def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" if parts: start = time.time() group_name = list(parts.values())[0]['group_name'] with db_session() as db: # this is a little tricky. parts have no uniqueness at all. # no uniqid and the posted dates can change since it's based off the first # segment that we see in that part, which is different for each scan. # what we do is get the next-closest thing (subject+author+group) and # order it by oldest first, so when it's building the dict the newest parts # end on top (which are the most likely to be being saved to). # realistically, it shouldn't be a big problem - parts aren't stored in the db # for very long anyway, and they're only a problem while there. saving 500 million # segments to the db is probably not a great idea anyway. existing_parts = dict( ((part.hash, part) for part in db.query(Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by(Part.posted.asc()).all() ) ) part_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if not existing_part: segments = part.pop('segments') part_inserts.append(part) part['segments'] = segments if part_inserts: ordering = ['hash', 'subject', 'group_name', 'posted', 'posted_by', 'total_segments', 'xref'] s = io.StringIO() for part in part_inserts: for item in ordering: if item == 'posted': s.write('"' + part[item].replace(tzinfo=None).strftime('%Y-%m-%d %H:%M:%S').replace('"', '\\"') + '",') elif item == 'xref': # leave off the comma s.write('"' + part[item].encode('utf-8', 'replace').decode('utf-8').replace('"', '\\"') + '"') else: s.write('"' + str(part[item]).encode('utf-8', 'replace').decode().replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Part): return False s.close() db.close() with db_session() as db: existing_parts = dict( ((part.hash, part) for part in db.query(Part) .options( subqueryload('segments'), Load(Part).load_only(Part.id, Part.hash), Load(Segment).load_only(Segment.id, Segment.segment) ) .filter(Part.hash.in_(parts.keys())) .filter(Part.group_name == group_name) .order_by(Part.posted.asc()) .all() ) ) segment_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if existing_part: segments = dict(((s.segment, s) for s in existing_part.segments)) for segment_number, segment in part['segments'].items(): if int(segment_number) not in segments: segment['part_id'] = existing_part.id segment_inserts.append(segment) else: # we hit a duplicate message for a part # kinda wish people would stop reposting shit constantly pass else: log.critical( 'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?') return False if segment_inserts: ordering = ['segment', 'size', 'message_id', 'part_id'] s = io.StringIO() for segment in segment_inserts: for item in ordering: if item == 'part_id': # leave off the tab s.write('"' + str(segment[item]).replace('"', '\\"') + '"') else: s.write('"' + str(segment[item]).encode('utf-8', 'replace').decode('utf-8').replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Segment): return False s.close() db.close() end = time.time() log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format( len(part_inserts), len(segment_inserts), end - start )) del part_inserts[:] del segment_inserts[:] return True
def process(precsv, processingFile=None): ordering = ['name', 'filename', 'nuked', 'category', 'pretime', 'source', 'requestid', 'requestgroup', 'searchname'] # Clean up the file a bit. precsv.replace("'", "", inplace=True, regex=True) precsv["nuked"].replace("2", "0", inplace=True) precsv["nuked"].replace("3", "1", inplace=True) precsv["nuked"].replace("4", "1", inplace=True) precsv["nuked"].replace("5", "1", inplace=True) precsv["nuked"].replace("69", "0", inplace=True) precsv.replace(".\\N$", '', inplace=True, regex=True) # Sometimes there are duplicates within the table itself, remove them precsv.drop_duplicates(subset='name', take_last=True, inplace=True) # Add clean searchname column precsv['searchname'] = precsv['name'].map(lambda name: releases.clean_release_name(name)) # Drop the pres without requestid's precsv = precsv[precsv.requestid != '0'] # Create a list of names to check if they exist names = list(precsv.name) # Query to find any existing pres, we need to delete them so COPY doesn't fail prenamelist = [] with db_session() as db: if names: pres = db.query(Pre).filter(Pre.name.in_(names)).all() for pre in pres: prenamelist.append(pre.name) data = io.StringIO() precsv.to_csv(data, index=False, header=False) # Delete any pres found as we are essentially going to update them if prenamelist: for pre in pres: db.delete(pre) db.commit() print("pre-import: Deleted {} pres that will re-inserted".format(len(prenamelist))) else: print("pre-import: File clean, no pres need to be deleted before re-insert") try: if processingFile is not None: print("pre-import: Attempting to add {} to the database".format(processingFile['lastfile'])) data.seek(0) copy_file(engine, data, ordering, Pre) # Write out the last pre csv name so it can be restarted later without downloading all the pres. with open('lastfile.json', 'w') as outfile: json.dump({'lastfile': int(processingFile['lastfile'])}, outfile) else: data.seek(0) copy_file(engine, data, ordering, Pre) data.close() print("pre-import: Chunk import successful") except Exception as e: print("pre-import: Error inserting into database - {}".format(e)) if processingFile is not None: INSERTFAILS.append(processingFile['lastfile']) else: print("pre-import: Error processing chunk")
def process(precsv, processingFile=None): ordering = [ 'name', 'filename', 'nuked', 'category', 'pretime', 'source', 'requestid', 'requestgroup', 'searchname' ] # Clean up the file a bit. precsv.replace("'", "", inplace=True, regex=True) precsv["nuked"].replace("2", "0", inplace=True) precsv["nuked"].replace("3", "1", inplace=True) precsv["nuked"].replace("4", "1", inplace=True) precsv["nuked"].replace("5", "1", inplace=True) precsv["nuked"].replace("69", "0", inplace=True) precsv.replace(".\\N$", '', inplace=True, regex=True) # Sometimes there are duplicates within the table itself, remove them precsv.drop_duplicates(subset='name', take_last=True, inplace=True) # Add clean searchname column precsv['searchname'] = precsv['name'].map( lambda name: releases.clean_release_name(name)) # Drop the pres without requestid's precsv = precsv[precsv.requestid != '0'] # Create a list of names to check if they exist names = list(precsv.name) # Query to find any existing pres, we need to delete them so COPY doesn't fail prenamelist = [] with db_session() as db: if names: pres = db.query(Pre).filter(Pre.name.in_(names)).all() for pre in pres: prenamelist.append(pre.name) data = io.StringIO() precsv.to_csv(data, index=False, header=False) # Delete any pres found as we are essentially going to update them if prenamelist: for pre in pres: db.delete(pre) db.commit() print("pre-import: Deleted {} pres that will re-inserted".format( len(prenamelist))) else: print( "pre-import: File clean, no pres need to be deleted before re-insert" ) try: if processingFile is not None: print("pre-import: Attempting to add {} to the database".format( processingFile['lastfile'])) data.seek(0) copy_file(engine, data, ordering, Pre) # Write out the last pre csv name so it can be restarted later without downloading all the pres. with open('lastfile.json', 'w') as outfile: json.dump({'lastfile': int(processingFile['lastfile'])}, outfile) else: data.seek(0) copy_file(engine, data, ordering, Pre) data.close() print("pre-import: Chunk import successful") except Exception as e: print("pre-import: Error inserting into database - {}".format(e)) if processingFile is not None: INSERTFAILS.append(processingFile['lastfile']) else: print("pre-import: Error processing chunk")
def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" if parts: start = time.time() group_name = list(parts.values())[0]['group_name'] with db_session() as db: # this is a little tricky. parts have no uniqueness at all. # no uniqid and the posted dates can change since it's based off the first # segment that we see in that part, which is different for each scan. # what we do is get the next-closest thing (subject+author+group) and # order it by oldest first, so when it's building the dict the newest parts # end on top (which are the most likely to be being saved to). # realistically, it shouldn't be a big problem - parts aren't stored in the db # for very long anyway, and they're only a problem while there. saving 500 million # segments to the db is probably not a great idea anyway. existing_parts = dict(((part.hash, part) for part in db.query( Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by( Part.posted.asc()).all())) part_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if not existing_part: segments = part.pop('segments') part_inserts.append(part) part['segments'] = segments if part_inserts: ordering = [ 'hash', 'subject', 'group_name', 'posted', 'posted_by', 'total_segments', 'xref' ] s = io.StringIO() for part in part_inserts: for item in ordering: if item == 'posted': s.write('"' + part[item].replace( tzinfo=None).strftime( '%Y-%m-%d %H:%M:%S').replace('"', '\\"') + '",') elif item == 'xref': # leave off the comma s.write('"' + part[item].encode('utf-8', 'replace'). decode('utf-8').replace('"', '\\"') + '"') else: s.write('"' + str(part[item]).encode('utf-8', 'replace'). decode().replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Part): return False s.close() db.close() with db_session() as db: existing_parts = dict( ((part.hash, part) for part in db.query(Part).options( subqueryload('segments'), Load(Part).load_only(Part.id, Part.hash), Load(Segment).load_only(Segment.id, Segment.segment)). filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by( Part.posted.asc()).all())) segment_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if existing_part: segments = dict( ((s.segment, s) for s in existing_part.segments)) for segment_number, segment in part['segments'].items(): if int(segment_number) not in segments: segment['part_id'] = existing_part.id segment_inserts.append(segment) else: # we hit a duplicate message for a part # kinda wish people would stop reposting shit constantly pass else: log.critical( 'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?' ) return False if segment_inserts: ordering = ['segment', 'size', 'message_id', 'part_id'] s = io.StringIO() for segment in segment_inserts: for item in ordering: if item == 'part_id': # leave off the tab s.write('"' + str(segment[item]).replace('"', '\\"') + '"') else: s.write( '"' + str(segment[item]).encode('utf-8', 'replace'). decode('utf-8').replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Segment): return False s.close() db.close() end = time.time() log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format( len(part_inserts), len(segment_inserts), end - start)) del part_inserts[:] del segment_inserts[:] return True