def parseNzedbirc(unformattedPre): PRE_REGEX = regex.compile( '(?P<preType>.+): \[DT: (?<pretime>.+)\]\[TT: (?P<name>.+)\]\[SC: (?P<source>.+)\]\[CT: (?P<category>.+)\]\[RQ: (?P<request>.+)\]\[SZ: (?P<size>.+)\]\[FL: (?P<files>.+)\]\[FN: (?P<filename>.+)\]') formattedPre = {} try: formattedPre = PRE_REGEX.search(unformattedPre).groupdict() except Exception as e: log.debug("pre: Error parsing nzedbirc - {}".format(e)) if formattedPre['preType'] == "NUK": formattedPre['nuked'] = True else: formattedPre['nuked'] = False #Deal with splitting out requests if they exist if formattedPre['request'] != "N/A": formattedPre['requestid'] = formattedPre['request'].split(":")[0] formattedPre['requestgroup'] = formattedPre['request'].split(":")[1] else: formattedPre['requestid'] = None formattedPre['searchname'] = releases.clean_release_name(formattedPre['name']) #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy formattedPre.pop("preType", None) formattedPre.pop("size", None) formattedPre.pop("files", None) formattedPre.pop("request", None) return formattedPre
def parseNzedbirc(unformattedPre): PRE_REGEX = regex.compile( '(?P<preType>.+): \[DT: (?<pretime>.+)\] \[TT: (?P<name>.+)\] \[SC: (?P<source>.+)\] \[CT: (?P<category>.+)\] \[RQ: (?P<request>.+)\] \[SZ: (?P<size>.+)\] \[FL: (?P<files>.+)\] \[FN: (?P<filename>.+)\]') formattedPre = {} try: formattedPre = PRE_REGEX.search(unformattedPre).groupdict() except Exception as e: log.debug("pre: Error parsing nzedbirc - {}".format(e)) if formattedPre['preType'] == "NUK": formattedPre['nuked'] = True else: formattedPre['nuked'] = False #Deal with splitting out requests if they exist if formattedPre['request'] != "N/A": formattedPre['requestid'] = formattedPre['request'].split(":")[0] formattedPre['requestgroup'] = formattedPre['request'].split(":")[1] else: formattedPre['requestid'] = None formattedPre['searchname'] = releases.clean_release_name(formattedPre['name']) #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy formattedPre.pop("preType", None) formattedPre.pop("size", None) formattedPre.pop("files", None) formattedPre.pop("request", None) return formattedPre
def parseNzedbirc(unformattedPre): CLEAN_REGEX = regex.compile('[\x02\x0F\x16\x1D\x1F]|\x03(\d{,2}(,\d{,2})?)?') PRE_REGEX = regex.compile( '(?P<preType>.+): \[DT: (?<pretime>.+)\] \[TT: (?P<name>.+)\] \[SC: (?P<source>.+)\] \[CT: (?P<category>.+)\] \[RQ: (?P<request>.+)\] \[SZ: (?P<size>.+)\] \[FL: (?P<files>.+)\] \[FN: (?P<filename>.+)\]') formattedPre = {} if unformattedPre is not None: try: cleanPre = regex.sub(CLEAN_REGEX, '', unformattedPre); formattedPre = PRE_REGEX.search(cleanPre).groupdict() except Exception as e: log.debug("pre: Message prior to error - {}".format(unformattedPre)) log.debug("pre: Error parsing nzedbirc - {}".format(e)) formattedPre = None if formattedPre is not None: if formattedPre['preType'] == "NUK": formattedPre['nuked'] = True else: formattedPre['nuked'] = False #Deal with splitting out requests if they exist if formattedPre['request'] != "N/A": formattedPre['requestid'] = formattedPre['request'].split(":")[0] formattedPre['requestgroup'] = formattedPre['request'].split(":")[1] else: formattedPre['requestid'] = None formattedPre['searchname'] = releases.clean_release_name(formattedPre['name']) #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy formattedPre.pop("preType", None) formattedPre.pop("size", None) formattedPre.pop("files", None) formattedPre.pop("request", None) return formattedPre else: return None
def process(precsv, processingFile=None): ordering = [ 'name', 'filename', 'nuked', 'category', 'pretime', 'source', 'requestid', 'requestgroup', 'searchname' ] # Clean up the file a bit. precsv.replace("'", "", inplace=True, regex=True) precsv["nuked"].replace("2", "0", inplace=True) precsv["nuked"].replace("3", "1", inplace=True) precsv["nuked"].replace("4", "1", inplace=True) precsv["nuked"].replace("5", "1", inplace=True) precsv["nuked"].replace("69", "0", inplace=True) precsv.replace(".\\N$", '', inplace=True, regex=True) # Sometimes there are duplicates within the table itself, remove them precsv.drop_duplicates(subset='name', take_last=True, inplace=True) # Add clean searchname column precsv['searchname'] = precsv['name'].map( lambda name: releases.clean_release_name(name)) # Drop the pres without requestid's precsv = precsv[precsv.requestid != '0'] # Create a list of names to check if they exist names = list(precsv.name) # Query to find any existing pres, we need to delete them so COPY doesn't fail prenamelist = [] with db_session() as db: if names: pres = db.query(Pre).filter(Pre.name.in_(names)).all() for pre in pres: prenamelist.append(pre.name) data = io.StringIO() precsv.to_csv(data, index=False, header=False) # Delete any pres found as we are essentially going to update them if prenamelist: for pre in pres: db.delete(pre) db.commit() print("pre-import: Deleted {} pres that will re-inserted".format( len(prenamelist))) else: print( "pre-import: File clean, no pres need to be deleted before re-insert" ) try: if processingFile is not None: print("pre-import: Attempting to add {} to the database".format( processingFile['lastfile'])) data.seek(0) copy_file(engine, data, ordering, Pre) # Write out the last pre csv name so it can be restarted later without downloading all the pres. with open('lastfile.json', 'w') as outfile: json.dump({'lastfile': int(processingFile['lastfile'])}, outfile) else: data.seek(0) copy_file(engine, data, ordering, Pre) data.close() print("pre-import: Chunk import successful") except Exception as e: print("pre-import: Error inserting into database - {}".format(e)) if processingFile is not None: INSERTFAILS.append(processingFile['lastfile']) else: print("pre-import: Error processing chunk")
def process(precsv, processingFile=None): ordering = ['name', 'filename', 'nuked', 'category', 'pretime', 'source', 'requestid', 'requestgroup', 'searchname'] # Clean up the file a bit. precsv.replace("'", "", inplace=True, regex=True) precsv["nuked"].replace("2", "0", inplace=True) precsv["nuked"].replace("3", "1", inplace=True) precsv["nuked"].replace("4", "1", inplace=True) precsv["nuked"].replace("5", "1", inplace=True) precsv["nuked"].replace("69", "0", inplace=True) precsv.replace(".\\N$", '', inplace=True, regex=True) # Sometimes there are duplicates within the table itself, remove them precsv.drop_duplicates(subset='name', take_last=True, inplace=True) # Add clean searchname column precsv['searchname'] = precsv['name'].map(lambda name: releases.clean_release_name(name)) # Drop the pres without requestid's precsv = precsv[precsv.requestid != '0'] # Create a list of names to check if they exist names = list(precsv.name) # Query to find any existing pres, we need to delete them so COPY doesn't fail prenamelist = [] with db_session() as db: if names: pres = db.query(Pre).filter(Pre.name.in_(names)).all() for pre in pres: prenamelist.append(pre.name) data = io.StringIO() precsv.to_csv(data, index=False, header=False) # Delete any pres found as we are essentially going to update them if prenamelist: for pre in pres: db.delete(pre) db.commit() print("pre-import: Deleted {} pres that will re-inserted".format(len(prenamelist))) else: print("pre-import: File clean, no pres need to be deleted before re-insert") try: if processingFile is not None: print("pre-import: Attempting to add {} to the database".format(processingFile['lastfile'])) data.seek(0) copy_file(engine, data, ordering, Pre) # Write out the last pre csv name so it can be restarted later without downloading all the pres. with open('lastfile.json', 'w') as outfile: json.dump({'lastfile': int(processingFile['lastfile'])}, outfile) else: data.seek(0) copy_file(engine, data, ordering, Pre) data.close() print("pre-import: Chunk import successful") except Exception as e: print("pre-import: Error inserting into database - {}".format(e)) if processingFile is not None: INSERTFAILS.append(processingFile['lastfile']) else: print("pre-import: Error processing chunk")