Example #1
0
def save_all(parts):
    """Save a set of parts to the DB, in a batch if possible."""
    if parts:
        start = time.time()
        group_name = list(parts.values())[0]['group_name']

        with db_session() as db:
            # this is a little tricky. parts have no uniqueness at all.
            # no uniqid and the posted dates can change since it's based off the first
            # segment that we see in that part, which is different for each scan.
            # what we do is get the next-closest thing (subject+author+group) and
            # order it by oldest first, so when it's building the dict the newest parts
            # end on top (which are the most likely to be being saved to).

            # realistically, it shouldn't be a big problem - parts aren't stored in the db
            # for very long anyway, and they're only a problem while there. saving 500 million
            # segments to the db is probably not a great idea anyway.
            existing_parts = dict(
                ((part.hash, part) for part in
                 db.query(Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter(
                     Part.group_name == group_name).order_by(Part.posted.asc()).all()
                )
            )

            part_inserts = []
            for hash, part in parts.items():
                existing_part = existing_parts.get(hash, None)
                if not existing_part:
                    segments = part.pop('segments')
                    part_inserts.append(part)
                    part['segments'] = segments

            if part_inserts:
                ordering = ['hash', 'subject', 'group_name', 'posted', 'posted_by', 'total_segments', 'xref']

                s = io.StringIO()
                for part in part_inserts:
                    for item in ordering:
                        if item == 'posted':
                            s.write('"' + part[item].replace(tzinfo=None).strftime('%Y-%m-%d %H:%M:%S').replace('"',
                                                                                                                '\\"') + '",')
                        elif item == 'xref':
                            # leave off the comma
                            s.write('"' + part[item].encode('utf-8', 'replace').decode('utf-8').replace('"', '\\"') + '"')
                        else:
                            s.write('"' + str(part[item]).encode('utf-8', 'replace').decode().replace('"', '\\"') + '",')
                    s.write("\n")
                s.seek(0)

                if not copy_file(engine, s, ordering, Part):
                    return False

                s.close()
                db.close()

        with db_session() as db:
            existing_parts = dict(
                ((part.hash, part) for part in
                 db.query(Part)
                .options(
                     subqueryload('segments'),
                     Load(Part).load_only(Part.id, Part.hash),
                     Load(Segment).load_only(Segment.id, Segment.segment)
                 )
                .filter(Part.hash.in_(parts.keys()))
                .filter(Part.group_name == group_name)
                .order_by(Part.posted.asc())
                .all()
                )
            )

            segment_inserts = []
            for hash, part in parts.items():
                existing_part = existing_parts.get(hash, None)
                if existing_part:
                    segments = dict(((s.segment, s) for s in existing_part.segments))
                    for segment_number, segment in part['segments'].items():
                        if int(segment_number) not in segments:
                            segment['part_id'] = existing_part.id
                            segment_inserts.append(segment)
                        else:
                            # we hit a duplicate message for a part
                            # kinda wish people would stop reposting shit constantly
                            pass
                else:
                    log.critical(
                        'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?')
                    return False

            if segment_inserts:
                ordering = ['segment', 'size', 'message_id', 'part_id']

                s = io.StringIO()
                for segment in segment_inserts:
                    for item in ordering:
                        if item == 'part_id':
                            # leave off the tab
                            s.write('"' + str(segment[item]).replace('"', '\\"') + '"')
                        else:
                            s.write('"' + str(segment[item]).encode('utf-8', 'replace').decode('utf-8').replace('"', '\\"') + '",')
                    s.write("\n")
                s.seek(0)

                if not copy_file(engine, s, ordering, Segment):
                    return False

                s.close()
                db.close()

        end = time.time()

        log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format(
            len(part_inserts),
            len(segment_inserts),
            end - start
        ))

        del part_inserts[:]
        del segment_inserts[:]

    return True
Example #2
0
def process(precsv, processingFile=None):
    ordering = ['name', 'filename', 'nuked', 'category', 'pretime', 'source', 'requestid', 'requestgroup', 'searchname']

    # Clean up the file a bit.
    precsv.replace("'", "", inplace=True, regex=True)
    precsv["nuked"].replace("2", "0", inplace=True)
    precsv["nuked"].replace("3", "1", inplace=True)
    precsv["nuked"].replace("4", "1", inplace=True)
    precsv["nuked"].replace("5", "1", inplace=True)
    precsv["nuked"].replace("69", "0", inplace=True)
    precsv.replace(".\\N$", '', inplace=True, regex=True)

    # Sometimes there are duplicates within the table itself, remove them
    precsv.drop_duplicates(subset='name', take_last=True, inplace=True)

    # Add clean searchname column
    precsv['searchname'] = precsv['name'].map(lambda name: releases.clean_release_name(name))

    # Drop the pres without requestid's
    precsv = precsv[precsv.requestid != '0']

    # Create a list of names to check if they exist
    names = list(precsv.name)

    # Query to find any existing pres, we need to delete them so COPY doesn't fail
    prenamelist = []
    with db_session() as db:

        if names:
            pres = db.query(Pre).filter(Pre.name.in_(names)).all()

            for pre in pres:
                prenamelist.append(pre.name)

        data = io.StringIO()
        precsv.to_csv(data, index=False, header=False)

        # Delete any pres found as we are essentially going to update them
        if prenamelist:
            for pre in pres:
                db.delete(pre)
            db.commit()
            print("pre-import: Deleted {} pres that will re-inserted".format(len(prenamelist)))
        else:
            print("pre-import: File clean, no pres need to be deleted before re-insert")

    try:
        if processingFile is not None:
            print("pre-import: Attempting to add {} to the database".format(processingFile['lastfile']))

            data.seek(0)
            copy_file(engine, data, ordering, Pre)

            # Write out the last pre csv name so it can be restarted later without downloading all the pres.
            with open('lastfile.json', 'w') as outfile:
                json.dump({'lastfile': int(processingFile['lastfile'])}, outfile)

        else:
            data.seek(0)
            copy_file(engine, data, ordering, Pre)
            data.close()
            print("pre-import: Chunk import successful")

    except Exception as e:
        print("pre-import: Error inserting into database - {}".format(e))

        if processingFile is not None:
            INSERTFAILS.append(processingFile['lastfile'])
        else:
            print("pre-import: Error processing chunk")
Example #3
0
def process(precsv, processingFile=None):
    ordering = [
        'name', 'filename', 'nuked', 'category', 'pretime', 'source',
        'requestid', 'requestgroup', 'searchname'
    ]

    # Clean up the file a bit.
    precsv.replace("'", "", inplace=True, regex=True)
    precsv["nuked"].replace("2", "0", inplace=True)
    precsv["nuked"].replace("3", "1", inplace=True)
    precsv["nuked"].replace("4", "1", inplace=True)
    precsv["nuked"].replace("5", "1", inplace=True)
    precsv["nuked"].replace("69", "0", inplace=True)
    precsv.replace(".\\N$", '', inplace=True, regex=True)

    # Sometimes there are duplicates within the table itself, remove them
    precsv.drop_duplicates(subset='name', take_last=True, inplace=True)

    # Add clean searchname column
    precsv['searchname'] = precsv['name'].map(
        lambda name: releases.clean_release_name(name))

    # Drop the pres without requestid's
    precsv = precsv[precsv.requestid != '0']

    # Create a list of names to check if they exist
    names = list(precsv.name)

    # Query to find any existing pres, we need to delete them so COPY doesn't fail
    prenamelist = []
    with db_session() as db:

        if names:
            pres = db.query(Pre).filter(Pre.name.in_(names)).all()

            for pre in pres:
                prenamelist.append(pre.name)

        data = io.StringIO()
        precsv.to_csv(data, index=False, header=False)

        # Delete any pres found as we are essentially going to update them
        if prenamelist:
            for pre in pres:
                db.delete(pre)
            db.commit()
            print("pre-import: Deleted {} pres that will re-inserted".format(
                len(prenamelist)))
        else:
            print(
                "pre-import: File clean, no pres need to be deleted before re-insert"
            )

    try:
        if processingFile is not None:
            print("pre-import: Attempting to add {} to the database".format(
                processingFile['lastfile']))

            data.seek(0)
            copy_file(engine, data, ordering, Pre)

            # Write out the last pre csv name so it can be restarted later without downloading all the pres.
            with open('lastfile.json', 'w') as outfile:
                json.dump({'lastfile': int(processingFile['lastfile'])},
                          outfile)

        else:
            data.seek(0)
            copy_file(engine, data, ordering, Pre)
            data.close()
            print("pre-import: Chunk import successful")

    except Exception as e:
        print("pre-import: Error inserting into database - {}".format(e))

        if processingFile is not None:
            INSERTFAILS.append(processingFile['lastfile'])
        else:
            print("pre-import: Error processing chunk")
Example #4
0
def save_all(parts):
    """Save a set of parts to the DB, in a batch if possible."""
    if parts:
        start = time.time()
        group_name = list(parts.values())[0]['group_name']

        with db_session() as db:
            # this is a little tricky. parts have no uniqueness at all.
            # no uniqid and the posted dates can change since it's based off the first
            # segment that we see in that part, which is different for each scan.
            # what we do is get the next-closest thing (subject+author+group) and
            # order it by oldest first, so when it's building the dict the newest parts
            # end on top (which are the most likely to be being saved to).

            # realistically, it shouldn't be a big problem - parts aren't stored in the db
            # for very long anyway, and they're only a problem while there. saving 500 million
            # segments to the db is probably not a great idea anyway.
            existing_parts = dict(((part.hash, part) for part in db.query(
                Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter(
                    Part.group_name == group_name).order_by(
                        Part.posted.asc()).all()))

            part_inserts = []
            for hash, part in parts.items():
                existing_part = existing_parts.get(hash, None)
                if not existing_part:
                    segments = part.pop('segments')
                    part_inserts.append(part)
                    part['segments'] = segments

            if part_inserts:
                ordering = [
                    'hash', 'subject', 'group_name', 'posted', 'posted_by',
                    'total_segments', 'xref'
                ]

                s = io.StringIO()
                for part in part_inserts:
                    for item in ordering:
                        if item == 'posted':
                            s.write('"' + part[item].replace(
                                tzinfo=None).strftime(
                                    '%Y-%m-%d %H:%M:%S').replace('"', '\\"') +
                                    '",')
                        elif item == 'xref':
                            # leave off the comma
                            s.write('"' +
                                    part[item].encode('utf-8', 'replace').
                                    decode('utf-8').replace('"', '\\"') + '"')
                        else:
                            s.write('"' +
                                    str(part[item]).encode('utf-8', 'replace').
                                    decode().replace('"', '\\"') + '",')
                    s.write("\n")
                s.seek(0)

                if not copy_file(engine, s, ordering, Part):
                    return False

                s.close()
                db.close()

        with db_session() as db:
            existing_parts = dict(
                ((part.hash, part) for part in db.query(Part).options(
                    subqueryload('segments'),
                    Load(Part).load_only(Part.id, Part.hash),
                    Load(Segment).load_only(Segment.id, Segment.segment)).
                 filter(Part.hash.in_(parts.keys())).filter(
                     Part.group_name == group_name).order_by(
                         Part.posted.asc()).all()))

            segment_inserts = []
            for hash, part in parts.items():
                existing_part = existing_parts.get(hash, None)
                if existing_part:
                    segments = dict(
                        ((s.segment, s) for s in existing_part.segments))
                    for segment_number, segment in part['segments'].items():
                        if int(segment_number) not in segments:
                            segment['part_id'] = existing_part.id
                            segment_inserts.append(segment)
                        else:
                            # we hit a duplicate message for a part
                            # kinda wish people would stop reposting shit constantly
                            pass
                else:
                    log.critical(
                        'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?'
                    )
                    return False

            if segment_inserts:
                ordering = ['segment', 'size', 'message_id', 'part_id']

                s = io.StringIO()
                for segment in segment_inserts:
                    for item in ordering:
                        if item == 'part_id':
                            # leave off the tab
                            s.write('"' +
                                    str(segment[item]).replace('"', '\\"') +
                                    '"')
                        else:
                            s.write(
                                '"' +
                                str(segment[item]).encode('utf-8', 'replace').
                                decode('utf-8').replace('"', '\\"') + '",')
                    s.write("\n")
                s.seek(0)

                if not copy_file(engine, s, ordering, Segment):
                    return False

                s.close()
                db.close()

        end = time.time()

        log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format(
            len(part_inserts), len(segment_inserts), end - start))

        del part_inserts[:]
        del segment_inserts[:]

    return True