Beispiel #1
0
def collect_entry(indexfh, identifier):
    """
    find position and size of entries corresponding to specified identifier
    from index
    """
    if args.duplicates:
        positions = get_positions(indexfh, identifier)
    elif args.zfound:
        positions = [get_position_last(indexfh, identifier)]
    else:
        positions = [get_position_first(indexfh, identifier)]

    if not positions or positions == [None]:  #empty array
        if args.verbose:
            eprint("    => WARNING: '{}' not found in index; skipping".format(
                identifier))
        return [], []

    entry_positions = list()
    entry_lengths = list()

    for position in positions:
        #1) extract position and entry size
        if args.encrypted:  #with iv
            posmatch = REESIVN.match(position)
        else:  #no iv
            posmatch = REES.match(position)
        entry_position, entry_length = posmatch.groups()

        #2) decode and append to lists
        entry_positions.append(b64_to_int(entry_position))
        entry_lengths.append(b64_to_int(entry_length))

    return entry_positions, entry_lengths
Beispiel #2
0
def check_index_compatibility(indexfile1, indexfile2):
    """
    will check if old and new indexes are of same type
    """

    with open(indexfile1, 'r', 1) as indexfh:
        line1 = indexfh.readline()  #first line of index

    index_type1, cipher1, keysize1, has_checksum1 = check_index(line1)

    with open(indexfile2, 'r', 1) as indexfh:
        line2 = indexfh.readline()  #first line of index

    index_type2, cipher2, keysize2, has_checksum2 = check_index(line2)

    if index_type1 != index_type2 or cipher1 != cipher2 or keysize1 != keysize2:
        eprint(
            "    => ERROR: The indexes you are merging '{}' and '{}'".format(
                indexfile1, indexfile2))
        eprint("              are of incompatible type! Cannot proceed!")
        sys.exit(22)
    if has_checksum1 != has_checksum2:
        eprint(
            "    => ERROR: The indexes you are merging '{}' and '{}'".format(
                indexfile1, indexfile2))
        eprint(
            "              are of the same type but only one contains checksums!"
        )
        eprint(
            "       Both should be either with or without checksums. Cannot proceed!"
        )
        sys.exit(22)
    args.index_type = index_type1
Beispiel #3
0
def check_files():
    """
    check for ability to open input/output filenames
    """
    check_iofiles([args.input_filename], [])

    args.outff_filename = None
    if args.encrypt:
        args.outff_filename = "{}.{}".format(args.input_filename, "enc")
    elif args.compress:  #if compressed but not encrypted
        args.outff_filename = "{}.{}".format(args.input_filename, "xz")

    if args.outff_filename is not None:
        check_iofiles([], [args.outff_filename])

    if args.threads > 1:  #multithread
        os.mkdir(args.mt_subfiles_dir, mode=0o700)
        args.chunk_itempfiles = list()
        for chunknum in range(args.chunks_count):
            chunknumstr = str(chunknum).zfill(len(str(
                args.chunks_count)))  #e.g. 00, 01..
            chunk_tempfile = args.mt_subfiles_iprefix + "." + chunknumstr
            try:
                myoutputfh = open(chunk_tempfile, 'w')
            except PermissionError:
                eprint(
                    "    => ERROR: Cannot open temporary file '{}' for writing"
                    .format(chunk_tempfile))
                sys.exit(1)
            args.chunk_itempfiles.append(chunk_tempfile)  #store temp filenames
            myoutputfh.close()
        delete_files(args.chunk_itempfiles)
        #for chunknum in range(args.chunks_count): #DEBUG!
        #   eprint(" >> the temporary index file for chunk #{} will be '{}'".format(
        #       chunknum, args.chunk_itempfiles[chunknum]))

        #if outff needs to be generated
        if args.compress or args.encrypt:
            args.chunk_otempfiles = list()
            for chunknum in range(args.chunks_count):
                chunknumstr = str(chunknum).zfill(len(str(
                    args.chunks_count)))  #e.g. 00, 01..
                chunk_tempfile = args.mt_subfiles_oprefix + "." + chunknumstr
                try:
                    myoutputfh = open(chunk_tempfile, 'w')
                except PermissionError:
                    eprint(
                        "    => ERROR: Cannot open temporary file '{}' for writing"
                        .format(chunk_tempfile))
                    sys.exit(1)
                args.chunk_otempfiles.append(
                    chunk_tempfile)  #store temp filenames
                myoutputfh.close()
            delete_files(args.chunk_otempfiles)
Beispiel #4
0
def check_iofiles(read_filenames, write_filenames):
    """
    check for ability to open input/output filenames
    """
    if read_filenames is not None:
        for filename in read_filenames:
            try:
                inputfh = open(filename, 'r')
                inputfh.close()
            except FileNotFoundError:
                eprint(
                    "    => ERROR: Cannot open file '{}' for reading".format(
                        filename))
                sys.exit(2)

    if write_filenames is not None:
        for filename in write_filenames:
            if os.path.isfile(filename):
                eprint("    => ERROR: file '{}' exists".format(filename))
                eprint("       please remove it as we refuse to overwrite it!")
                sys.exit(1)
            try:
                myoutputfh = open(filename, 'w')
                myoutputfh.close()
            except PermissionError:
                eprint(
                    "    => ERROR: Cannot open file '{}' for writing".format(
                        filename))
                sys.exit(1)
            #eprint("deleting {}".format(filename)) #DEBUG
            delete_files([filename])
Beispiel #5
0
def print_stats(start_time):
    """
    if verbose print some final statistics on entries indexed
    """
    entries_sum = 0
    indexes_sum = 0
    skipped_sum = 0
    for chunknum in range(args.chunks_count):
        entries_sum += entries_count[chunknum]
        indexes_sum += indexes_count[chunknum]
        skipped_sum += skipped_count[chunknum]
    eprint(" '-- {} entries with {} indexes ~ {} entries skipped --'".format(
        entries_sum, indexes_sum, skipped_sum))
    eprint(" '-- Elapsed: {}, {} entries/sec --'".format(
        *elapsed_time(start_time, entries_sum)))
Beispiel #6
0
    def test_format_indexes(self):
        """
        Check for correct parsing of indexes
        """
        cipher_type = "A"
        for try_index_type in ("-", ".", ":", "+"):
            new_indexes = ffdb.format_indexes(entry,
                                              try_index_type,
                                              cipher_type,
                                              checksums=False)
            assert len(new_indexes) == len(entry['ids'])

            firstline = new_indexes[0].rstrip()
            ffdb.eprint("index_type: '{}', index_line: '{}'".format(
                try_index_type, firstline))

            index_type, cipher_name, keysize, _ = ffdb.check_index(firstline)

            assert try_index_type == index_type
            if cipher_name is not None:
                assert cipher_type == ffdb.get_cipher_type(cipher_name)
Beispiel #7
0
def check_files():
    """
    do some checks on availability of resources
    """
    check_iofiles([args.flatfile], [])
    check_iofiles([], [args.output_filename, args.outindex_filename])
    if args.threads > 1:  #multithread
        os.mkdir(args.mt_subfiles_dir, mode=0o700)
        args.chunk_itempfiles = list()
        for chunknum in range(args.chunks_count):
            chunknumstr = str(chunknum).zfill(len(str(
                args.chunks_count)))  #e.g. 00, 01..
            chunk_tempfile = args.mt_subfiles_iprefix + "." + chunknumstr
            try:
                myoutputfh = open(chunk_tempfile, 'w')
            except PermissionError:
                eprint(
                    "    => ERROR: Cannot open temporary file '{}' for writing"
                    .format(chunk_tempfile))
                sys.exit(1)
            args.chunk_itempfiles.append(chunk_tempfile)  #store temp filenames
            myoutputfh.close()
        delete_files(args.chunk_itempfiles)

        args.chunk_otempfiles = list()
        for chunknum in range(args.chunks_count):
            chunknumstr = str(chunknum).zfill(len(str(
                args.chunks_count)))  #e.g. 00, 01..
            chunk_tempfile = args.mt_subfiles_oprefix + "." + chunknumstr
            try:
                myoutputfh = open(chunk_tempfile, 'w')
            except PermissionError:
                eprint(
                    "    => ERROR: Cannot open temporary file '{}' for writing"
                    .format(chunk_tempfile))
                sys.exit(1)
            args.chunk_otempfiles.append(chunk_tempfile)  #store temp filenames
            myoutputfh.close()
        delete_files(args.chunk_otempfiles)
Beispiel #8
0
    def test_ciphers(self):
        global SALT
        SALT = b'5ed3a4284d6a9c1e4e4f6b4729b254be'
        passphrase = "The quick brown fox jumps over the lazy dog"
        iv = ffdb.generate_iv()

        ffdb.eprint("Testing aes128")
        keysize = 16
        cipher_name, key = ffdb.derive_key(passphrase, keysize)
        cipher_type = ffdb.get_cipher_type(cipher_name)
        assert cipher_name == "aes128"
        assert key == b'c\x05i\xa5\x81c`\x8e(\xa4\xd3CR\xc9\xb0\xf1'
        assert cipher_type == "A"

        ffdb.eprint("Testing aes192")
        keysize = 24
        cipher_name, key = ffdb.derive_key(passphrase, keysize)
        cipher_type = ffdb.get_cipher_type(cipher_name)
        assert cipher_name == "aes192"
        assert key == b'c\x05i\xa5\x81c`\x8e(\xa4\xd3CR\xc9\xb0\xf1\x86L\x7f=a\xd3\x8cw'
        assert cipher_type == "B"

        ffdb.eprint("Testing aes256")
        keysize = 32
        cipher_name, key = ffdb.derive_key(passphrase, keysize)
        cipher_type = ffdb.get_cipher_type(cipher_name)
        assert cipher_name == "aes256"
        assert key == b'c\x05i\xa5\x81c`\x8e(\xa4\xd3CR\xc9\xb0\xf1\x86L\x7f=a\xd3\x8cw\xadc:\x899\xfe^\xfe'
        assert cipher_type == "C"

        ffdb.eprint("Testing encryption/decryption")
        for keysize in (16, 24, 32):
            cipher_name, key = ffdb.derive_key(passphrase, keysize)

            cipher = ffdb.init_cipher(key, iv)
            encrypted_data = cipher.encrypt(data.encode('UTF-8'))
            compressed_encrypted_data = cipher.encrypt(
                ffdb.deflate(data.encode('UTF-8'), 9))

            cipher = ffdb.init_cipher(key, iv)
            decrypted_data = cipher.decrypt(encrypted_data).decode('UTF-8')
            decrypted_uncompressed_data = ffdb.inflate(
                cipher.decrypt(compressed_encrypted_data)).decode('UTF-8')
            assert decrypted_data == data
            assert decrypted_uncompressed_data == data
Beispiel #9
0
def print_stats(start_time):
    """
    print some final statistics on entries deleted
    """
    indexes_sum = 0
    deleted_sum = 0
    reindexed_sum = 0
    for chunknum in range(args.chunks_count):
        indexes_sum += indexes_count[chunknum]
        reindexed_sum += reindexed_count[chunknum]
        deleted_sum += deleted_count[chunknum]

    if found_count == 1:
        eprint(" |-- Found and removed 1 entry.")
    elif found_count > 0:
        eprint(" |-- Found and removed {} entries.".format(found_count))
    if found_count < requested_count:
        if found_count == 0:
            eprint(
                "    => WARNING: NONE of the {} requested identifiers found in index!"
                .format(requested_count))
        else:
            eprint(
                "    => WARNING: only {} of the {} requested identifiers found in index."
                .format(found_count, requested_count))
    eprint(" |-- Deleted {} and reindexed {} indexes out of total {}.".format(
        deleted_sum, reindexed_sum, indexes_sum))
    _, reindexed_speed = elapsed_time(start_time, reindexed_sum)
    eprint(" '-- Elapsed: {}, {} deletions/sec; {} reindexing/sec --'".format(
        *elapsed_time(start_time, found_count), reindexed_speed))
Beispiel #10
0
def check_args():
    """
    parse arguments and check for error conditions
    """
    global args
    usagetxt = """{0} -f FLATFILE -i INDEXFILE -l LISTFILE [-o OUTPATH]
    [-f] : flatfile from which the entries should be removed
    [-i] : index of FLATFILE
    [-l] : file with list of identifers for the entries that should be removed
    see {0} -h for tweaks and optional modes
    \nexamples:
    {0} -f entries.dat -i entries.pos -l removeme.list
      (will create entries.dat.new and entries.pos.new)
    {0} -f entries.dat -i entries.pos -l removeme.list -o cleaned
      (will create cleaned/entries.dat.new and cleaned/entries.pos.new)
    """.format(PROGNAME)
    parser = argparse.ArgumentParser(
        description='Use a positional index to delete \
                                     entries from a flatfile',
        usage=usagetxt)
    parser.add_argument(
        '-f',
        '--file',
        dest='flatfile',
        help="filename of flatfile from which entries should be deleted",
        required=True,
        type=str)
    parser.add_argument(
        '-i',
        '--index',
        dest='index_filename',
        help="filename of index file containing entry identifiers",
        required=True,
        type=str)
    parser.add_argument(
        '-l',
        '--list',
        dest='list_filename',
        help="a file containing a list of identifiers corresponding to entries \
                        to delete",
        required=True,
        type=str)
    parser.add_argument(
        '-o',
        '--outpath',
        dest='outpath',
        help="write new files to specified path rather than creating \
                        new files in the same location as the original ones",
        required=False,
        type=str)
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help="verbose operation",
                        required=False)
    parser.add_argument(
        '-d',
        '--duplicates',
        dest='duplicates',
        action='store_true',
        help=
        "specify INDEX_FILE could contain duplicate identifiers and request \
                        deletion of all of them (default is to delete a single entry)",
        required=False)
    parser.add_argument(
        '-z',
        '--zfound',
        dest='zfound',
        action='store_true',
        help="specify INDEX_FILE contains duplicate identifiers and request \
                        deletion of last entry appearing in the flatfile (default is the first)",
        required=False)
    parser.add_argument(
        '-t',
        '--threads',
        dest='threads',
        help="use specified number of multiple threads for parallel reindexing",
        required=False,
        type=int)
    parser.add_argument(
        '-b',
        '--blocksize',
        dest='index_blocksize',
        help="redefine blocksize used for parallel execution. By default \
                        it will be adjusted automatically to the number of threads",
        required=False,
        type=siprefix2num)
    args = parser.parse_args()

    randnum = str(randint(1000, 9999))

    args.progressbar = False
    if args.verbose:
        eprint(" .-- {} v{} -- by {} --.".format(PROGNAME, VERSION, AUTHOR))
        args.progressbar = True

    if args.zfound and args.duplicates:
        eprint(
            "    => ERROR: No sense specifying both --zfound and --duplicates at the same time"
        )

    if args.verbose:
        if args.zfound:
            eprint(" |-- [-z] option selected:")
            eprint(
                " |   if duplicates in index, the entry appearing last in ff will be deleted"
            )
        elif args.duplicates:
            eprint(" |-- [-d] option selected:")
            eprint(
                " |-- if duplicates in index, all corresponding entries in ff will be deleted"
            )
        else:
            eprint(
                " |-- if duplicates in index, the entry appearing first in ff will be deleted"
            )
            eprint(" |   you can change this behaviour with [-z] or [-d]")

    if args.flatfile[-3:] == ".gz":
        eprint("    => ERROR: -f argument has extension .gz; wrong file??")
        sys.exit(22)

    if args.flatfile.find("://") != -1 and not args.remote:
        eprint("    => NOTICE: -f argument appears to be an URL; wrong file??")
        sys.exit(22)

    if args.outpath is None:
        args.outindex_filename = args.index_filename + ".new"
        args.output_filename = args.flatfile + ".new"
    else:
        if not os.access(args.outpath, os.W_OK):
            eprint(
                "    => ERROR: specified outpath '{}' doesn't exist or is not writable!"
                .format(args.outpath))
            sys.exit(1)
        args.outindex_filename = os.path.join(args.outpath,
                                              args.index_filename + ".new")
        args.output_filename = os.path.join(args.outpath,
                                            args.flatfile + ".new")
    if args.verbose:
        eprint(" |-- updated flatfile and index will be '{}' '{}'".format(
            args.output_filename, args.outindex_filename))

    #gather information from first line of index
    args.encrypted = False
    with open(args.index_filename, 'r', 1) as indexfh:
        args.index_type, _, _, _ = check_index(indexfh.readline())
    if args.index_type in (".", "+"):
        args.encrypted = True
        if args.verbose:
            eprint(" |-- index made for encrypted entries")
            eprint(
                " `=> Please ensure the -f filename points to the encrypted flatfile"
            )
    if args.index_type in (":", "+"):
        if args.verbose:
            eprint(" |-- index made for compressed entries")
            eprint(
                " `=> Please ensure the -f filename points to the compressed flatfile"
            )

    if args.index_blocksize is not None:
        if args.threads is None:
            eprint(
                "    => ERROR: specifying blocksize makes sense only for -t execution"
            )
            sys.exit(22)
        if args.verbose:
            eprint(" |-- blocksize set to {} bytes".format(
                args.index_blocksize))
    if args.threads is not None:  #multithread
        if args.threads < 2:
            eprint(
                "    => ERROR: No sense specifying a number of threads lower than 2!"
            )
            sys.exit(22)
        if args.index_blocksize is None:
            #if not specified, we use 1/threadnumTH of filesize up to a minimum MINBLOCKSIZE
            args.index_blocksize = max(
                calculate_blocksize(args.index_filename, args.threads),
                siprefix2num(MINBLOCKSIZE))
        args.mt_subfiles_dir = TEMPDIR + "/tmpREINDEX" + randnum + "/"
        args.mt_subfiles_iprefix = args.mt_subfiles_dir + "I"
        args.mt_subfiles_oprefix = args.mt_subfiles_dir + "O"
        args.list_filesize, args.chunks_count = calculate_chunknum(
            args.index_filename, args.index_blocksize)
        if args.verbose:
            eprint(
                " |-- Parallel work in {} chunks of maximum {} bytes (-b to change)"
                .format(args.chunks_count, args.index_blocksize))
            eprint(
                " |-- using maximum {} parallel threads (-t); your OS reports {} cpus."
                .format(args.threads, os.cpu_count()))
    else:  # if unspecified, set args.threads to 1
        args.threads = 1
        args.chunks_count = 1
Beispiel #11
0
    #2) delete entries from flatfile
    delete_entries(args.flatfile, args.output_filename, mysorted_positions,
                   myposition2size)

    #3) fill dict with cumulative offsets, used to update index
    myposition2offset = dict()
    size_offset = 0
    for myentry_position in mysorted_positions:
        size_offset += myposition2size[myentry_position]
        myposition2offset[myentry_position] = size_offset
    #eprint("removed a total of {} bytes".format(size_offset)) #debug
    if os.path.getsize(args.flatfile) - size_offset != os.path.getsize(
            args.output_filename):
        eprint(
            "    => ERROR: problems with deletion, file size of resulting file is wrong"
        )
        sys.exit(1)

    #4) update the index shifting positions, optionally with multithread
    if args.threads > 1:  #multithread
        if sys.version_info[1] > 7:  #from py3.8
            set_start_method('fork')  #spawn not implemented
        args.chunk_itempfiles, _ = split_file(args.index_filename,
                                              args.index_blocksize,
                                              args.mt_subfiles_iprefix)
        args.chunks_count = len(args.chunk_itempfiles)

        if args.verbose:
            eprint(
                " |-- parallel reindexing in chunks of maximum {} bytes (-b to change)"
Beispiel #12
0
def check_args():
    """
    parse arguments and check for error conditions
    """
    global args, GZTOOL_EXE
    usagetxt = """{0} -f FLATFILE -i INDEXFILE -e ENTRIESFILE -n NEWINDEXFILE
    [-f] : flatfile into which the new entries should be added
    [-i] : index of FLATFILE
    [-e] : filename containing the new entries to be added
    [-n] : index of ENTRIESFILE
    see {0} -h for tweaks and optional modes
    \nexamples:
       {0} -f db.dat -i db.pos -e new.dat -n new.pos
         (will update db.dat and db.pos)
       {0} -c -f db.dat -i db.pos -e new.dat -n new.pos
         (will create db.dat.new and db.pos.new)
       {0} -c -o export -f db.dat -i db.pos -e new.dat -n new.pos
         (will create export/db.dat.new and export/db.pos.new)
    """.format(PROGNAME)
    parser = argparse.ArgumentParser(
        description='Merge new pre-indexed entries into an existing \
                                     flatfile',
        usage=usagetxt)
    parser.add_argument('-f',
                        '--file',
                        dest='ff_filename',
                        help="filename of flatfile to be processed",
                        required=True,
                        type=str)
    parser.add_argument('-i',
                        '--index',
                        dest='index_filename',
                        help="filename of index file with entry identifiers",
                        required=True,
                        type=str)
    parser.add_argument(
        '-e',
        '--entries',
        dest='newentries_filename',
        help="filename of new entries to be merged into flatfile",
        required=True,
        type=str)
    parser.add_argument('-n',
                        '--newindex',
                        dest='newindex_filename',
                        help="filename of index file with entry identifiers",
                        required=True,
                        type=str)
    parser.add_argument(
        '-c',
        '--create',
        dest='createmode',
        action='store_true',
        help="create new files (.new extension) rather than updating existing \
                        files (the default operation mode)",
        required=False)
    parser.add_argument(
        '-o',
        '--outpath',
        dest='outpath',
        help="optionally write new files to specified path rather than creating \
                        new files in the same location as the original ones",
        required=False,
        type=str)
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help="verbose operation",
                        required=False)
    parser.add_argument(
        '-d',
        '--delete',
        dest='deleteafter',
        action='store_true',
        help="delete ENTRIESFILE and NEWINDEXFILE after merging is completed",
        required=False)
    parser.add_argument(
        '-g',
        '--gzip',
        dest='gzip',
        action='store_true',
        help="compress the final flatfile after merge, creating .gzi compressed \
                        index",
        required=False)
    parser.add_argument(
        '-s',
        '--small',
        dest='smallnew',
        action='store_true',
        help=
        "use this mode if the new index is small (<30k entries): performance \
                        should be better",
        required=False)
    args = parser.parse_args()

    if args.verbose:
        eprint(" .-- {} v{} -- by {} --.".format(PROGNAME, VERSION, AUTHOR))

    args.ff_compressed = False
    if args.ff_filename[-3:] == ".gz":
        args.ff_compressed = True
        if not args.gzip:
            eprint(
                "    => NOTICE: -f argument has extension .gz: assuming flatfile is compressed"
            )
            eprint(
                "               it will be uncompressed and then recompressed after merge"
            )
            args.gzip = True

    args.newentries_compressed = False
    if args.newentries_filename[-3:] == ".gz":
        args.newentries_compressed = True
        eprint(
            "    => NOTICE: -n argument has extension .gz: assuming newentriesfile compressed"
        )

    if args.ff_filename.find("://") != -1:
        eprint("    => ERROR: {} cannot operate on remote flatfiles".format(
            PROGNAME))
        sys.exit(22)

    if args.smallnew:
        if args.verbose:
            eprint(" |-- using tweak for smaller new index")
    else:
        randnum = str(randint(1000, 9999))
        args.itemp_filename = TEMPDIR + "/tmpMERGE" + randnum

    if args.outpath is None:
        args.outindex_filename = args.index_filename + ".new"
    else:
        if not os.access(args.outpath, os.W_OK):
            eprint(
                "    => ERROR: specified outpath '{}' doesn't exist or is not writable!"
                .format(args.outpath))
            sys.exit(1)
        args.outindex_filename = os.path.join(args.outpath,
                                              args.index_filename + ".new")
Beispiel #13
0
if __name__ == '__main__':
    #parse and check arguments
    check_args()

    #check files (if they can be read/written)
    check_files()

    #check if old and new indexes are of same type:
    check_index_compatibility(args.index_filename, args.newindex_filename)

    #uncompress files if needed
    if args.ff_compressed:
        if args.verbose:
            eprint(
                " |-- uncompressing original flatfile.. this may take some time.."
            )
        args.ff_filename = uncompress_file(args.ff_filename)
    if args.newentries_compressed:
        if args.verbose:
            eprint(
                " |-- uncompressing newentries file.. this may take some time.."
            )
        args.newentries_filename = uncompress_file(args.newentries_filename)

    #calculate index offset
    pos_offset = os.path.getsize(args.ff_filename)

    #merge old and new identifiers' indexes
    if args.smallnew:
        tempfile = None
Beispiel #14
0
 def test_deflate(self):
     bytestring = data.encode('UTF-8')
     deflated = ffdb.deflate(bytestring, 9)
     inflated = ffdb.inflate(deflated).decode('UTF-8')
     ffdb.eprint("testing inflate/deflate of data")
     assert inflated == data, "problems with compression or uncompression"
Beispiel #15
0
def check_args():
    """
    parse arguments and check for error conditions
    """
    global args, patterns, joinedpatterns
    usagetxt = """{0} -f FLATFILE -i 'PATTERN' [-e ENDPATTERN] >INDEXFILE
    [-f] : flatfile to index
    [-i] : regex pattern for the identifiers; also [-j], see examples below
    [-e] : pattern for end of entry. defaults to "^-$"
    see '{0} -h' for tweaks and optional modes
    \nnotes: If compression or encryption is requested, an output flatfile will
         be created, and the resulting index will refer to it.
       If the identifiers are a LOT and memory is an issue, you may wish to use
         [-u] option and sort the resulting index after it has been generated.\nexamples:
       {0} -i '^AC   (.+?);' -f uniprot.dat -e '^//$' >up.pac
       {0} -i '^AC   (.+?);' 'ID   (.+?);' -f [...]
         (multiple patterns can be specified)

       {0} -i '^AC   (.+?);' -j '^OX   NCBI_(Tax)ID=(\\d+) ' -f [...]
         (complex patterns made of multiple parts can be specified with [-j];
          -i and -j patterns can be used together)

       {0} -a -j '^DR   (.+?);( .+?);' -f [...]
       {0} -a -i '^AC   (\\S+?); ?(\\S+)?;? ?(\\S+)?;?' -f [...]
         (use [-a] option to find all instances and capture groups of
          the provided patterns, not just the first one)
    """.format(PROGNAME)
    parser = argparse.ArgumentParser(
        description='Create a positional index for any flatfile, \
                                     optionally compressing or encrypting its entries',
        usage=usagetxt)
    parser.add_argument('-f',
                        '--file',
                        dest='input_filename',
                        help="Filename of flatfile to be processed",
                        required=True)
    parser.add_argument('-i',
                        '--id',
                        dest='patterns',
                        help="regexp pattern for identifier(s) to index",
                        required=False,
                        type=str,
                        nargs='+')
    parser.add_argument('-j',
                        '--joinedid',
                        dest='joinedpatterns',
                        help="regexp pattern for identifier(s) to index",
                        required=False,
                        type=str,
                        nargs='+')
    parser.add_argument(
        '-e',
        '--endpattern',
        dest='terminator',
        help="regexp pattern to identify the end of each entry. If unspecified \
                        it defaults to '^-$'",
        required=False)
    parser.add_argument(
        '-a',
        '--allmatches',
        dest='allmatches',
        action='store_true',
        help=
        "find all instances of the identifier pattern, not just the first one \
                        (the default behaviour)",
        required=False)
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help="verbose operation",
                        required=False)
    parser.add_argument(
        '-t',
        '--threads',
        dest='threads',
        help="use specified number of threads for parallel indexing",
        required=False,
        type=int)
    parser.add_argument(
        '-b',
        '--blocksize',
        dest='input_blocksize',
        help="redefine blocksize used for parallel execution. By default \
                        it will be adjusted automatically to the number of threads",
        required=False,
        type=siprefix2num)
    parser.add_argument(
        '-o',
        '--offset',
        dest='pos_offset',
        help="optional offset (in bytes) to shift entry positions in index",
        required=False,
        type=int)
    parser.add_argument(
        '-k',
        '--keysize',
        dest='keysize',
        help="request entries to be encrypted and specify encryption strength: \
                        16=aes-128, 24=aes-192 or 32=aes-256. INPUT_FILENAME.enc will be created",
        required=False,
        type=int,
        choices=(16, 24, 32))
    parser.add_argument(
        '-p',
        '--passphrase',
        dest='passphrase',
        help="passphrase for encrypting the entries; if unspecified it will be \
                        requested interactively (safer)",
        required=False,
        type=str)
    parser.add_argument(
        '-c',
        '--compresslevel',
        dest='compresslevel',
        help="request entries to be compressed and specify a compress level. \
                        INPUT_FILENAME.xz will be created",
        required=False,
        type=int,
        choices=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9))
    parser.add_argument('-x',
                        '--xsanity',
                        dest='xsanity',
                        action='store_true',
                        help="compute entry checksums and add them to index",
                        required=False)
    parser.add_argument(
        '-u',
        '--unsorted',
        dest='unsorted',
        action='store_true',
        help="do not sort the index, leaving that task to a followup external \
                        command. Note that extraction requires a sorted index",
        required=False)
    parser.add_argument(
        '-n',
        '--nopos',
        dest='nopos',
        action='store_true',
        help="do not compute positions, just print matching identifiers",
        required=False)
    args = parser.parse_args()

    randnum = str(randint(1000, 9999))

    if args.patterns is None and args.joinedpatterns is None:
        eprint("    => ERROR: at least one of -i or -j needs to be given!")
        sys.exit(22)

    if args.patterns is None:
        args.patterns = []

    if args.joinedpatterns is None:
        args.joinedpatterns = []

    args.progressbar = False
    if args.verbose:
        eprint(" .-- {} v{} -- by {} --.".format(PROGNAME, VERSION, AUTHOR))
        args.progressbar = True

    if args.allmatches:
        if args.verbose:
            eprint(
                " |-- All matches of the pattern will be stored as identifiers"
            )

    if args.nopos:
        if args.pos_offset is not None or args.compresslevel is not None \
                or args.passphrase is not None or args.keysize is not None or \
                args.xsanity:
            eprint(
                "    => ERROR: No sense specifying compression, encryption, sanity"
            )
            eprint("    =>        or pos_offset when using --nopos option")
            sys.exit(22)
        if args.verbose:
            eprint(
                " |-- No positional index will be created. Only printing identifiers found"
            )

    if args.pos_offset is None:
        args.pos_offset = 0
    else:
        if args.verbose:
            eprint(" |-- positions to be offset by: {}".format(
                args.pos_offset))

    if args.unsorted:
        if args.verbose:
            eprint(
                "    => NOTICE: index will be printed unsorted as requested.")
            eprint("       Please sort index before using it for extraction")

    patterns = set()
    for pattern in args.patterns:
        if args.verbose:
            eprint(" |-- adding identifier pattern '{}'".format(pattern))
        patterns.add(re.compile(pattern.encode('UTF-8'), re.MULTILINE))
    joinedpatterns = set()
    for pattern in args.joinedpatterns:
        if args.verbose:
            eprint(
                " |-- adding joined identifier pattern '{}'".format(pattern))
        joinedpatterns.add(re.compile(pattern.encode('UTF-8'), re.MULTILINE))

    if args.terminator is None:
        args.terminator = "^-$"  #default
    if args.verbose:
        eprint(" |-- entry terminator pattern set as '{}'".format(
            args.terminator))

    if args.xsanity and args.verbose:
        eprint(" |-- entry checksums will be computed and added to index")

    if args.keysize is not None:
        if args.passphrase is None:
            eprint(" |-- keysize specified, please provide a passphrase:")
            args.passphrase = getpass.getpass(prompt=" |>> Passphrase: ")

    args.encrypt = False
    args.cipher_type = None
    args.key = None
    if args.passphrase is not None:
        args.encrypt = True
        if args.keysize is None:
            args.keysize = 16  #default
        args.cipher_name, args.key = derive_key(args.passphrase, args.keysize)
        args.cipher_type = get_cipher_type(args.cipher_name)
        if args.verbose:
            eprint(
                " |-- encrypted flatfile (with cipher {}) will be written to '{}.enc'"
                .format(args.cipher_name, args.input_filename))
            #eprint(" |-- the passphrase is: {}".format(args.passphrase)) #DEBUG!
            #eprint(" |-- the encryption key is: {}".format(args.key)) #DEBUG!

    args.compress = False
    if args.compresslevel is not None:
        args.compress = True
        if args.encrypt:
            args.index_type = "+"  #both compressed and encrypted
            if args.verbose:
                eprint(
                    " |-- entries will be compressed and encrypted to .enc file"
                )
        else:
            args.index_type = ":"  #compressed but not encrypted
            if args.verbose:
                eprint(" |-- entries will be compressed to .xz file")
    else:  #not compressed
        if args.encrypt:
            args.index_type = "."  #encrypted but not compressed
            if args.verbose:
                eprint(" |-- entries will be encrypted to .enc file")
        else:
            args.index_type = "-"  #not encrypted nor compressed but entry sizes stored

    if args.input_blocksize is not None:
        if args.threads is None:
            eprint(
                "    => ERROR: specifying blocksize makes sense only for -t execution"
            )
            sys.exit(22)
        if args.verbose:
            eprint(" |-- blocksize set to {} bytes".format(
                args.input_blocksize))
    if args.threads is not None:  #multithread
        if args.threads < 2:
            eprint(
                "    => ERROR: No sense specifying a number of threads lower than 2!"
            )
            sys.exit(22)
        if args.input_blocksize is None:
            #if not specified, we use 1/threadnumTH of inputfilesize up
            #to a maximum MAXBLOCKSIZE and with minimum MINBLOCKSIZE
            args.input_blocksize = max(
                siprefix2num(MINBLOCKSIZE),
                min(calculate_blocksize(args.input_filename, args.threads),
                    siprefix2num(MAXBLOCKSIZE)))
        args.input_filesize = os.path.getsize(args.input_filename)
        if args.input_blocksize > args.input_filesize // 2 * 3:
            eprint(
                "    => NOTICE: blocksize too BIG compared to flatfile size, -t not applicable!"
            )
            sys.exit(22)
        args.mt_subfiles_dir = TEMPDIR + "/tmpINDEX" + randnum + "/"
        args.mt_subfiles_fprefix = args.mt_subfiles_dir + "F"
        args.mt_subfiles_iprefix = args.mt_subfiles_dir + "I"
        args.mt_subfiles_oprefix = args.mt_subfiles_dir + "O"
        #find max number of chunks required (we'll adjust later on split)
        args.input_filesize, args.chunks_count = calculate_chunknum(
            args.input_filename, args.input_blocksize)
        if args.verbose:
            eprint(
                " |-- using maximum {} parallel threads (-t); your OS reports {} cpus."
                .format(args.threads, os.cpu_count()))
    else:  # if unspecified, set args.threads to 1
        args.threads = 1
        args.chunks_count = 1

    if args.nopos:
        args.index_type = ""  #no indexes
Beispiel #16
0
        if sys.version_info[1] > 7:  #from py3.8
            set_start_method('fork')  #spawn not implemented
        args.chunk_ftemp_files = list()

        #find out where to split the input file without breaking entries
        args.chunk_ftemp_startpos, args.chunk_ftemp_filesizes = compute_split_positions(
            args.input_filename, args.input_blocksize, args.terminator)
        args.chunks_count = len(args.chunk_ftemp_filesizes)
        suffixlength = len(str(args.chunks_count))
        for mychunknum in range(args.chunks_count):
            chunk_suffix = str(mychunknum).zfill(suffixlength)
            args.chunk_ftemp_files.append(args.mt_subfiles_dir + chunk_suffix)

        if args.verbose:
            eprint(
                " |-- parallel work in chunks of maximum {} bytes (-b to change)"
                .format(args.input_blocksize))
            eprint(" |-- flatfile will be split into {} chunks".format(
                args.chunks_count))

        entries_count = Array('i', args.chunks_count)
        indexes_count = Array('i', args.chunks_count)
        skipped_count = Array('i', args.chunks_count)

        args.chunk_itempfiles = args.chunk_itempfiles[0:args.chunks_count]
        if args.outff_filename is not None:
            args.chunk_otempfiles = args.chunk_otempfiles[0:args.chunks_count]

        #init threads
        pool = Pool(args.threads,
                    initializer=init_thread,