Beispiel #1
0
def createDatabase(db, iterator,
                   force=False,
                   synonyms=None,
                   compression=None,
                   random_access_points=None,
                   regex_identifier=None,
                   clean_sequence=False,
                   ignore_duplicates=False,
                   allow_duplicates=False,
                   translator=None):
    """index files in filenames to create database.

    Two new files are created - db.fasta and db_name.idx

    If compression is enabled, provide random access points
    every # bytes.

    Dictzip is treated as an uncompressed file.

    regex_identifier: pattern to extract identifier from description line.
    If None, the part until the first white-space character is used.

    translator: specify a translator
    """

    if db.endswith(".fasta"):
        db = db[:-len(".fasta")]

    if compression:
        if compression == "lzo":
            import lzo

            def lzo_mangler(s):
                return lzo.compress(s, 9)
            mangler = lzo_mangler
            db_name = db + ".lzo"
            write_chunks = True
        elif compression == "zlib":
            def zlib_mangler(s):
                return zlib.compress(s, 9)
            mangler = zlib_mangler
            db_name = db + ".zlib"
            write_chunks = True
        elif compression == "gzip":
            mangler = gzip_mangler
            db_name = db + ".gz"
            write_chunks = True
        elif compression == "dictzip":
            import dictzip

            def mangler(x):
                return x

            db_name = db + ".dz"
            write_chunks = False
        elif compression == "bzip2":
            import bz2

            def bzip_mangler(x):
                return bz2.compress(x, 9)

            mangler = bzip_mangler
            db_name = db + ".bz2"
            write_chunks = True
        elif compression == "debug":
            def mangler(x):
                return x
            db_name = db + ".debug"
            write_chunks = True
        elif compression == "rle":
            import RLE
            mangler = RLE.compress
            db_name = db + ".rle"
            write_chunks = True
        else:
            raise ValueError("unknown compression library: %s" % compression)

        index_name = db + ".cdx"

        if write_chunks and random_access_points is None \
           or random_access_points <= 0:
            raise ValueError("specify chunksize in --random-access-points")

    else:
        def mangler(x):
            return x
        db_name = db + ".fasta"
        write_chunks = False
        index_name = db + ".idx"

    if os.path.exists(db_name) and not force:
        raise ValueError("database %s already exists." % db_name)

    if os.path.exists(index_name) and not force:
        raise ValueError("database index %s already exists." % index_name)

    outfile_index = open(index_name, "w")
    if compression == "dictzip":
        if random_access_points is None or random_access_points <= 0:
            raise ValueError(
                "specify dictzip chunksize in --random-access-points")
        outfile_fasta = dictzip.open(
            db_name, "wb", buffersize=1000000, chunksize=random_access_points)
        compression = None
    else:
        outfile_fasta = open(db_name, "wb")

    identifiers = {}
    lsequence = 0
    identifier_pos, sequence_pos = 0, 0

    translation = string.maketrans("xX", "nN")

    fragments = []
    lfragment = 0

    last_identifier = None

    while 1:

        try:
            result = iterator.next()
        except StopIteration:
            break

        if not result:
            break

        is_new, identifier, fragment = result

        if is_new:
            # check for duplicate identifiers
            if identifier in identifiers:
                if ignore_duplicates:
                    raise ValueError("ignore duplicates not implemented")
                elif allow_duplicates:
                    # the current implementation will fail if the same
                    # identifiers
                    # are directly succeeding each other
                    # better: add return to iterator that indicates a new
                    # identifier
                    out_identifier = identifier + \
                        "_%i" % (identifiers[identifier])
                    identifiers[identifier] += 1
                    identifiers[out_identifier] = 1
                else:
                    raise ValueError("%s occurs more than once" %
                                     (identifier,))
            else:
                identifiers[identifier] = 1
                out_identifier = identifier

            if last_identifier:
                if write_chunks:
                    writeFragments(outfile_fasta, outfile_index,
                                   fragments, mangler,
                                   size=random_access_points,
                                   write_all=True)

                    fragments = []
                    lfragment = 0
                else:
                    outfile_fasta.write("\n")

                outfile_index.write("\t%i\n" % lsequence)

            # write identifier
            identifier_pos = outfile_fasta.tell()
            outfile_fasta.write(mangler(">%s\n" % out_identifier))
            sequence_pos = outfile_fasta.tell()

            outfile_index.write("%s\t%i" % (out_identifier,
                                            identifier_pos))
            if write_chunks:
                outfile_index.write("\t%i" % random_access_points)
            else:
                outfile_index.write("\t%i" % sequence_pos)

            fragments = []
            lsequence = 0
            last_identifier = identifier

        if translator:
            s = translator(fragment)
        else:
            s = re.sub("\s", "", fragment.strip())
            if clean_sequence:
                s = s.translate(translation)

        lsequence += len(s)

        if write_chunks:
            fragments.append(s)
            lfragment += len(s)
            if lfragment > random_access_points:
                rest = writeFragments(outfile_fasta,
                                      outfile_index,
                                      fragments,
                                      mangler,
                                      size=random_access_points,
                                      write_all=False)
                fragments = [rest]
                lfragment = len(rest)
        else:
            outfile_fasta.write(mangler(s))

    if write_chunks:
        writeFragments(outfile_fasta, outfile_index, fragments, mangler,
                       size=random_access_points, write_all=True)
    else:
        outfile_fasta.write("\n")

    outfile_index.write("\t%i\n" % lsequence)

    # add synonyms for the table
    if synonyms:
        for key, vals in synonyms.items():
            for val in vals:
                outfile_index.write("%s\t%s\n" % (key, val))
Beispiel #2
0
def createDatabase(db,
                   filenames,
                   force=False,
                   synonyms=None,
                   compression=None,
                   random_access_points=None,
                   regex_identifier=None):
    """index files in filenames to create database.

    Two new files are created - db.fasta and db_name.idx

    If compression is enabled, provide random access points
    every # bytes.

    Dictzip is treated as an uncompressed file.

    regex_identifier: pattern to extract identifier from description line.
    If None, the part until the first white-space character is used.
    """

    if compression:
        if compression == "lzo":
            import lzo

            def lzo_mangler(s):
                return lzo.compress(s, 9)

            mangler = lzo_mangler
            db_name = db + ".lzo"
            write_chunks = True
        elif compression == "zlib":

            def zlib_mangler(s):
                return zlib.compress(s, 9)

            mangler = zlib_mangler
            db_name = db + ".zlib"
            write_chunks = True
        elif compression == "gzip":
            mangler = gzip_mangler
            db_name = db + ".gz"
            write_chunks = True
        elif compression == "dictzip":
            import dictzip
            mangler = lambda x: x
            db_name = db + ".dz"
            write_chunks = False
        elif compression == "debug":
            mangler = lambda x: x
            db_name = db + ".debug"
            write_chunks = True
        else:
            raise "unknown compression library: %s" % compression

    else:
        mangler = lambda x: x
        db_name = db + ".fasta"
        write_chunks = False

    index_name = db + ".idx"

    if db in filenames:
        raise ValueError("database (%s) is part of input set." % db_name)

    if os.path.exists(db_name) and not force:
        raise ValueError("database %s already exists." % db_name)

    if os.path.exists(index_name) and not force:
        raise ValueError("database index %s already exists." % index_name)

    outfile_index = open(index_name, "w")
    if compression == "dictzip":
        import dictzip
        if random_access_points == None or random_access_points <= 0:
            raise ValueError(
                "specify dictzip chunksize in --random-access-points")
        outfile_fasta = dictzip.open(db_name,
                                     "wb",
                                     buffersize=1000000,
                                     chunksize=random_access_points)
        compression = None
    else:
        outfile_fasta = open(db_name, "wb")

    if type(filenames) == types.StringType:
        filenames = [filenames]

    identifiers = {}
    lsequence = 0
    identifier_pos, sequence_pos = 0, 0

    translation = string.maketrans("xX", "nN")

    for filename in filenames:

        if filename == "-":
            infile = sys.stdin
        elif filename[-3:] == ".gz":
            infile = gzip.open(filename, "r")
        else:
            infile = open(filename, "r")

        fragments = []
        lfragment = 0
        first = True

        for line in infile:

            if line[0] == "#": continue

            if line[0] == ">":

                if not first:

                    if write_chunks:
                        writeFragments(outfile_fasta, outfile_index, fragments,
                                       mangler, random_access_points, True)

                        fragments = []
                        lfragment = 0
                    else:
                        outfile_fasta.write("\n")

                    outfile_index.write("\t%i\n" % lsequence)

                first = False

                if regex_identifier:
                    try:
                        identifier = re.search(regex_identifier,
                                               line[1:-1]).groups()[0]
                    except AttributeError:
                        raise "could not parse identifer from line %s" % line[
                            1:-1]
                else:
                    identifier = re.split("\s", line[1:-1])[0]

                ## check for duplicate identifiers
                if identifier in identifiers:
                    raise ValueError, "%s occurs more than once in %s and %s: line=%s" %\
                          (identifier, identifiers[identifier], filename, line[1:-1])
                identifiers[identifier] = filename

                # write identifier, the identifier includes a new-line
                identifier_pos = outfile_fasta.tell()
                outfile_fasta.write("%s" % mangler(line))
                sequence_pos = outfile_fasta.tell()

                outfile_index.write("%s\t%i" % (identifier, identifier_pos))
                if write_chunks:
                    outfile_index.write("\t%i" % random_access_points)
                else:
                    outfile_index.write("\t%i" % sequence_pos)

                lsequence = 0

            else:

                s = re.sub("\s", "", line.strip())

                if options.clean_sequence:
                    s = s.translate(translation)

                lsequence += len(s)

                if write_chunks:
                    fragments.append(s)
                    lfragment += len(s)
                    if lfragment > random_access_points:
                        rest = writeFragments(outfile_fasta, outfile_index,
                                              fragments, mangler,
                                              random_access_points, False)
                        fragments = [rest]
                        lfragment = len(rest)
                else:
                    outfile_fasta.write(mangler(s))

        if write_chunks:
            writeFragments(outfile_fasta, outfile_index, fragments, mangler,
                           random_access_points, True)
        else:
            outfile_fasta.write("\n")

        outfile_index.write("\t%i\n" % lsequence)

    # add synonyms for the table
    if synonyms:
        for key, vals in synonyms.items():
            for val in vals:
                outfile_index.write("%s\t%s\n" % (key, val))
Beispiel #3
0
def createDatabase(db,
                   iterator,
                   force=False,
                   synonyms=None,
                   compression=None,
                   random_access_points=None,
                   regex_identifier=None,
                   clean_sequence=False,
                   ignore_duplicates=False,
                   allow_duplicates=False,
                   translator=None):
    """index files in filenames to create database.

    Two new files are created - db.fasta and db_name.idx

    If compression is enabled, provide random access points
    every # bytes.

    Dictzip is treated as an uncompressed file.

    regex_identifier: pattern to extract identifier from description line.
    If None, the part until the first white-space character is used.

    translator: specify a translator
    """

    if db.endswith(".fasta"):
        db = db[:-len(".fasta")]

    if compression:
        if compression == "lzo":
            import lzo

            def lzo_mangler(s):
                return lzo.compress(s, 9)

            mangler = lzo_mangler
            db_name = db + ".lzo"
            write_chunks = True
        elif compression == "zlib":

            def zlib_mangler(s):
                return zlib.compress(s, 9)

            mangler = zlib_mangler
            db_name = db + ".zlib"
            write_chunks = True
        elif compression == "gzip":
            mangler = gzip_mangler
            db_name = db + ".gz"
            write_chunks = True
        elif compression == "dictzip":
            import dictzip
            mangler = lambda x: x
            db_name = db + ".dz"
            write_chunks = False
        elif compression == "bzip2":
            import bz2

            def bzip_mangler(s):
                return bz2.compress(s, 9)

            mangler = bzip_mangler
            db_name = db + ".bz2"
            write_chunks = True
        elif compression == "debug":
            mangler = lambda x: x
            db_name = db + ".debug"
            write_chunks = True
        elif compression == "rle":
            import RLE
            mangler = RLE.compress
            db_name = db + ".rle"
            write_chunks = True
        else:
            raise ValueError("unknown compression library: %s" % compression)

        index_name = db + ".cdx"

        if write_chunks and random_access_points is None \
           or random_access_points <= 0:
            raise ValueError("specify chunksize in --random-access-points")

    else:
        mangler = lambda x: x
        db_name = db + ".fasta"
        write_chunks = False
        index_name = db + ".idx"

    if os.path.exists(db_name) and not force:
        raise ValueError("database %s already exists." % db_name)

    if os.path.exists(index_name) and not force:
        raise ValueError("database index %s already exists." % index_name)

    outfile_index = open(index_name, "w")
    if compression == "dictzip":
        if random_access_points is None or random_access_points <= 0:
            raise ValueError(
                "specify dictzip chunksize in --random-access-points")
        outfile_fasta = dictzip.open(db_name,
                                     "wb",
                                     buffersize=1000000,
                                     chunksize=random_access_points)
        compression = None
    else:
        outfile_fasta = open(db_name, "wb")

    identifiers = {}
    lsequence = 0
    identifier_pos, sequence_pos = 0, 0

    translation = string.maketrans("xX", "nN")

    fragments = []
    lfragment = 0

    last_identifier = None

    while 1:

        try:
            result = iterator.next()
        except StopIteration:
            break

        if not result:
            break

        is_new, identifier, fragment = result

        if is_new:
            # check for duplicate identifiers
            if identifier in identifiers:
                if ignore_duplicates:
                    raise ValueError("ignore duplicates not implemented")
                elif allow_duplicates:
                    # the current implementation will fail if the same
                    # identifiers
                    # are directly succeeding each other
                    # better: add return to iterator that indicates a new
                    # identifier
                    out_identifier = identifier + \
                        "_%i" % (identifiers[identifier])
                    identifiers[identifier] += 1
                    identifiers[out_identifier] = 1
                else:
                    raise ValueError("%s occurs more than once" %
                                     (identifier, ))
            else:
                identifiers[identifier] = 1
                out_identifier = identifier

            if last_identifier:
                if write_chunks:
                    writeFragments(outfile_fasta,
                                   outfile_index,
                                   fragments,
                                   mangler,
                                   size=random_access_points,
                                   write_all=True)

                    fragments = []
                    lfragment = 0
                else:
                    outfile_fasta.write("\n")

                outfile_index.write("\t%i\n" % lsequence)

            # write identifier
            identifier_pos = outfile_fasta.tell()
            outfile_fasta.write(mangler(">%s\n" % out_identifier))
            sequence_pos = outfile_fasta.tell()

            outfile_index.write("%s\t%i" % (out_identifier, identifier_pos))
            if write_chunks:
                outfile_index.write("\t%i" % random_access_points)
            else:
                outfile_index.write("\t%i" % sequence_pos)

            fragments = []
            lsequence = 0
            last_identifier = identifier

        if translator:
            s = translator(fragment)
        else:
            s = re.sub("\s", "", fragment.strip())
            if clean_sequence:
                s = s.translate(translation)

        lsequence += len(s)

        if write_chunks:
            fragments.append(s)
            lfragment += len(s)
            if lfragment > random_access_points:
                rest = writeFragments(outfile_fasta,
                                      outfile_index,
                                      fragments,
                                      mangler,
                                      size=random_access_points,
                                      write_all=False)
                fragments = [rest]
                lfragment = len(rest)
        else:
            outfile_fasta.write(mangler(s))

    if write_chunks:
        writeFragments(outfile_fasta,
                       outfile_index,
                       fragments,
                       mangler,
                       size=random_access_points,
                       write_all=True)
    else:
        outfile_fasta.write("\n")

    outfile_index.write("\t%i\n" % lsequence)

    # add synonyms for the table
    if synonyms:
        for key, vals in synonyms.items():
            for val in vals:
                outfile_index.write("%s\t%s\n" % (key, val))
Beispiel #4
0
def createDatabase( db, 
                    filenames,
                    force = False,
                    synonyms = None,
                    compression = None,
                    random_access_points = None,
                    regex_identifier = None):
    """index files in filenames to create database.

    Two new files are created - db.fasta and db_name.idx

    If compression is enabled, provide random access points
    every # bytes.

    Dictzip is treated as an uncompressed file.

    regex_identifier: pattern to extract identifier from description line.
    If None, the part until the first white-space character is used.
    """

    if compression:
        if compression == "lzo":
            import lzo
            def lzo_mangler( s ): return lzo.compress(s, 9)
            mangler = lzo_mangler
            db_name = db + ".lzo"
            write_chunks = True
        elif compression == "zlib":
            def zlib_mangler( s ): return zlib.compress( s, 9)
            mangler = zlib_mangler
            db_name = db + ".zlib"
            write_chunks = True            
        elif compression == "gzip":
            mangler = gzip_mangler
            db_name = db + ".gz"
            write_chunks = True            
        elif compression == "dictzip":
            import dictzip
            mangler = lambda x: x
            db_name = db + ".dz"
            write_chunks = False
        elif compression == "debug":
            mangler = lambda x: x
            db_name = db + ".debug"
            write_chunks = True
        else:
            raise "unknown compression library: %s" % compression
        
    else:
        mangler = lambda x: x
        db_name = db + ".fasta"
        write_chunks = False
        
    index_name = db + ".idx"
    
    if db in filenames:
        raise ValueError( "database (%s) is part of input set." % db_name)

    if os.path.exists( db_name ) and not force:
        raise ValueError( "database %s already exists." % db_name )

    if os.path.exists( index_name ) and not force:
        raise ValueError( "database index %s already exists." % index_name )
    
    outfile_index = open( index_name, "w" )
    if compression == "dictzip":
        import dictzip
        if random_access_points == None or random_access_points <= 0:
            raise ValueError("specify dictzip chunksize in --random-access-points")
        outfile_fasta = dictzip.open( db_name, "wb", buffersize=1000000, chunksize=random_access_points )
        compression = None
    else:
        outfile_fasta = open( db_name, "wb" )

    if type(filenames) == types.StringType:
        filenames = [filenames]

    identifiers = {}
    lsequence = 0
    identifier_pos, sequence_pos = 0, 0

    translation = string.maketrans("xX", "nN")
    
    for filename in filenames:

        if filename == "-": 
            infile = sys.stdin
        elif filename[-3:] == ".gz":
            infile = gzip.open( filename, "r" )
        else:
            infile = open( filename, "r")

        fragments = []
        lfragment = 0
        first = True
        
        for line in infile:

            if line[0] == "#":  continue
            
            if line[0] == ">" :
                
                if not first:
                    
                    if write_chunks:
                        writeFragments( outfile_fasta, outfile_index, fragments, mangler,
                                        random_access_points, True )
                        
                        fragments = []
                        lfragment = 0
                    else:
                        outfile_fasta.write( "\n" )
                        
                    outfile_index.write("\t%i\n" % lsequence)

                first = False
                
                if regex_identifier:
                    try:
                        identifier = re.search(regex_identifier, line[1:-1]).groups()[0]
                    except AttributeError:
                        raise "could not parse identifer from line %s" % line[1:-1]
                else:
                    identifier = re.split("\s", line[1:-1])[0]
                    
                ## check for duplicate identifiers
                if identifier in identifiers:
                    raise ValueError, "%s occurs more than once in %s and %s: line=%s" %\
                          (identifier, identifiers[identifier], filename, line[1:-1])
                identifiers[identifier] = filename
                
                # write identifier, the identifier includes a new-line
                identifier_pos = outfile_fasta.tell()
                outfile_fasta.write( "%s" % mangler(line) )
                sequence_pos = outfile_fasta.tell()
                
                outfile_index.write( "%s\t%i" % (identifier,
                                                 identifier_pos ) )
                if write_chunks:
                    outfile_index.write( "\t%i" % random_access_points )
                else:
                    outfile_index.write( "\t%i" % sequence_pos )
                    
                lsequence = 0
                
            else:
                
                s = re.sub( "\s", "", line.strip() )

                if options.clean_sequence:
                    s = s.translate( translation )
                        
                lsequence += len(s)
                
                if write_chunks:
                    fragments.append(s)
                    lfragment += len(s)
                    if lfragment > random_access_points:
                        rest = writeFragments( outfile_fasta, outfile_index,
                                               fragments, mangler, random_access_points,
                                               False)
                        fragments = [rest]
                        lfragment = len(rest)
                else:
                    outfile_fasta.write( mangler(s) )
                    
        if write_chunks:
            writeFragments( outfile_fasta, outfile_index, fragments, mangler, random_access_points, True )
        else:
            outfile_fasta.write( "\n" )
            
        outfile_index.write("\t%i\n" % lsequence )

    # add synonyms for the table
    if synonyms:
        for key, vals in synonyms.items():
            for val in vals:
                outfile_index.write( "%s\t%s\n" % (key, val) )