Esempio n. 1
0
def combine(fasta_files, output):
    """
    Simply combines a given list of FASTA files together.  Aside from gzip support,
    this is just file concatenation.
    """
    
    outputFile = gzOptOpen(output, mode = 'w')
    for fasta in fasta_files:
        fastaFile = gzOptOpen(fasta, mode = 'r')
        outputFile.write(fastaFile.read())
        fastaFile.close()
    outputFile.close()
    
    return output
Esempio n. 2
0
def parse_to_generator(fasta_file, labelConverter = (lambda x: x),
                       filter_string = None):
    '''
    Parses a FASTA file and provides a generator of (header,sequence) tuples.
    Can be used efficiently even on very large FASTA files.

    Example usage:
    for header,sequence in parse_fasta(file_name):
        print header
        process(sequence)
    '''

    fasta = gzOptOpen(fasta_file, mode = 'r')

    header = ''
    sequence = []

    for line in fasta:
        if line[0] == '>':
            if header and sequence:
                if (not filter_string) or filter_string not in header:
                    # It would be slightly more efficient to check for
                    # filter_string when the header is first encountered,
                    # and not aggregate the sequence in that case?
                    yield (labelConverter(header), ''.join(sequence))
            sequence = []
            header = line[1:].strip()
        else:
            sequence.append(line.strip())	

    yield (labelConverter(header), ''.join(sequence))

    fasta.close()
Esempio n. 3
0
def write_fasta(fasta, save_file, write_mode='w'):
    '''
    Creates a fasta file given a dictionary with prot: sequence or an iterator of
    (prot, sequence) pairs.
    This method adds ">" at the beginning of each header if there isn't one already.
    The proteins will not be written in any particular order.
    '''

    fasta_file = gzOptOpen(save_file, mode = write_mode)

    if type(fasta) == type(dict()):
        for prot, seq in fasta.items():
            if prot[0] == '>':
                fasta_file.write("%s\n" % prot.strip())
            else:
                fasta_file.write(">%s\n" % prot.strip())
            fasta_file.write("%s\n\n" % seq)
    else:
        for prot, seq in fasta:
            if prot[0] == '>':
                fasta_file.write("%s\n" % prot.strip())
            else:
                fasta_file.write(">%s\n" % prot.strip())
            fasta_file.write("%s\n\n" % seq)	

    fasta_file.close()
Esempio n. 4
0
def partial_database(fasta, output = None, search = '',
                     use_regex = False, include_matches = True):
    """
    fasta -> A target FASTA-format file
    output -> The output file (input file is overwritten if this is not specified.)
    search -> Either a string or a list of strings.
    use_regex -> If this is set to True, strings in search will be interpreted
    as regular expressions.  Otherwise, a search string "matches" if it is contained
    by the header.
    include_matches -> If this is True (the default,) FASTA entries are included in
    the output if they match a search string; if False, FASTA entries are included
    if they match NONE of the search strings.
    
    Creates a new fasta database by copying each entry from the original, according
    to the rules outlined above.
    """

    if not output:
        output = fasta + 'partial_database.fasta'

    if isinstance(search, basestring):
        search = [search]
    if use_regex:
        search = map(re.compile, search)
        
    
    fastaGen = parse_to_generator(fasta)

    out = gzOptOpen(output, mode = 'w')

    if use_regex:
        for header, sequence in fastaGen:
            if any([x.search(header) for x in search]) == include_matches:
                out.write('>' + header + '\n')
                out.write(sequence + '\n')                
    else:
        for header, sequence in fastaGen:
            if any([x in header for x in search]) == include_matches:
                out.write('>' + header + '\n')
                out.write(sequence + '\n')

    out.close()
    
    return output
Esempio n. 5
0
def parse_to_dict(fastaFile, labelConverter = (lambda x: x), 
                  nonredundant_labels = False,
                  filter_string = None):
    '''
    Parses a FASTA file and provides a dict object from headers to corresponding
    sequences.  Takes longer to initialize than the generator, but random-access
    lookup of sequences can be performed much faster.
    '''

    fasta = gzOptOpen(fastaFile, mode = 'r')

    index = {}
    nextKey = None
    nextSeq = ''
    for line in fasta:
        if line[0] == ">":
            if nextSeq:
                if not filter_string or filter_string not in nextKey:
                    assert nextKey, "Missing or invalid identification line?"
                    index[labelConverter(nextKey.strip())] = nextSeq.replace('\n', '')
                nextSeq = ''

            nextKey = line[1:]

        else:
            nextSeq += line
    
    if not nonredundant_labels:
        label = labelConverter(nextKey.strip())
        index[label] = nextSeq.replace('\n', '')
    else:
        seq = nextSeq.replace('\n', '')
        keys = nextKey.split('\x01')
        for key in keys:
            index[key] = seq
    

    fasta.close()
    return index
Esempio n. 6
0
 def __init__(self, filename, line_length = 80):
     self.file = gzOptOpen(filename, mode = 'w')
     self.length = line_length
Esempio n. 7
0
def iterate_spectra_simple(xmlfile):
    parser = xml.iterparse(gzOptOpen(xmlfile))
    for evt, obj in parser:
        if obj.tag == ns('spectrum'):
            yield readSpectrumXML(obj)
            obj.clear()