def combine(fasta_files, output): """ Simply combines a given list of FASTA files together. Aside from gzip support, this is just file concatenation. """ outputFile = gzOptOpen(output, mode = 'w') for fasta in fasta_files: fastaFile = gzOptOpen(fasta, mode = 'r') outputFile.write(fastaFile.read()) fastaFile.close() outputFile.close() return output
def parse_to_generator(fasta_file, labelConverter = (lambda x: x), filter_string = None): ''' Parses a FASTA file and provides a generator of (header,sequence) tuples. Can be used efficiently even on very large FASTA files. Example usage: for header,sequence in parse_fasta(file_name): print header process(sequence) ''' fasta = gzOptOpen(fasta_file, mode = 'r') header = '' sequence = [] for line in fasta: if line[0] == '>': if header and sequence: if (not filter_string) or filter_string not in header: # It would be slightly more efficient to check for # filter_string when the header is first encountered, # and not aggregate the sequence in that case? yield (labelConverter(header), ''.join(sequence)) sequence = [] header = line[1:].strip() else: sequence.append(line.strip()) yield (labelConverter(header), ''.join(sequence)) fasta.close()
def write_fasta(fasta, save_file, write_mode='w'): ''' Creates a fasta file given a dictionary with prot: sequence or an iterator of (prot, sequence) pairs. This method adds ">" at the beginning of each header if there isn't one already. The proteins will not be written in any particular order. ''' fasta_file = gzOptOpen(save_file, mode = write_mode) if type(fasta) == type(dict()): for prot, seq in fasta.items(): if prot[0] == '>': fasta_file.write("%s\n" % prot.strip()) else: fasta_file.write(">%s\n" % prot.strip()) fasta_file.write("%s\n\n" % seq) else: for prot, seq in fasta: if prot[0] == '>': fasta_file.write("%s\n" % prot.strip()) else: fasta_file.write(">%s\n" % prot.strip()) fasta_file.write("%s\n\n" % seq) fasta_file.close()
def partial_database(fasta, output = None, search = '', use_regex = False, include_matches = True): """ fasta -> A target FASTA-format file output -> The output file (input file is overwritten if this is not specified.) search -> Either a string or a list of strings. use_regex -> If this is set to True, strings in search will be interpreted as regular expressions. Otherwise, a search string "matches" if it is contained by the header. include_matches -> If this is True (the default,) FASTA entries are included in the output if they match a search string; if False, FASTA entries are included if they match NONE of the search strings. Creates a new fasta database by copying each entry from the original, according to the rules outlined above. """ if not output: output = fasta + 'partial_database.fasta' if isinstance(search, basestring): search = [search] if use_regex: search = map(re.compile, search) fastaGen = parse_to_generator(fasta) out = gzOptOpen(output, mode = 'w') if use_regex: for header, sequence in fastaGen: if any([x.search(header) for x in search]) == include_matches: out.write('>' + header + '\n') out.write(sequence + '\n') else: for header, sequence in fastaGen: if any([x in header for x in search]) == include_matches: out.write('>' + header + '\n') out.write(sequence + '\n') out.close() return output
def parse_to_dict(fastaFile, labelConverter = (lambda x: x), nonredundant_labels = False, filter_string = None): ''' Parses a FASTA file and provides a dict object from headers to corresponding sequences. Takes longer to initialize than the generator, but random-access lookup of sequences can be performed much faster. ''' fasta = gzOptOpen(fastaFile, mode = 'r') index = {} nextKey = None nextSeq = '' for line in fasta: if line[0] == ">": if nextSeq: if not filter_string or filter_string not in nextKey: assert nextKey, "Missing or invalid identification line?" index[labelConverter(nextKey.strip())] = nextSeq.replace('\n', '') nextSeq = '' nextKey = line[1:] else: nextSeq += line if not nonredundant_labels: label = labelConverter(nextKey.strip()) index[label] = nextSeq.replace('\n', '') else: seq = nextSeq.replace('\n', '') keys = nextKey.split('\x01') for key in keys: index[key] = seq fasta.close() return index
def __init__(self, filename, line_length = 80): self.file = gzOptOpen(filename, mode = 'w') self.length = line_length
def iterate_spectra_simple(xmlfile): parser = xml.iterparse(gzOptOpen(xmlfile)) for evt, obj in parser: if obj.tag == ns('spectrum'): yield readSpectrumXML(obj) obj.clear()