def process_file(self, filename): """Processes the given input file""" self.log.info("Processing %s..." % filename) parser = fasta.Parser(open_anything(filename)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: print(seq.id)
def separate_sequences(self, table, sequences_file): """Separates the fasta sequence file in individual fasta files, one per cluster""" reader = fasta.Parser(open_anything(sequences_file)) seqs = dict(((seq.id, seq) for seq in reader)) for cluster_name, cluster_seqs in table.items(): output_file_name = self.sequences_dir + os.path.sep + cluster_name output_fd = open(output_file_name + ".faa", "w") writer = fasta.Writer(output_fd) for sequence in cluster_seqs: obj = SeqRecord(seqs[sequence].seq, sequence, "", "") writer.write(obj) output_fd.close()
def run_real(self): """Runs the application""" # Load valid sequence IDs (if necessary) if self.options.sequences_file: self.log.info("Loading sequences from %s..." % self.options.sequences_file) self.total_sequence_length = 0 self.valid_sequence_ids = set() parser = fasta.Parser(open_anything(self.options.sequences_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) for seq in parser: self.valid_sequence_ids.add(seq.id) self.total_sequence_length += len(seq.seq) else: self.valid_sequence_ids = complementerset() self.total_sequence_length = None # Find which sources will be allowed if not self.options.include_sources: self.sources = complementerset() else: self.sources = set(self.options.include_sources) self.sources.difference_update(self.options.exclude_sources) if isinstance(self.sources, complementerset): self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded())) else: self.log.info("Accepted sources: %s" % ", ".join(self.sources)) if not self.args: self.args = ["-"] for arg in self.args: # Set up the output formatter if self.options.print_totals: self.output_formatter = GenomeLevelOutputFormatter(self) else: self.output_formatter = SequenceLevelOutputFormatter(self) # Process the file self.process_infile(arg) # Print the results self.output_formatter.finish()
def load_sequences_from_file(self, fname): """Loads the sequences from the given file. The file must be in FASTA format. You are allowed to pass file pointers or names of gzipped/bzipped files here.""" return self.load_sequences(fasta.Parser(open_anything(fname)))
def process_sequences_file(self, seq_file): """Processes the sequences one by one, extracting all the pieces into an output fasta file""" self.log.info("Processing fasta file %s...", seq_file) parser = fasta.Parser(open_anything(seq_file)) parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp) ids_to_process = set(self.parts.keys()) writer = fasta.FastWriter(sys.stdout) if self.output_file is not None: output_fd = open(self.output_file, "w") writer_file = fasta.FastWriter(output_fd) for seq in parser: seq_id = seq.id if seq_id not in self.parts: if self.options.try_alternative_splicing: seq_id = seq_id.strip().rstrip(".1") if seq_id not in self.parts: continue else: continue sequence = seq.seq length_seq = len(sequence) ids_to_process.remove(seq_id) for left, right in self.parts[seq_id]: if left < 0: left = length_seq + left + 1 if right < 0: right = length_seq + right + 1 right = min(right, length_seq) # just in case... if left > right: # again, just in case self.log.warning( "Problem with fragment of %s, " "the right part is smaller than " "the left", seq_id) continue new_record = None if left == 1 and right == length_seq: new_record = seq.fragment(not self.options.keep_ids) else: if not self.options.keep_ids: new_id = "%s:%d-%d" % (seq_id, left, right) else: new_id = seq_id new_record = SeqRecord(sequence[(left - 1):right], id=new_id, name=seq.name, description="") writer.write(new_record) if self.output_file is not None: writer_file.write(new_record) if self.output_file is not None: output_fd.close() if ids_to_process: self.log.fatal( "The following identifiers of sequences (%s) were" "found in the fragments file, but not in the " "fasta file ", ",".join(ids_to_process)) return 1 return 0