Beispiel #1
0
    def process_file(self, filename):
        """Processes the given input file"""
        self.log.info("Processing %s..." % filename)

        parser = fasta.Parser(open_anything(filename))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
        for seq in parser:
            print(seq.id)
Beispiel #2
0
    def separate_sequences(self, table, sequences_file):
        """Separates the fasta sequence file in individual fasta files,
           one per cluster"""
        reader = fasta.Parser(open_anything(sequences_file))
        seqs = dict(((seq.id, seq) for seq in reader))

        for cluster_name, cluster_seqs in table.items():
            output_file_name = self.sequences_dir + os.path.sep + cluster_name
            output_fd = open(output_file_name + ".faa", "w")
            writer = fasta.Writer(output_fd)

            for sequence in cluster_seqs:
                obj = SeqRecord(seqs[sequence].seq, sequence, "", "")
                writer.write(obj)

            output_fd.close()
Beispiel #3
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." %
                          self.options.sequences_file)
            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser,
                                           self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" %
                          ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Beispiel #4
0
 def load_sequences_from_file(self, fname):
     """Loads the sequences from the given file. The file must
     be in FASTA format. You are allowed to pass file pointers
     or names of gzipped/bzipped files here."""
     return self.load_sequences(fasta.Parser(open_anything(fname)))
Beispiel #5
0
    def process_sequences_file(self, seq_file):
        """Processes the sequences one by one, extracting all the pieces into
        an output fasta file"""
        self.log.info("Processing fasta file %s...", seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)

        ids_to_process = set(self.parts.keys())

        writer = fasta.FastWriter(sys.stdout)
        if self.output_file is not None:
            output_fd = open(self.output_file, "w")
            writer_file = fasta.FastWriter(output_fd)

        for seq in parser:
            seq_id = seq.id
            if seq_id not in self.parts:
                if self.options.try_alternative_splicing:
                    seq_id = seq_id.strip().rstrip(".1")
                    if seq_id not in self.parts:
                        continue
                else:
                    continue

            sequence = seq.seq
            length_seq = len(sequence)
            ids_to_process.remove(seq_id)

            for left, right in self.parts[seq_id]:

                if left < 0:
                    left = length_seq + left + 1
                if right < 0:
                    right = length_seq + right + 1

                right = min(right, length_seq)
                # just in case...

                if left > right:
                    # again, just in case
                    self.log.warning(
                        "Problem with fragment of %s, "
                        "the right part is smaller than "
                        "the left", seq_id)
                    continue

                new_record = None

                if left == 1 and right == length_seq:
                    new_record = seq.fragment(not self.options.keep_ids)
                else:
                    if not self.options.keep_ids:
                        new_id = "%s:%d-%d" % (seq_id, left, right)
                    else:
                        new_id = seq_id
                    new_record = SeqRecord(sequence[(left - 1):right],
                                           id=new_id,
                                           name=seq.name,
                                           description="")
                writer.write(new_record)
                if self.output_file is not None:
                    writer_file.write(new_record)

        if self.output_file is not None:
            output_fd.close()

        if ids_to_process:
            self.log.fatal(
                "The following identifiers of sequences (%s) were"
                "found in the fragments file, but not in the "
                "fasta file ", ",".join(ids_to_process))
            return 1
        return 0