Example #1
0
    def process_file(self, filename):
        """Processes the given input file"""
        self.log.info("Processing %s..." % filename)

        parser = fasta.Parser(open_anything(filename))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
        for seq in parser:
            print(seq.id)
Example #2
0
    def process_file(self, filename):
        """Processes the given input file"""
        self.log.info("Processing %s..." % filename)

        parser = fasta.Parser(open_anything(filename))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
        for seq in parser:
            print seq.id
Example #3
0
 def process_sequences_file(self, fname):
     self.log.info("Loading sequences from %s..." % fname)
     self.seq_ids_to_length = {}
     parser = fasta.Parser(open_anything(fname))
     parser = fasta.regexp_remapper(parser,
             self.sequence_id_regexp
     )
     for seq in parser:
         self.seq_ids_to_length[seq.id] = len(seq.seq)
Example #4
0
    def load_sequences(self, seq_file):
        """Loads the sequences from the given sequence file in FASTA format"""
        self.log.info("Loading sequences from %s..." % seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser,
                self.options.sequence_id_regexp)

        self.seqs = dict(((seq.id, seq) for seq in parser))
Example #5
0
 def process_sequences_file_old(self, fname):
     """ This is the old version, all the entries are
         loaded into memory
     """
     self.log.info("Loading sequences from %s..." % fname)
     parser = fasta.Parser(open_anything(fname))
     parser = fasta.regexp_remapper(parser,
                                    self.sequence_id_regexp)
     seqs, lens = [], []
     for i, seq in enumerate(parser):
         seqs.append(seq.id)
         lens.append(len(seq.seq))
         if i % 1000000 == 0:
             self.log.info("Read {} seqs".format(i))
     self.log.info("...loaded")
     self.seq_ids_to_length = dict(zip(seqs, lens))
Example #6
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." %
                          self.options.sequences_file)
            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser,
                                           self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" %
                          ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Example #7
0
    def run_real(self):
        """Runs the application"""

        # Load valid sequence IDs (if necessary)
        if self.options.sequences_file:
            self.log.info("Loading sequences from %s..." % self.options.sequences_file)

            self.total_sequence_length = 0
            self.valid_sequence_ids = set()
            parser = fasta.Parser(open_anything(self.options.sequences_file))
            parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)
            for seq in parser:
                self.valid_sequence_ids.add(seq.id)
                self.total_sequence_length += len(seq.seq)
        else:
            self.valid_sequence_ids = complementerset()
            self.total_sequence_length = None

        # Find which sources will be allowed
        if not self.options.include_sources:
            self.sources = complementerset()
        else:
            self.sources = set(self.options.include_sources)
        self.sources.difference_update(self.options.exclude_sources)
        if isinstance(self.sources, complementerset):
            self.log.info("Ignored sources: %s" % ", ".join(self.sources.iterexcluded()))
        else:
            self.log.info("Accepted sources: %s" % ", ".join(self.sources))

        if not self.args:
            self.args = ["-"]

        for arg in self.args:
            # Set up the output formatter
            if self.options.print_totals:
                self.output_formatter = GenomeLevelOutputFormatter(self)
            else:
                self.output_formatter = SequenceLevelOutputFormatter(self)
            # Process the file
            self.process_infile(arg)
            # Print the results
            self.output_formatter.finish()
Example #8
0
    def process_sequences_file(self, fname):
        """ In this version we use `shelve` to save
            memory (the pairs (protein accession, length) are
            stored in a temporary database. See `process_sequences_file_old`
            for the old version.
        """
        self.log.info("Loading sequences from {}...".format(fname))
        parser = fasta.Parser(open_anything(fname))
        parser = fasta.regexp_remapper(parser,
                                       self.sequence_id_regexp)
        self.filename_shelve = os.path.join(tempfile.gettempdir(),
                                            "shelve_file")
        self.seq_ids_to_length = shelve.open(self.filename_shelve)

        for i, seq in enumerate(parser):
            self.seq_ids_to_length[seq.id] = len(seq.seq)
            if i % 1000000 == 0:
                self.log.info("Read {} seqs".format(i))
                self.seq_ids_to_length.sync()
        self.log.info("...loaded")
Example #9
0
    def process_sequences_file(self, seq_file):
        """Processes the sequences one by one, extracting all the pieces into
        an output fasta file"""
        self.log.info("Processing fasta file %s..." %seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser, 
            self.options.sequence_id_regexp)

        ids_to_process = set(self.parts.keys())

        writer = fasta.FastWriter(sys.stdout)
        if self.output_file is not None:
            output_fd = open(self.output_file,"w")
            writer_file = fasta.FastWriter(output_fd)

        for seq in parser:
            seq_id = seq.id
            if seq_id not in self.parts:
                if self.options.try_alternative_splicing:
                    seq_id = seq_id.strip().rstrip(".1")
                    if seq_id not in self.parts:
                        continue
                else:
                    continue
 
            sequence = seq.seq
            length_seq = len(sequence)
            ids_to_process.remove(seq_id)

            for left, right in self.parts[seq_id]:

                if left < 0:
                    left = length_seq + left + 1
                if right < 0:
                    right = length_seq + right + 1

                right = min(right, length_seq)
                #just in case...

                if left > right:
                    #again, just in case
                    self.log.warning("Problem with fragment of %s, "
                        "the right part is smaller than the left" % seq_id)
                    continue

                new_record = None

                if left == 1 and right == length_seq:
                    new_record = seq.fragment(not self.options.keep_ids)
                else:
                    if not self.options.keep_ids:
                        new_id = "%s:%d-%d" % (seq_id, left, right)
                    else:
                        new_id = seq_id
                    new_record = SeqRecord(sequence[(left-1):right],
                            id=new_id, name=seq.name, description="")
                writer.write(new_record)
                if self.output_file is not None:
                    writer_file.write(new_record)

        if self.output_file is not None:
            output_fd.close()

        if len(ids_to_process) > 0:
            self.log.fatal("The following identifiers of sequences (%s) were"
                    "found in the fragments file, but not in the fasta file"
                    % ",".join(ids_to_process))
            return 1
Example #10
0
    def process_sequences_file(self, seq_file):
        """Processes the sequences one by one, extracting all the pieces into
        an output fasta file"""
        self.log.info("Processing fasta file %s...", seq_file)

        parser = fasta.Parser(open_anything(seq_file))
        parser = fasta.regexp_remapper(parser, self.options.sequence_id_regexp)

        ids_to_process = set(self.parts.keys())

        writer = fasta.FastWriter(sys.stdout)
        if self.output_file is not None:
            output_fd = open(self.output_file, "w")
            writer_file = fasta.FastWriter(output_fd)

        for seq in parser:
            seq_id = seq.id
            if seq_id not in self.parts:
                if self.options.try_alternative_splicing:
                    seq_id = seq_id.strip().rstrip(".1")
                    if seq_id not in self.parts:
                        continue
                else:
                    continue

            sequence = seq.seq
            length_seq = len(sequence)
            ids_to_process.remove(seq_id)

            for left, right in self.parts[seq_id]:

                if left < 0:
                    left = length_seq + left + 1
                if right < 0:
                    right = length_seq + right + 1

                right = min(right, length_seq)
                # just in case...

                if left > right:
                    # again, just in case
                    self.log.warning(
                        "Problem with fragment of %s, "
                        "the right part is smaller than "
                        "the left", seq_id)
                    continue

                new_record = None

                if left == 1 and right == length_seq:
                    new_record = seq.fragment(not self.options.keep_ids)
                else:
                    if not self.options.keep_ids:
                        new_id = "%s:%d-%d" % (seq_id, left, right)
                    else:
                        new_id = seq_id
                    new_record = SeqRecord(sequence[(left - 1):right],
                                           id=new_id,
                                           name=seq.name,
                                           description="")
                writer.write(new_record)
                if self.output_file is not None:
                    writer_file.write(new_record)

        if self.output_file is not None:
            output_fd.close()

        if ids_to_process:
            self.log.fatal(
                "The following identifiers of sequences (%s) were"
                "found in the fragments file, but not in the "
                "fasta file ", ",".join(ids_to_process))
            return 1
        return 0