Example #1
0
    def parse_reference_output(self):
        """Reads and processes DIAMOND tabular output of the first DIAMOND
        search.

        Note: this function finds query sequences similar to reference
        proteins. Since a query sequence may have more than one areas of
        similarity (for instance, in fusion proteins of two subunits or
        in multi-domain proteins), it will try to find as many such areas
        as possible.

        DIAMOND hits are filtered by two parameters: length of alignment
        and amino acid identity %.

        This function does not return anything. Instead, it populates
        'reads' dictionary with AnnotatedRead objects.

        """
        tsvfile = os.path.join(
            self.options.get_project_dir(self.sample.sample_id),
            self.sample.sample_id + '_' + self.end + '_' + self.options.ref_output_name
        )
        current_sequence_read_id = ''
        hit_list = DiamondHitList(current_sequence_read_id)
        # TODO: cleanup identity_cutoff = self.config.get_identity_cutoff(self.collection)
        length_cutoff = self.config.get_length_cutoff(self.collection)
        print('Length cutoff:', length_cutoff)
        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                hit = DiamondHit()
                (row[0], _) = parse_fastq_seqid(row[0])
                hit.create_hit(row)
                # filtering by length
                if hit.length < length_cutoff:
                    continue  # go to next hit

                if hit.query_id != current_sequence_read_id:
                    # when new query ID reached, process collected hits,
                    # then start over with new query identifier
                    # filtering: remove overlapping hits
                    hit_list.filter_list(self.config.get_overlap_cutoff(self.collection))
                    # if any hits left, assign function to hits and populate reads dictionary
                    hit_list.annotate_hits(self.ref_data)
                    hit_list.filter_list_by_identity(self.ref_data)
                    if hit_list.hits_number != 0:
                        read = AnnotatedRead(current_sequence_read_id)
                        read.hit_list = hit_list
                        self.reads[current_sequence_read_id] = read
                    # start over
                    current_sequence_read_id = hit.query_id
                    hit_list = DiamondHitList(current_sequence_read_id)
                hit_list.add_hit(hit)
            # when EOF reached, process collected hits
            hit_list.filter_list(self.config.get_overlap_cutoff(self.collection))
            hit_list.annotate_hits(self.ref_data)
            hit_list.filter_list_by_identity(self.ref_data)
            if hit_list.hits_number != 0:
                read = AnnotatedRead(current_sequence_read_id)
                read.hit_list = hit_list
                self.reads[current_sequence_read_id] = read
Example #2
0
    def parse_reference_output(self):
        """Reads and processes DIAMOND tabular output of the preselection
        DIAMOND search.

        Note: this function finds query sequences similar to reference
        proteins. Since a query sequence may have more than one areas of
        similarity (for instance, in fusion proteins of two subunits or
        in multi-domain proteins), it will try to find as many such areas
        as possible.

        DIAMOND hits are filtered by two parameters: length of alignment
        and amino acid identity %, which are defined in program config ini.
        """
        tsvfile = os.path.join(self.assembly_dir,
                               'all_contigs_' + self.project.options.ref_output_name)
        current_id = ''
        hit_list = DiamondHitList(current_id)
        identity_cutoff = self.project.config.get_identity_cutoff(
            self.project.options.get_collection())
        length_cutoff = self.project.config.get_length_cutoff(
            self.project.options.get_collection())
        print('Parse reference output: Identity cutoff: ',
              identity_cutoff,
              ', Length cutoff: ',
              length_cutoff)

        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                hit = DiamondHit()
                hit.create_hit(row)
                # filtering by identity and length
                if hit.identity < identity_cutoff:
                    continue  # skip this line
                if hit.length < length_cutoff:
                    continue  # skip this line

                if hit.query_id != current_id:
                    # filter list for overlapping hits
                    hit_list.filter_list(self.project.config.get_overlap_cutoff(
                        self.project.options.get_collection()))
                    if hit_list.hits_number != 0:
                        # annotate_hits
                        hit_list.annotate_hits(self.project.ref_data)
                        function_id, contig_id, _ = parse_gene_id(current_id)
                        self.assembly.contigs[function_id][contig_id].\
                            genes[current_id].hit_list = hit_list

                    current_id = hit.query_id
                    hit_list = DiamondHitList(current_id)
                hit_list.add_hit(hit)
            hit_list.filter_list(
                self.project.config.get_overlap_cutoff(self.project.options.get_collection()))
            if hit_list.hits_number != 0:
                # annotate_hits
                hit_list.annotate_hits(self.project.ref_data)
                function_id, contig_id, _ = parse_gene_id(current_id)
                self.assembly.contigs[function_id][contig_id].genes[current_id].hit_list = \
                    hit_list