Beispiel #1
0
    def _parse_tx_infos(self, gtf_path):
        """Parse transcript infos from GTF file or load from cache

        In case of successful loading from GTF, result will be cached.
        """
        if os.path.exists('_tx_cache.bin'):
            with open('_tx_cache.bin', 'rb') as f:
                return pickle.load(f)
        result = []
        with gzip.open(gtf_path, 'rt') as f:
            for i, line in enumerate(f):
                if i % 1000 == 0:
                    print('processed {}'.format(i), file=sys.stderr)
                if line.startswith('#'):
                    continue
                if line.split('\t', 3)[2] != 'transcript':
                    continue
                record = GTFFeature.parse(line)
                if record.feature != 'transcript':
                    continue
                result.append(
                    TranscriptInfo(record.attrs['gene_id'],
                                   record.attrs['transcript_id'],
                                   record.attrs['transcript_type'],
                                   record.seqname,
                                   record.start,
                                   record.end))
        with open('_tx_cache.bin', 'wb') as g:
            pickle.dump(result, g)
            print(len(result), file=sys.stderr)
        return result
Beispiel #2
0
    def handle_match(self, linc_tx, match, nm):
        """Handle one match of a lincRNA against the genome

        For each match, look at all overlapping exons and consider them
        as candidate lincRNA-to-coding gene interactions.
        """
        # look for exons overlapping with the lincRNA match
        match_strand = ('-' if match.flag & 16 else '+')
        region = Region(self.sam_file.getrname(match.reference_id),
                        match.pos, match.reference_end)
        #print('Querying for exons...', file=sys.stderr)
        try:
            first = True
            for arr in self.tabix.query(*region.to_tuple()):
                exon = GTFFeature.parse(arr=arr)
                if exon.feature != 'exon':
                    continue  # we look for overlapping transcripts
                if exon.attrs['transcript_type'] != 'protein_coding':
                    continue  # we are only interested in these
                if match_strand == exon.strand:
                    continue  # must be on different strands
                overlap_type = self.classify_overlap(
                    [region.start, region.end],
                    [exon.start, exon.end])
                if first:
                    # print('MATCH', match, file=sys.stderr)
                    first = False
                # print('TARGET', exon, file=sys.stderr)
                window_5 = exon.get_5_prime_window(region)
                window_3 = exon.get_3_prime_window(region)
                classes = self.compute_classes(exon, window_5, window_3)

                def ali_length(cigar):
                    OP = 'MIDNSHP=X'
                    return sum(num for op, num in cigar if OP[op] in 'MI=X')

                # generate and write out OutputRecord
                out = OutputRecord(
                        region, exon, linc_tx, len(match.query_sequence),
                        ali_length(match.cigar), int(nm),
                        window_5, window_3, classes)
                print(out.to_tsv(), file=self.args.output_tsv)
        except tabix.TabixError as e:
            pass  # swallow, probably some unplaced region