コード例 #1
0
ファイル: extract_chimeras.py プロジェクト: granek/aimhii
    def listFromCIGAR(cls, cigarstring,position_b0, refname, strand):
        read_parts = []
        if strand == MINUS: # need to reverse the CIGAR
            logger.debug("Reversing CIGAR for minus strand read fragment")
            cigarstring = "".join(reversed(re.findall("\d+[MIDNSHP=X]", cigarstring)))

        op_type_list = []
        for op in HTSeq.parse_cigar(cigarstring, position_b0, refname, strand):
            logger.debug(map(str,(op, op.query_from, op.query_to, op.ref_iv)))
            if op.type == "M":
                if "M" in op_type_list:
                    if len(op_type_list) >=2 and op_type_list[-1] == "D" and op_type_list[-2] == "M":
                        logger.debug(map(str,("extending (D):", op, op.query_from, op.query_to, op.ref_iv)))
                        read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand)
                    elif len(op_type_list) >=2 and op_type_list[-1] == "I" and op_type_list[-2] == "M":
                        logger.debug(map(str,("extending (I):", op, op.query_from, op.query_to, op.ref_iv)))
                        read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand)
                    else:
                        logger.debug("CIGAR WARNING: Number of matches > 1: {0}".format(cigarstring))
                else:
                    logger.debug(map(str,("appending:", op, op.query_from, op.query_to, op.ref_iv)))
                    suppl_frag = cls(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand)
                    read_parts.append(suppl_frag)
            op_type_list.append(op.type)
        return read_parts
コード例 #2
0
ファイル: htseq_extra.py プロジェクト: shouldsee/htseq_ext
    def fromCigar(cls,
                  cigar,
                  drop=None,
                  dropEmpty=None,
                  start=None,
                  chrom=None,
                  strand=None):
        if drop is None:
            drop = 'DNS'

        assert isinstance(cigar, basestring), cigar
        args = [cigar]
        if start is not None:
            args += [start]
            if chrom is not None:
                args += [chrom]
                if strand is not None:
                    args += [strand]
        cos = HTSeq.parse_cigar(*args)
        cos = [co for co in cos if co.type not in drop]

        #         if dropEmpty is None:
        #             dropEmpty = True
        #         if dropEmpty:
        #             cos = [co for co in cos if co.size ]

        ivdq = GenomicIntervalDeque([co.ref_iv for co in cos],
                                    cigarstring=cigar)
        #         if start is not None:
        #             for iv in ivdq:
        #                 iv.start+= start
        #                 iv.end  += start
        return ivdq
コード例 #3
0
ファイル: extract_chimeras.py プロジェクト: WenjiaS/aimhii
    def listFromCIGAR(cls, cigarstring, position_b0, refname, strand):
        read_parts = []
        if strand == MINUS:  # need to reverse the CIGAR
            logger.debug("Reversing CIGAR for minus strand read fragment")
            cigarstring = "".join(
                reversed(re.findall("\d+[MIDNSHP=X]", cigarstring)))

        op_type_list = []
        for op in HTSeq.parse_cigar(cigarstring, position_b0, refname, strand):
            logger.debug(map(str, (op, op.query_from, op.query_to, op.ref_iv)))
            if op.type == "M":
                if "M" in op_type_list:
                    if len(op_type_list) >= 2 and op_type_list[
                            -1] == "D" and op_type_list[-2] == "M":
                        logger.debug(
                            map(str, ("extending (D):", op, op.query_from,
                                      op.query_to, op.ref_iv)))
                        read_parts[-1].extend(op.query_from, op.query_to,
                                              op.ref_iv.start, op.ref_iv.end,
                                              op.ref_iv.chrom, strand)
                    elif len(op_type_list) >= 2 and op_type_list[
                            -1] == "I" and op_type_list[-2] == "M":
                        logger.debug(
                            map(str, ("extending (I):", op, op.query_from,
                                      op.query_to, op.ref_iv)))
                        read_parts[-1].extend(op.query_from, op.query_to,
                                              op.ref_iv.start, op.ref_iv.end,
                                              op.ref_iv.chrom, strand)
                    else:
                        logger.debug(
                            "CIGAR WARNING: Number of matches > 1: {0}".format(
                                cigarstring))
                else:
                    logger.debug(
                        map(str, ("appending:", op, op.query_from, op.query_to,
                                  op.ref_iv)))
                    suppl_frag = cls(op.query_from, op.query_to,
                                     op.ref_iv.start, op.ref_iv.end,
                                     op.ref_iv.chrom, strand)
                    read_parts.append(suppl_frag)
            op_type_list.append(op.type)
        return read_parts
コード例 #4
0
def annotate_pairs(pairs_path, ant, ant_mode, ant_col, strand_type, min_over,
                   cigar_col, output, **kwargs):
    instream = (_fileio.auto_open(pairs_path,
                                  mode='r',
                                  nproc=kwargs.get('nproc_in'),
                                  command=kwargs.get('cmd_in', None))
                if pairs_path else sys.stdin)

    outstream = (_fileio.auto_open(output,
                                   mode='w',
                                   nproc=kwargs.get('nproc_out'),
                                   command=kwargs.get('cmd_out', None))
                 if output else sys.stdout)

    header, body_stream = _headerops.get_header(instream)
    header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)

    if len(header) == 0:
        sys.stderr.write('.pairs file doesn\'t have header rows!\n')
        raise SystemExit(1)

    col_names = header[-1].split(' ')
    if col_names[0] != '#columns:':
        sys.stderr.write(
            'The last row of .pairs header is not a valid col_names row (start with \'#columns:\')!\n'
        )
        raise SystemExit(1)
    col_names.pop(0)

    ant_col = ant_col.split(',')
    for i in ant_col:
        if i in col_names:
            sys.stderr.write(
                'Annotation col names already exist in .pairs file!\n')
            raise SystemExit(1)

    for i in strand_type:
        if i not in ['s', 'r', 'n']:
            sys.stderr.write('Invalid strand specific type for annotation!\n')
            raise SystemExit(1)
    if ant_mode.lower() == 'both':
        header[-1] = header[-1] + ' ' + ' '.join(ant_col)
    else:
        header[-1] = header[-1] + ' ' + ant_col[0]

    min_over = [int(i) for i in min_over.split(',')]

    cigar_col = cigar_col.split(',')
    cigar_idx = []
    for i in cigar_col:
        if i not in col_names and i.lower() != 'false':
            sys.stderr.write(
                'Cigar col names doesn\'t exist in .pairs file!\n')
            raise SystemExit(1)
        else:
            cigar_idx += [col_names.index(i)]

    outstream.writelines(l + '\n' for l in header)
    count_line = 1
    for line in body_stream:
        if count_line % 1000000 == 0:
            print("%d records processed ..." % count_line)
        count_line += 1

        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)

        if ant_mode.lower() == 'rna':
            chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \
                cols[_pairsam_format.COL_S1], cols[cigar_idx[0]]
            if cigar_idx[0] == 'false':
                match_length1 = 1
            else:
                if cigar1 == '*':
                    cigar1 = '1M'
                match_length1 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)])
            ant_str = annotate_region(ant, chrom1, pos1, strand1,
                                      match_length1, min_over[0],
                                      strand_type[0])
        elif ant_mode.lower() == 'dna':
            chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \
                cols[_pairsam_format.COL_S2], cols[cigar_idx[0]]
            if cigar_idx[0] == 'false':
                match_length2 = 1
            else:
                if cigar2 == '*':
                    cigar2 = '1M'
                match_length2 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)])
            ant_str = annotate_region(ant, chrom2, pos2, strand2,
                                      match_length2, min_over[0],
                                      strand_type[0])
        else:
            chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \
                cols[_pairsam_format.COL_S1], cols[cigar_idx[0]]
            if cigar_idx[0] == 'false':
                match_length1 = 1
            else:
                if cigar1 == '*':
                    cigar1 = '1M'
                match_length1 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)])
            ant_str = annotate_region(ant, chrom1, pos1, strand1,
                                      match_length1, min_over[0],
                                      strand_type[0])
            chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \
                cols[_pairsam_format.COL_S2], cols[cigar_idx[1]]
            if cigar_idx[1] == 'false':
                match_length2 = 1
            else:
                if cigar2 == '*':
                    cigar2 = '1M'
                match_length2 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)])
            ant_str += _pairsam_format.PAIRSAM_SEP + \
                annotate_region(ant, chrom2, pos2, strand2, match_length2, min_over[1], strand_type[1])

        outstream.write(
            _pairsam_format.PAIRSAM_SEP.join([line.rstrip(), ant_str]))
        outstream.write('\n')

    if instream != sys.stdin:
        instream.close()
    if outstream != sys.stdout:
        outstream.close()