def listFromCIGAR(cls, cigarstring,position_b0, refname, strand): read_parts = [] if strand == MINUS: # need to reverse the CIGAR logger.debug("Reversing CIGAR for minus strand read fragment") cigarstring = "".join(reversed(re.findall("\d+[MIDNSHP=X]", cigarstring))) op_type_list = [] for op in HTSeq.parse_cigar(cigarstring, position_b0, refname, strand): logger.debug(map(str,(op, op.query_from, op.query_to, op.ref_iv))) if op.type == "M": if "M" in op_type_list: if len(op_type_list) >=2 and op_type_list[-1] == "D" and op_type_list[-2] == "M": logger.debug(map(str,("extending (D):", op, op.query_from, op.query_to, op.ref_iv))) read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand) elif len(op_type_list) >=2 and op_type_list[-1] == "I" and op_type_list[-2] == "M": logger.debug(map(str,("extending (I):", op, op.query_from, op.query_to, op.ref_iv))) read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand) else: logger.debug("CIGAR WARNING: Number of matches > 1: {0}".format(cigarstring)) else: logger.debug(map(str,("appending:", op, op.query_from, op.query_to, op.ref_iv))) suppl_frag = cls(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand) read_parts.append(suppl_frag) op_type_list.append(op.type) return read_parts
def fromCigar(cls, cigar, drop=None, dropEmpty=None, start=None, chrom=None, strand=None): if drop is None: drop = 'DNS' assert isinstance(cigar, basestring), cigar args = [cigar] if start is not None: args += [start] if chrom is not None: args += [chrom] if strand is not None: args += [strand] cos = HTSeq.parse_cigar(*args) cos = [co for co in cos if co.type not in drop] # if dropEmpty is None: # dropEmpty = True # if dropEmpty: # cos = [co for co in cos if co.size ] ivdq = GenomicIntervalDeque([co.ref_iv for co in cos], cigarstring=cigar) # if start is not None: # for iv in ivdq: # iv.start+= start # iv.end += start return ivdq
def listFromCIGAR(cls, cigarstring, position_b0, refname, strand): read_parts = [] if strand == MINUS: # need to reverse the CIGAR logger.debug("Reversing CIGAR for minus strand read fragment") cigarstring = "".join( reversed(re.findall("\d+[MIDNSHP=X]", cigarstring))) op_type_list = [] for op in HTSeq.parse_cigar(cigarstring, position_b0, refname, strand): logger.debug(map(str, (op, op.query_from, op.query_to, op.ref_iv))) if op.type == "M": if "M" in op_type_list: if len(op_type_list) >= 2 and op_type_list[ -1] == "D" and op_type_list[-2] == "M": logger.debug( map(str, ("extending (D):", op, op.query_from, op.query_to, op.ref_iv))) read_parts[-1].extend(op.query_from, op.query_to, op.ref_iv.start, op.ref_iv.end, op.ref_iv.chrom, strand) elif len(op_type_list) >= 2 and op_type_list[ -1] == "I" and op_type_list[-2] == "M": logger.debug( map(str, ("extending (I):", op, op.query_from, op.query_to, op.ref_iv))) read_parts[-1].extend(op.query_from, op.query_to, op.ref_iv.start, op.ref_iv.end, op.ref_iv.chrom, strand) else: logger.debug( "CIGAR WARNING: Number of matches > 1: {0}".format( cigarstring)) else: logger.debug( map(str, ("appending:", op, op.query_from, op.query_to, op.ref_iv))) suppl_frag = cls(op.query_from, op.query_to, op.ref_iv.start, op.ref_iv.end, op.ref_iv.chrom, strand) read_parts.append(suppl_frag) op_type_list.append(op.type) return read_parts
def annotate_pairs(pairs_path, ant, ant_mode, ant_col, strand_type, min_over, cigar_col, output, **kwargs): instream = (_fileio.auto_open(pairs_path, mode='r', nproc=kwargs.get('nproc_in'), command=kwargs.get('cmd_in', None)) if pairs_path else sys.stdin) outstream = (_fileio.auto_open(output, mode='w', nproc=kwargs.get('nproc_out'), command=kwargs.get('cmd_out', None)) if output else sys.stdout) header, body_stream = _headerops.get_header(instream) header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) if len(header) == 0: sys.stderr.write('.pairs file doesn\'t have header rows!\n') raise SystemExit(1) col_names = header[-1].split(' ') if col_names[0] != '#columns:': sys.stderr.write( 'The last row of .pairs header is not a valid col_names row (start with \'#columns:\')!\n' ) raise SystemExit(1) col_names.pop(0) ant_col = ant_col.split(',') for i in ant_col: if i in col_names: sys.stderr.write( 'Annotation col names already exist in .pairs file!\n') raise SystemExit(1) for i in strand_type: if i not in ['s', 'r', 'n']: sys.stderr.write('Invalid strand specific type for annotation!\n') raise SystemExit(1) if ant_mode.lower() == 'both': header[-1] = header[-1] + ' ' + ' '.join(ant_col) else: header[-1] = header[-1] + ' ' + ant_col[0] min_over = [int(i) for i in min_over.split(',')] cigar_col = cigar_col.split(',') cigar_idx = [] for i in cigar_col: if i not in col_names and i.lower() != 'false': sys.stderr.write( 'Cigar col names doesn\'t exist in .pairs file!\n') raise SystemExit(1) else: cigar_idx += [col_names.index(i)] outstream.writelines(l + '\n' for l in header) count_line = 1 for line in body_stream: if count_line % 1000000 == 0: print("%d records processed ..." % count_line) count_line += 1 cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP) if ant_mode.lower() == 'rna': chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \ cols[_pairsam_format.COL_S1], cols[cigar_idx[0]] if cigar_idx[0] == 'false': match_length1 = 1 else: if cigar1 == '*': cigar1 = '1M' match_length1 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)]) ant_str = annotate_region(ant, chrom1, pos1, strand1, match_length1, min_over[0], strand_type[0]) elif ant_mode.lower() == 'dna': chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \ cols[_pairsam_format.COL_S2], cols[cigar_idx[0]] if cigar_idx[0] == 'false': match_length2 = 1 else: if cigar2 == '*': cigar2 = '1M' match_length2 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)]) ant_str = annotate_region(ant, chrom2, pos2, strand2, match_length2, min_over[0], strand_type[0]) else: chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \ cols[_pairsam_format.COL_S1], cols[cigar_idx[0]] if cigar_idx[0] == 'false': match_length1 = 1 else: if cigar1 == '*': cigar1 = '1M' match_length1 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)]) ant_str = annotate_region(ant, chrom1, pos1, strand1, match_length1, min_over[0], strand_type[0]) chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \ cols[_pairsam_format.COL_S2], cols[cigar_idx[1]] if cigar_idx[1] == 'false': match_length2 = 1 else: if cigar2 == '*': cigar2 = '1M' match_length2 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)]) ant_str += _pairsam_format.PAIRSAM_SEP + \ annotate_region(ant, chrom2, pos2, strand2, match_length2, min_over[1], strand_type[1]) outstream.write( _pairsam_format.PAIRSAM_SEP.join([line.rstrip(), ant_str])) outstream.write('\n') if instream != sys.stdin: instream.close() if outstream != sys.stdout: outstream.close()