def trim4x(inFqFileName, outFqFilePrefix, trimLen): if inFqFileName == 'stdin': inFqFile = sys.stdin else: inFqFile = open(inFqFileName) outFqFile1 = open('%s.1.fastq' % outFqFilePrefix, 'w') outFqFile2 = open('%s.2.fastq' % outFqFilePrefix, 'w') while 1: line = inFqFile.readline() if not line: break seq = inFqFile.readline()[:-1] if line[0] != '@': raise Exception seqN = line[1:].rstrip().split(' ')[0] inFqFile.readline() qual = inFqFile.readline()[:-1] if 'N' in seq[:trimLen] or 'N' in seq[-trimLen:]: continue outFqFile1.write('@%s/1\n%s\n+\n%s\n' % (seqN, seq[:trimLen], qual[:trimLen])) outFqFile2.write( '@%s/2\n%s\n+\n%s\n' % (seqN, mybasic.rc(seq[-trimLen:]), mybasic.rev(qual[-trimLen:]))) outFqFile1.close() outFqFile2.close()
def trim4x(inFqFileName, outFqFilePrefix, trimLen): if inFqFileName == "stdin": inFqFile = sys.stdin else: inFqFile = open(inFqFileName) outFqFile1 = open("%s.1.fastq" % outFqFilePrefix, "w") outFqFile2 = open("%s.2.fastq" % outFqFilePrefix, "w") while 1: line = inFqFile.readline() if not line: break seq = inFqFile.readline()[:-1] if line[0] != "@": raise Exception seqN = line[1:].rstrip().split(" ")[0] inFqFile.readline() qual = inFqFile.readline()[:-1] if "N" in seq[:trimLen] or "N" in seq[-trimLen:]: continue outFqFile1.write("@%s/1\n%s\n+\n%s\n" % (seqN, seq[:trimLen], qual[:trimLen])) outFqFile2.write("@%s/2\n%s\n+\n%s\n" % (seqN, mybasic.rc(seq[-trimLen:]), mybasic.rev(qual[-trimLen:]))) outFqFile1.close() outFqFile2.close()
def make_samse(ifileN, ofileN): headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes') # ofile = open(ofileN, 'w') for header in headerL: print header # ofile.write('%s\n' % header) result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_single.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True) ## for unpaired for r in result: qname = r.rid() flag = 0x0 rname = '*' pos = 0 mapq = 0 cigar = '' rnext = '*' ## assume --npath=1 (maximum 1 alignment per read) pnext = 0 ## assume --npath=1 (maximum 1 alignment per read) tlen = 0 ## assume --npath=1 (maximum 1 alignment per read) seq = r.seq() qual = r.qual() extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read) if r.nLoci > 1: flag = flag | 0x4 cigar = '*' new_cigar = '*' extra = '' print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra)) else: if r.pairRel == '(transloc)': match = r.matchL()[0] segL = match.getSegInfo() mapq = segL[0].mapq seq = r.seq() qual = r.qual() for seg in segL: flag = 0x0 (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() pos = min(int(pos1), int(pos2)) (cigar,clip) = seg.toCIGAR_trans() if clip < 0: ## first half seq2 = seq[:clip] qual2 = qual[:clip] else: ## second half seq2 = seq[clip:] qual2 = qual[clip:] if strand == '-': flag = flag | 0x10 seq2 = mybasic.rc(seq2) qual2 = mybasic.rev(qual2) # print qname,seg.toCIGAR_trans() print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra)) else: match = r.matchL()[0] ## assume --npath=1 (maximum 1 alignment per read) segL = match.getSegInfo() (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups() pos = min(int(pos1), int(pos2)) mapq = segL[0].mapq seg_nm = segL[0].numSub cigar2 = match.toCIGAR() # print qname, match.toCIGAR() if segL[0].start != '' and segL[0].start != '0': cigar = str(segL[0].start) + 'S' if strand == '-': cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar if segL[0].ins != '' and segL[0].ins != '0': cigar = str(segL[0].ins) + 'I' + cigar else: cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M' if segL[0].ins != '' and segL[0].ins != '0': cigar = cigar + str(segL[0].ins) + 'I' if len(segL) == 1: new_cigar = segL[0].toCIGAR(True) else: new_cigar = segL[0].toCIGAR() prev_cigar = new_cigar index = 0 for seg in segL[1:]: index = index + 1 if index == (len(segL) - 1): final = True else: final = False rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() match = str(seg.numMatch + seg.numSub) + 'M' if seg.ins != '' and seg.ins != '0': ins = str(seg.ins) + 'I' else: ins = '' if pos == 0 or pos > min(int(rm[2]), int(rm[3])): pos = min(int(rm[2]), int(rm[3])) if strand == '-': dist = int(pos2) - int(rm[2]) - 1 if dist > 0: cigar = match + ins + str(dist) + 'N' + cigar else: cigar = match + ins + cigar else: dist = int(rm[2]) - int(pos2) - 1 if dist > 0: cigar = cigar + str(dist) + 'N' + match + ins else: cigar = cigar + match + ins seg_nm = seg_nm + seg.numSub pos1 = rm[2] pos2 = rm[3] cur_cigar = seg.toCIGAR(final) if strand == '-': if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = cur_cigar + str(dist) + 'N' + new_cigar else: new_cigar = cur_cigar + new_cigar else: if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = new_cigar + str(dist) + 'N' + cur_cigar else: new_cigar = new_cigar + cur_cigar prev_cigar = cur_cigar if segL[-1].end != '' and segL[-1].end != '0': ## last segment if strand == '-': cigar = str(segL[-1].end) + 'S' + cigar else: cigar = cigar + str(segL[-1].end) + 'S' extra = extra + ('\tNM:i:%s' % seg_nm) if strand == '-': flag = flag | 0x10 seq = mybasic.rc(seq) qual = mybasic.rev(qual) ## print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra)) print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
def make_samse(ifileN, ofileN): headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes') # ofile = open(ofileN, 'w') for header in headerL: print header # ofile.write('%s\n' % header) result = mygsnap.gsnapFile( '/pipeline/test_ini_gsnap2sam/S022_single.gsnap', False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True) ## for unpaired for r in result: qname = r.rid() flag = 0x0 rname = '*' pos = 0 mapq = 0 cigar = '' rnext = '*' ## assume --npath=1 (maximum 1 alignment per read) pnext = 0 ## assume --npath=1 (maximum 1 alignment per read) tlen = 0 ## assume --npath=1 (maximum 1 alignment per read) seq = r.seq() qual = r.qual() extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read) if r.nLoci > 1: flag = flag | 0x4 cigar = '*' new_cigar = '*' extra = '' print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra)) else: if r.pairRel == '(transloc)': match = r.matchL()[0] segL = match.getSegInfo() mapq = segL[0].mapq seq = r.seq() qual = r.qual() for seg in segL: flag = 0x0 (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() pos = min(int(pos1), int(pos2)) (cigar, clip) = seg.toCIGAR_trans() if clip < 0: ## first half seq2 = seq[:clip] qual2 = qual[:clip] else: ## second half seq2 = seq[clip:] qual2 = qual[clip:] if strand == '-': flag = flag | 0x10 seq2 = mybasic.rc(seq2) qual2 = mybasic.rev(qual2) # print qname,seg.toCIGAR_trans() print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra)) else: match = r.matchL()[ 0] ## assume --npath=1 (maximum 1 alignment per read) segL = match.getSegInfo() (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups() pos = min(int(pos1), int(pos2)) mapq = segL[0].mapq seg_nm = segL[0].numSub cigar2 = match.toCIGAR() # print qname, match.toCIGAR() if segL[0].start != '' and segL[0].start != '0': cigar = str(segL[0].start) + 'S' if strand == '-': cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar if segL[0].ins != '' and segL[0].ins != '0': cigar = str(segL[0].ins) + 'I' + cigar else: cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M' if segL[0].ins != '' and segL[0].ins != '0': cigar = cigar + str(segL[0].ins) + 'I' if len(segL) == 1: new_cigar = segL[0].toCIGAR(True) else: new_cigar = segL[0].toCIGAR() prev_cigar = new_cigar index = 0 for seg in segL[1:]: index = index + 1 if index == (len(segL) - 1): final = True else: final = False rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() match = str(seg.numMatch + seg.numSub) + 'M' if seg.ins != '' and seg.ins != '0': ins = str(seg.ins) + 'I' else: ins = '' if pos == 0 or pos > min(int(rm[2]), int(rm[3])): pos = min(int(rm[2]), int(rm[3])) if strand == '-': dist = int(pos2) - int(rm[2]) - 1 if dist > 0: cigar = match + ins + str(dist) + 'N' + cigar else: cigar = match + ins + cigar else: dist = int(rm[2]) - int(pos2) - 1 if dist > 0: cigar = cigar + str(dist) + 'N' + match + ins else: cigar = cigar + match + ins seg_nm = seg_nm + seg.numSub pos1 = rm[2] pos2 = rm[3] cur_cigar = seg.toCIGAR(final) if strand == '-': if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = cur_cigar + str(dist) + 'N' + new_cigar else: new_cigar = cur_cigar + new_cigar else: if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = new_cigar + str(dist) + 'N' + cur_cigar else: new_cigar = new_cigar + cur_cigar prev_cigar = cur_cigar if segL[-1].end != '' and segL[-1].end != '0': ## last segment if strand == '-': cigar = str(segL[-1].end) + 'S' + cigar else: cigar = cigar + str(segL[-1].end) + 'S' extra = extra + ('\tNM:i:%s' % seg_nm) if strand == '-': flag = flag | 0x10 seq = mybasic.rc(seq) qual = mybasic.rev(qual) ## print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra)) print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))