def bwareflength(bwabin, reffile): pat = re.compile('@SQ') bwabin = subprocesspath.subprocesspath(bwabin) reffile = subprocesspath.subprocesspath(reffile) bwacmd = ' '.join([bwabin, 'mem', reffile, '-']) runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE, stdin=PIPE) runbwaalign.stdin.write('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'.encode('ascii')) runbwaalign.stdin.close() seqlength = dict() for i in runbwaalign.stdout: i = i.decode("utf-8") i = i.rstrip('\n') if re.search(pat, i): (_, seqname, seqlen) = i.split('\t') seqname = str(seqname.replace('SN:', '')) seqlen = int(seqlen.replace('LN:', '')) seqlength[seqname] = seqlen return seqlength
def bamtobcf(bcfbin, reffile, bamfile, outbcf): bcfbin = subprocesspath.subprocesspath(bcfbin) reffile = subprocesspath.subprocesspath(reffile) bamfile = subprocesspath.subprocesspath(bamfile) outbcf = subprocesspath.subprocesspath(outbcf) bcfcmd = ' '.join([ bcfbin, ' mpileup -E -d 500 -L 500 -Ou -f', reffile, bamfile, '| ', bcfbin, ' call -cv -Ob -o', outbcf ]) print(bcfcmd) bcfrun = Popen(bcfcmd, shell=True) bcfrun.communicate() bcfidxcmd = ' '.join([bcfbin, ' index', outbcf]) print(bcfidxcmd) bcfidxrun = Popen(bcfidxcmd, shell=True) bcfidxrun.communicate() return True
def bwaalign(bwabin, reffile, inputfile, outfile, threadnumber=1): """ bwa mem alignment :param bwabin: bwa bin path :param reffile: reference file, make by bwa index :param inputfile: sequence or reads file :param outfile: samfile :param threadnumber: number of threads :return: True """ # bwabin = subprocesspath.subprocesspath(bwabin) ##/Users/Forrest/SVN/bwa/bwa mem -O 0 -B 0 -E 0 -k 5 ../DM_404.fa oligo_tmp2.fa bwabin = subprocesspath.subprocesspath(bwabin) reffile = subprocesspath.subprocesspath(reffile) inputfile = subprocesspath.subprocesspath(inputfile) outfile = subprocesspath.subprocesspath(outfile) bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile, '>', outfile]) print(bwacmd) runbwaalign = Popen(bwacmd, shell=True) runbwaalign.communicate() return True
def bwaloci(bwabin, reffile, inputfile, threadnumber=1): pat = re.compile('^@') bwabin = subprocesspath.subprocesspath(bwabin) reffile = subprocesspath.subprocesspath(reffile) inputfile = subprocesspath.subprocesspath(inputfile) bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile]) print(bwacmd) runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE) res = list() for lin in runbwaalign.stdout.readlines(): lin = lin.decode('utf-8').rstrip('\n') if not re.search(pat, lin): infor = lin.split('\t') seqnmae = infor[2] start = infor[3] probeseq = infor[9] res.append('\t'.join([probeseq, seqnmae, start])) return res
def jfseqkmercountforfilter(jfpath, jfkmerfile, mer, sequence, bfcount=False): """ :param jfpath: jellyfish bin path :param jfkmerfile: jellyfish kmer count file :param mer: int, kmer :param sequence: string, sequence for kmerscore count :param bfcount: :return: list, kmerscore list """ seqlen = len(sequence) jfpath = subprocesspath.subprocesspath(jfpath) jfkmerfile = subprocesspath.subprocesspath(jfkmerfile) jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile]) print(jfquerycommand) kmerct = subprocess.Popen(jfquerycommand, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) mer = int(mer) end = mer jfkmercount = list() while (end <= seqlen): start = end - mer subseq = sequence[start:end] + '\n' kmerct.stdin.write(subseq.encode('ascii')) kmerct.stdin.flush() lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n') number = int(lin) jfkmercount.append(number) end += 1 kmerct.stdin.close() kmerct.stdout.close() # kmerct.terminate() kmerct.wait() return jfkmercount
def jfcount(jfpath, mer, output, infile, threads=1, size='100M', lowercount=2): """ Only keep >=2 kerm, if kmer==1 score =0 :param jfpath: :param mer: :param output: :param infile: :param threads: :param size: :param lowercount: :return: """ jfpath = subprocesspath.subprocesspath(jfpath) output = subprocesspath.subprocesspath(output) infile = subprocesspath.subprocesspath(infile) jfcountcommand = ' '.join([ jfpath, 'count', '--canonical', '-m', str(mer), '-L', str(lowercount), '-t', str(threads), '-o', str(output), '-s', str(size), infile ]) print(jfcountcommand) p = subprocess.Popen(jfcountcommand, shell=True) try: outs, errs = p.communicate() return True except Exception: p.kill() outs, errs = p.communicate() print("Something wrong in jellyfish count") return False
def jfquerylist(jfpath, jfkmerfile, seqlist, bfcount=False): """ :param jfpath: jellyfish bin path :param jfkmerfile: jellyfish kmer count file :param seqlist: list of sequences :param bfcount: :return: list, kmerscore list """ jfpath = subprocesspath.subprocesspath(jfpath) jfkmerfile = subprocesspath.subprocesspath(jfkmerfile) jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile]) print(jfquerycommand) kmerct = subprocess.Popen(jfquerycommand, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) jfkmercount = list() for subseq in seqlist: subseq = subseq + '\n' kmerct.stdin.write(subseq.encode('ascii')) kmerct.stdin.flush() lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n') number = int(lin) resstr = subseq.rstrip() + ',' + str(number) jfkmercount.append(resstr) kmerct.stdin.close() kmerct.stdout.close() kmerct.wait() return jfkmercount
def jfgeneratorscount(jfpath, mer, output, generators, threads=1, size='100M'): """ :param jfpath: :param mer: :param output: :param infile: :param threads: :param size: :param lowercount: :return: """ jfpath = subprocesspath.subprocesspath(jfpath) output = subprocesspath.subprocesspath(output) generators = subprocesspath.subprocesspath(generators) jfcountcommand = ' '.join([ jfpath, 'count', '--canonical', '-m', str(mer), '-g', generators, '-t', str(threads), '-o', str(output), '-s', str(size) ]) print(jfcountcommand) p = subprocess.Popen(jfcountcommand, shell=True) try: outs, errs = p.communicate() return True except Exception: p.kill() outs, errs = p.communicate() print("Something wrong in jellyfish count") return False
def jfcount(jfpath, mer, output, infile,threads=1, size='100M', lowercount=2): """ Only keep >=2 kerm, if kmer==1 score =0 :param jfpath: :param mer: :param output: :param infile: :param threads: :param size: :param lowercount: :return: """ jfpath = subprocesspath.subprocesspath(jfpath) output = subprocesspath.subprocesspath(output) infile = subprocesspath.subprocesspath(infile) jfcountcommand = ' '.join([jfpath, 'count', '--canonical', '-m', str(mer), '-L', str(lowercount), '-t', str(threads), '-o', str(output), '-s', str(size), infile]) print(jfcountcommand) p = subprocess.Popen(jfcountcommand, shell=True) try: outs, errs = p.communicate() return True except Exception: p.kill() outs, errs = p.communicate() print("Something wrong in jellyfish count") return False
def bwamem_paired(bwabin, samtoolsbin, reffile, outfile, inputfile1, inputfile2, samplename, threadnumber=1): bwabin = subprocesspath.subprocesspath(bwabin) samtoolsbin = subprocesspath.subprocesspath(samtoolsbin) reffile = subprocesspath.subprocesspath(reffile) inputfile = subprocesspath.subprocesspath(inputfile1) inputfile = subprocesspath.subprocesspath(inputfile2) outfile = subprocesspath.subprocesspath(outfile) samplestr = '\'@RG\\tID:' + samplename + '\\tSM:' + samplename + '\\tLB:WGS\\tPL:Illumina\'' bwacmd = ' '.join( [bwabin, 'mem', '-M', '-R', samplestr, '-t', str(threadnumber), reffile, inputfile1, inputfile2, '| ', samtoolsbin, 'sort -@', str(threadnumber), '-o', outfile]) print(bwacmd) runbwaalign = Popen(bwacmd, shell=True) runbwaalign.communicate() samidxcmd = ' '.join([samtoolsbin, 'index', outfile]) print(samidxcmd) samidx = Popen(samidxcmd, shell=True) samidx.communicate() return True
def getconsensus(bcftoolspath, bcffile, chrom, start, end, seq, sample, strand='+'): """ get consensus by using bcftools """ bcftoolspath = subprocesspath.subprocesspath(bcftoolspath) bcffile = subprocesspath.subprocesspath(bcffile) mathlen = len(seq) - 10 if mathlen < 10: mathlen = len(seq) seqlen = str(mathlen) pat = re.compile('[ATCG]{' + seqlen + ',}') if strand == '-': seq = revcom.revcom(seq) fastring = '\'>' + chrom + ':' + start + '-' + end + '\\n' + seq + '\'' bcfcon_command = ' '.join([ 'echo', fastring, '|' + bcftoolspath + ' consensus -s', sample, bcffile ]) consensus = 'N' * len(seq) try: p = Popen(bcfcon_command, shell=True, stdin=PIPE, stdout=PIPE) for i in p.stdout: i = i.decode('utf-8').rstrip('\n') # print(i) if pat.search(i): consensus = pat.search(i)[0] except: print("warnning: ", bcfcon_command, " ##") # print('c:',consensus) return str(consensus)
def bwafilter(bwabin, reffile, inputfile, minas, maxxs ,threadnumber=1 ): pat = re.compile('^@') bwabin = subprocesspath.subprocesspath(bwabin) reffile = subprocesspath.subprocesspath(reffile) inputfile = subprocesspath.subprocesspath(inputfile) bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile]) print(bwacmd) aspat = re.compile('AS:i:(\d*)') xspat = re.compile('XS:i:(\d*)') runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE) res = list() for lin in runbwaalign.stdout.readlines(): # print("before decode",lin) lin = lin.decode('utf-8').rstrip('\n') # print("after decode", lin) if not re.search(pat, lin): infor = lin.split('\t') seqnmae = infor[2] start = infor[3] probeseq = infor[9] asmatch = re.search(aspat, lin) xsmatch = re.search(xspat, lin) if asmatch: asscore = int(asmatch.group(1)) else: continue if xsmatch: xsscore = int(xsmatch.group(1)) else: continue if (asscore >= minas) & (xsscore < maxxs): res.append('\t'.join([probeseq, seqnmae, start])) runbwaalign.stdout.close() runbwaalign.wait() return res
def jfseqkmercount(jfpath, jfkmerfile, mer, sequence, bfcount=False): """ :param jfpath: jellyfish bin path :param jfkmerfile: jellyfish kmer count file :param mer: int, kmer :param sequence: string, sequence for kmerscore count :param bfcount: :return: list, kmerscore list """ seqlen = len(sequence) jfpath = subprocesspath.subprocesspath(jfpath) jfkmerfile = subprocesspath.subprocesspath(jfkmerfile) jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile]) print(jfquerycommand) kmerct = subprocess.Popen(jfquerycommand, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) mer = int(mer) end = mer jfkmercount = list() while (end <= seqlen): start = end - mer subseq = sequence[start:end]+'\n' kmerct.stdin.write(subseq.encode('ascii')) kmerct.stdin.flush() lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n') number = int(lin) if number == 2: number = 1 if number > 2: number = 2 jfkmercount.append(number) end += 1 kmerct.stdin.close() kmerct.stdout.close() # kmerct.terminate() kmerct.wait() return jfkmercount
def jfprobekmerfilter(jfpbkfruner): """ :param jfpath: jellyfish bin path :param jfkmerfile: jellyfish kmer count file :param mer: int, kmer :param sequence: string, sequence for kmerscore count :param max: max kmer score :param min: min kmer score :return: list, kmerscore list """ # jfpath = , jfkmerfile, mer, probe, maxk, mink probeinfo = jfpbkfruner.probe.split('\t') sequence = probeinfo[3] seqlen = len(sequence) jfpath = subprocesspath.subprocesspath(jfpbkfruner.jfpath) jfkmerfile = subprocesspath.subprocesspath(jfpbkfruner.jfkmerfile) jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile]) # print(jfquerycommand) kmerct = subprocess.Popen(jfquerycommand, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) mer = int(jfpbkfruner.mer) end = mer jfkmercount = list() keep = True while (end <= seqlen): start = end - mer subseq = sequence[start:end]+'\n' kmerct.stdin.write(subseq.encode('ascii')) kmerct.stdin.flush() lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n') number = int(lin) # print(number) if number >= jfpbkfruner.maxk: keep = False if number <= jfpbkfruner.mink: keep = False jfkmercount.append(number) end += 1 kmerct.stdin.close() kmerct.stdout.close() # kmerct.terminate() kmerct.wait() jfprobefileter = dict() jfprobefileter['chro'] = probeinfo[0] jfprobefileter['start'] = probeinfo[1] jfprobefileter['end'] = probeinfo[2] jfprobefileter['seq'] = probeinfo[3] jfprobefileter['keep'] = keep jfprobefileter['sumscore'] = sum(jfkmercount) return jfprobefileter
def bwafilter(bwabin, reffile, inputfile, minas, maxxs ,threadnumber=1 ): pat = re.compile('^@') bwabin = subprocesspath.subprocesspath(bwabin) reffile = subprocesspath.subprocesspath(reffile) inputfile = subprocesspath.subprocesspath(inputfile) bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile]) print(bwacmd) aspat = re.compile('AS:i:(\d.)') xspat = re.compile('XS:i:(\d.)') runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE) res = list() for lin in runbwaalign.stdout.readlines(): # print("before decode",lin) lin = lin.decode('utf-8').rstrip('\n') # print("after decode", lin) if not re.search(pat, lin): infor = lin.split('\t') seqnmae = infor[2] start = infor[3] probeseq = infor[9] asmatch = re.search(aspat, lin) xsmatch = re.search(xspat, lin) if asmatch: asscore = int(asmatch.group(1)) else: continue if xsmatch: xsscore = int(xsmatch.group(1)) else: continue if (asscore >= minas) & (xsscore < maxxs): res.append('\t'.join([probeseq, seqnmae, start])) runbwaalign.stdout.close() runbwaalign.wait() return res
def bwa_mem(bwabin, reffile, inputfile, threadnumber=1): pat = re.compile('^@') bwabin = subprocesspath.subprocesspath(bwabin) reffile = subprocesspath.subprocesspath(reffile) inputfile = subprocesspath.subprocesspath(inputfile) bwacmd = ' '.join([ bwabin, 'mem', '-O', ' 0', ' -B', ' 0', ' -E', ' 0', ' -k', ' 5', '-t', str(threadnumber), reffile, inputfile ]) print(bwacmd) # aspat = re.compile('AS:i:(\d.)') # # xspat = re.compile('XS:i:(\d.)') runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE) res = list() idx = 0 for lin in runbwaalign.stdout.readlines(): # print("before decode",lin) lin = lin.decode('utf-8').rstrip('\n') # print("after decode", lin) if not re.search(pat, lin): infor = lin.split('\t') map_qual = infor idx = idx + 1 query_name = infor[0] query_chr, query_st, query_ed = query_name.split('_') seqname = infor[2] start = infor[3] probeseq = infor[9] md = re.split(':', infor[12])[-1] aln_matches = sum([ int(item) for item in re.split('[ACTG^]', md) if not item == '' ]) aln_mismatches = sum([ len(item) for item in re.split('[\d+^]', md) if not item == '' ]) identity = aln_matches / (aln_matches + aln_mismatches) # asmatch = re.search(aspat, lin) # # xsmatch = re.search(xspat, lin) # # if asmatch: # # asscore = int(asmatch.group(1)) # # else: # # continue # # if xsmatch: # # xsscore = int(xsmatch.group(1)) # # else: # # continue # # if (asscore >= minas) & (xsscore < maxxs): end = str(int(start) + aln_matches + aln_mismatches - 1) res.append(','.join([ str(idx), probeseq, query_chr, query_st, query_ed, '0.99', seqname, start, end, str(f'{identity:.2f}') ])) runbwaalign.stdout.close() runbwaalign.wait() return res