예제 #1
0
파일: bwa.py 프로젝트: forrestzhang/Chorus
def bwareflength(bwabin, reffile):

    pat = re.compile('@SQ')

    bwabin = subprocesspath.subprocesspath(bwabin)

    reffile = subprocesspath.subprocesspath(reffile)

    bwacmd = ' '.join([bwabin, 'mem',  reffile, '-'])

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE, stdin=PIPE)

    runbwaalign.stdin.write('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'.encode('ascii'))

    runbwaalign.stdin.close()

    seqlength = dict()

    for i in runbwaalign.stdout:

        i = i.decode("utf-8")

        i = i.rstrip('\n')

        if re.search(pat, i):

            (_, seqname, seqlen) = i.split('\t')

            seqname = str(seqname.replace('SN:', ''))

            seqlen = int(seqlen.replace('LN:', ''))

            seqlength[seqname] = seqlen

    return seqlength
예제 #2
0
def bamtobcf(bcfbin, reffile, bamfile, outbcf):
    bcfbin = subprocesspath.subprocesspath(bcfbin)

    reffile = subprocesspath.subprocesspath(reffile)

    bamfile = subprocesspath.subprocesspath(bamfile)

    outbcf = subprocesspath.subprocesspath(outbcf)

    bcfcmd = ' '.join([
        bcfbin, ' mpileup -E -d 500 -L 500 -Ou -f', reffile, bamfile, '| ',
        bcfbin, ' call -cv -Ob -o', outbcf
    ])

    print(bcfcmd)

    bcfrun = Popen(bcfcmd, shell=True)

    bcfrun.communicate()

    bcfidxcmd = ' '.join([bcfbin, ' index', outbcf])

    print(bcfidxcmd)

    bcfidxrun = Popen(bcfidxcmd, shell=True)

    bcfidxrun.communicate()

    return True
예제 #3
0
파일: bwa.py 프로젝트: zhangtaolab/Chorus2
def bwaalign(bwabin, reffile, inputfile, outfile, threadnumber=1):
    """
    bwa mem alignment
    :param bwabin: bwa bin path
    :param reffile: reference file, make by bwa index
    :param inputfile: sequence or reads file
    :param outfile: samfile
    :param threadnumber: number of threads
    :return: True
    """

    # bwabin = subprocesspath.subprocesspath(bwabin)

    ##/Users/Forrest/SVN/bwa/bwa mem -O 0 -B 0 -E 0 -k 5 ../DM_404.fa oligo_tmp2.fa
    bwabin = subprocesspath.subprocesspath(bwabin)
    reffile = subprocesspath.subprocesspath(reffile)
    inputfile = subprocesspath.subprocesspath(inputfile)
    outfile = subprocesspath.subprocesspath(outfile)

    bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile, '>', outfile])

    print(bwacmd)

    runbwaalign = Popen(bwacmd, shell=True)

    runbwaalign.communicate()

    return True
예제 #4
0
파일: bwa.py 프로젝트: forrestzhang/Chorus
def bwaalign(bwabin, reffile, inputfile, outfile, threadnumber=1):
    """
    bwa mem alignment
    :param bwabin: bwa bin path
    :param reffile: reference file, make by bwa index
    :param inputfile: sequence or reads file
    :param outfile: samfile
    :param threadnumber: number of threads
    :return: True
    """

    # bwabin = subprocesspath.subprocesspath(bwabin)

    ##/Users/Forrest/SVN/bwa/bwa mem -O 0 -B 0 -E 0 -k 5 ../DM_404.fa oligo_tmp2.fa
    bwabin = subprocesspath.subprocesspath(bwabin)
    reffile = subprocesspath.subprocesspath(reffile)
    inputfile = subprocesspath.subprocesspath(inputfile)
    outfile = subprocesspath.subprocesspath(outfile)

    bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile, '>', outfile])

    print(bwacmd)

    runbwaalign = Popen(bwacmd, shell=True)

    runbwaalign.communicate()

    return True
예제 #5
0
파일: bwa.py 프로젝트: zhangtaolab/Chorus2
def bwaloci(bwabin, reffile, inputfile, threadnumber=1):

    pat = re.compile('^@')

    bwabin = subprocesspath.subprocesspath(bwabin)
    reffile = subprocesspath.subprocesspath(reffile)
    inputfile = subprocesspath.subprocesspath(inputfile)


    bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile])

    print(bwacmd)

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE)

    res = list()

    for lin in runbwaalign.stdout.readlines():

        lin = lin.decode('utf-8').rstrip('\n')

        if not re.search(pat, lin):

            infor = lin.split('\t')

            seqnmae = infor[2]

            start = infor[3]

            probeseq = infor[9]

            res.append('\t'.join([probeseq, seqnmae, start]))

    return res
예제 #6
0
파일: bwa.py 프로젝트: forrestzhang/Chorus
def bwaloci(bwabin, reffile, inputfile, threadnumber=1):

    pat = re.compile('^@')

    bwabin = subprocesspath.subprocesspath(bwabin)
    reffile = subprocesspath.subprocesspath(reffile)
    inputfile = subprocesspath.subprocesspath(inputfile)


    bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile])

    print(bwacmd)

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE)

    res = list()

    for lin in runbwaalign.stdout.readlines():

        lin = lin.decode('utf-8').rstrip('\n')

        if not re.search(pat, lin):

            infor = lin.split('\t')

            seqnmae = infor[2]

            start = infor[3]

            probeseq = infor[9]

            res.append('\t'.join([probeseq, seqnmae, start]))

    return res
예제 #7
0
파일: bwa.py 프로젝트: zhangtaolab/Chorus2
def bwareflength(bwabin, reffile):

    pat = re.compile('@SQ')

    bwabin = subprocesspath.subprocesspath(bwabin)

    reffile = subprocesspath.subprocesspath(reffile)

    bwacmd = ' '.join([bwabin, 'mem',  reffile, '-'])

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE, stdin=PIPE)

    runbwaalign.stdin.write('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'.encode('ascii'))

    runbwaalign.stdin.close()

    seqlength = dict()

    for i in runbwaalign.stdout:

        i = i.decode("utf-8")

        i = i.rstrip('\n')

        if re.search(pat, i):

            (_, seqname, seqlen) = i.split('\t')

            seqname = str(seqname.replace('SN:', ''))

            seqlen = int(seqlen.replace('LN:', ''))

            seqlength[seqname] = seqlen

    return seqlength
예제 #8
0
def jfseqkmercountforfilter(jfpath, jfkmerfile, mer, sequence, bfcount=False):
    """
    :param jfpath: jellyfish bin path
    :param jfkmerfile: jellyfish kmer count file
    :param mer: int, kmer
    :param sequence: string, sequence for kmerscore count
    :param bfcount:
    :return: list, kmerscore list
    """

    seqlen = len(sequence)

    jfpath = subprocesspath.subprocesspath(jfpath)

    jfkmerfile = subprocesspath.subprocesspath(jfkmerfile)

    jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile])

    print(jfquerycommand)

    kmerct = subprocess.Popen(jfquerycommand,
                              shell=True,
                              stdout=subprocess.PIPE,
                              stdin=subprocess.PIPE)

    mer = int(mer)

    end = mer

    jfkmercount = list()

    while (end <= seqlen):

        start = end - mer

        subseq = sequence[start:end] + '\n'

        kmerct.stdin.write(subseq.encode('ascii'))

        kmerct.stdin.flush()

        lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n')

        number = int(lin)

        jfkmercount.append(number)
        end += 1

    kmerct.stdin.close()

    kmerct.stdout.close()

    # kmerct.terminate()

    kmerct.wait()

    return jfkmercount
예제 #9
0
def jfcount(jfpath, mer, output, infile, threads=1, size='100M', lowercount=2):
    """
    Only keep >=2 kerm, if kmer==1 score =0
    :param jfpath:
    :param mer:
    :param output:
    :param infile:
    :param threads:
    :param size:
    :param lowercount:
    :return:
    """

    jfpath = subprocesspath.subprocesspath(jfpath)

    output = subprocesspath.subprocesspath(output)

    infile = subprocesspath.subprocesspath(infile)

    jfcountcommand = ' '.join([
        jfpath, 'count', '--canonical', '-m',
        str(mer), '-L',
        str(lowercount), '-t',
        str(threads), '-o',
        str(output), '-s',
        str(size), infile
    ])

    print(jfcountcommand)

    p = subprocess.Popen(jfcountcommand, shell=True)

    try:

        outs, errs = p.communicate()

        return True

    except Exception:

        p.kill()

        outs, errs = p.communicate()

        print("Something wrong in jellyfish count")

        return False
예제 #10
0
def jfquerylist(jfpath, jfkmerfile, seqlist, bfcount=False):
    """
    :param jfpath: jellyfish bin path
    :param jfkmerfile: jellyfish kmer count file
    :param seqlist: list of sequences
    :param bfcount:
    :return: list, kmerscore list
    """

    jfpath = subprocesspath.subprocesspath(jfpath)

    jfkmerfile = subprocesspath.subprocesspath(jfkmerfile)

    jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile])

    print(jfquerycommand)

    kmerct = subprocess.Popen(jfquerycommand,
                              shell=True,
                              stdout=subprocess.PIPE,
                              stdin=subprocess.PIPE)

    jfkmercount = list()

    for subseq in seqlist:

        subseq = subseq + '\n'

        kmerct.stdin.write(subseq.encode('ascii'))

        kmerct.stdin.flush()

        lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n')

        number = int(lin)

        resstr = subseq.rstrip() + ',' + str(number)

        jfkmercount.append(resstr)

    kmerct.stdin.close()

    kmerct.stdout.close()

    kmerct.wait()

    return jfkmercount
예제 #11
0
def jfgeneratorscount(jfpath, mer, output, generators, threads=1, size='100M'):
    """
    :param jfpath:
    :param mer:
    :param output:
    :param infile:
    :param threads:
    :param size:
    :param lowercount:
    :return:
    """

    jfpath = subprocesspath.subprocesspath(jfpath)

    output = subprocesspath.subprocesspath(output)

    generators = subprocesspath.subprocesspath(generators)

    jfcountcommand = ' '.join([
        jfpath, 'count', '--canonical', '-m',
        str(mer), '-g', generators, '-t',
        str(threads), '-o',
        str(output), '-s',
        str(size)
    ])

    print(jfcountcommand)

    p = subprocess.Popen(jfcountcommand, shell=True)

    try:

        outs, errs = p.communicate()

        return True

    except Exception:

        p.kill()

        outs, errs = p.communicate()

        print("Something wrong in jellyfish count")

        return False
예제 #12
0
def jfcount(jfpath, mer, output, infile,threads=1,  size='100M', lowercount=2):
    """
    Only keep >=2 kerm, if kmer==1 score =0
    :param jfpath:
    :param mer:
    :param output:
    :param infile:
    :param threads:
    :param size:
    :param lowercount:
    :return:
    """

    jfpath = subprocesspath.subprocesspath(jfpath)

    output = subprocesspath.subprocesspath(output)

    infile = subprocesspath.subprocesspath(infile)

    jfcountcommand = ' '.join([jfpath, 'count', '--canonical', '-m', str(mer), '-L', str(lowercount),
                               '-t', str(threads), '-o', str(output),  '-s', str(size),  infile])

    print(jfcountcommand)

    p = subprocess.Popen(jfcountcommand, shell=True)

    try:

        outs, errs = p.communicate()

        return True

    except Exception:

        p.kill()

        outs, errs = p.communicate()

        print("Something wrong in jellyfish count")

        return False
예제 #13
0
파일: bwa.py 프로젝트: zhangtaolab/Chorus2
def bwamem_paired(bwabin, samtoolsbin, reffile, outfile, inputfile1, inputfile2, samplename, threadnumber=1):
    bwabin = subprocesspath.subprocesspath(bwabin)

    samtoolsbin = subprocesspath.subprocesspath(samtoolsbin)

    reffile = subprocesspath.subprocesspath(reffile)

    inputfile = subprocesspath.subprocesspath(inputfile1)

    inputfile = subprocesspath.subprocesspath(inputfile2)

    outfile = subprocesspath.subprocesspath(outfile)

    samplestr = '\'@RG\\tID:' + samplename + '\\tSM:' + samplename + '\\tLB:WGS\\tPL:Illumina\''

    bwacmd = ' '.join(
        [bwabin, 'mem', '-M', '-R', samplestr, '-t', str(threadnumber), reffile, inputfile1, inputfile2, '| ',
         samtoolsbin, 'sort -@', str(threadnumber), '-o', outfile])

    print(bwacmd)

    runbwaalign = Popen(bwacmd, shell=True)

    runbwaalign.communicate()

    samidxcmd = ' '.join([samtoolsbin, 'index', outfile])

    print(samidxcmd)

    samidx = Popen(samidxcmd, shell=True)

    samidx.communicate()

    return True
예제 #14
0
def getconsensus(bcftoolspath,
                 bcffile,
                 chrom,
                 start,
                 end,
                 seq,
                 sample,
                 strand='+'):
    """
    get consensus by using bcftools 
    """
    bcftoolspath = subprocesspath.subprocesspath(bcftoolspath)
    bcffile = subprocesspath.subprocesspath(bcffile)
    mathlen = len(seq) - 10
    if mathlen < 10:
        mathlen = len(seq)
    seqlen = str(mathlen)
    pat = re.compile('[ATCG]{' + seqlen + ',}')
    if strand == '-':
        seq = revcom.revcom(seq)
    fastring = '\'>' + chrom + ':' + start + '-' + end + '\\n' + seq + '\''
    bcfcon_command = ' '.join([
        'echo', fastring, '|' + bcftoolspath + ' consensus -s', sample, bcffile
    ])

    consensus = 'N' * len(seq)

    try:
        p = Popen(bcfcon_command, shell=True, stdin=PIPE, stdout=PIPE)

        for i in p.stdout:
            i = i.decode('utf-8').rstrip('\n')
            #         print(i)
            if pat.search(i):
                consensus = pat.search(i)[0]
    except:
        print("warnning: ", bcfcon_command, " ##")
    #             print('c:',consensus)
    return str(consensus)
예제 #15
0
파일: bwa.py 프로젝트: zhangtaolab/Chorus2
def bwafilter(bwabin, reffile, inputfile, minas, maxxs ,threadnumber=1 ):

    pat = re.compile('^@')

    bwabin = subprocesspath.subprocesspath(bwabin)

    reffile = subprocesspath.subprocesspath(reffile)

    inputfile = subprocesspath.subprocesspath(inputfile)

    bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile])

    print(bwacmd)

    aspat = re.compile('AS:i:(\d*)')

    xspat = re.compile('XS:i:(\d*)')

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE)

    res = list()

    for lin in runbwaalign.stdout.readlines():
        # print("before decode",lin)
        lin = lin.decode('utf-8').rstrip('\n')
        # print("after decode", lin)
        if not re.search(pat, lin):

            infor = lin.split('\t')

            seqnmae = infor[2]

            start = infor[3]

            probeseq = infor[9]

            asmatch = re.search(aspat, lin)

            xsmatch = re.search(xspat, lin)

            if asmatch:

                asscore = int(asmatch.group(1))

            else:

                continue

            if xsmatch:

                xsscore = int(xsmatch.group(1))

            else:

                continue

            if (asscore >= minas) & (xsscore < maxxs):

                res.append('\t'.join([probeseq, seqnmae, start]))


    runbwaalign.stdout.close()

    runbwaalign.wait()

    return res
예제 #16
0
def jfseqkmercount(jfpath, jfkmerfile, mer, sequence, bfcount=False):

    """
    :param jfpath: jellyfish bin path
    :param jfkmerfile: jellyfish kmer count file
    :param mer: int, kmer
    :param sequence: string, sequence for kmerscore count
    :param bfcount:
    :return: list, kmerscore list
    """

    seqlen = len(sequence)

    jfpath = subprocesspath.subprocesspath(jfpath)

    jfkmerfile = subprocesspath.subprocesspath(jfkmerfile)

    jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile])

    print(jfquerycommand)

    kmerct = subprocess.Popen(jfquerycommand, shell=True, stdout=subprocess.PIPE,
                              stdin=subprocess.PIPE)


    mer = int(mer)

    end = mer

    jfkmercount = list()

    while (end <= seqlen):

        start = end - mer

        subseq = sequence[start:end]+'\n'

        kmerct.stdin.write(subseq.encode('ascii'))

        kmerct.stdin.flush()

        lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n')

        number = int(lin)

        if number == 2:

            number = 1

        if number > 2:

            number = 2

        jfkmercount.append(number)
        end += 1

    kmerct.stdin.close()

    kmerct.stdout.close()

    # kmerct.terminate()

    kmerct.wait()

    return jfkmercount
예제 #17
0
def jfprobekmerfilter(jfpbkfruner):

    """
    :param jfpath: jellyfish bin path
    :param jfkmerfile: jellyfish kmer count file
    :param mer: int, kmer
    :param sequence: string, sequence for kmerscore count
    :param max: max kmer score
    :param min: min kmer score
    :return: list, kmerscore list
    """
    # jfpath = , jfkmerfile, mer, probe, maxk, mink

    probeinfo = jfpbkfruner.probe.split('\t')

    sequence = probeinfo[3]

    seqlen = len(sequence)

    jfpath = subprocesspath.subprocesspath(jfpbkfruner.jfpath)

    jfkmerfile = subprocesspath.subprocesspath(jfpbkfruner.jfkmerfile)

    jfquerycommand = ' '.join([jfpath, 'query', '-i', '-l', jfkmerfile])

    # print(jfquerycommand)

    kmerct = subprocess.Popen(jfquerycommand, shell=True, stdout=subprocess.PIPE,
                              stdin=subprocess.PIPE)


    mer = int(jfpbkfruner.mer)

    end = mer

    jfkmercount = list()

    keep = True

    while (end <= seqlen):

        start = end - mer

        subseq = sequence[start:end]+'\n'

        kmerct.stdin.write(subseq.encode('ascii'))

        kmerct.stdin.flush()

        lin = kmerct.stdout.readline().decode('utf-8').rstrip('\n')

        number = int(lin)

        # print(number)

        if number >= jfpbkfruner.maxk:

            keep = False

        if number <= jfpbkfruner.mink:

            keep = False

        jfkmercount.append(number)

        end += 1

    kmerct.stdin.close()

    kmerct.stdout.close()

    # kmerct.terminate()

    kmerct.wait()

    jfprobefileter = dict()

    jfprobefileter['chro'] = probeinfo[0]
    jfprobefileter['start'] = probeinfo[1]
    jfprobefileter['end'] = probeinfo[2]
    jfprobefileter['seq'] = probeinfo[3]
    jfprobefileter['keep'] = keep
    jfprobefileter['sumscore'] = sum(jfkmercount)

    return jfprobefileter
예제 #18
0
파일: bwa.py 프로젝트: forrestzhang/Chorus
def bwafilter(bwabin, reffile, inputfile, minas, maxxs ,threadnumber=1 ):

    pat = re.compile('^@')

    bwabin = subprocesspath.subprocesspath(bwabin)

    reffile = subprocesspath.subprocesspath(reffile)

    inputfile = subprocesspath.subprocesspath(inputfile)

    bwacmd = ' '.join([bwabin, 'mem', '-O',' 0',' -B',' 0',' -E',' 0',' -k',' 5', '-t',str(threadnumber), reffile, inputfile])

    print(bwacmd)

    aspat = re.compile('AS:i:(\d.)')

    xspat = re.compile('XS:i:(\d.)')

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE)

    res = list()

    for lin in runbwaalign.stdout.readlines():
        # print("before decode",lin)
        lin = lin.decode('utf-8').rstrip('\n')
        # print("after decode", lin)
        if not re.search(pat, lin):

            infor = lin.split('\t')

            seqnmae = infor[2]

            start = infor[3]

            probeseq = infor[9]

            asmatch = re.search(aspat, lin)

            xsmatch = re.search(xspat, lin)

            if asmatch:

                asscore = int(asmatch.group(1))

            else:

                continue

            if xsmatch:

                xsscore = int(xsmatch.group(1))

            else:

                continue

            if (asscore >= minas) & (xsscore < maxxs):

                res.append('\t'.join([probeseq, seqnmae, start]))


    runbwaalign.stdout.close()

    runbwaalign.wait()

    return res
예제 #19
0
def bwa_mem(bwabin, reffile, inputfile, threadnumber=1):

    pat = re.compile('^@')

    bwabin = subprocesspath.subprocesspath(bwabin)

    reffile = subprocesspath.subprocesspath(reffile)

    inputfile = subprocesspath.subprocesspath(inputfile)

    bwacmd = ' '.join([
        bwabin, 'mem', '-O', ' 0', ' -B', ' 0', ' -E', ' 0', ' -k', ' 5', '-t',
        str(threadnumber), reffile, inputfile
    ])

    print(bwacmd)

    #    aspat = re.compile('AS:i:(\d.)')
    #
    #    xspat = re.compile('XS:i:(\d.)')

    runbwaalign = Popen(bwacmd, shell=True, stdout=PIPE)

    res = list()
    idx = 0

    for lin in runbwaalign.stdout.readlines():
        # print("before decode",lin)
        lin = lin.decode('utf-8').rstrip('\n')
        # print("after decode", lin)
        if not re.search(pat, lin):

            infor = lin.split('\t')
            map_qual = infor
            idx = idx + 1
            query_name = infor[0]
            query_chr, query_st, query_ed = query_name.split('_')
            seqname = infor[2]

            start = infor[3]

            probeseq = infor[9]

            md = re.split(':', infor[12])[-1]

            aln_matches = sum([
                int(item) for item in re.split('[ACTG^]', md) if not item == ''
            ])
            aln_mismatches = sum([
                len(item) for item in re.split('[\d+^]', md) if not item == ''
            ])
            identity = aln_matches / (aln_matches + aln_mismatches)

            #            asmatch = re.search(aspat, lin)
            #
            #            xsmatch = re.search(xspat, lin)
            #
            #            if asmatch:
            #
            #                asscore = int(asmatch.group(1))
            #
            #            else:
            #
            #                continue
            #
            #            if xsmatch:
            #
            #                xsscore = int(xsmatch.group(1))
            #
            #            else:
            #
            #                continue
            #
            #            if (asscore >= minas) & (xsscore < maxxs):

            end = str(int(start) + aln_matches + aln_mismatches - 1)
            res.append(','.join([
                str(idx), probeseq, query_chr, query_st, query_ed, '0.99',
                seqname, start, end,
                str(f'{identity:.2f}')
            ]))

    runbwaalign.stdout.close()

    runbwaalign.wait()

    return res