Ejemplo n.º 1
0
def grep_reads(read, f_libs, direction):
    #reverse the read
    #reverse = ''.join([rdict[i] for i in read[::-1]])

    cmds = []
    for fn in f_libs:
        cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/grep_reads.py %s %s %s &' % (
            read, fn, direction)
        cmds.append(cmd)

    cmds.append('\nwait;\n')

    f_job = 'grep_read.job'
    cmn.write_lines(cmds, f_job)

    cmn.run('bash %s ' % f_job)
    #the output dir is grep_out
    dn = 'all_grep_reads.txt'
    cmn.run('cat grep_out/* > %s' % dn)
    fished_reads = cmn.getid(dn)

    return fished_reads
Ejemplo n.º 2
0
    #options=parse_options()
    try:
        fref, fqlist = sys.argv[1:3]
    except:
        print("Usage: *.py sampleInfo.baits fqlist", file=sys.stderr)
        sys.exit()

    #add primer if not added
    ref_seqs, toAddDict = read_baits(fref)
    #log the baits into the dataset
    log_newBaits_ifPossible(ref_seqs)

    #index ref here
    frefs = parse_ref(ref_seqs)

    fqlist = cmn.getid(fqlist)
    fq_groups = group_fq(fqlist)

    N = cmn.cpu_check()

    bwa_cmds = ['module add bwa']
    for reflabel in frefs:
        fref = frefs[reflabel]
        fnlabel = cmn.lastName(fref).replace('.fa', '')
        for sp in fq_groups:
            R1, R2, single = fq_groups[sp]
            cmd = 'bwa mem -t %s -B 2 -M %s %s %s | grep "%s" > %s_paired_%s_mapped.sam ' % (
                N, fnlabel, R1, R2, reflabel, sp, fnlabel)
            bwa_cmds.append(cmd)
            cmd = 'bwa mem -t %s -B 2 -M %s %s | grep "%s" > %s_single_%s_mapped.sam ' % (
                N, fnlabel, single, reflabel, sp, fnlabel)
Ejemplo n.º 3
0
import sys
python_lib = '/work/biophysics/mtang/SNP_calling/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import os

vcf_list = [os.path.abspath(fn) for fn in cmn.getid(sys.argv[1])]

for fn in vcf_list:
    items = fn.split('/')
    parent_dir = '/'.join(items[:-3])
    step2_dir = '%s/step2_bwa_mapping/mapped_reads_count' % parent_dir
    sp = cmn.lastName(fn).split('_')[0]
    lines = cmn.cmd2lines('grep %s %s/*' % (sp, step2_dir))
    maxRef = (None, 0)
    if len(lines) == 0:
        print('Error for %s' % fn)

    for line in lines:
        a, ref, mapN, totalN = line.strip().split()
        if int(mapN) > maxRef[1]:
            maxRef = [ref, int(mapN)]

    ref = maxRef[0]
    #ref = 'Junonia_v2_withMito'
    parent_dir = '/'.join(items[:-1])
    new_vcf = '%s/%s_%s_snp_step2.vcf' % (parent_dir, sp, ref)
    cmd = 'mv %s %s' % (fn, new_vcf)
    print(cmd)
Ejemplo n.º 4
0
    #options=parse_options()
    try:
        fn, fadd = sys.argv[1:3]
    except:
        print("Usage: *.py aln repID.file", file=sys.stderr)
        sys.exit()

    #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps'
    #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs'
    #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID)
    #       if i.strip() != ''])

    #fadd = 'added_sps'
    #if cmn.filexist(fadd):
    #    print 'found local list, add them in'
    goodIDs = set([each for each in cmn.getid(fadd) if each[0] != '#'])

    dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa',
                                                        '') + '_taken.fa'
    dp = open(dn, 'w')

    new = []
    leftIDs = set(goodIDs)
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                #name = line[1:].strip().strip().split('_')[0].replace('flt', '').split('Dup')[0]
                name = line[1:].strip().split()[0]
                if name in goodIDs:
                    isGood = True
                    if name in leftIDs:
Ejemplo n.º 5
0
import sys
import os

python_lib = '/work/biophysics/mtang/SNP_calling/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#1. read in data
fns = cmn.getid(sys.argv[1])

falist = cmn.cmd2lines('ls *m2s.fa')

finished_maps = set([fn.replace('_m2s.fa', '.map') for fn in falist])

isGood = True

cmds = []
for fn in fns:
    label = cmn.lastName(fn)
    if label in finished_maps:
        continue

    isGood = False
    if 'MITO' in label:
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta_mito.py %s' % fn
    else:
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta.py %s' % fn
    cmds.append(cmd)
Ejemplo n.º 6
0
    try:
        fn, fadd = sys.argv[1:3]
    except:
        print("Usage: *.py aln repID.file", file=sys.stderr)
        sys.exit()

    #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps'
    #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs'
    #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID)
    #       if i.strip() != ''])

    #fadd = 'added_sps'
    #if cmn.filexist(fadd):
    #    print 'found local list, add them in'
    goodIDs = set(
        [each.split('_')[0] for each in cmn.getid(fadd) if each[0] != '#'])

    dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa',
                                                        '') + '_taken.fa'
    dp = open(dn, 'w')

    new = []
    leftIDs = set(goodIDs)
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                name = line[1:].strip().strip().split('_')[0].replace(
                    'flt', '').split('Dup')[0].split('.Lere')[0]
                #name = line[1:].strip().split()[0]
                #name = line[1:].strip().split('.Lerema')[0]
                if name in goodIDs:
Ejemplo n.º 7
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        odir, f_ass = sys.argv[1:3]
    except:
        print("Usage: *.py filelist assembly_v0.fa", file=sys.stderr)
        print("you should index assembly_v0.fa first with -p assembly_v0", file=sys.stderr)
        print("using command /home2/wli/local/bwa-0.7.12/bwa index ", file=sys.stderr)
        sys.exit()

    #fns = cmn.cmd2lines('ls %s/*.fq' % odir)
    fns = cmn.getid(odir)

    group_dict = separate_by_label(fns)

    ass_label = cmn.find_between(cmn.lastName(f_ass), 'assembly_', '.fa')

    cmn.mkdir('job_files')
    cmn.mkdir('cmd_files')

    for plabel in group_dict:
        print('processing lib %s' % plabel)
        each = group_dict[plabel]
        #also parse the files inside this function
        #return the file name after parsing
        paired, unpaired = separate_by_pair(plabel, each)
        if paired == None:
Ejemplo n.º 8
0
    sys.path.append(python_lib)

import cmn
import os
import time

def get_current_jobs(label, user):
    cmd = 'squeue| grep %s| grep g%s|wc -l' % (user, label)
    N = cmn.cmd2info(cmd).split()[0]
    N = int(N)
    return N


fn = 'forked_jobs.list'

jobs = cmn.getid(fn)

cores = int(sys.argv[1])

user = cmn.cmd2info('echo $USER').strip()
user_label = user[0]

currentN = get_current_jobs(user_label, user)

os.chdir('job_files')

todo = list(jobs)

while(len(todo) != 0):
    fjob = todo[0]
    currentN = get_current_jobs(user_label, user)
Ejemplo n.º 9
0
        for a0, a1, a2 in [aset, bset]:
            if a1 == None and a0 != None and a2 != None:
                return True
    return False  #no indel


if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py samlist", file=sys.stderr)
        sys.exit()

    fns = cmn.getid(fn)
    cmn.run('rm hasDeletion 2> /dev/null')

    read_dict = {}
    bad_alignments = []
    seqDict = {}

    for fn in fns:
        print('parsing %s...' % fn)
        try:
            samfile = pysam.AlignmentFile(fn)
        except:
            print('skip empty file %s' % fn)
            continue

        for record in samfile:
    sys.path.append(python_lib)

import cmn
import os

def group_list(alist):
    adict = {}
    for fn in alist:
        sp = cmn.lastName(fn).split('_')[0]
        try:
            adict[sp].append(fn)
        except KeyError:
            adict[sp] = [fn]
    return adict            

fq_list = cmn.getid(sys.argv[1])
vcf_list = cmn.getid(sys.argv[2])
samdir_list = cmn.getid(sys.argv[3])
vcfCov_dir = cmn.getid(sys.argv[4])[0]

#5737_3311_assembly_v1_stat.report
finished = [cmn.lastName(each) for each in cmn.getid(sys.argv[5])]

refresh = any([each=='-r' for each in sys.argv])

fq_groups = group_list(fq_list)# group by sp 
vcf_groups = group_list(vcf_list)

cmds = []
for sp in vcf_groups:
    vcf_fns = vcf_groups[sp]
Ejemplo n.º 11
0
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py gap_see", file=sys.stderr)
        sys.exit()

    pop_map = {}
    fpops = cmn.cmd2lines(
        'ls /project/biophysics/Nick_lab/wli/sequencing/general_info/P*IDs')
    for fpop in fpops:
        alist = cmn.getid(fpop)
        popname = cmn.lastName(fpop)[1:-3]
        for sp in alist:
            pop_map[sp] = popname

    #15101E04_snp.codeVcf_father 13629191 1986713 0.145768960168
    adict = {}
    for line in cmn.file2lines(fn):
        items = line.split()
        sp = items[0].split('_')[0]
        gapF = float(items[-1])
        try:
            pop = pop_map[sp]
        except:
            continue
Ejemplo n.º 12
0
            print('Error! can not decide the reference for %s' % sp)
            print('Please remove the duplications in ')
            print('\n'.join(bestFns))
            isbad = True

        ref = cmn.txt_read(bestFns[0])
        adict[sp] = ref
        rset.add(ref)
    
    if isbad:
        sys.exit()
    return rset, adict        


#1. read in data
fns = cmn.getid(sys.argv[1])

#bwa_dirs = [line.strip().rstrip('/') for line in cmn.getid(sys.argv[2])]

#2. check which reference they used
#sp is unique
vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
#6188_3842_assembly_v2_snp_step2.vcf
#vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
sps = list(vcf_dict.keys())

#ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs)
ref_genomes, refmapping = set([]), {}
for fn in fns:
    #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf
    fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '')
Ejemplo n.º 13
0
import sys
python_lib = '/work/biophysics/mtang/SNP_calling/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import os

jobs = [line.strip().split()[-1] for line in cmn.getid(sys.argv[1])]

fromDir = sys.argv[2].rstrip('/')  #the dir ends with step3

cwd = os.getcwd()

cmn.mkdir('job_files')
cmn.mkdir('step3_gatk')

fromPdir = '/'.join(fromDir.split('/')[:-1])
cmn.run('ln -s %s/step2_bwa_mapping' % fromPdir)

fjobs = []
#1. copy the directory to current
for job in jobs:
    wdir = job[4:-4]
    current = '%s/%s' % (fromDir, wdir)
    cmd = 'cp -r %s step3_gatk' % current
    print('forking data for %s' % current)
    cmn.run(cmd)
    new = '%s/step3_gatk/%s' % (cwd, wdir)
    user = cmn.cmd2info('echo $USER').strip()
    user_label = user[0]
Ejemplo n.º 14
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fns = sys.argv[1:]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    fqs = []
    for fn in fns:
        fqs += cmn.getid(fn)

    sequences = []
    for fq in fqs:
        print('reading %s' % fq)
        with open(fq) as fp:
            for i, line in enumerate(fp):
                if i % 4 == 1:
                    sequences.append(line.strip())

    maxlength = max(list(map(len, sequences)))

    for i in range(3, maxlength):
        print('checking %s' % i)
        sample_times = 10
        check_seeds = generate_random_sequence(i, sample_times)
Ejemplo n.º 15
0
            print('Error! can not decide the reference for %s' % sp)
            print('Please remove the duplications in ')
            print('\n'.join(bestFns))
            isbad = True

        ref = cmn.txt_read(bestFns[0])
        adict[sp] = ref
        rset.add(ref)
    
    if isbad:
        sys.exit()
    return rset, adict        


#1. read in data
bwa_dirs = cmn.getid(sys.argv[1])
fgood = 'good_maps.txt'
fbad = 'bad_maps.txt'

mito = set([])
genome = set([])

for line in cmn.file2lines(fgood):
    sp = line.split('_')[0]
    if 'MITO' in line:
        mito.add(sp)
    else:
        genome.add(sp)


refs, refdict = detect_ref_genomes( genome , bwa_dirs)
Ejemplo n.º 16
0
import cmn


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #fn = 'coding.fasta'
    fn = sys.argv[1]
    sampleIDs = set(cmn.getid(sys.argv[2]))
    print(sampleIDs)

    gapped = set([])
    adict = {}
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                defline = line.strip()
            else:
                seq = line.strip()
                adict[defline] = seq
                if defline.split('_')[0][1:] in sampleIDs:
                    count = 0
                    for char in seq:
                        if char == '-' or char == 'N' or char == ',' or char == 'X':
Ejemplo n.º 17
0
            print('Error! can not decide the reference for %s' % sp)
            print('Please remove the duplications in ')
            print('\n'.join(bestFns))
            isbad = True

        ref = cmn.txt_read(bestFns[0])
        adict[sp] = ref
        rset.add(ref)

    if isbad:
        sys.exit()
    return rset, adict


#1. read in data
fns = cmn.getid(sys.argv[1])

bwa_dirs = [line.strip().rstrip('/') for line in cmn.getid(sys.argv[2])]

#2. check which reference they used
#sp is unique
vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
#6188_3842_assembly_v2_snp_step2.vcf
#vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
sps = list(vcf_dict.keys())

#ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs)
ref_genomes, refmapping = set([]), {}
for fn in fns:
    #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf
    fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '')
Ejemplo n.º 18
0
            adict[sp] = [fn]
    return adict


def check_NA(label):
    fn = label + '_stat.report'
    line = cmn.file2lines(fn)[-1]
    items = line.strip().split()
    #return isWarnning
    if len(items) != 10 or 'NA' in items:
        return True
    else:
        return False


fq_list = cmn.getid(sys.argv[1])
vcf_list = cmn.getid(sys.argv[2])
samdir_list = cmn.getid(sys.argv[3])
vcfCov_dir = cmn.getid(sys.argv[4])[0]

report_files = cmn.cmd2lines('ls *_stat.report')
finished_labels = set(
    [cmn.lastName(each).replace('_stat.report', '') for each in report_files])

refresh = any([each == '-r' for each in sys.argv])

fq_groups = group_list(fq_list)  # group by sp
vcf_groups = group_list(vcf_list)

isGood = True
Ejemplo n.º 19
0
    keys = list(adict.keys())
    for key in keys:
        each = adict[key]
        if len(each) != 2:
            print('Error! number of libs is wrong for %s' % key)
            print('below are the detected libs:')
            print('\n'.join(each))
            print('Please fix!')
            sys.exit()
        each.sort()
        adict[key] = each
    return adict


#---------------main--------------------
fastqs = cmn.getid(sys.argv[1])

findex = sys.argv[2]

ad_dict = make_ad_dict(findex)

fastq_dict = group_fastqs(fastqs)

cmds = []
for key in fastq_dict:
    R1, R2 = fastq_dict[key]
    try:
        ad1 = ad_dict[key]
    except KeyError:
        print('Error! missing data for %s' % key)
        continue
Ejemplo n.º 20
0
    #options=parse_options()
    try:
        freftable, mapdir, freq = sys.argv[1:4]
    except:
        print(
            "Usage: *.py ../step1_gather_data/mapping_info.txt ../step2_bwa_mapping ../step1_gather_data/require_SNPs.dict.pkl",
            file=sys.stderr)
        sys.exit()

    cwd = os.getcwd()

    if not os.path.exists('bad_vcf.list'):
        print('Error! can not find info for bad vcf files!')
        sys.exit()

    badones = set(cmn.getid('bad_vcf.list'))

    #1. read in info
    fsams = cmn.cmd2lines('ls %s/*/*/*.sam' % mapdir)
    #print fsams
    samdirs = set(['/'.join(fsam.split('/')[:-2]) for fsam in fsams])
    #print samdirs
    require_refs = cmn.pickle_read(freq)

    fq_dict = {}
    refdict = {}
    #1. tell by reftable
    #make the requirement by the reftable
    required = {}
    for line in cmn.file2lines(freftable):
        items = line.strip().split()
Ejemplo n.º 21
0
    #options=parse_options()
    try:
        fn, fadd = sys.argv[1:3]
    except:
        print("Usage: *.py aln repID.file", file=sys.stderr)
        sys.exit()

    #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps'
    #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs'
    #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID)
    #       if i.strip() != ''])

    #fadd = 'added_sps'
    #if cmn.filexist(fadd):
    #    print 'found local list, add them in'
    goodIDs = set(cmn.getid(fadd))

    dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa',
                                                        '') + '_taken.fa'
    dp = open(dn, 'w')

    new = []
    leftIDs = set(goodIDs)
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                name = line[1:].strip().split('_')[0].split('-')[0]
                if name in goodIDs:
                    isGood = True
                    if name in leftIDs:
                        leftIDs.remove(name)
Ejemplo n.º 22
0
if fq == '':
    print('Error! can not find fastq list file!')
    sys.exit()
else:
    print('guessing fastq file to be %s' % fq)

if fref == '':
    print('Error! can not find ref table file!')
    sys.exit()
else:
    print('guessing ref table file to be %s' % fref)

fq_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/fastq.filelist'
if os.path.exists(fq_all):
    aset = set(cmn.getid(fq_all))
else:
    aset = set([])

bset = set(cmn.getid(fq))
newset = aset | bset

newset = filter_best_fastq(newset)

cmn.write_lines(newset, fq_all)

fref_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/refTable.txt'
if os.path.exists(fref_all):
    aset = set(cmn.getid(fref_all))
else:
    aset = set([])
import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn, fg = sys.argv[1:3]
    except:
        print("Usage: *.py 2nd_sam_aln.txt good_reads.txt", file=sys.stderr)
        sys.exit()

    goodIDs = set(cmn.getid(fg))

    dp = open('filtered_sam_aln.txt', 'w')
    dbad = open('bad_sam_aln.txt', 'w')
    with open(fn) as fp:
        for line in fp:
            Id = line.strip().split()[0]
            if Id in goodIDs:
                dp.write(line)
            else:
                dbad.write(line)
    dp.close()
    dbad.close()
Ejemplo n.º 24
0
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py map pop1 pop2 ...", file=sys.stderr)
        sys.exit()

    poplist = sys.argv[2:]

    if len(poplist) == 0:
        print('please specify populations!')
        sys.exit()

    popdict = {}
    for fpop in poplist:
        IDs = cmn.getid(fpop)
        name = cmn.lastName(fpop)
        popdict[name] = IDs
        #for ID in IDs:
        #    popdict[ID] = name

    #seqDict = {}
    #with open(fn) as fp:
    #    for line in fp:
    #        if line[0] == '>':
    #            name = line[1:].split('_')[0]
    #        else:
    #            seq = line.strip()
    #            try:
    #                seqDict[name].append(seq)
    #            except KeyError:
Ejemplo n.º 25
0

if __name__=='__main__':
    #options=parse_options()
    try:
        #fn, f_table = sys.argv[1:3]
        fn = sys.argv[1]
    except:
        print("Usage: *.py RAxML_bestTree.noGap", file=sys.stderr)
        sys.exit()

    nameDict = get_names_4barcode()

    info = []
    missing = []
    lines = cmn.getid(fn)

    for line in lines:
        sp = line.strip().split()[0]
        #line = '%s\t%s\n' % (line, nameDict[sp])
        try:
            info.append(nameDict[sp].replace('"', ''))
        except KeyError:
            missing.append(sp)

    info.append('')

    info = '\n'.join(info)

    cmn.write_file(info, 'sampleInfo')