def transfer_alea_files(fnlist):
    transferDir = 'alea_transfer'
    cmn.mkdir(transferDir)
    newlist = []
    for fn in fnlist:
        print('transfering %s from archive server ...' % fn)
        cmd = 'rsync -r [email protected]:%s %s' % (fn, transferDir)
        cmn.run(cmd)
        newlist.append('%s/%s' % (transferDir, cmn.lastName(fn)))
    return newlist
def parse_ref(seqDict):
    cmn.mkdir('baits')

    newDict = {}
    for i, name in enumerate(seqDict):
        seq = seqDict[name]
        fnlabel = 'bait%s' % i
        dn = 'baits/%s.fa' % fnlabel
        name = name.replace('*', '').replace('"', "'")
        fasta = '>%s\n%s\n' % (name, seq)
        cmn.write_file(fasta, dn)
        cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel)
        cmn.run(cmd)
        newDict[name] = dn
    return newDict
def attempt_to_find_genus_by_abundence(ID, fqlist):
    tmpdir = 'tmp_%s' % ID
    cmn.mkdir(tmpdir)
    os.chdir(tmpdir)

    cmn.write_lines(fqlist, 'fqlist')
    cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist'
    cmn.run(cmd)

    dn = 'picked_bait.txt'
    if cmn.filexist(dn):
        genus = cmn.txt_read(dn).strip().split('_')[0].split()[0]
    else:
        genus = None
    os.chdir('..')
    cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID))
    cmn.run('rm -r %s ' % tmpdir)
    return genus
Beispiel #4
0
        sys.exit()

    try:
        pre = sys.argv[4]
    except:
        pre = 'x'

    #cores = 32 # in biohpc, it is 32

    partition = 'super'
    for i, arg in enumerate(sys.argv):
        if arg == '-p':
            partition = sys.argv[i + 1]

    tmpdir = '%s_files' % pre
    cmn.mkdir(tmpdir)
    bundle = len(cmds) / N

    for i in range(N):
        cmd = cmds[i * bundle:(i + 1) * bundle]
        if i == N - 1:
            cmd += cmds[(i + 1) * bundle:]
        dn = '%s/%s_%s' % (tmpdir, pre, i)
        cmn.write_file('\n'.join(cmd), dn)
        submit_job(dn, Npara, pre, i, partition)

    cwd = os.getcwd()
    d_stat = '%s/stat.info' % tmpdir
    info = []
    info.append('commands are from: %s/%s' % (cwd, sys.argv[1]))
    info.append('split into %s jobs' % sys.argv[2])
        '/project/biophysics/Nick_lab/wli/sequencing/mapping/SNP_calling/2_gatk/template_gatk.job'
    )
    template = template.replace('assembly_v0', ass_label)

    fns = cmn.cmd2lines('ls ../1_bwa_align/*.sam | grep -v _v0_')
    fns = [os.path.abspath(i) for i in fns]

    #good_set = set('LEP18259 3318 3303'.split())
    finished = cmn.cmd2lines("ls */*.vcf|cut -d '/' -f 2|cut -d '_' -f 1")

    #group spiecies
    group_dict = group_by_species(fns)

    for slabel in group_dict:
        if slabel in finished:
            print('skip finished ' + slabel)
            continue
        cmn.mkdir(slabel)
        os.chdir(slabel)

        fns = group_dict[slabel]
        f_sam = merge_sams(slabel, fns)

        cmd = template.replace('3377', slabel)
        #cmd = cmd.replace('--job-name=gatk', '--job-name=%s' % slabel)

        os.chdir('..')

        cmn.write_file(cmd, 'gatk%s.job' % slabel)
        cmn.run('sbatch gatk%s.job' % slabel)
Beispiel #6
0
    try:
        odir, f_ass = sys.argv[1:3]
    except:
        print("Usage: *.py filelist assembly_v0.fa", file=sys.stderr)
        print("you should index assembly_v0.fa first with -p assembly_v0", file=sys.stderr)
        print("using command /home2/wli/local/bwa-0.7.12/bwa index ", file=sys.stderr)
        sys.exit()

    #fns = cmn.cmd2lines('ls %s/*.fq' % odir)
    fns = cmn.getid(odir)

    group_dict = separate_by_label(fns)

    ass_label = cmn.find_between(cmn.lastName(f_ass), 'assembly_', '.fa')

    cmn.mkdir('job_files')
    cmn.mkdir('cmd_files')

    for plabel in group_dict:
        print('processing lib %s' % plabel)
        each = group_dict[plabel]
        #also parse the files inside this function
        #return the file name after parsing
        paired, unpaired = separate_by_pair(plabel, each)
        if paired == None:
            continue
        label = '%s_%s' % (plabel, ass_label)
        #index_label = cmn.lastName(f_ass).replace('.fa', '')
        index_label = f_ass.replace('.fa', '')
        cmd = ''
        cmd += '/home2/wli/local/bwa-0.7.12/bwa mem -t 32 -M %s %s %s > %s_paired.sam;\n' % (index_label, paired[0], paired[1], label)
Beispiel #7
0
kept_spdir_files = 'realigned_reads_step2.bam snp_step2.vcf$'.split()

for each in spdirs:
    print(each)
    wdir_label = cmn.lastName(each)
    dwdir = '%s/%s' % (ddir, wdir_label)
    if os.path.exists(dwdir):
        print(
            'the destination directory has already exists! please check manually to choose which one to keep:'
        )
        print('distination dir: %s' % dwdir)
        print('current dir: %s' % each)
        print('\n')
        continue

    cmn.mkdir(dwdir)
    fbam = '%s/realigned_reads_step2.bam' % (each)
    if os.path.exists(fbam):
        cmd = 'cp %s/realigned_reads_step2.bam %s' % (each, dwdir)
    else:
        fbam = '%s/realigned_reads.bam' % each
        if os.path.exists(fbam):
            cmd = 'cp %s/realigned_reads.bam %s' % (each, dwdir)
        else:
            print('Error, can not find bam file!')

    cmn.run(cmd)
    #print cmd
    fvcf = spdirs[each]
    cmd = 'cp %s %s' % (fvcf, dwdir)
    cmn.run(cmd)
Beispiel #8
0
        defline = lines[0]
        seq = ''.join(lines[1:])
        adict[defline] = seq
    return adict


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':

    #fn = 'all_genomes_noGap.fa'
    #fn = 'all_genomes_charGap.fa'
    try:
        fn = sys.argv[1]
    except:
        print('*.py all_genomes_charGap.fa ')
        sys.exit()

    adict = read_fa(fn)

    fnlabel = cmn.lastName(fn).replace('.fa', '')
    outdir = 'splitS_%s' % fnlabel
    cmn.mkdir(outdir)
    for i, key in enumerate(adict):
        seq = adict[key]
        fasta = '>%s\n%s\n' % (key, seq)
        dn = '%s/%s_%s.fa' % (outdir, fnlabel, i)
        cmn.write_file(fasta, dn)
Beispiel #9
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        read, fn, direction = sys.argv[1:4]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    fished_reads = []
    cmn.mkdir('grep_out')
    if True:
        with open(fn) as fp:
            for lineN, line in enumerate(fp):
                if lineN % 4 != 1:  #only take the sequence
                    continue
                line = line.strip()
                if direction == 'backward':
                    line = line[::-1]

                #find match forward and + strand
                i1 = line.find(read)
                if i1 != -1:
                    fished_reads.append(line[i1:])

                #in the reverse strand
Beispiel #10
0
import sys
python_lib = '/work/00412/mtang/sequencing/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import os

fromDir = os.path.abspath(sys.argv[1])
toDir = os.path.abspath(sys.argv[2])

wdirs = cmn.cmd2lines('ls %s | grep ^sampleRun_' % fromDir)

#toKeep = ['*.txt', '*.report', 'barcode_count', '*_contig.fa', 'denovo_barcode.fa', 'bait0_denovo.br']
toKeep = [
    '*.txt', '*.report', '*_contig.fa', 'denovo_barcode.fa', 'bait0_denovo.br'
]
for wdir in wdirs:
    eachToDir = '%s/%s' % (toDir, wdir)
    cmn.mkdir(eachToDir)

    eachFromDir = '%s/%s' % (fromDir, wdir)

    for fn in toKeep:
        cmd = 'cp %s/%s %s' % (eachFromDir, fn, eachToDir)
        print(cmd)
        cmn.run(cmd)
Beispiel #11
0
    pairDict = {}

    for fn in fns:
        key = '_'.join(cmn.lastName(fn).split('_')[:-1])
        #if '250' in key or '500' in key:
        #    print 'skip short lib: %s' % fn
        #    cmn.run('ln -s %s' % fn)
        #    continue

        try:
            pairDict[key].append(fn)
        except KeyError:
            pairDict[key] = [fn]

    cmn.mkdir('logs')
    cmds = []
    for key in pairDict:
        each = pairDict[key]
        if len(each) == 2:
            each.sort()
            cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/re-pair-reads_wenlin %s %s %s >& logs/%s_run.log &' % (
                each[0], each[1], key, key)
            cmds.append(cmd)
        else:
            print('cannot find pair for %s' % str(each))
            for iii in each:
                cmn.run('ln -s %s' % iii)

    cmds.append('wait')
Beispiel #12
0
import sys
python_lib = '/work/biophysics/mtang/SNP_calling/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import os

jobs = [line.strip().split()[-1] for line in cmn.getid(sys.argv[1])]

fromDir = sys.argv[2].rstrip('/')  #the dir ends with step3

cwd = os.getcwd()

cmn.mkdir('job_files')
cmn.mkdir('step3_gatk')

fromPdir = '/'.join(fromDir.split('/')[:-1])
cmn.run('ln -s %s/step2_bwa_mapping' % fromPdir)

fjobs = []
#1. copy the directory to current
for job in jobs:
    wdir = job[4:-4]
    current = '%s/%s' % (fromDir, wdir)
    cmd = 'cp -r %s step3_gatk' % current
    print('forking data for %s' % current)
    cmn.run(cmd)
    new = '%s/step3_gatk/%s' % (cwd, wdir)
    user = cmn.cmd2info('echo $USER').strip()
    user_label = user[0]
Beispiel #13
0
    #check if all the files has contains
    falist = cmn.file2lines(fn)
    bad_falist = [
        fa for fa in falist
        if not cmn.filexist(fa) and '/archive/butterfly/' not in fa
    ]

    if len(bad_falist) != 0:
        print('Error!')
        print('the following files are errorous:')
        print('\n'.join(bad_falist))
        sys.exit()

    transferDir = 'archiveTransfer'
    cmn.mkdir(transferDir)

    alea_list = [fa for fa in falist if '/archive/butterfly' in fa]

    biohpc_list = set(falist) - set(alea_list)

    newlist = transfer_alea_files(alea_list)

    newlist += list(biohpc_list)

    dn = 'new.falist'
    cmn.write_lines(newlist, dn)
    #backup this newlist
    cmn.mkdir('../falist_info')
    dirlabel = os.getcwd().rstrip('/').split('/')[-1]
    backFn = '../falist_info/%s.falist' % dirlabel
Beispiel #14
0
if __name__ == '__main__':
    #options=parse_options()
    try:
        fns = [os.path.abspath(each) for each in sys.argv[3:]]
        #KmerCut = int(sys.argv[1])
        KmerSize = int(sys.argv[1])
        Ncpu = int(sys.argv[2])
    except:
        print("Usage: *.py KmerCut KmerSize(19) Ncpu R1.fq R2.fq",
              file=sys.stderr)
        sys.exit()

    outlabel = cmn.lastName(fns[0]).split('_')[0]
    tmpDir = '%s_jf' % outlabel
    cmn.mkdir(tmpDir)
    #step1, run Jellyfish
    print('running Jellyfish to get Kmer count...')
    os.chdir(tmpDir)
    cmd = 'jellyfish count -m %s -t %s -s 10000000000 -c 8 --timing=jf.err --canonical ' % (
        KmerSize, Ncpu)
    cmd += ' '.join(fns)
    cmn.run(cmd)

    cmd = 'jellyfish histo mer_counts.jf > %smer_histo.txt' % KmerSize
    cmn.run(cmd)

    cmd = 'jellyfish dump -c mer_counts.jf > %smer_counts' % KmerSize
    cmn.run(cmd)

    #step2, filter out reads with high Kmers
    except:
        print('usage: *.py fsam fass', file=sys.stderr)
        sys.exit()

    cmd = 'module add samtools; samtools faidx %s' % fass
    cmn.run(cmd)
    cmd = 'module add picard/1.117; java -jar $PICARD/CreateSequenceDictionary.jar R=%s O=%s.dict' % (
        fass, fass[:-3])
    cmn.run(cmd)

    template = cmn.txt_read(
        '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/template_gatk_bias_fromSam.job'
    )
    template = template.replace('[WL_ref]', fass)
    template = template.replace('[INPUT.sam]', fsam)

    sampleId = cmn.lastName(fsam).replace('highQ_', '').split('_')[0]

    dnlabel = '%s_%s' % (cmn.lastName(fsam).replace(
        '.sam', ''), cmn.lastName(fass).replace('.fa', ''))
    cmn.mkdir(dnlabel)
    os.chdir(dnlabel)

    cwd = os.getcwd()
    pre_cmds = 'cd %s\n' % cwd
    template = template.replace('5642', sampleId)
    template = template.replace('[WL_preprossing]', pre_cmds)

    cmn.write_file(template, 'gatk%s.job' % sampleId)
    #cmn.run('sbatch gatk%s.job' % sampleId)
Beispiel #16
0
        isIndexed = True
    print('###############################################')

    if not isIndexed:
        print('**********************************************')
        print('\nimportant!!!')
        print('please re-run this script after all references are indexed!\n')
        print('**********************************************')
    ###############################
    #all the steps below would put into the job files

    template = cmn.txt_read(
        '/work/biophysics/mtang/SNP_calling/scripts/templates/template_gatk_unbias4TACC.job'
    )

    cmn.mkdir('job_files')
    fjobs = []
    for sp in refdict:
        #if sp.split('_')[0] not in subsetIDs:
        #    continue
        snp_list = refdict[sp]
        for samdir, ref in snp_list:
            label = '%s_%s' % (sp, ref)
            if label not in subsetJobs:
                continue
            print('processing %s' % label)

            #a. make directory
            olabel = '%s_%s' % (sp, ref)
            wdir = '%s/%s' % (cwd, olabel)
            wdir4TACC = '../%s' % olabel
    cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID))
    cmn.run('rm -r %s ' % tmpdir)
    return genus



if __name__=='__main__':
    #options=parse_options()
    try:
        #fn, f_table = sys.argv[1:3]
        fn = sys.argv[1]
    except:
        print("Usage: *.py fqlist", file=sys.stderr)
        sys.exit()

    cmn.mkdir('tmpStat')

    IDlist = set([])
    fq_groups = {}
    for line in cmn.file2lines(fn):
        Id = cmn.lastName(line).split('_')[0]
        Id = Id.replace('NVG-', '').replace('11-BOA-','').replace('LEP-', 'LEP')
        IDlist.add(Id)
        fq = os.path.abspath(line)
        try:
            fq_groups[Id].append(fq)
        except KeyError:
            fq_groups[Id] = [fq]

    nameDict = get_names_4barcode()
    fqlist = cmn.file2lines(fn)
    groupDict = {}
    for fq in fqlist:
        ID = cmn.lastName(fq).split('_')[0]
        try:
            groupDict[ID].append(fq)
        except KeyError:
            groupDict[ID] = [fq]


    for sample in groupDict:
        fqlist = groupDict[sample]
        #fqlist = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/%s*.fastq' % sample)
        #fqlist = cmn.cmd2lines('ls /work/biophysics/wli/workspace/filtered_6313*q')
        wdir = 'mitoD_%s' % sample
        cmn.mkdir(wdir)
        os.chdir(wdir)
        cwd = os.getcwd()
        info = template.replace('[cwd]', cwd)
        info = info.replace('[fq_files]', ' '.join(fqlist))
        info = info.replace('[sample]', sample)

        #prepare quake infiles
        fqlist_local = []
        for fq in fqlist:
            cmn.run('ln -s ' + fq)
            fqlist_local.append(cmn.lastName(fq))
        cmn.write_lines(fqlist_local, 'fqlist')
        cmn.run('ln -s fqlist infiles')

        #make fq2fa comand
Beispiel #19
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    adict = {}
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                label = line.strip()
            else:
                seq = line.strip()
                adict[label] = '%s\n%s\n' % (label, seq)

    times = 10
    keys = list(adict.keys())
    cmn.mkdir('shuffle_genome')
    for each in range(times):
        random.shuffle(keys)
        new = [adict[key] for key in keys]
        dn = 'shuffle_genome/%s_shuffle%s' % (cmn.lastName(fn), each)
        cmn.write_file(''.join(new), dn)
Beispiel #20
0
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py filelist", file=sys.stderr)
        sys.exit()

    #step 1 including 1. making the pileup file, 2. correct bias, 3. sam map again 4. snp call till last step
    #step 2: just the snp call using multiple CPUs

    #only put 15 jobs in a node to avoid memmory problem
    f_ass = '/project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/assembly_v2.fa'
    #should not run for reference genome; this should just build by original snp call
    ref_sp = '3935'

    step1dir = 'step1_cmds'
    cmn.mkdir(step1dir)

    #fns = cmn.getid(fn)
    fns = cmn.cmd2lines(
        'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/realigned_reads.bam'
    )
    finished_pileups = cmn.cmd2lines(
        'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/*.pileup'
    )

    finished = set(
        [cmn.lastName(i).split('_')[0] for i in cmn.cmd2lines('ls */*.vcf')])
    step1_finished = set([
        cmn.lastName(i).split('/')[0]
        for i in cmn.cmd2lines('ls */realigned_reads.bam')
    ])
Beispiel #21
0
    cwd = os.getcwd()

    #1. read in info
    refs = set([])
    rdict = {}
    for line in cmn.file2lines(finfo):
        sp, fastq, ref = line.strip().split()
        try:
            rdict[sp].append((fastq, ref))
        except KeyError:
            rdict[sp] = [(fastq, ref)]
        refs.add(ref)

    #2. prepare reference jobs
    refdir = '/work/biophysics/mtang/SNP_calling/indexed_references'
    cmn.mkdir(refdir)
    os.chdir(refdir)
    index_cmds = ['cd %s' % refdir]
    for ref in refs:
        if not os.path.exists(cmn.lastName(ref)):
            #cmn.run('ln -s %s' % ref)
            cmn.run('cp %s %s/' % (ref, refdir))
        ref = cmn.lastName(ref)
        reflabel = ref.replace('.fa', '')
        checkFn = reflabel + '.pac'
        if cmn.filexist(checkFn):
            print('found finished ref for %s, skip it' % ref)
            continue
        cmd = '/home2/wli/local/bwa-0.7.12/bwa index %s -p %s &' % (ref,
                                                                    reflabel)
        index_cmds.append(cmd)