#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fqlist, fmitolist = sys.argv[1:]
    except:
        print("Usage: *.py fqlist refMitoList", file=sys.stderr)
        sys.exit()

    #ftemplate = '/work/biophysics/wli/Eudamine/wholeMito_run2/mito_denovo.template'
    ftemplate = '/project/biophysics/Nick_lab/wli/sequencing/scripts/mito_scripts/mito_refDenovo.template'
    template = cmn.txt_read(ftemplate)
    fqlist = cmn.file2lines(fqlist)
    groupDict = {}
    for fq in fqlist:
        fq = os.path.abspath(fq)
        ID = cmn.lastName(fq).split('_')[0]
        try:
            groupDict[ID].append(fq)
        except KeyError:
            groupDict[ID] = [fq]

    fmitolist = os.path.abspath(fmitolist)
    for sample in groupDict:
        fqlist = groupDict[sample]
        wdir = 'mitoRef_%s' % sample
        cmn.mkdir(wdir)
Exemple #2
0
        sys.exit()

    f_table = '/project/biophysics/Nick_lab/wli/sequencing/scripts/name_table'

    nameDict = {}
    for line in cmn.file2lines(f_table):
        items = line.strip().split()
        if len(items) == 0:
            continue
        label = items[0]
        name = '_'.join(items)
        nameDict[label] = name.replace('-', '_')

    print(list(nameDict.keys()))

    t = ete3.Tree(cmn.txt_read(fn).replace('[&U]', ''))

    appear = {}
    for node in t:
        name = node.name
        sp = name.split('_')[0]
        if sp not in appear:
            appear[sp] = 1
        else:
            appear[sp] += 1

        new_name = '%s_cp%s' % (nameDict[sp], appear[sp])
        node.name = new_name

    info = t.write()
    print(info)
Exemple #3
0
        sys.exit()

    f_table = '/project/biophysics/Nick_lab/wli/sequencing/scripts/name_table'

    nameDict = {}
    for line in cmn.file2lines(f_table):
        items = line.strip().split()
        if len(items) == 0:
            continue
        label = items[0]
        name = '_'.join(items[1:])
        nameDict[label] = name.replace('-', '_')

    print(list(nameDict.keys()))

    info = cmn.txt_read(fn)
    #hasDash = ('_' in info)
    hasDash = False
    for label in nameDict:
        name = nameDict[label]
        #data_label = '%s_' % label
        if label.isdigit():
            if hasDash:
                data_label = '%s_' % label
                new_name = '%s_%s_' % (label, name)
            else:
                data_label = '%s' % label
                new_name = '%s_%s' % (label, name)

        else:
            data_label = '%s' % label
Exemple #4
0
        sys.exit()

    key_cmd = sys.argv[1]

    node = '1'
    part = 'super'
    time_hour = '200'
    for i, arg in enumerate(sys.argv):
        if arg == '-n':
            node = sys.argv[i + 1]
        elif arg == '-p':
            part = sys.argv[i + 1]
        elif arg == '-t':
            time_hour = sys.argv[i + 1]

    cwd = os.getcwd()

    aa = cmn.txt_read('/home2/wli/template/slurm.job')

    aa = aa.replace('NODE', node)
    aa = aa.replace('PART', part)
    aa = aa.replace('TIME_HOUR', time_hour)

    #aa+="#$ -pe %sway %s\n\n\n"  % (cpu, cpu)

    aa += 'cd %s\n\n' % cwd

    aa += key_cmd + '\n'

    print(aa)
Exemple #5
0
#input:
#output:
#algorithm:
#author:wenlin; Date:2012-

import sys
python_lib = '/home2/wli/my_programs/python_lib'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import ete3

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py bestTree", file=sys.stderr)
        sys.exit()

    t = ete3.Tree(cmn.txt_read(fn))

    print(t.write(format=9))
        print('please submit %s to the queue for indexing ' % fjob)
    else:
        print('good news! all references have been indexed')
        isIndexed = True
    print('###############################################')

    if not isIndexed:
        print('**********************************************')
        print('\nimportant!!!')
        print('please re-run this script after all references are indexed!\n')
        print('**********************************************')
    ###############################
    #all the steps below would put into the job files

    template = cmn.txt_read(
        '/work/biophysics/mtang/SNP_calling/scripts/templates/template_gatk_unbias4TACC.job'
    )

    cmn.mkdir('job_files')
    fjobs = []
    for sp in refdict:
        print('processing %s' % sp)
        snp_list = refdict[sp]
        for samdir, ref in snp_list:

            #a. make directory
            olabel = '%s_%s' % (sp, ref)
            wdir = '%s/%s' % (cwd, olabel)
            wdir4TACC = '../%s' % olabel
            cmn.mkdir(wdir)
Exemple #7
0
def load_verified_barcodes():
    fgood = '/archive/biophysics/Nick_lab/wli/archive/barcodes/auto_tables/verified_barcodes.fa'

    seqDict = read_fa(fgood)
    IDmapping = names2IDs(list(seqDict.keys()))
    return IDmapping, seqDict



if __name__=='__main__':
    fn1 = 'sum_denovo.fa'
    fn2 = 'sum_barcodes.fa'
    #fn3 = 'compare.check'

    if 'Error' in cmn.txt_read('compare.check'):
        print('##########################################################################')
        print('Error in running barcode pipeline! please fix lines with "Error" in "compare.check" file!')
        print('##########################################################################')
        #sys.exit()

    replaceIDs = set(cmn.cmd2lines('grep takenD compare.check|grep -v same|cut -f 1'))

    seqDict1 = read_fa(fn1)
    seqDict2 = read_fa(fn2)

    dn = 'sum_hybrid.fa'
    #newDict = {}
    with open(dn, 'w') as fp:
        for name in seqDict2:
            if name in replaceIDs:
Exemple #8
0
    #output the phylip format file
    seqDict = {ID: ''.join(final[ID]) for ID in final}
    length = len(seqDict[ID])

    new = ['%s\t%s' % (len(seqDict), length)]
    for name in seqDict:
        new.append('%s        %s' % (name, seqDict[name]))

    dn = outlabel + '.phylip'
    cmn.write_lines(new, dn)

    #write out the partition file
    ftemplate = '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/partition_finder.cfg.template'
    fcfg = 'partition_finder.cfg'
    info = cmn.txt_read(ftemplate)
    info = info.replace('[input_phylip]', dn)
    ##Gene3_pos3 = 1452-2208\3;
    print('assuming all are protein coding genes')
    blocks = []
    for name,i,j in setList:
        if True:
            pos = 0
            pLabel = '%s_%s' % (name, pos+1)
            for char in badchars:
                pLabel = pLabel.replace(char, '_')
            line = '%s = %s-%s;\n' % (pLabel, i+pos, j)
            blocks.append(line)
    info = info.replace('[data_block_input]', ''.join(blocks))
    cmn.write_file(info, fcfg)
def add_in_baits(fref):
    fbait = 'sampleInfo.baits'
    ref_info = cmn.txt_read(fref)
    if cmn.filexist('bait_insertion'):
        indel_dict = read_indel_info('bait_insertion')
    else:
        indel_dict = {}

    #baits added by customBaits
    #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo'
    #if cmn.filexist(fadd):
    #    add_lines = cmn.file2lines(fadd)
    #else:
    #    add_lines = []
    add_lines = []

    if len(indel_dict) != 0:
        ref_info = insert_in_ref_info(ref_info, indel_dict)
        add_lines = insert_in_lines(add_lines, indel_dict)

    refIDs = [
        line[1:] for line in ref_info.split('\n')
        if line.strip() != '' and line[0] == '>'
    ]
    addedIDs = [line.split()[1] for line in add_lines]

    #new = []
    #when check a new line, need to check both the fref and the fadd
    #if the one is not in fadd, add it to fadd
    for line in cmn.file2lines(fbait):
        sp, defline, seq = line.strip().split()
        if all([defline.upper() not in refID.upper() for refID in refIDs]):
            #not in ref
            if all([
                    defline.upper() not in addedID.upper()
                    for addedID in addedIDs
            ]):
                if len(seq) == 698:
                    add_lines.append(line)
                else:
                    if len(seq) != 658:
                        print(
                            'Error! length of bait barcode is wrong for %s %s'
                            % (sp, defline))
                        sys.exit()
                    else:
                        seq = add_primer(seq)
                        add_lines.append('%s\t%s\t%s' % (sp, defline, seq))

    #now get a new fadd, need to format it into fasta
    add_fasta = []
    for line in add_lines:
        sp, defline, seq = line.strip().split()
        fasta = '>%s\n%s\n' % (defline, seq)
        add_fasta.append(fasta)

    ref_info += '\n'
    ref_info += ''.join(add_fasta)
    dn = 'species_barcodes_4mapping_withAddon.fa'
    cmn.write_file(ref_info, dn)

    #index it
    cmd = 'module add bwa; bwa index %s' % dn
    cmn.run(cmd)

    #record the new fadd
    #cmn.write_lines(add_lines, fadd)

    return dn
Exemple #10
0
    sys.path.append(python_lib)

import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py falist_file", file=sys.stderr)
        sys.exit()

    fns = cmn.getid(fn)
    template = cmn.txt_read(
        '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/RAxML_tree.job'
    )
    count = 0
    for fn in fns:
        count += 1
        label = cmn.lastName(fn).replace('.fa', '')
        info = template.replace('[FN]', fn)
        info = info.replace('[outlabel]', label)
        dn = 'BTtree%s.job' % count
        cmn.write_file(info, dn)
Exemple #11
0
cwd = os.getcwd()

cmn.mkdir('job_files')
cmn.mkdir('step3_gatk')

fromPdir = '/'.join(fromDir.split('/')[:-1])
cmn.run('ln -s %s/step2_bwa_mapping' % fromPdir)

fjobs = []
#1. copy the directory to current
for job in jobs:
    wdir = job[4:-4]
    current = '%s/%s' % (fromDir, wdir)
    cmd = 'cp -r %s step3_gatk' % current
    print('forking data for %s' % current)
    cmn.run(cmd)
    new = '%s/step3_gatk/%s' % (cwd, wdir)
    user = cmn.cmd2info('echo $USER').strip()
    user_label = user[0]

    fjob = '%s/job_files/%s' % (fromDir, job)
    info = cmn.txt_read(fjob)
    info = info.replace(fromDir, '%s/step3_gatk' % cwd)

    fjob = 'job_files/g%s%s.job' % (user_label, wdir)
    cmn.write_file(info, fjob)
    fjobs.append(cmn.lastName(fjob))

dn = 'forked_jobs.list'
cmn.write_lines(fjobs, dn)
Exemple #12
0
import sys
python_lib = '/home2/wli/my_programs/python_lib'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py alist", file=sys.stderr)
        sys.exit()

    all_deflines = cmn.cmd2lines(
        'grep ">" /archive/biophysics/Nick_lab/wli/project/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa|cut -d ">" -f 2'
    )

    all_genus = set([each.split('_')[0] for each in all_deflines])

    for word in cmn.txt_read(fn).strip().split():
        if word in all_genus:
            print(word)
    #print taken
    all_fa = sum(list(faDict.values()), [])
    if len(all_fa) == 0:
        print('we find nothing... Please ask Wenlin for help')
        cmn.write_lines(IDs, 'missingMITOonlys')
        sys.exit()

    ass_count = count_ass_appearance(all_fa)
    best_ass = max(list(ass_count.keys()), key=lambda x: ass_count[x])
    print(
        'the most common assembly is %s, only take fa mapped to this assembly'
        % best_ass)
    cmn.write_file(best_ass, 'best_assembly.txt')
    #cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/find_falist_mito_checkOnly.py %s %s' % (fn, ' '.join(words))
    #cmn.run(cmd)
    best_ass = cmn.txt_read('best_assembly.txt').strip()

    for ID in faDict:
        alist = faDict[ID]
        taken = [
            each for each in alist
            if best_ass in each.replace('_withMito', '')
        ]

        #print ID, taken

        if best_ass == 'cne' and len(taken) == 0:
            taken += [each for each in alist if '3574_assembly_v1' in each]

        if best_ass == '3574_assembly_v1' and len(taken) == 0:
            taken += [each for each in alist if 'cne' in each]
def read_genus_info_from_bait(fn):
    ID, genus = cmn.txt_read(fn).strip().split()[:2]
    return {ID:genus}
Exemple #15
0
if __name__ == '__main__':
    #options=parse_options()
    try:
        cmd = sys.argv[1]
    except:
        print("Usage: *.py 'seq2ref.py 254780193'", file=sys.stderr)
        print("the command must contain full python to read it",
              file=sys.stderr)
        sys.exit()

    import cmn

    argvs = cmd.split()

    info = cmn.txt_read(argvs[0])

    if "__name__=='__main__'" not in info:
        print("program doesn't contain the line: __name__=='__main__'",
              file=sys.stderr)
        print("exit! do nothing", file=sys.stderr)
        sys.exit()

    #reformat to make it workable for profiler
    info = reformat(info, argvs[1:])

    dn = 'profile_%s' % argvs[0]
    cmn.write_file(info, dn)
    report = cmn.cmd2info('python %s' % dn)

    dn = '%s_report' % argvs[0]
Exemple #16
0
python_lib = '/work/00412/mtang/sequencing/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

if __name__ == '__main__':
    import cmn, os, sys
    k = len(sys.argv)

    if k == 1:
        print('usage:make_job.py fn [-n 4 -p 128G -t 24]')
        sys.exit()

    fn = sys.argv[1]
    key_cmd = cmn.txt_read(fn)

    node = '1'
    part = 'normal'
    time_hour = '48'
    for i, arg in enumerate(sys.argv):
        if arg == '-n':
            node = sys.argv[i + 1]
        elif arg == '-p':
            part = sys.argv[i + 1]
        elif arg == '-t':
            time_hour = sys.argv[i + 1]

    cwd = os.getcwd()

    aa = cmn.txt_read('/work/00412/mtang/sequencing/scripts/slurm.job')
Exemple #17
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    seqDict, length = read_fa(fn)

    template = cmn.txt_read(
        '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/beast_template.xml'
    )

    info = []
    for name in seqDict:
        info.append('<sequence taxon="%s">%s</sequence>' %
                    (name, seqDict[name]))

    info.append('')
    new = template.replace('[WLdata]', '\n'.join(info))
    new = new.replace('[WLlabel]', cmn.lastName(fn))

    dn = cmn.lastName(fn) + '.xml'
    cmn.write_file(new, dn)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fn, ftree = sys.argv[1:3]
    except:
        print("Usage: *.py table ftree", file=sys.stderr)
        sys.exit()


    #ftree = '/project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/current_tree.newick'

    t = ete3.Tree(cmn.txt_read(ftree))

    order_list = []
    nameDict = {}
    for node in t:
        name = node.name.split('_')[0].lstrip("'")
        print(name)
        if '_cp1' in node.name:
            order_list.append(name)
            nameDict[name] = node.name


    #read in fasta
    #seqDict = read_fa(fn)
    table_dict = {}
    for line in cmn.file2lines(fn):
Exemple #19
0
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fvcf, fbam = list(map(os.path.abspath, sys.argv[1:]))
    except:
        print("Usage: *.py *.vcf *.bam", file=sys.stderr)
        print("Generate the command to phase vcf", file=sys.stderr)
        sys.exit()


    template = cmn.txt_read('/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/template_readbackedPhasing.cmds')
    dnlabel = cmn.lastName(fvcf).replace('.vcf', '')
    outdir = '%s_wdir' % dnlabel
    cmn.mkdir(outdir)
    os.chdir(outdir)

    cwd = os.getcwd()

    cmd = 'ln -s %s' % fvcf
    cmn.run(cmd)
    fvcf = cmn.lastName(fvcf)

    cmd = 'ln -s %s' % fbam
    cmn.run(cmd)
    fbam = cmn.lastName(fbam)