def find_tree_file(wdir, name):
    tmp = cmn.cmd2lines('ls -t %s/*renamed' % wdir)
    if len(tmp) == 0:
        tmp += cmn.cmd2lines('ls -t %s/%s*tre' % (wdir, name))

    if len(tmp) == 0:
        print('Error! can not find tree in %s' % wdir)
        sys.exit()

    return tmp[0]
Esempio n. 2
0
def backup_finalStat(wdir):
    ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/final_stats/'
    fns = cmn.cmd2lines('ls %s/*.report| grep -v all_genome' % wdir)
    for fn in fns:
        print('processing %s...' % fn)
        fnlabel = cmn.lastName(fn)
        #don't back up the ones without species
        items = fnlabel.replace('_stat.report', '').split('_')
        if len(items) == 1:
            print('skip the fasta without sp for %s' % fn)
            continue

        #get the one with least NA and more data amount
        dn = '%s/%s' % (ddir, fnlabel)
        if os.path.exists(dn):
            print('merging new and old data for %s' % fnlabel)
            Nold_na, Nold_data = count_final_stat(dn)
            Nnew_na, Nnew_data = count_final_stat(fn)
            if Nnew_na < Nold_na:  #less NA
                cmn.run('cp %s %s' % (fn, dn))
            else:
                if Nnew_na == Nold_na:  #same NA number
                    if Nnew_data > Nold_data:
                        cmn.run('cp %s %s' % (fn, dn))

        else:
            cmn.run('cp %s %s' % (fn, dn))
    cmn.run('cd %s; cat *.report > allstat.txt' % ddir)
Esempio n. 3
0
def old_log_newBaits_ifPossible(seqs):
    fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    takenNames = set(
        [each.strip()[1:] for each in cmn.cmd2lines('grep ">" %s' % fall)])

    fnew = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa'
    seqDict = read_fa(fnew)

    for name in seqs:
        if name not in takenNames and (name not in seqDict):
            seqDict[name] = seqs[name][20:678]

    with open(fnew, 'w') as dp:
        for name in seqDict:
            if name not in takenNames and (name not in seqDict):
                print('saving %s into database...' % name)
            fasta = '>%s\n%s\n' % (name, seqDict[name])
            dp.write(fasta)

    fverify = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa'
    dict2 = read_fa(fverify)
    seqDict.update(dict2)
    with open(fverify, 'w') as dp:
        for name in seqDict:
            fasta = '>%s\n%s\n' % (name.replace(
                '(assembled)', '').strip('.'), seqDict[name].replace('-', 'N'))
            dp.write(fasta)
    cmd = 'module add blast;cd /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes; makeblastdb -in=all_barcodes_4verify.fa -dbtype=nucl; chmod a+w all_barcodes_4verify.*'
    cmn.run(cmd)
Esempio n. 4
0
def read_compare_file(fn, fcheck, ID):
    cmd = 'grep -P \'^%s\t\' %s' % (ID, fn)
    line = cmn.cmd2lines(cmd)[0]
    #TODO: need to rework
    if 'same' in line:
        return 'mostConfident', [0]
    elif 'diffGenus' in line:
        return 'diffGenus', [0, 1]
    elif 'takenD' in line:
        return 'confident', [0]
    elif 'completeDenovo' in line and ('goodCC' in line):
        return 'confident', [0]
    elif 'completeDenovo' in line:
        return 'denovoOnly', [0]
    elif 'goodCC' in line:
        return 'goodRef', [1]
    else:
        #these sample would be somewhat problematic
        if 'Error' in line:
            return 'Error', [0, 1]
        elif isHighRatio(line):
            return 'Suspicius', [0, 1]
        elif 'Gap0' not in line:
            return 'PoorSample', [1]
        elif 'noDenovo' in line:
            return 'refOnly', [0]
        else:
            return 'TODO', [0, 1]
def read_baits():
    fns = cmn.cmd2lines('ls baits/bait*.fa')
    seqDict = {}
    for fn in fns:
        name, seq = cmn.file2lines(fn)
        seqDict[name[1:]] = list(seq)
    return seqDict
Esempio n. 6
0
def search_for_old_fastq(label, wdirs):
    global selfDir
    alist = []
    for wdir in wdirs:
        cmd = 'ls %s/%s*q 2> /dev/null' % (wdir, label)
        alist += [line for line in cmn.cmd2lines(cmd) if selfDir not in line]
    return alist
Esempio n. 7
0
def prune_tree(ftree, fseq):
    t = ete3.Tree(ftree)
    IDlist = cmn.cmd2lines('grep ">" %s|cut -d ">" -f 2' % fseq)
    t.prune(IDlist)
    dn = 'prune_tree.tre'
    cmn.write_file(t.write(format=1), dn)
    return dn
Esempio n. 8
0
def parse_fastq_dir(wdir):
    adict = {}
    fns = cmn.cmd2lines('ls %s/*q' % wdir)
    for line in fns:
        sp = cmn.lastName(line).split('_')[0]
        try:
            adict[sp].append(line)
        except KeyError:
            adict[sp] = [line]
    return adict
Esempio n. 9
0
def do_barcode_blast(sequence):
    fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta'
    namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '')
    fquery = '/tmp/%s.fa' % namelabel
    cmn.write_file(sequence, fquery)
    cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb)
    cmd += '-outfmt \'6 sseqid qlen slen length pident\''
    lines = cmn.cmd2lines(cmd)
    #cmn.run('rm %s' % fquery)
    return lines
Esempio n. 10
0
def find_annotation(CG):
    if CG == 'NA':
        return 'NA'
    fn = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/gene_association.fb'
    anno_lines = cmn.cmd2lines('grep %s %s' % (CG, fn))
    annos = set([])
    for line in anno_lines:
        items = line.split('\t')
        anno = items[9]
        annos.add(anno)
    return ','.join(annos)
Esempio n. 11
0
def get_names():
    adict = {}
    fns = cmn.cmd2lines('ls -tr /project/biophysics/Nick_lab/wli/sequencing/scripts/data/*.sampleData')
    for fn in fns:
        for line in cmn.file2lines(fn):
            line = line.strip()
            items = line.split()
            sp = items[0].split('-')[-1]
            line = line.replace(items[0], sp).replace('-', '_').replace('(','').replace(')', '')
            adict[sp] = '_'.join(line.split())
    return adict
def do_barcode_blast(sequence, seqDict):
    #fref = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo'

    fdb = makeBlastDatabase(seqDict)

    #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta'
    namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '')
    namelabel = namelabel.replace('/', '_')
    fquery = '/tmp/%s.fa' % namelabel
    cmn.write_file(sequence, fquery)
    cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb)
    cmd += '-outfmt \'6 sseqid qlen slen length pident\''
    lines = cmn.cmd2lines(cmd)
    cmn.run('rm %s' % fquery)
    return lines
Esempio n. 13
0
def get_query_sequence(seqDict, genus, sp):
    #1. anything in Eudamine file has higher priority
    fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt'
    cmd = 'grep %s %s' % (sp, fEud)
    lines = cmn.cmd2lines(cmd)
    if len(lines) == 1:
        name = lines[0].split()[0]
        seq = seqDict[name]
        fasta = '>%s\n%s\n' % (name, seq)
        qlen = len(seq.replace('N', ''))
        print('pick %s for %s %s' % (name, genus, sp))
        return fasta, qlen

    #look it up in other files
    names = list(seqDict.keys())
    good_names = [name for name in names
            if genus in name ]
    if len(good_names) == 0:#sp is just 'sp'
        print('can not find barcode for genus keyword "%s"' % genus)
        good_names = names

    if len(good_names) > 1:
        #try to refine it
        tmp = [name for name in good_names
                if sp in name]
        if len(tmp) != 0:
            good_names = tmp

    #try to see if type species is there
    tmp = [name for name in good_names
            if name[0] == '*']
    if len(tmp) != 0:
        good_names = tmp
    else:
        tmp = [name for name in good_names
            if '*' in name]
        if len(tmp) != 0:
            good_names = tmp

    #then randomly pick one, get the max length ones
    name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-')))
    seq = seqDict[name]
    fasta = '>%s\n%s\n' % (name, seq)
    qlen = len(seq.replace('N', ''))
    print('pick %s for %s %s' % (name, genus, sp))
    return fasta, qlen
Esempio n. 14
0
def parse_info_file(wdir):
    fns = cmn.cmd2lines(' ls %s/fastq*' % wdir)
    adict = {}
    for fn in fns:
        fp = open(fn)
        #with open(fn) as fp:
        for line in fp:
            line = line.strip()
            sp = cmn.lastName(line).split('_')[0]
            try:
                adict[sp].append(line)
            except KeyError:
                adict[sp] = [line]
        fp.close()

    for key in list(adict.keys()):
        adict[key] = set(adict[key])

    return adict
Esempio n. 15
0
def read_rep():
    dn = 'rep.dict.pkl'
    if cmn.filexist(dn):
        print('loading repeats using precomputed data...')
        return cmn.pickle_read(dn)

    freps = cmn.cmd2lines('ls annotation_repeats/*.gff3')
    repdict = {}
    for frep in freps:
        for line in cmn.file2lines(fn):
            items = line.strip().split()
            scaf = items[0]
            if scaf not in repdict:
                repdict[scaf] = set([])

            i, j = list(map(int, items[3:5]))
            repdict[scaf] = repdict[scaf] | set(range(i, j))
    cmn.pickle_write(repdict, dn)
    return repdict
def split_and_order_sp_byTree(sampleIDs):
    treeDir = '/project/biophysics/Nick_lab/mtang/building_trees'
    wdirs = cmn.cmd2lines('ls %s' % treeDir)

    wdir_dict = get_newest_tree_dir(wdirs)
    rdict = {}
    for projectName in wdir_dict:
        wdir = '%s/%s' % (treeDir, wdir_dict[projectName])
        print('pick %s for %s' % (wdir, projectName))

        IDfile = '%s/NickList' % (wdir)
        IDlist = [each.replace('NVG-', '').replace('LEP-', 'LEP')
                for each in cmn.file2lines(IDfile)]
        overlapIDs = set(IDlist) & set(sampleIDs)
        if len(overlapIDs) > 0:
            ftree = find_tree_file(wdir, projectName)
            ordered_IDs = order_ID_byTree(ftree, overlapIDs)
            rdict[projectName] = ordered_IDs

    return rdict
Esempio n. 17
0
def backup_vcf_coverage(wdir):
    ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/check_vcf_coverage'
    fns = cmn.cmd2lines('ls %s/*_vcf.cov' % wdir)

    #1. only back up the new version of cov file
    for fn in fns:
        print('processing %s...' % fn)
        lines = cmn.file2lines(fn)
        items = lines[-1].strip().split()
        if len(items) != 6:
            print('skip old format file %s' % fn)
            continue
        fnlabel = cmn.lastName(fn)
        dn = '%s/%s' % (ddir, fnlabel)
        if os.path.exists(dn):
            print('merging new and old data for %s' % fnlabel)
            covOld = float(cmn.file2lines(dn)[-1].split()[-2])
            cov = float(lines[-1].split()[-2])
            if cov > covOld:
                cmn.run('cp %s %s' % (fn, dn))
        else:
            cmn.run('cp %s %s' % (fn, dn))
Esempio n. 18
0
def backup_fasta(wdir):
    ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/map2fasta'
    fns = cmn.cmd2lines('ls %s/*_m2s.fa| grep -v all_genome' % wdir)
    for fn in fns:
        print('processing %s...' % fn)
        fnlabel = cmn.lastName(fn)
        #don't back up the ones without species
        items = fnlabel.replace('_snp_step2_MITO_m2s.fa',
                                '').replace('_snp_step2_m2s.fa', '').split('_')
        if len(items) == 1:
            print('skip the fasta without sp for %s' % fn)
            continue

        #get the least gapped one
        dn = '%s/%s' % (ddir, fnlabel)
        if os.path.exists(dn):
            print('merging new and old data for %s' % fnlabel)
            Nold = count_fasta_nonGap(dn)
            Nnew = count_fasta_nonGap(fn)
            if Nnew > Nold:
                cmn.run('cp %s %s' % (fn, dn))
        else:
            cmn.run('cp %s %s' % (fn, dn))
Esempio n. 19
0
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
        Ncores = int(sys.argv[2])
    except:
        print("Usage: *.py fa Ncores", file=sys.stderr)
        sys.exit()

    #if the nodes are less than 4 taxa, produce a random tree
    cmd = "grep '>' %s" % (fn)
    lines = [
        each[1:].strip() for each in cmn.cmd2lines(cmd) if each.strip() != ''
    ]

    N = len(lines)
    if N < 4:
        print('Warning: fastme can not make tree of less than 4 taxa')
        print('Warning: so I make a fake tree...')
        dn = '%s.phylip.fastme.tre' % cmn.lastName(fn)
        if N == 1:
            info = '(%s);\n' % lines[0]
        if N == 2:
            a, b = lines
            info = '(%s,%s);\n' % (a, b)
        elif N == 3:
            a, b, c = lines
            info = '((%s,%s),%s);\n' % (a, b, c)
Esempio n. 20
0
        sys.exit()

    #step 1 including 1. making the pileup file, 2. correct bias, 3. sam map again 4. snp call till last step
    #step 2: just the snp call using multiple CPUs

    #only put 15 jobs in a node to avoid memmory problem
    f_ass = '/project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/assembly_v2.fa'
    #should not run for reference genome; this should just build by original snp call
    ref_sp = '3935'

    step1dir = 'step1_cmds'
    cmn.mkdir(step1dir)

    #fns = cmn.getid(fn)
    fns = cmn.cmd2lines(
        'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/realigned_reads.bam'
    )
    finished_pileups = cmn.cmd2lines(
        'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/*.pileup'
    )

    finished = set(
        [cmn.lastName(i).split('_')[0] for i in cmn.cmd2lines('ls */*.vcf')])
    step1_finished = set([
        cmn.lastName(i).split('/')[0]
        for i in cmn.cmd2lines('ls */realigned_reads.bam')
    ])

    template = cmn.txt_read(
        '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/1_gatk_runs/step1_job.template'
    )
        fn=sys.argv[1]
    except:
        print("Usage: *.py NickList requred_keywords", file=sys.stderr)
        sys.exit()

    words = sys.argv[2:]

    IDs = set(cmn.file2lines(fn))

    #note: cne is equal to 3574_assembly_v1
    wdirs = [
            '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/map2fasta',
            '/project/biophysics/Nick_lab/mtang/unbias_SNPs/*/step4_postprocessing/map2fasta'
            ]

    alea_list = cmn.cmd2lines('ssh [email protected] "ls /archive/butterfly/unbias_pipeline_info/step4_postprocessing/map2fasta/*MITO*.fa"')


    missing = []
    falist = []
    required = ''
    if len(words) != 0:
        required = '|' + '|'.join(['grep %s' % word for word in words])

    faDict = {}
    for ID in IDs:
        taken = []
        for wdir in wdirs:
            cmd = 'ls %s/%s*_m2s.fa  2> /dev/null| grep MITO %s' % (wdir, ID, required)
            taken += cmn.cmd2lines(cmd)
Esempio n. 22
0
    except:
        print("Usage: *.py NickList requred_keywords", file=sys.stderr)
        sys.exit()

    words = sys.argv[2:]

    IDs = set(cmn.file2lines(fn))

    #note: cne is equal to 3574_assembly_v1
    wdirs = [
        '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/vcf2map',
        '/project/biophysics/Nick_lab/mtang/unbias_SNPs/*/step4_postprocessing/vcf2map'
    ]

    alea_list = cmn.cmd2lines(
        'ssh [email protected] "ls /archive/butterfly/unbias_pipeline_info/step4_postprocessing/vcf2map/*.map| grep -v mitogenome"'
    )

    missing = []
    falist = []
    required = ''
    if len(words) != 0:
        for word in words:
            word = word.strip()
            if word == '3574_assembly_v1' or word == 'cne':
                word = '"3574_assembly_v1\|cne"'

            required += '| grep %s' % word

    words = set(words)
Esempio n. 23
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        wdir = sys.argv[1]
    except:
        print("Usage: *.py wdir", file=sys.stderr)
        sys.exit()

    total = 0
    unaligned = 0
    bestDir = cmn.txt_read('%s/best_mapping.txt' % wdir)
    fns = cmn.cmd2lines('ls %s/%s/*.sam' % (wdir, bestDir))
    sp = fns[0].split('/')[-3]
    for fn in fns:
        fp = open(fn)
        #with open(fn) as fp:
        for line in fp:
            if line[0] == '@':
                continue
            else:
                total += 1
                if line.strip().split()[2] == '*':
                    unaligned += 1
        fp.close()
    print(sp, bestDir, (total - unaligned), total)
Esempio n. 24
0
import sys
import os

python_lib = '/work/biophysics/mtang/SNP_calling/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#1. read in data
fns = cmn.getid(sys.argv[1])

falist = cmn.cmd2lines('ls *m2s.fa')

finished_maps = set([fn.replace('_m2s.fa', '.map') for fn in falist])

isGood = True

cmds = []
for fn in fns:
    label = cmn.lastName(fn)
    if label in finished_maps:
        continue

    isGood = False
    if 'MITO' in label:
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta_mito.py %s' % fn
    else:
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta.py %s' % fn
    cmds.append(cmd)
Esempio n. 25
0
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import os
from fullname_lib import get_names_4barcode
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':

    #infoLines = cmn.cmd2lines('head -n 1 sampleRun*/rescued_read_assembled_mis1*.txt')
    IDlist = cmn.cmd2lines("ls -d sampleRun_* |grep -v fake|cut -d '_' -f 2")

    nameDict = get_names_4barcode()

    for ID in IDlist:
        items = nameDict[ID].replace('?', '').split()
        ID, genus, sp = items[:3]
        print('sampleInfo', ID, genus, sp)

        fn = 'sampleRun_%s/good_read_assembled.txt' % ID
        #label = '%s_%s' % (genus, sp)
        cmd = 'head %s -n 2| grep %s' % (fn, genus)
        print(cmd)
        info = cmn.cmd2info(cmd).strip()
        if info == '':
            print('please re-run', ID, genus, sp)
Esempio n. 26
0
def group_fastq(fns):
    adict = {}
    for fn in fns:
        key = cmn.lastName(fn).split('_')[0]
        try:
            adict[key].append(fn)
        except KeyError:
            adict[key] = [fn]
    return adict


#~~~~~~~~main~~~~~~~~~~~~~~#
wdir = sys.argv[1].rstrip('/')
selfDir = os.path.abspath(wdir)

fastqs = cmn.cmd2lines('ls %s/*q' % wdir)
print(fastqs)

log_info = []
outdir = '%s/fastq_thisBatch' % wdir
cmn.mkdir(outdir)

hasCombined = False
for fastq in fastqs:
    label = '.'.join(cmn.lastName(fastq).split('.')[:-1])
    print('processing %s' % label)
    old_fastqs = search_for_old_fastq(label, search_dirs)
    dn = '%s/%s' % (outdir, cmn.lastName(fastq))
    if not os.path.exists(dn):
        cmn.run('mv %s %s' % (fastq, dn))
    else:
Esempio n. 27
0
    except:
        print(
            "Usage: *.py ../step1_gather_data/mapping_info.txt ../step2_bwa_mapping ../step1_gather_data/require_SNPs.dict.pkl TACC_IDs",
            file=sys.stderr)
        sys.exit()

    cwd = os.getcwd()

    #subsetIDs = set(cmn.getid(fsubset))
    subsetJobs = set([
        cmn.lastName(line.replace('sbatch', '').strip())[4:-4]
        for line in cmn.file2lines(fsubset)
    ])

    #1. read in info
    fsams = cmn.cmd2lines('ls %s/*/*/*.sam' % mapdir)
    #print fsams
    samdirs = set(['/'.join(fsam.split('/')[:-2]) for fsam in fsams])
    #print samdirs
    require_refs = cmn.pickle_read(freq)

    fq_dict = {}
    refdict = {}
    #1. tell by reftable
    #make the requirement by the reftable
    for line in cmn.file2lines(freftable):
        items = line.strip().split()
        sp = items[0]
        fastqs = items[1].split(',')
        fq_dict[sp] = fastqs
Esempio n. 28
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        indir, label = sys.argv[1:3]
    except:
        print("Usage: *.py ../1_process_alignment/noGap_splits noGap",
              file=sys.stderr)
        sys.exit()

    fns = [os.path.abspath(i) for i in cmn.cmd2lines('ls %s/*' % indir)]

    wdir = 'split_run_%s' % label
    cmn.mkdir(wdir)
    os.chdir(wdir)
    cmn.mkdir('job_files')
    for count, fn in enumerate(fns):
        cmn.run('ln -s %s' % fn)
        fn_new = cmn.lastName(fn)
        cmd = 'rm *%s_%s; /home2/wli/local/RAxML/raxmlHPC-PTHREADS-SSE3 -m GTRGAMMA -s %s -n %s_%s -p 7112 -T 48 ' % (
            label, count, fn_new, label, count)
        dn = 'job_files/sg%s.job' % count
        cmn.run('/home2/wli/my_programs/make_job.py "%s" -p 256GB -t 33 > %s' %
                (cmd, dn))
Esempio n. 29
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py *.report", file=sys.stderr)
        sys.exit()

    fcomp = 'compare.check'

    cmd = 'grep -v same compare.check'
    lines = cmn.cmd2lines(cmd)

    #exclude those with gap0 and nothing in report file
    badSp = set([])
    for line in cmn.file2lines(fn):
        if line[0] == '#':
            continue
        sp = line.split()[0].split('_')[0]
        badSp.add(sp)

    for line in lines:
        items = line.strip().split()
        sp = items[0]
        if sp not in badSp and items[2] == 'Gap0':
            if 'no' in items[-1]:
                line = 'goodReport:' + line
    stackSeq = parse_del_p(stackSeq, del_p)
    cleanSeq = parse_del_p(cleanSeq, del_p)
    threadSeq = parse_del_p(threadSeq, del_p)
    return threadSeq, stackSeq, cleanSeq


def parse_del_p(seq, del_p):
    new = [char for i, char in enumerate(seq) if i not in del_p]
    return ''.join(new)


if __name__ == '__main__':
    #olines = cmn.cmd2lines('grep "thread\|stack" sampleRun_*/good_read_assembled.txt')
    #lines = cmn.cmd2lines('grep -H "threaded_\|stack_\|clean_" sampleRun_*/rescued_read_assembled_mis1*.txt')
    wdirs = cmn.cmd2lines('ls -d sampleRun_*')

    #cmn.mkdir('sampleRun_fake')
    #cmn.run('touch sampleRun_fake/good_read_assembled.txt')
    #cmn.run('touch sampleRun_fake/rescued_read_assembled_mis1.txt')
    #time.sleep(2)

    cmn.run('rm cannot_fixed_indel.txt 2> /dev/null')
    #frecords = cmn.cmd2lines('ls sampleRun_*/pickingLog.txt')

    stack_lines = {}
    thread_lines = {}
    clean_lines = {}
    leftN = 20
    barcodeLength = 658
    for wdir in wdirs: