예제 #1
0
def compute_fileSize(alist):
    size = 0
    for fn in alist:
        if 'archive/butterfly' in fn:
            cmd = 'ssh [email protected] "python /home/wenlin/my_programs/filesize.py %s"' % fn
            size += int(cmn.cmd2info(cmd).strip())
        else:
            size += cmn.filesize(fn) / 1024 / 1024
    return size
예제 #2
0
def update_SRNP_species(line2):
    srnp = find_SRNPnumber(line2)
    cmd = '/home2/wli/anaconda/bin/python /archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/updateSRNPnumber.py %s' % srnp
    print(cmd)
    sp = cmn.cmd2info(cmd).strip()
    if sp != '':
        newname = '%s|%s' % ('_'.join(sp.split()), srnp)
    else:
        newname = line2
    return newname
예제 #3
0
def remove_duplication(alist):
    stat_dict = {}
    dup = []
    for fn in alist:
        cmd = 'wc -l %s' % fn
        N = int(cmn.cmd2info(cmd).strip().split()[0])
        if N in stat_dict:
            dup.append(fn)
        else:
            stat_dict[N] = fn
    return list(stat_dict.values()), dup
예제 #4
0
def old_find_reference(fn):
    cmd = 'samtools view -H %s| grep "@PG"| grep bwa' % fn
    info = cmn.cmd2info(cmd)
    items = info.strip().split()
    for i, item in enumerate(items):
        if item == '-M':
            ref = items[i+1]
            break

    if ref[-3:] == '.fa':
        ref = ref[:-3]

    print('found ref: %s' % ref)
    return ref
예제 #5
0
def check_difference(seq1, seq2):
    print(len(seq1), len(seq2))
    if len(seq1) == len(seq2):
        return sum([char1 != char2 for char1, char2 in zip(seq1, seq2)
            if char1 not in gapChars and char2 not in gapChars])

    cmn.write_file(seq1, 'tmpSeq1.fa')
    cmn.write_file(seq2, 'tmpSeq2.fa')
    info = cmn.cmd2info('blastn -query tmpSeq1.fa -subject tmpSeq2.fa')
    #Identities = 656/656 (100%)

    identityString = cmn.find_between(info, 'Identities = ', ' (')
    identN, totalN = list(map(int, identityString.split('/')))
    cmn.write_file(info, 'checkTmp%s.br' % (ID))
    return totalN - identN
예제 #6
0
def read_barcode_inWdir(sampleID):
    refbased = cmn.cmd2info('grep thread rescued_read_assembled_mis1*.txt').strip().split()[1]

    adict = {
            '%s_threaded' % sampleID: refbased[20:678],
            }
    try:
        denovo = cmn.file2lines('denovo_barcode.fa')[1]
        adict['%s_denovo' % sampleID] =  denovo
    except:
        pass

    try:
        protDict = read_fa('../all_protBarcodes_complete.fa')
        adict['%s_prot' % sampleID] = protDict[sampleID]
    except:
        pass

    return adict, list(adict.keys())
예제 #7
0
    sys.path.append(python_lib)

import cmn
import os
from fullname_lib import get_names_4barcode
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':

    #infoLines = cmn.cmd2lines('head -n 1 sampleRun*/rescued_read_assembled_mis1*.txt')
    IDlist = cmn.cmd2lines("ls -d sampleRun_* |grep -v fake|cut -d '_' -f 2")

    nameDict = get_names_4barcode()

    for ID in IDlist:
        items = nameDict[ID].replace('?', '').split()
        ID, genus, sp = items[:3]
        print('sampleInfo', ID, genus, sp)

        fn = 'sampleRun_%s/good_read_assembled.txt' % ID
        #label = '%s_%s' % (genus, sp)
        cmd = 'head %s -n 2| grep %s' % (fn, genus)
        print(cmd)
        info = cmn.cmd2info(cmd).strip()
        if info == '':
            print('please re-run', ID, genus, sp)
예제 #8
0
def get_current_jobs(label, user):
    cmd = 'squeue| grep %s| grep g%s|wc -l' % (user, label)
    N = cmn.cmd2info(cmd).split()[0]
    N = int(N)
    return N
예제 #9
0
import time

def get_current_jobs(label, user):
    cmd = 'squeue| grep %s| grep g%s|wc -l' % (user, label)
    N = cmn.cmd2info(cmd).split()[0]
    N = int(N)
    return N


fn = 'forked_jobs.list'

jobs = cmn.getid(fn)

cores = int(sys.argv[1])

user = cmn.cmd2info('echo $USER').strip()
user_label = user[0]

currentN = get_current_jobs(user_label, user)

os.chdir('job_files')

todo = list(jobs)

while(len(todo) != 0):
    fjob = todo[0]
    currentN = get_current_jobs(user_label, user)
    print(currentN)
    if currentN < cores:
        #submit
        cmd = 'sbatch %s' % fjob
예제 #10
0
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py vcf", file=sys.stderr)
        sys.exit()

    total = cmn.cmd2info('wc -l %s' % fn).split()[0]

    SNPs = cmn.cmd2info('grep HaplotypeScore %s > %s.tmp; wc -l %s.tmp' %
                        (fn, fn, fn)).split()[0]

    lowqual = cmn.cmd2info('grep LowQual %s.tmp|wc -l ; rm %s.tmp' %
                           (fn, fn)).split()[0]

    print(cmn.lastName(fn), total, SNPs, lowqual,
          int(SNPs) / float(total),
          int(lowqual) / float(SNPs))
def check_fastqlines(fn):
    cmd = 'wc -l %s' % fn
    N = int(cmn.cmd2info(cmd).strip().split()[0])
    return N
예제 #12
0
              file=sys.stderr)
        sys.exit()

    import cmn

    argvs = cmd.split()

    info = cmn.txt_read(argvs[0])

    if "__name__=='__main__'" not in info:
        print("program doesn't contain the line: __name__=='__main__'",
              file=sys.stderr)
        print("exit! do nothing", file=sys.stderr)
        sys.exit()

    #reformat to make it workable for profiler
    info = reformat(info, argvs[1:])

    dn = 'profile_%s' % argvs[0]
    cmn.write_file(info, dn)
    report = cmn.cmd2info('python %s' % dn)

    dn = '%s_report' % argvs[0]
    cmn.write_file(report, dn)
    print('results in %s' % dn)

    dn2 = "%s_sorted" % dn
    cmd = 'cat %s| sort -r -nk4 > %s' % (dn, dn2)
    os.system(cmd)
    print('sorted result by the accumuated time is in %s' % dn2)
예제 #13
0
    readSizeStr = format_readSize(readSize)
print(readSizeStr)

#2. step2, get the reference and its length
vcf_label = cmn.lastName(vcf_fn).replace('_snp_step2.vcf', '')
items = vcf_label.split('_')
sp = items[0]
reflabel = '_'.join(items[1:])
ref_length = get_ref_length(reflabel)
print(ref_length)

#3. get percentage of mapping
#../../step2_bwa_mapping
#TODO: if sam data available, recompute it
cmd = 'cat %s/mapped_reads_count/*| grep %s| grep %s' % (samdir, sp, reflabel)
info = cmn.cmd2info(cmd)
items = info.strip().split()

if len(items) == 0:
    print('Error! can not find map percentage for %s %s' % (sp, reflabel))
    mapPercentage = 'NA'
else:
    if len(items) == 4:  #old format, ignore the mapN
        mapPercentage = 'oldstat'
        mapN, totalN = list(map(int, items[2:4]))
    else:
        mapN, totalN, halfN, pPercent = list(map(float, items[-4:]))
    #mapPercentage = float(mapN) / totalN
    mapPercentage = 'ready'

print(mapPercentage)
예제 #14
0
import os

wdir = os.path.abspath(sys.argv[1].rstrip('/'))

fvcfs = cmn.cmd2lines('ls %s/*/*.vcf' % wdir)

refdir = '/work/biophysics/mtang/SNP_calling/indexed_references'

badones = []
for fvcf in fvcfs:
    label = fvcf.split('/')[-2]
    reflabel = '_'.join(label.split('_')[1:])
    finfo = '%s/%s_scafLength.txt' % (refdir, reflabel)
    if not os.path.exists(finfo):
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/assembly_scaf_length.py %s/%s.fa ' % (
            refdir, reflabel)
        cmn.run(cmd)

    infoline = cmn.cmd2info('tail -n 1 %s' % finfo).strip()
    Cscaf, Cindex = infoline.split()[:2]

    checkline = cmn.cmd2info('tail -n 1 %s' % fvcf).strip()
    scaf, index = checkline.split()[:2]

    if scaf != Cscaf or Cindex != index:
        print('Error! problematic vcf file for %s' % label)
        badones.append(label)

dn = 'bad_vcf.list'
cmn.write_lines(badones, dn)
예제 #15
0
#sps = mapF_dict.keys()

#ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs)

#3. make the length check
ref_dir = '/work/biophysics/mtang/SNP_calling/indexed_references'

print('validating map files...')
refNdict = read_refN(ref_genomes)

good_maps = []
bad_maps = []
for sp in refmapping:
    ref = refmapping[sp]
    fmaps = mapF_dict[sp]
    refN = refNdict[ref]
    mapN = 0
    for fmap in fmaps:
        N = int(cmn.cmd2info('wc -l %s' % fmap).split()[0])
        mapN += N
    if refN != mapN:
        print('Error! the line of map doesn\'t agree with reference for %s' %
              sp)
        print('Nref vs Nmap: %s %s; ref is %s\n' % (refN, mapN, ref))
        bad_maps += fmaps
    else:
        good_maps += fmaps

cmn.write_lines(good_maps, 'good_maps.txt')
cmn.write_lines(bad_maps, 'bad_maps.txt')
예제 #16
0
    try:
        requires[sp].append(set(refs))
    except KeyError:
        requires[sp] = [set(refs)]

#check if the ref genome exist
#check if the ref is conflict with the one we already have
refdir = '/work/biophysics/mtang/SNP_calling/indexed_references'
for ref in allrefs:
    if not os.path.exists(ref):
        print('reference %s doesn\'t exist! please email to ask!' % ref)

    oldref = '%s/%s' % (refdir, cmn.lastName(ref))
    #print oldref, ref
    if os.path.exists(oldref):
        check = cmn.cmd2info('diff %s %s| wc -l ' % (oldref, ref))
        if int(check) != 0:
            print('new ref is different from old ref! please email to ask!')
            print('old ref: %s' % oldref)
            print('new ref: %s' % ref)

#addon: check fastq to see if anything has been done before
fdone = '/project/biophysics/Nick_lab/mtang/archive/submission_done'
info_wdir = '/project/biophysics/Nick_lab/mtang/archive/step1_info'
fastq_dir = '/project/biophysics/Nick_lab/mtang/archive/fastq_libs'

info_dict = parse_info_file(info_wdir)
done_dict = parse_done_file(fdone)
#current_fastqs = parse_fastq_dir(fastq_dir)

print('***********************************************************')