Esempio n. 1
0
def separate_by_pair_old(label, fns):
    #paired = [i for i in fns if ('_paired' in i) ]
    paired = [i for i in fns if ('_pair' in i) or ('_R' in i)]
    paired.sort()
    if len(paired) != 2:
        print('error: wrong number of pairs as %s' % str(paired))
        print('from: %s' % str(fns))
        print('need to change the label criterion')
        sys.exit()

    unpaired = set(fns) - set(paired)

    #parse each files
    newPaired = []
    for fn in paired:
        if os.path.exists(cmn.lastName(fn)):
            cmn.run('unlink %s;' % cmn.lastName(fn))
        cmn.run('ln -s %s' % fn)
        newPaired.append(cmn.lastName(fn))

    singleFn = '%s_single.fq' % label
    if cmn.filexist(singleFn):
        cmn.run('rm %s' % singleFn)
    for fn in unpaired:
        cmn.run('cat %s >> %s' % (fn, singleFn))

    return newPaired, singleFn
Esempio n. 2
0
def find_genus_info():
    genus = None
    try:
        genus = sys.argv[2]
    except:
        fn = 'restricted_genus.info'
        if cmn.filexist(fn):
            genus = cmn.txt_read(fn).strip()
    return genus
Esempio n. 3
0
def find_reference(fn):
    pdir = '/'.join(fn.split('/')[:-1])
    fass_label = '%s/assembly_selfref_v2' % pdir
    fass = '%s/assembly_selfref_v2.fa' % pdir
    if not cmn.filexist(fass):
        print('WARNING: can not find assembly_selfref_v2.fa, use the orginal one')
        reflabel = '_'.join(fn.split('/')[-2].split('_')[1:])
        fass_label = '/work/biophysics/mtang/SNP_calling/indexed_references/%s' % reflabel
    return fass_label
Esempio n. 4
0
def separate_by_pair(fastqs, wdir):
    print(wdir)
    pdict = {}
    mapdict = {}
    for fastq in fastqs:
        key = '.'.join(cmn.lastName(fastq).split('.')[:-1])
        mapdict[key] = fastq

    names = list(mapdict.keys())
    length = max([len(name) for name in names])
    for i in range(length):
        if len(names) == 0:
            break

        checks = [name[:-1 - i] for name in names]

        count_dict = Counter(checks)
        for key in count_dict:
            if count_dict[key] == 2:  # got paired
                paired_names = [each for each in names if each.startswith(key)]
                fns = [mapdict[name] for name in paired_names]
                pdict[key] = fns

                #remove it from the list
                for name in paired_names:
                    names.remove(name)

    if len(pdict) == 0:
        print('Error! fastq lib name not recognized, contact Wenlin for help!')
        sys.exit()

    singleLibs = [mapdict[name] for name in names]
    print(singleLibs)

    if len(singleLibs) > 1:
        print(
            'Warnning: more than one lib detected as single lib. below is the single list:'
        )
        print('\n'.join(singleLibs))
        print('Email Wenlin for help')

    #print 'paired libs are:'
    #for key in pdict:
    #    print pdict[key]

    #print '\nsingle libs are:'
    #print ' '.join(singleLibs)

    singleFn = '%s/single.fq' % wdir
    if cmn.filexist(singleFn):
        cmn.run('rm %s' % singleFn)
    for fn in singleLibs:
        cmn.run('cat %s/%s >> %s' % (wdir, fn, singleFn))

    return pdict, singleFn
Esempio n. 5
0
def read_refN(ref_genomes):
    adict = {}
    for ref in ref_genomes:
        fN = '%s/%s_scaf_header.lines' % (ref_dir, ref)
        #print fN
        if not cmn.filexist(fN):
            fhead = '%s/%s_scaf.header' % (ref_dir, ref)
            cmd = 'wc -l %s > %s' % (fhead, fN)
            cmn.run(cmd)
        N = int(cmn.txt_read(fN).split()[0])
        adict[ref] = N
    return adict
Esempio n. 6
0
def separate_by_pair_vold(fastqs, wdir):
    pdict = {}
    mapdict = { }
    for fastq in fastqs:
        key = '.'.join(cmn.lastName(fastq).split('.')[:-1])
        mapdict[key] = fastq

    
    names = list(mapdict.keys())
    length = min([len(name) for name in names])
    for i in range(length):
        checks = [name[:-1-i] for name in names]
        count_dict = Counter(checks)
        if max(count_dict.values()) == 2: #got paired
            for name in names:
                key = name[:-1-i]
                fn = mapdict[name]
                try:
                    pdict[key].append(fn)
                except:
                    pdict[key] = [fn]
            break
    
    if len(pdict) == 0:
        print('Error! fastq lib name not recognized, contact Wenlin for help!')
        sys.exit()

    singleLibs = []
    keys = list(pdict.keys())
    for key in keys:
        libs = pdict[key]
        if len(libs) != 2:
            singleLibs += libs
            del pdict[key]
                
    #print 'paired libs are:'
    #for key in pdict:
    #    print pdict[key]
    
    #print '\nsingle libs are:'
    #print ' '.join(singleLibs)

    singleFn = '%s/single.fq' % wdir
    if cmn.filexist(singleFn):
        cmn.run('rm %s' % singleFn)
    for fn in singleLibs:
        cmn.run('cat %s >> %s' % (fn, singleFn))

    return pdict, singleFn
def parse_inserted_gap(ID, seq, label):
    fn = 'sampleRun_%s/bait_insertion' % ID
    #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')):
    if cmn.filexist(fn):
        #lines = cmn.file2lines(fn)
        #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:]))
        #Ngap = 0
        #for line in lines:
        #    items = line.strip().split()
        #    Ngap += len(items[-1])

        #check what is the right range of sequence
        print('runing blast to fix %s' % ID)
        checkSeq = seq.replace('-', 'N').strip('N')
        fquery = 'tmpInput.fa'
        fasta = '>input\n%s\n' % checkSeq
        cmn.write_file(fasta, fquery)
        dn = 'tmpBr_%s.txt' % label
        cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery
        cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn
        cmn.run(cmd)
        isFixed = False
        for line in cmn.file2lines(dn):
            items = line.strip().split()
            #print items
            qstart, qend, sstart, send = list(map(int, items[2:6]))
            if sstart == 1 and send == 658 and qstart == 21:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 658:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
            if sstart == 2 and send == 655 and qstart == 22:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 654:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
        if not isFixed:
            cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt')
    return seq
Esempio n. 8
0
def parse_bait(fn):
    alist = []
    adict = {}
    if cmn.filexist('bait_insertion'):
        indel_dict = read_indel_info('bait_insertion')
    else:
        indel_dict = {}

    for line in cmn.file2lines(fn):
        sp, name, seq = line.strip().split()
        if len(indel_dict) != []:
            seq = add_indel(seq, indel_dict)

        adict[name] = seq
        alist.append(name)
    return adict, alist
Esempio n. 9
0
def merge_sams(dn, fns):
    #dn = '%s.sam' % label

    print('merging files: %s into %s' % (str(fns), dn))

    if cmn.filexist(dn):
        cmn.run('rm ' + dn)

    fp_dn = open(dn, "a")

    filter_and_write_lines(fp_dn, fns[0], header=True)

    for fn in fns[1:]:
        filter_and_write_lines(fp_dn, fn)

    fp_dn.close()
    return dn
def attempt_to_find_genus_by_abundence(ID, fqlist):
    tmpdir = 'tmp_%s' % ID
    cmn.mkdir(tmpdir)
    os.chdir(tmpdir)

    cmn.write_lines(fqlist, 'fqlist')
    cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist'
    cmn.run(cmd)

    dn = 'picked_bait.txt'
    if cmn.filexist(dn):
        genus = cmn.txt_read(dn).strip().split('_')[0].split()[0]
    else:
        genus = None
    os.chdir('..')
    cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID))
    cmn.run('rm -r %s ' % tmpdir)
    return genus
Esempio n. 11
0
def read_rep():
    dn = 'rep.dict.pkl'
    if cmn.filexist(dn):
        print('loading repeats using precomputed data...')
        return cmn.pickle_read(dn)

    freps = cmn.cmd2lines('ls annotation_repeats/*.gff3')
    repdict = {}
    for frep in freps:
        for line in cmn.file2lines(fn):
            items = line.strip().split()
            scaf = items[0]
            if scaf not in repdict:
                repdict[scaf] = set([])

            i, j = list(map(int, items[3:5]))
            repdict[scaf] = repdict[scaf] | set(range(i, j))
    cmn.pickle_write(repdict, dn)
    return repdict
Esempio n. 12
0
def merge_sams(label, fns):
    dn = '%s.sam' % label

    print('merging files: %s into %s' % (str(fns), dn))

    if cmn.filexist(dn):
        cmn.run('rm ' + dn)

    cmn.run('cp %s %s' % (fns[0], dn))

    fp_dn = open(dn, "a")
    for fn in fns[1:]:
        fp = open(fn)
        for line in fp:
            if line[0] != "@" and line[0] != "[" and line.split()[2] != "*":
                #if line[0] != "@":
                fp_dn.write(line)
        fp.close()

    fp_dn.close()
    return dn
def combine_data(fn, label):
    global ischeck_badID, sp, old_dir
    sp = '_'.join(cmn.lastName(fn).split('_')[:-1])

    accepted_labels = ['R1', 'R2', 'singleton']
    if label not in accepted_labels:
        print('Error! your indicated label are not accepted')
        print('accepted values are %s' % (','.join(accepted_labels)))
        sys.exit()

    newDict = read_fastq(fn)

    oldFn = '%s/%s_%s.fastq' % (old_dir, sp, label)
    if cmn.filexist(oldFn):
        print('combine new data with old data for %s' % fn)
        oldDict = read_fastq(oldFn)
        #newDict = read_fastq(fn)
        finalDict = combine_fastq(oldDict, newDict)
    else:
        finalDict = newDict

    return finalDict
Esempio n. 14
0
def count_sam_align(fns):
    totalN = 0
    alignN = 0
    half_alignN = 0  #more than half aligned
    total_pN = 0  #mapped positions
    total_ptN = 0  # total positions
    for fn in fns:
        if not cmn.filexist(fn):
            continue

        #pN and ptN are the counts by positions
        alnN, halfN, tN, pN, ptN = aligned_reads(fn)
        totalN += tN
        alignN += alnN
        half_alignN += halfN
        total_pN += pN
        total_ptN += ptN

    pPercent = float(total_pN) / total_ptN
    items = fn.split('/')
    sp, ref = items[-3:-1]
    print(sp, ref, alignN, totalN, half_alignN, pPercent)
Esempio n. 15
0
            rdict[sp] = [(fastq, ref)]
        refs.add(ref)

    #2. prepare reference jobs
    refdir = '/work/biophysics/mtang/SNP_calling/indexed_references'
    cmn.mkdir(refdir)
    os.chdir(refdir)
    index_cmds = ['cd %s' % refdir]
    for ref in refs:
        if not os.path.exists(cmn.lastName(ref)):
            #cmn.run('ln -s %s' % ref)
            cmn.run('cp %s %s/' % (ref, refdir))
        ref = cmn.lastName(ref)
        reflabel = ref.replace('.fa', '')
        checkFn = reflabel + '.pac'
        if cmn.filexist(checkFn):
            print('found finished ref for %s, skip it' % ref)
            continue
        cmd = '/home2/wli/local/bwa-0.7.12/bwa index %s -p %s &' % (ref,
                                                                    reflabel)
        index_cmds.append(cmd)

    index_cmds.append('\nwait\n')
    os.chdir(cwd)

    print('#################################################')
    if len(index_cmds) != 2:
        dn = 'index.cmds'
        cmn.write_lines(index_cmds, dn)
        fjob = 'index.job'
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/decorate_job.py %s -p 256GB > %s' % (
Esempio n. 16
0
    
    count = 1
    
    dnNew = '%s_%s.fa' % (dnlabel, count)
    dnlist.append(dnNew)
    dp = open(dnNew, 'w')
    with open(dn) as fp:
        for i, line in enumerate(fp):
            line = line.strip()
            if i % 2 == 0:
                defline = line
                continue
            elif i % 2 == 1:
                fasta = '%s\n%s\n' % (defline, line)
                dp.write(fasta)
            
            if (i+1) % each_pack == 0:
                count += 1
                dp.close()
                dnNew = '%s_%s.fa' % (dnlabel, count)
                dnlist.append(dnNew)
                dp = open(dnNew, 'w')
    
    dp.close()
    
    for each in dnlist:
        if not cmn.filexist(each):
            cmn.run('rm %s' % each)

    cmn.run('rm %s' % dn)
fns = cmn.cmd2lines('ls %s/*/*/*.sam' % wdir)

dirs = set(['/'.join(fn.split('/')[:-2]) for fn in fns])
cov_files = cmn.cmd2lines('ls mapped_reads_count/*_cov.count 2> /dev/null')

finished_dirs = set(
    [cmn.lastName(fn).replace('_cov.count', '') for fn in cov_files])

cwd = os.getcwd()

isGood = True

cmds = ['cd %s' % wdir]
for dir in dirs:
    sp = cmn.lastName(dir)
    if sp in finished_dirs and cmn.filexist(
            'mapped_reads_count/%s_cov.count' % sp):
        continue
    isGood = False
    cmd = 'python /work/biophysics/mtang/SNP_calling/scripts/tell_best_mapping.py %s &' % dir
    cmds.append(cmd)

cmds.append('\nwait\n')

outdir = '%s/mapped_reads_count' % wdir
cmn.mkdir(outdir)

for dir in dirs:
    sp = cmn.lastName(dir.rstrip('/'))
    dn = '%s/%s_cov.count' % (outdir, sp)
    if sp in finished_dirs and cmn.filexist(dn):
        continue
Esempio n. 18
0
    cmds = ['cd %s' % refdir]
    cmds.append('module add picard/1.117')
    cmds.append('module load java/oracle/jdk1.8.0_65')

    todoref_count = 0
    taken_refs = set([])
    for sublist in list(refdict.values()):
        for samdir, ref in sublist:
            if ref in taken_refs:
                continue
            else:
                taken_refs.add(ref)
            fcheck = '%s/%s.dict' % (refdir, ref)
            #print refdir, ref
            if cmn.filexist(fcheck):
                print('skip finished indexed %s' % ref)
                continue
            # this has been finished in bwa mapping
            #/home2/wli/local/bwa-0.7.12/bwa index -p assembly_selfref assembly_selfref.fa > index.log &
            todoref_count += 1
            cmds.append(
                'java -jar $PICARD/CreateSequenceDictionary.jar R=%s.fa O=%s.dict &'
                % (ref, ref))

            cmds.append(
                '/home2/wli/local/samtools-1.2/samtools faidx %s.fa &' % ref)

    cmds.append('\nwait;\n')

    isIndexed = False
def get_query_sequence(seqDict, genus, sp):
    #1. anything in Eudamine file has higher priority
    #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt'
    #cmd = 'grep %s %s' % (sp, fEud)
    #lines = cmn.cmd2lines(cmd)
    #if len(lines) == 1:
    #    name = lines[0].split()[0]
    #    seq = seqDict[name]
    #    fasta = '>%s\n%s\n' % (name, seq)
    #    qlen = len(seq.replace('N', ''))
    #    print 'pick %s for %s %s' % (name, genus, sp)
    #    return fasta, qlen

    names = list(seqDict.keys())
    #try to look up the exact match first
    expected_name = '%s_%s' % (genus, sp)
    tmp = [name for name in names
        if name.upper() == expected_name.upper()]

    if len(tmp) != 0:
        name = tmp[0]
        print('found exact match %s' % name)
        seq = seqDict[name]
        fasta = '>%s\n%s\n' % (name, seq)
        qlen = len(seq.replace('N', ''))
        return fasta, qlen


    #look it up in other files
    good_names = [name for name in names
    #        if genus.upper() in name.upper().split('_')]
            if genus.upper() == name.upper().split('_')[0]]

    useGenus = False
    if len(good_names) > 0:
        useGenus = True

    cmn.run('rm pickingLog.txt 2> /dev/null')
    if len(good_names) == 0:#sp is just 'sp'
        print('can not find barcode for genus keyword "%s"' % genus)
        good_names = names
        cmn.write_file('noGenus\n', 'pickingLog.txt')

    if len(good_names) > 1:
        #try to refine it
        tmp = [name for name in good_names
                if sp.upper() in name.upper().split('_')]
        if len(tmp) != 0:
            good_names = tmp
        else:
            cmn.append_file('noSpecies\n', 'pickingLog.txt')

    #############################################
    ####new here, auto pick sequences for those has no info
    #############################################
    if cmn.filexist('pickingLog.txt'):
        print('automatically pick bait by fastq similarity')
        fsp = 'restricted_genus.info'
        if useGenus and (not cmn.filexist(fsp)):
            cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist %s' % genus
        else:
            cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist '
        cmn.run(cmd)
        good_names = cmn.file2lines('picked_bait.txt')
        cmn.write_file('pickClosed\n', 'pickingLog.txt')


    #############################################
    #############################################
    #############################################

    #try to see if type species is there
    tmp = [name for name in good_names
            if name[0] == '*']
    if len(tmp) != 0:
        good_names = tmp
    else:
        tmp = [name for name in good_names
            if '*' in name]
        if len(tmp) != 0:
            good_names = tmp

    #then randomly pick one, get the max length ones
    name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-')))
    #name = name.replace('/', '_')
    seq = seqDict[name]
    fasta = '>%s\n%s\n' % (name, seq)
    qlen = len(seq.replace('N', ''))
    print('pick %s for %s %s' % (name, genus, sp))
    return fasta, qlen
Esempio n. 20
0
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    #check if all the files has contains
    falist = cmn.file2lines(fn)
    bad_falist = [
        fa for fa in falist
        if not cmn.filexist(fa) and '/archive/butterfly/' not in fa
    ]

    if len(bad_falist) != 0:
        print('Error!')
        print('the following files are errorous:')
        print('\n'.join(bad_falist))
        sys.exit()

    transferDir = 'archiveTransfer'
    cmn.mkdir(transferDir)

    alea_list = [fa for fa in falist if '/archive/butterfly' in fa]

    biohpc_list = set(falist) - set(alea_list)
    thread_lines = {}
    clean_lines = {}
    leftN = 20
    barcodeLength = 658
    for wdir in wdirs:
        ID = wdir.split('sampleRun_')[-1]
        print('working on %s' % ID)
        try:
            fn = cmn.cmd2lines(
                'ls sampleRun_%s/rescued_read_assembled_mis1*.txt' % ID)[0]
        except:
            print('can not find assembled files for %s' % ID)
            continue
        #print fn
        findel = 'sampleRun_%s/bait_insertion' % ID
        if cmn.filexist(findel):
            print('prasing indel for %s' % ID)
            indel_positions = find_indel_from_reads(findel, fn)
            print('indel_positions', indel_positions)
        else:
            indel_positions = []

        threadSeq, stackSeq, cleanSeq = read_lineup_seq(fn, indel_positions)

        thread_lines[ID] = threadSeq
        stack_lines[ID] = stackSeq
        clean_lines[ID] = cleanSeq

    #1. if thread and stack show inconsistent, show an X
    #2. if thread has gap, show as lower case
    #3. if both are gap, show an N
def add_in_baits(fref):
    fbait = 'sampleInfo.baits'
    ref_info = cmn.txt_read(fref)
    if cmn.filexist('bait_insertion'):
        indel_dict = read_indel_info('bait_insertion')
    else:
        indel_dict = {}

    #baits added by customBaits
    #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo'
    #if cmn.filexist(fadd):
    #    add_lines = cmn.file2lines(fadd)
    #else:
    #    add_lines = []
    add_lines = []

    if len(indel_dict) != 0:
        ref_info = insert_in_ref_info(ref_info, indel_dict)
        add_lines = insert_in_lines(add_lines, indel_dict)

    refIDs = [
        line[1:] for line in ref_info.split('\n')
        if line.strip() != '' and line[0] == '>'
    ]
    addedIDs = [line.split()[1] for line in add_lines]

    #new = []
    #when check a new line, need to check both the fref and the fadd
    #if the one is not in fadd, add it to fadd
    for line in cmn.file2lines(fbait):
        sp, defline, seq = line.strip().split()
        if all([defline.upper() not in refID.upper() for refID in refIDs]):
            #not in ref
            if all([
                    defline.upper() not in addedID.upper()
                    for addedID in addedIDs
            ]):
                if len(seq) == 698:
                    add_lines.append(line)
                else:
                    if len(seq) != 658:
                        print(
                            'Error! length of bait barcode is wrong for %s %s'
                            % (sp, defline))
                        sys.exit()
                    else:
                        seq = add_primer(seq)
                        add_lines.append('%s\t%s\t%s' % (sp, defline, seq))

    #now get a new fadd, need to format it into fasta
    add_fasta = []
    for line in add_lines:
        sp, defline, seq = line.strip().split()
        fasta = '>%s\n%s\n' % (defline, seq)
        add_fasta.append(fasta)

    ref_info += '\n'
    ref_info += ''.join(add_fasta)
    dn = 'species_barcodes_4mapping_withAddon.fa'
    cmn.write_file(ref_info, dn)

    #index it
    cmd = 'module add bwa; bwa index %s' % dn
    cmn.run(cmd)

    #record the new fadd
    #cmn.write_lines(add_lines, fadd)

    return dn
Esempio n. 23
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    fns = cmn.cmd2lines('ls *.map| grep -v all| grep -v test| grep -v concat')
    #fns = fns[:1]

    f_label = '15101E05_snp.vcf'


    #make the scaffold index
    #cmd = "grep -v '^#' 15101E05_snp.vcf| cut -f 1,2 > index_header"
    if not cmn.filexist('index_header'):
    #    cmn.run(cmd)
        make_index_header('15101E05_snp.vcf', 'assembly_v2_length.txt')

    header = ['scaffold', 'index']

    for fn in fns:
        label = fn.split('_')[0]
        header.append(label+'_f')
        header.append(label+'_m')

    cmn.write_lines(fns, 'map_name_order')

    cmn.write_file('\t'.join(header)+'\n', 'table_header')

    cmd = 'cp table_header all_concat.map;'
Esempio n. 24
0
    if len(sys.argv) > 2:
        if sys.argv[2] == 'ignore':
            ignore_check = True

    outlabel = cmn.lastName(fn).replace('.fa', '')
    wdir = '%s_tmp' % cmn.lastName(fn)
    cmn.mkdir(wdir)
    os.chdir(wdir)

    fphy = cmn.lastName(fn) + '.phylip'
    fname = cmn.lastName(fn) + '.phylipNames.dict.pkl'

    fchecks = ['outfile', 'dist.Tree']
    isbad = False
    for fcheck in fchecks:
        if not ignore_check and cmn.filexist(fcheck):
            print('Erorr: file %s exists! running pipeline would overwrite the files, please either delete it or move it to another place' % fcheck)
            isbad = True
    if isbad:
        sys.exit()

    cmd = 'source /home2/wli/.bash_profile;/project/biophysics/Nick_lab/wli/sequencing/scripts/fasta2phylip4dnadist.py %s' % fn
    cmn.run(cmd)

    dnadistInfo = '%s\nY\n' % fphy
    cmn.write_file(dnadistInfo, 'input.dnadist')

    cmd = 'rm outfile 2> /dev/null;/home2/wli/local/phylip-3.696/exe/dnadist < input.dnadist > dnadist.log'
    #print cmd
    cmn.run(cmd)
Esempio n. 25
0
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py good_reads_assembled [allowed_mismatch=0]",
              file=sys.stderr)
        sys.exit()

    try:
        misN = int(sys.argv[2])
    except:
        misN = 1

    hasDelLabel = False
    #NOTE: new feature: rejecting reads that matched mostly to the extended ends
    #NOTE: change it such that bad reads are not rescued due to gaps
    if cmn.filexist('hasDeletion'):
        indel_list = set(cmn.file2lines('hasDeletion'))
    else:
        indel_list = set([])

    #add primer if not added
    bait_dict, ordered_baits, stack_seq, thread_seq, good_reads, junk_reads, sampleLabel = read_assembled_file(
        fn)
    print('junk1', len(junk_reads))

    #use stack to rescue
    all_rescued = {}
    while (True):
        #this means more reads are rescued
        #continue doing rescue
Esempio n. 26
0
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        wdir = sys.argv[1]
    except:
        print("Usage: *.py 3935", file=sys.stderr)
        sys.exit()

    #subdirs = cmn.cmd2lines('ls %s| grep -v txt$' % wdir)

    #for subdir in subdirs:
    #    fns = cmn.cmd2lines('ls %s/%s/*.sam' % (wdir, subdir))
    #    count_sam_align(fns)
    fn = '%s/mapping_stat.txt' % wdir
    if not cmn.filexist(fn):
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/count_sam_aligns_byDir.py %s' %  wdir
        cmn.run(cmd)
        sys.exit()

    sp = cmn.lastName(wdir)
    for line in cmn.file2lines(fn):
        print('%s\t%s' % (sp, line))



Esempio n. 27
0
if len(subdirs) == 1:
    print('only one directory, no need to tell who is best')
    cmn.write_file(subdirs[0], '%s/best_mapping.txt' % wdir)
    sys.exit()

else:
    sizeDict = {}
    for each in subdirs:
        fsams = cmn.cmd2lines('ls %s/%s/*.sam' % (wdir, each))
        total = 0
        mapped = 0
        halfmap = 0
        TpN = 0
        TptN = 0
        for fsam in fsams:
            if not cmn.filexist(fsam):
                continue
            mappedN, halfN, totalN, pN, ptN = aligned_reads(fsam)
            total += totalN
            mapped += mappedN
            halfmap += halfN
            TpN += pN
            TptN += ptN

        sizeDict[each] = (mapped, total, halfmap, float(TpN) / TptN)

    dn = '%s/mapping_stat.txt' % wdir
    info = [
        '%s\t%s\n' % (name, '\t'.join(map(str, sizeDict[name])))
        for name in sizeDict
    ]
        Id = cmn.lastName(line).split('_')[0]
        Id = Id.replace('NVG-', '').replace('11-BOA-','').replace('LEP-', 'LEP')
        IDlist.add(Id)
        fq = os.path.abspath(line)
        try:
            fq_groups[Id].append(fq)
        except KeyError:
            fq_groups[Id] = [fq]

    nameDict = get_names_4barcode()

    fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta'
    seqDict = read_fa(fall)
    fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa'
    if cmn.filexist(fadd):
        seqDict.update(read_fa(fadd))
    ftable = '/archive/biophysics/Nick_lab/wli/archive/barcodes/auto_tables/verified_barcodes.fa'
    seqDict.update(read_autoTable(ftable))
    all_genus = set([name.split('_')[0].lower() for name in seqDict])

    info = []
    missing = []
    notFound = []

    for sp in IDlist:
        try:
            fullname = nameDict[sp]
            genus = fullname.split()[1]
            if genus.lower() not in all_genus:
                notFound.append(sp)