コード例 #1
0
def attempt_to_find_genus_by_abundence(ID, fqlist):
    tmpdir = 'tmp_%s' % ID
    cmn.mkdir(tmpdir)
    os.chdir(tmpdir)

    cmn.write_lines(fqlist, 'fqlist')
    cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist'
    cmn.run(cmd)

    dn = 'picked_bait.txt'
    if cmn.filexist(dn):
        genus = cmn.txt_read(dn).strip().split('_')[0].split()[0]
    else:
        genus = None
    os.chdir('..')
    cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID))
    cmn.run('rm -r %s ' % tmpdir)
    return genus
コード例 #2
0
ファイル: auto_extend.py プロジェクト: ATPs/xiaolongTools
def detect_dominated_reads(seqs, seed_i):
    global fraction
    global minN

    sequence_index = seed_i
    form_dict = {}  #record which seq has how many faction
    forms = []  #used for output
    while (True):
        subseqs = [
            seq[:sequence_index] for seq in seqs if len(seq) >= sequence_index
        ]
        if len(subseqs) == 0:
            break
        total = float(len(subseqs))

        adict = Counter(subseqs)
        #print 'seq', seqs
        #print 'sequence_index', sequence_index
        #print 'subseq', subseqs
        #print adict

        maxCount = 0
        maxSeq = ''
        for seq in adict:
            count = adict[seq]
            if count > maxCount:
                maxCount = count
                maxSeq = seq

        forms.append('%.3f\t%s\t%s' % (maxCount / total, len(subseqs), maxSeq))
        form_dict[maxSeq] = [
            maxCount / total, len(subseqs)
        ]  #fraction of reads and number of remaining reads

        if maxCount < (fraction * total):
            break
        if len(subseqs) < minN:
            break

        sequence_index += 1

    dn = 'form_stat.txt'
    cmn.write_lines(forms, dn)
    return form_dict
コード例 #3
0
ファイル: auto_extend.py プロジェクト: ATPs/xiaolongTools
def grep_reads(read, f_libs, direction):
    #reverse the read
    #reverse = ''.join([rdict[i] for i in read[::-1]])

    cmds = []
    for fn in f_libs:
        cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/grep_reads.py %s %s %s &' % (
            read, fn, direction)
        cmds.append(cmd)

    cmds.append('\nwait;\n')

    f_job = 'grep_read.job'
    cmn.write_lines(cmds, f_job)

    cmn.run('bash %s ' % f_job)
    #the output dir is grep_out
    dn = 'all_grep_reads.txt'
    cmn.run('cat grep_out/* > %s' % dn)
    fished_reads = cmn.getid(dn)

    return fished_reads
コード例 #4
0
            qseq, sseq = items[-2:]
            if ('-' in qseq) or ('-' in sseq):
                print('detected gap for %s, skip' % sid, file=sys.stderr)
                continue
            
            for i in range(qhsp_length):
                qI = i + qstart
                sI = i
                codon = hsp[sI]
                try:
                    stack_dict[exon][sp][qI].append(codon)
                except KeyError:
                    stack_dict[exon][sp][qI] = [codon]

        dn = '%s_%s_endExtend.fa' % (sp, exon)
        cmn.write_lines(end_seqs, dn)
        
        print('endseq', consensus_seqs(end_seqs))
        maxLength = max([len(seq) for seq in start_seqs])
        pattern = '{:>%s}' % maxLength
        start_seqs = [pattern.format(seq) for seq in start_seqs]
        dn = '%s_%s_startExtend.fa' % (sp, exon)
        cmn.write_lines(start_seqs, dn)
        print('startseq', consensus_seqs(start_seqs))


            
#tell and output
dn = 'phased_assemblies.contigs'
new = []
cov_dict = {}
コード例 #5
0
    biohpc_files = []
    for each in falist:
        if '/archive/butterfly/' in each or ('jshen/h' in each):
            alea_files.append(each)
        else:
            biohpc_files.append(each)

    new_files = transfer_alea_files(alea_files)
    falist = biohpc_files + new_files

    #if len(missing) != 0:
        #try to look for refgenomes
    #    fns = cmn.cmd2lines('ls /work/biophysics/mtang/SNP_calling/indexed_references/mitogenomes/*.fa')
    #    addback = [fn for fn in fns if cmn.lastName(fn).split('_')[0] in missing]
    #    missing = set(missing) - set([cmn.lastName(fn).split('_')[0] for fn in addback])
    #    falist += addback

    if len(missing) != 0:
        print('ATTENTION! the following ID missing sequence!')
        print('\n'.join(missing))
        cmn.write_lines(missing, 'missingMITOs')

    falist.append('')
    cmn.write_lines(falist, 'falist.mito')


    #if len(alea_files) != 0:
    #    print 'the following files need to transfer from /archive server'
    #    print '\n'.join(alea_files)

コード例 #6
0
    header = lines[0].split()[2:]
    #cmn.write_lines(header, 'header_names')

    #new = [' '.join(['sp%s' % i for i in xrange(len(header))])]
    new = [' '.join(header)]
    for line in lines[1:]:
        firstChar = ''
        items = line.split()[2:]
        newline = []
        for item in items:
            chars = item.split()
            if firstChar == '':
                firstChar = chars[0]

            if len(chars) == 1:
                if chars[0] == '-':
                    newline.append('0,0')
                elif firstChar == chars[0]:
                    newline.append('1,0')
                else:
                    newline.append('0,1')

            else:  #have both char
                newline.append('1,1')
        new.append(' '.join(newline))

    dn = fn + '.tmix'
    cmn.write_lines(new, dn)

    cmn.run('gzip %s' % dn)
コード例 #7
0
    label = cmn.lastName(fn)
    if label in finished_maps:
        continue

    isGood = False
    if 'MITO' in label:
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta_mito.py %s' % fn
    else:
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta.py %s' % fn
    cmds.append(cmd)

if isGood:
    print('Good news! everything looks good!')

else:
    cmds.append('')
    dn = 'm2fadd.cmds'
    cmn.write_lines(cmds, dn)

    print('Error!!!!!')
    print('There are still %s fasta missing' % (len(cmds) - 1))
    print('please use following command to submit jobs')
    print(
        '\n>>> /work/biophysics/mtang/SNP_calling/scripts/submit_jobs.py %s [#node] m2fAdd -p 256GB\n'
        % dn)
    print('-p specifies the partition it submitted to')
    print(
        '[#node] is the number of nodes and should be adjusted according to number of lines in %s'
        % dn)
    print('\n[IMPORTANT]Please run this check again upon the job completion.')
コード例 #8
0
            #/home2/wli/local/bwa-0.7.12/bwa index -p assembly_selfref assembly_selfref.fa > index.log &
            todoref_count += 1
            cmds.append(
                'java -jar $PICARD/CreateSequenceDictionary.jar R=%s.fa O=%s.dict &'
                % (ref, ref))

            cmds.append(
                '/home2/wli/local/samtools-1.2/samtools faidx %s.fa &' % ref)

    cmds.append('\nwait;\n')

    isIndexed = False
    print('###############################################')
    if todoref_count != 0:
        fcmd = 'gatkIndex.cmd'
        cmn.write_lines(cmds, fcmd)
        fjob = 'gatkIndex.job'
        cmd = '/work/biophysics/mtang/SNP_calling/scripts/decorate_job.py %s -p 256GB > %s' % (
            fcmd, fjob)
        cmn.run(cmd)
        print('please submit %s to the queue for indexing ' % fjob)
    else:
        print('good news! all references have been indexed')
        isIndexed = True
    print('###############################################')

    if not isIndexed:
        print('**********************************************')
        print('\nimportant!!!')
        print('please re-run this script after all references are indexed!\n')
        print('**********************************************')
コード例 #9
0
            good_reads.append(name1)
        elif (misM2 + 1) >= misM1:
            #bad one has more mismatch, good one is good!
            #good one can be 1 bp more than the bad one
            good_reads.append(name1)
        else:
            if identity >= identity_cut:
                good_reads.append(name1)
            else:
                bad_reads.append((name2, aln1))

    print('further classify overlapping reads into:')
    print('%s good reads' % len(good_reads))
    print('%s bad reads' % len(bad_reads))
    #sp2 = name2sp(name2)

    #add back the previous IDs
    good_reads.append('#' * 100)
    for ID in good_IDs:
        name = ID1mapping[ID]
        good_reads.append(name)

    cmn.write_lines(good_reads, 'good_reads.txt')

    #bad_reads.append('#' * 100)
    for ID in bad_IDs:
        name = ID2mapping[ID]
        bad_reads.append((name, seqDict2[name]))
    bad_alignments = ['%s    %s\n' % (each[0], each[1]) for each in bad_reads]
    cmn.write_file(''.join(bad_alignments), 'bad_reads_alignment.txt')
コード例 #10
0
        try:
            clean_seq = clean_lines[ID]
            thread_seq = thread_lines[ID]
            length = min(len(clean_seq), len(thread_seq))
            N = sum([
                clean_seq[i + 20] != thread_seq[i + 20]
                for i in range(length - 20)
                if clean_seq[i + 20] not in gapChars and (
                    thread_seq[i + 20] not in gapChars)
            ])

            Ngap = sum([char in gapChars for char in clean_seq[20:678]])
            if Ngap == 0 and N == 0:
                difflabel += ',goodCC'
            else:
                difflabel += ',CC_%s' % N
                if Ngap != 0:
                    difflabel += '[g%s]' % Ngap
        except KeyError:
            difflabel += ',noCC'

        line.append(difflabel)
        try:
            line.append(conta_dict[ID])
        except:
            line.append('NA')
        line.append(recordLabel)
        new.append('\t'.join(line))
    new.append('')
    cmn.write_lines(new, 'compare.check')
コード例 #11
0
        line = '%s%s' % (Pname, good_reads[name])
        final.append(line)

    ### report those filtered by bwa mapping to other species
    final.append('#' * 700)
    names = sorted(list(bad_dict.keys()),
                   key=lambda x: spBased_badnames(x, bad_dict[x]))
    #collapsed_names = collapse_same_reads(names, bad_dict, True)
    for name in names:
        #try:
        #    Pname = collapsed_names[name]
        #except KeyError:
        #    continue
        Pname = format_name(strformat, name, indel_list)
        line = '%s%s' % (Pname, bad_dict[name])
        final.append(line)

    if hasDelLabel:
        dn = 'rescued_read_assembled_mis%s_withDeletion.txt' % misN
    else:
        dn = 'rescued_read_assembled_mis%s.txt' % misN

    cmn.write_lines(final, dn)

    Ngood = len(good_reads)
    Njunk = len(junk_reads)
    #print junk_reads
    statInfo = 'junk:good = %s:%s(%s)\n' % (Njunk, Ngood, float(Njunk) /
                                            (Njunk + Ngood))
    cmn.write_file(statInfo, 'rescued_ratio_mis%s.txt' % misN)
コード例 #12
0
ファイル: auto_extend.py プロジェクト: ATPs/xiaolongTools
    read = seed_read
    Iter = 0
    extensions = []
    while (Iter < upper_iter):
        Iter += 1
        print('running iteration %s' % Iter)

        #prepare wdir
        wdir = 'extend_iter%s' % Iter
        cmn.mkdir(wdir)
        os.chdir(wdir)
        cmn.write_file(read, 'seed_seq.txt')

        #grep the reads
        fished_reads = grep_reads(read, f_libs, direction)
        cmn.write_lines(fished_reads, 'fished_reads.txt')

        #make stat of the reads
        stat_dict = detect_dominated_reads(fished_reads, len(read))

        #get the longest set under cutoff
        extended_seq = get_extended(stat_dict)
        cmn.write_file(extended_seq, 'extension_seq.txt')
        os.chdir('..')

        if extended_seq == '':
            print('no extension can be found in iteration %s! exit!' % Iter)
            break
        read = extended_seq[-overlapN:]
        extensions.append(extended_seq)
コード例 #13
0
                        lastPhase = curPhase
                left = move_pointer(left, index + 1, count + 1)
            #print 'left', left

    scafLength = len_dict[scaf]
    if scafLength + 1 != expectIndex:
        gap = scafLength + 1 - expectIndex
        fillN = 'N' * gap
        seq1.append(fillN)
        seq2.append(fillN)

    #output the last
    phased_blocks.append(ouput_phased_blocks(lastPosition, right, 'lastOne'))

    dnlabel = cmn.lastName(fn).replace('.vcf', '')
    sp = dnlabel.split('_')[1]
    dn = dnlabel + '_phased.fa'
    with open(dn, 'w') as dp:
        dp.write('>%s_ref_or_phase1\n' % sp)
        dp.write(''.join(seq1))
        dp.write('\n')
        dp.write('>%s_called_or_phase2\n' % sp)
        dp.write(''.join(seq2))
        dp.write('\n')

    dn = dnlabel + '_phased.blocks'
    cmn.write_file(''.join(phased_blocks), dn)

    dn = dnlabel + '_phased.letters'
    cmn.write_lines(phased_letter, dn)
コード例 #14
0
    ### report those filtered by bwa mapping to other species
    final.append('#' * 700)
    names = sorted(list(bad_dict.keys()), key=lambda x: spBased_badnames(x, bad_dict[x]))
    collapsed_names = collapse_same_reads(names, bad_dict, True)
    for name in names:
        try:
            Pname = collapsed_names[name]
        except KeyError:
            continue
        Pname = format_name(strformat, Pname)
        line = '%s%s' % (Pname, bad_dict[name])
        final.append(line)

    dn = 'good_read_assembled.txt'
    cmn.write_lines(final, dn)


    #report thoese reads inconsistent with consensus
    for name in bad_cc_names:
        #Pname = parse_br_name(name_dict, name)
        Pname = name
        Pname = format_name(strformat, Pname)
        line = '%s%s' % (Pname, seqDict[name])
        info.append(line)

    info.append('#' * 200)

    #report those reads mapped to other species
    names = sorted(bad_reads, key=lambda x: parse_bad_names(x, seqDict[x]))
    for name in names:
コード例 #15
0
ファイル: find_fastqlist.py プロジェクト: ATPs/xiaolongTools
    checklist = list(IDs)
    for line in falist:
        sp = cmn.lastName(line).split('_')[0]
        if sp in IDs:
            tmp.append(line)
            try:
                checklist.remove(sp)
            except:
                pass

    #print falist
    falist = tmp
    #print falist
    missing = set(checklist) | set(missing)

    alea_files = [
        each for each in falist
        if '/archive/butterfly/' in each or ('jshen/h' in each)
    ]

    if len(alea_files) != 0:
        print('the following files need to transfer from /archive server')
        print('\n'.join(alea_files))

    cmn.write_lines(falist, 'attempt.fastqlist')

    if len(missing) != 0:
        print('ATTENTION! the following ID missing sequence!')
        print('\n'.join(missing))
        cmn.write_lines(missing, 'missingIDs')
コード例 #16
0
    #add primer if not added
    ref_seqs, toAddDict = read_baits(fref)
    #log the baits into the dataset
    log_newBaits_ifPossible(ref_seqs)

    #index ref here
    frefs = parse_ref(ref_seqs)

    fqlist = cmn.getid(fqlist)
    fq_groups = group_fq(fqlist)

    N = cmn.cpu_check()

    bwa_cmds = ['module add bwa']
    for reflabel in frefs:
        fref = frefs[reflabel]
        fnlabel = cmn.lastName(fref).replace('.fa', '')
        for sp in fq_groups:
            R1, R2, single = fq_groups[sp]
            cmd = 'bwa mem -t %s -B 2 -M %s %s %s | grep "%s" > %s_paired_%s_mapped.sam ' % (
                N, fnlabel, R1, R2, reflabel, sp, fnlabel)
            bwa_cmds.append(cmd)
            cmd = 'bwa mem -t %s -B 2 -M %s %s | grep "%s" > %s_single_%s_mapped.sam ' % (
                N, fnlabel, single, reflabel, sp, fnlabel)
            bwa_cmds.append(cmd)
            bwa_cmds.append('\nwait\n')

    dn = 'bwa.cmds'
    cmn.write_lines(bwa_cmds, dn)
コード例 #17
0
    print('guessing fastq file to be %s' % fq)

if fref == '':
    print('Error! can not find ref table file!')
    sys.exit()
else:
    print('guessing ref table file to be %s' % fref)

fq_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/fastq.filelist'
if os.path.exists(fq_all):
    aset = set(cmn.getid(fq_all))
else:
    aset = set([])

bset = set(cmn.getid(fq))
newset = aset | bset

newset = filter_best_fastq(newset)

cmn.write_lines(newset, fq_all)

fref_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/refTable.txt'
if os.path.exists(fref_all):
    aset = set(cmn.getid(fref_all))
else:
    aset = set([])

bset = set(cmn.getid(fref))
newset = aset | bset
cmn.write_lines(newset, fref_all)
コード例 #18
0
    mapInfo = ['#both the index started with 0 (not 1)']
    mapDict = {}
    new = []
    for defline in adict:
        seq = adict[defline]
        label = defline[1:].split('_')[0]
        #goodSeq = [char for i, char in enumerate(seq)
        #        if i not in gapped]
        count = 0
        goodSeq = []
        for i, char in enumerate(seq):
            if i not in gapped:
                #mapInfo.append('%s\t%s\t%s' % (label, count, i))
                mapDict[count] = i
                count += 1
                goodSeq.append(char)

        goodSeq = ''.join(goodSeq)

        new.append('%s\n%s\n' % (defline, goodSeq))

    f_label = '%s_%s' % (cmn.lastName(fn).split('.')[0], cmn.lastName(sys.argv[2]))
    dn = f_label + '_noGap.fa'
    cmn.write_file(''.join(new), dn)

    mapInfo += ['%s\t%s' % (key, mapDict[key]) for key in mapDict]
    dn = f_label + '_noGap2coding_index.info'
    cmn.write_lines(mapInfo, dn)

コード例 #19
0

    seqDict = read_fa(fn)
    lengths = [len(seqDict[i]) for i in seqDict]
    if len(set(lengths)) != 1:
        print('alignments are not in the same length! below is the stat:')
        for i in seqDict:
            print(i, len(seqDict[i]))
        sys.exit()


    keys = list(seqDict.keys())

    info = [str(len(seqDict))]
    for name in keys:
        line = [name]
        for name2 in keys:
            if name2 == name:
                line.append(0.0)
            else:
                dist = compute_distance(name, name2, seqDict)
                line.append(dist)
        info.append('\t'.join(map(str, line)))

    info.append('')
    cmn.write_lines(info, fn + '.dist')




コード例 #20
0
        seqs += adict[name]

    for i in range(len(seqs[0])):
        chars = [seq[i] for seq in seqs]
        check_chars = [char for char in chars if (char != '-') and (char != 'N')]
        Nchar = len(check_chars)
        #N = len(set(check_chars))
        count_dict = Counter(check_chars)
        N = len(count_dict)

        if N == 0:
            #skip the all gapped positions
            continue
        if any([count_dict[char] < Ncut for char in count_dict]):
            #skip non-info positions
            continue
        elif float(Nchar) / len(chars) < gap_cut:
            #skip gap positions
            continue
        else:
            line = chars2line(i, chars)
            hetero[N-1].append(line)


    for i, each in enumerate(hetero):
        dn = cmn.lastName(fn) + '.hetero%s' % (i + 1)
        cmn.write_lines(each, dn)



コード例 #21
0
f_list = 'falist'

cmd = 'ls %s/* > %s' % (wdir, f_list)
cmn.run(cmd)

falist = [os.path.abspath(fn) for fn in cmn.file2lines(f_list)]

Njob = 3
fa_size = cmn.filesize(falist[0]) / 1024 / 1024

Njob = max(Njob, 50 * fa_size / 5000 + 1)

Ncores = 48 * Njob / 100
print('number of cores:', Ncores)
print('number of jobs:', Njob)

cmds = []
outdir = 'making_fastme_trees'
cmn.mkdir(outdir)
for fa in falist:
    cmd = 'cd %s; python /project/biophysics/Nick_lab/wli/sequencing/scripts/fasta2fastmeTree.py %s %s' % (
        outdir, fa, Ncores)
    cmds.append(cmd)

cmn.write_lines(cmds, 'fastme.cmds')

cmd = 'python /home2/wli/my_programs/submit_jobs.py fastme.cmds %s %s -p 256GB' % (
    Njob, project)
cmn.run(cmd)
コード例 #22
0
def fasta2chpInput(fn, fdonor, freceipt):
    ##############################
    gapCut = 0.1
    Napp = 4
    gapChars = set(list('N-X'))
    linkage = 10000000.0 #the larger the stronger linkage
    ##############################
    #outlabel = 'chp_%s_%s_%s_gap%s_info%s' % (cmn.lastName(fn), cmn.lastName(fdonor), cmn.lastName(freceipt), gapCut, Napp)
    outlabel = cmn.lastName(fn)

    #parsing sequence
    seqDict, seqLength = read_fa(fn)
    seqGroups = group_seqDict(seqDict)
    included_IDs = set(seqGroups.keys())

    donor_dict = parse_popDef(fdonor, freceipt, included_IDs)
    donor_keys = list(donor_dict.keys())
    random.shuffle(donor_keys)
    print(donor_keys)
    donorIDs = []

    donorF = []
    key_groups = [] #this is used to fill in gap for each group
    for key in donor_keys:#just to garantee ordering
        IDs = donor_dict[key]
        key_groups.append(IDs)
        donorIDs += list(IDs)
        line = 'p%s %s\n' % (key, len(IDs)*2)
        donorF.append(line)

    dn = outlabel + '.donor'
    cmn.write_file(''.join(donorF), dn)

    receiptIDs = [line.split()[0] for line in cmn.file2lines(freceipt)]
    receiptIDs = list(set(receiptIDs) & included_IDs)
    receiptInfo = parse_receiptInfo(freceipt, receiptIDs)
    dn = outlabel + '_ind_record.list'
    cmn.write_lines(receiptInfo, dn)
    key_groups.append(receiptIDs)

    ids = []
    ordered_keys = donorIDs + receiptIDs

    phase = [str(len(donorIDs)*2)]
    phase.append(str(len(donorIDs) + len(receiptIDs)))

    Npos = 0
    Pline = ['P']
    phaseSeqs = [[] for _ in range(len(ordered_keys)*2)]

    for i in range(seqLength):
        chars = []
        isBad = False
        for keys in key_groups:
            subchars = take_position_chars(seqGroups, keys, i)

            #require half of subchars is not gap
            nonGap_sub = [char for char in subchars
                    if char not in gapChars]
            #print i, subchars, nonGap_sub
            if len(nonGap_sub) < len(subchars) * 0.5:
                isBad = True
                break

            #random sample the positions to fill in gap
            newSubChars = []
            for char in subchars:
                if char in gapChars:
                    newChar = random.sample(nonGap_sub, 1)[0]
                else:
                    newChar = char
                newSubChars.append(newChar)

            chars += newSubChars
        #print 'chars', chars
        if isBad:
            continue
        nonGaps = [char for char in chars
                if char not in gapChars]
        #print i, chars
        #print i, nonGaps
        if len(nonGaps) > ((1 - gapCut) * len(chars)):
        #if len(nonGaps) == len(chars):
            #filtering for infoP
            count_dict = Counter(nonGaps)
            if len(count_dict) != 2:
                continue
            if any([count_dict[key] < Napp for key in count_dict]):
                continue
            #print i, 'isGood'
            Pline.append(i)
            Npos += 1
            for j, char in enumerate(chars):
                if char in gapChars:
                    char = '0'

                phaseSeqs[j].append(char)

    dn = outlabel + '.hap'
    phase.append(str(Npos))
    phase.append(' '.join(map(str, Pline)))
    print('number of positions: %s' % (len(Pline) -1))
    phase.append('S' * Npos)
    phase += [''.join(map(str, each)) for each in phaseSeqs]
    phase.append('')
    cmn.write_lines(phase, dn)

    #parse out recomb
    linkage = float(linkage)
    positions = Pline[1:]
    recomb = ['pos morgan.dist']
    for i in range(len(positions) - 1):
        p1 = positions[i]
        p2 = positions[i+1]
        dist = p2 - p1
        morgan = dist / linkage
        recomb.append('%s %s' % (p1, morgan))
    recomb.append('%s 0' % (positions[-1]))
    recomb.append('')
    dn = outlabel + '.recomb'
    cmn.write_lines(recomb, dn)

    return outlabel
コード例 #23
0
    hasCombined = True
    if len(old_fastqs) == 0:  # no old data
        print('no old libs found for %s' % label)
        cmn.run('ln -s %s' % dn)
    else:  #has old data
        print('combining old libs for %s' % label)
        old_fastqs, dup_fastqs = remove_duplication(old_fastqs)
        cmn.run('cp %s %s' % (dn, wdir))
        log_info.append('%s\t%s\n' % (label, dn))
        comb_fn = '%s/%s' % (wdir, cmn.lastName(dn))
        for old_fastq in old_fastqs:
            cmn.run('cat %s >> %s' % (old_fastq, comb_fn))
            log_info.append('%s\t%s\n' % (label, old_fastq))

if hasCombined:
    cmn.write_file(''.join(log_info), '%s/combined_libs.log' % wdir)

#make statistics for data amount

fastq_groups = group_fastq(fastqs)

new = []
for key in fastq_groups:
    fns = fastq_groups[key]
    cmd = 'python /work/biophysics/mtang/SNP_calling/scripts/check_fastq_size.py %s %s' % (
        key, ','.join(fns))
    new.append(cmd)

new.append('')
cmn.write_lines(new, 'fastq_amount.cmds')
コード例 #24
0
    dn = outlabel + '.ids'
    cmn.write_file(''.join(ids), dn)

    phase = [str(Nhap)]
    Npos = 0
    Pline = ['P']
    phaseSeqs = [[] for _ in range(Nhap)]

    for i in range(seqLength):
        chars = []
        for key in ordered_keys:
            chars += [seq[i] for seq in seqGroups[key]]

        nonGaps = [char for char in chars if char not in gapChars]

        if len(nonGaps) > ((1 - gapCut) * len(chars)):
            Pline.append(i)
            Npos += 1
            for j, char in enumerate(chars):
                if char in gapChars:
                    char = '0'

                phaseSeqs[j].append(char)

    dn = outlabel + '.phase'
    phase.append(str(Npos))
    phase.append(' '.join(map(str, Pline)))
    phase += [''.join(map(str, each)) for each in phaseSeqs]
    phase.append('')
    cmn.write_lines(phase, dn)
コード例 #25
0
        #fqlist = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/%s*.fastq' % sample)
        #fqlist = cmn.cmd2lines('ls /work/biophysics/wli/workspace/filtered_6313*q')
        wdir = 'mitoD_%s' % sample
        cmn.mkdir(wdir)
        os.chdir(wdir)
        cwd = os.getcwd()
        info = template.replace('[cwd]', cwd)
        info = info.replace('[fq_files]', ' '.join(fqlist))
        info = info.replace('[sample]', sample)

        #prepare quake infiles
        fqlist_local = []
        for fq in fqlist:
            cmn.run('ln -s ' + fq)
            fqlist_local.append(cmn.lastName(fq))
        cmn.write_lines(fqlist_local, 'fqlist')
        cmn.run('ln -s fqlist infiles')

        #make fq2fa comand
        quake_fqlist = [each.replace('.fastq', '.cor.fastq') for each in fqlist_local]
        fq2fa_cmds = ['rm %s.fa 2> /dev/null' % sample]
        for fq in quake_fqlist:
            cmd = 'fq2fa %s >> %s.fa;' % (fq, sample)
            fq2fa_cmds.append(cmd)

        cmn.write_lines(quake_fqlist, 'fqlist.cor')
        cmd = '\n'.join(fq2fa_cmds)
        info = info.replace('[fq2fa_commands]', cmd)

        noWolba_fqlist = ['noWolb_%s' % each for each in quake_fqlist]
        info = info.replace('[noWolba_fastq]', ' '.join(noWolba_fqlist))
コード例 #26
0
ファイル: concat_maps.py プロジェクト: ATPs/xiaolongTools
    fns = cmn.cmd2lines('ls *.map| grep -v all| grep -v test| grep -v concat')
    #fns = fns[:1]

    f_label = '15101E05_snp.vcf'


    #make the scaffold index
    #cmd = "grep -v '^#' 15101E05_snp.vcf| cut -f 1,2 > index_header"
    if not cmn.filexist('index_header'):
    #    cmn.run(cmd)
        make_index_header('15101E05_snp.vcf', 'assembly_v2_length.txt')

    header = ['scaffold', 'index']

    for fn in fns:
        label = fn.split('_')[0]
        header.append(label+'_f')
        header.append(label+'_m')

    cmn.write_lines(fns, 'map_name_order')

    cmn.write_file('\t'.join(header)+'\n', 'table_header')

    cmd = 'cp table_header all_concat.map;'
    cmd += 'paste index_header %s >> all_concat.map;' % ' '.join(fns)
    if not cmn.filexist('all_concat.map'):
        print(cmd)
        cmn.run(cmd)
    else:
        print('the final file all_concat.map has exist!, skip!')
コード例 #27
0
ファイル: find_maplist.py プロジェクト: ATPs/xiaolongTools
        if len(taken) == 0:
            missing.append(ID)
        elif len(taken) == 1:
            falist.append(taken[0])
        else:
            falist.append(tell_best_fa(taken))
        print('checklog', ID, taken)
    alea_files = []
    biohpc_files = []
    for each in falist:
        if '/archive/butterfly/' in each or ('jshen/h' in each):
            alea_files.append(each)
        else:
            biohpc_files.append(each)

    new_files = transfer_alea_files(alea_files)
    falist = biohpc_files + new_files

    falist.append('')
    cmn.write_lines(falist, 'statlist')

    if len(missing) != 0:
        print('ATTENTION! the following ID missing sequence!')
        print('\n'.join(missing))
        cmn.write_lines(missing, 'missingIDs')

    #if len(alea_files) != 0:
    #    print 'the following files need to transfer from /archive server'
    #    print '\n'.join(alea_files)
コード例 #28
0
ファイル: runSTRUCTURE.py プロジェクト: ATPs/xiaolongTools
            newConfig.append('#define OUTFILE structure.output')
        elif '#define MAXPOPS' in line:
            newConfig.append('#define MAXPOPS %s' % K)
        else:
            newConfig.append(line)

    newConfig.append('')

    cmds = []
    for eachtime in range(rep):
        outdir = 'structureK%s/r%s' % (K, eachtime)
        cmn.mkdir(outdir)
        os.chdir(outdir)

        dn = 'mainparams'
        cmn.write_lines(newConfig, dn)

        cmd = 'cd %s; touch extraparams; /home2/wli/local/Structure/bin/structure > runtime.log & cd %s' % (
            outdir, cwd)
        #cmn.run(cmd)

        cmds.append(cmd)
        cmd = 'sleep 55;'
        cmds.append(cmd)

        os.chdir(cwd)

    cmds.append('\nwait\n')
    dn = 'masterK%s.bash' % K
    cmn.write_lines(cmds, dn)
    cmd = 'bash %s' % dn
コード例 #29
0
        #fn = cmn.lastName(fn)

        fqs = cmn.cmd2lines(
            'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/1_bwa_align/%s*.fq'
            % sp)
        fqs += cmn.cmd2lines(
            'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/1_bwa_align/%s*.fastq'
            % sp)
        if len(fqs) == 0:
            print('can not find fastq for %s!' % sp)
            os.chdir('..')
            continue

        cmd, fsams = make_bwa_cmds(fqs, fn)
        dn = 'sam_filelist'
        cmn.write_lines(fsams, dn)

        info = template.replace('assembly_selfref', asslabel)
        info = info.replace('[bwa_cmds]', cmd)
        info = info.replace('[WL_sam_filelist]', dn)
        info = info.replace('[WL_preprocessing]', '\n'.join(step1cmds))

        #make snp call cmds
        #f_sam = merge_sams(sp, fsams)

        info = info.replace('5328', sp)
        info = info.replace('[WL_cwd]', os.getcwd())

        info2 = template2.replace('assembly_selfref', asslabel)
        info2 = info2.replace('5328', sp)
        info2 = info2.replace('[WL_cwd]', os.getcwd())
コード例 #30
0
        fq = os.path.abspath(fq)
        ID = cmn.lastName(fq).split('_')[0]
        try:
            groupDict[ID].append(fq)
        except KeyError:
            groupDict[ID] = [fq]

    fmitolist = os.path.abspath(fmitolist)
    for sample in groupDict:
        fqlist = groupDict[sample]
        wdir = 'mitoRef_%s' % sample
        cmn.mkdir(wdir)
        os.chdir(wdir)
        cwd = os.getcwd()
        info = template.replace('[cwd]', cwd)
        cmn.write_lines(fqlist, 'fqlist')
        cmd = 'cat %s|xargs cat > ref_mito.fa; module add bwa; bwa index ref_mito.fa' % fmitolist
        cmn.run(cmd)

        bwa_commands = []
        if len(fqlist) == 3:
            R1 = [each for each in fqlist if '_R1' in each][0]
            R2 = [each for each in fqlist if '_R2' in each][0]
            single = [each for each in fqlist if '_single' in each][0]
            bwa_commands.append('bwa mem -M ref_mito.fa %s %s > paired.sam' %
                                (R1, R2))
            bwa_commands.append('bwa mem -M ref_mito.fa %s > singleton.sam' %
                                (single))
        elif len(fqlist) == 2:
            R1 = [each for each in fqlist if '_R1' in each][0]
            R2 = [each for each in fqlist if '_R2' in each][0]