bfa = '>%s\n%s\n' % (subjct, seq)
            difference = compare_itself(fasta, bfa)
            paired_fa.append(bfa)
            keys = list(difference.keys())
            keys.sort()
            for pos in keys:
                info.append('%s\t%s\t%s\t%s\n' %
                            (name, subjct, pos, difference[pos]))
            if isInBlast:
                #itself is found by blast
                pass
            else:
                #this is very special
                #the barcode is not in the blast result
                difference, subjct = compare_top_hit(br_result)
                bfa = '>%s(addBack_closest_barcode)\n%s\n' % (
                    subjct, barCodeDict[subjct])
                paired_fa.append(bfa)
                keys = list(difference.keys())
                keys.sort()
                for pos in keys:
                    info.append('%s\t%s\t%s\t%s\n' %
                                (name, subjct, pos, difference[pos]))

        info.append('#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#\n')
    dn = cmn.lastName(fn) + '.report'
    cmn.write_file(''.join(info), dn)

    dn = cmn.lastName(fn) + '_paired.fa'
    cmn.write_file(''.join(paired_fa), dn)
Exemple #2
0
if __name__=='__main__':
    #options=parse_options()
    try:
        fn=sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta'
    seqDict = read_fa(fall)

    info = []
    for line in cmn.file2lines(fn):
        #5077    Autochton zarex
        items = line.strip().split()
        sample, genus, sp = items[:3]
        query_sequence, qlen = get_query_sequence(seqDict, genus, sp)
        br_result = do_barcode_blast(query_sequence)
        print(br_result)
        #print '\n'.join(br_result)
        baits = pick_barcode_baits(br_result, qlen, seqDict)
        info += format_baits(sample, baits)

    dn = cmn.lastName(fn) + '.baits'
    cmn.write_file(''.join(info), dn)



Exemple #3
0
        fn, fadd = sys.argv[1:3]
    except:
        print("Usage: *.py aln repID.file", file=sys.stderr)
        sys.exit()

    #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps'
    #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs'
    #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID)
    #       if i.strip() != ''])

    #fadd = 'added_sps'
    #if cmn.filexist(fadd):
    #    print 'found local list, add them in'
    goodIDs = set(cmn.getid(fadd))

    dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa',
                                                        '') + '_taken.fa'
    dp = open(dn, 'w')

    new = []
    leftIDs = set(goodIDs)
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                name = line[1:].strip().split('_')[0].split('-')[0]
                if name in goodIDs:
                    isGood = True
                    if name in leftIDs:
                        leftIDs.remove(name)
                else:
                    isGood = False
Exemple #4
0
    for each in contig:
        scaf, position1, char1, char2, phase = each
        phase += '[swap]'
        newlist.append((scaf, position1, char2, char1, phase))
    return newlist


if __name__ == '__main__':
    #options=parse_options()
    try:
        fsam, fletter = sys.argv[1:]
    except:
        print("Usage: *.py *.sam *.letters", file=sys.stderr)
        sys.exit()

    outlabel = cmn.lastName(fletter)[:-8]
    print(outlabel)
    #{read_query_name: [record1, record2]}
    paired_samDict = read_samfile(fsam)
    #covDict[scaf][index][char]
    covDict = compute_coverage_from_sam(paired_samDict)
    cons_seq = make_cons_from_covDict(covDict)

    #adict = {'scaf': position1: [A, T, phase]}
    letter_dict, inconsistent_positions = read_letter_file(fletter)

    #still save inconsistent letter in letter_dict because we need to break contigs by them
    letter_dict, corrected_dict = correct_false_snp_call(
        letter_dict, covDict, inconsistent_positions)
    new = []
    for scaf in corrected_dict:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fR1, fR2 = sys.argv[1:3]
    except:
        print("Usage: *.py R1 R2", file=sys.stderr)
        sys.exit()


    sample = cmn.lastName(fR1).split('_')[0]

    spacing_list = [2, 3, 5, 10]

    count = 0
    for spacing in spacing_list:
        fpR1 = open(fR1)
        fpR2 = open(fR2)
        dnlabel = '%s.spacing%s' % (sample, spacing)
        print('making %s' % dnlabel)

        dnR1 = open('%s_R1.fastq' % dnlabel, 'w')
        dnR2 = open('%s_R2.fastq' % dnlabel, 'w')

        for i, line1 in enumerate(fpR1):
            line2 = fpR2.readline()
Exemple #6
0
        adict[sp] = ref
        rset.add(ref)

    if isbad:
        sys.exit()
    return rset, adict


#1. read in data
fns = cmn.getid(sys.argv[1])

bwa_dirs = [line.strip().rstrip('/') for line in cmn.getid(sys.argv[2])]

#2. check which reference they used
#sp is unique
vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
#6188_3842_assembly_v2_snp_step2.vcf
#vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
sps = list(vcf_dict.keys())

#ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs)
ref_genomes, refmapping = set([]), {}
for fn in fns:
    #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf
    fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '')
    items = fnlabel.split('_')
    sp = items[0]
    ref = '_'.join(items[1:])
    ref_genomes.add(ref)
    refmapping[fnlabel] = ref
Exemple #7
0
import sys
python_lib = '/work/00412/mtang/sequencing/scripts'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py *.clw", file=sys.stderr)
        sys.exit()

    new = []
    with open(fn) as fp:
        for line in fp:
            exon, sp, seq = line.strip().split()
            sp = sp.split('.')[0]
            new.append('>%s_%s\n%s\n' % (sp, exon, seq))

    dn = cmn.lastName(fn).replace('.sum', '') + '.fa'
    cmn.write_file(''.join(new), dn)
    adict = {}
    fastas = cmn.txt_read(fa).split('>')[1:]
    print(fastas)
    for each in fastas:
        lines = each.strip().split('\n')
        defline = lines[0]
        seq = ''.join([line.strip() for line in lines[1:]])
        #seq = seq.replace('N', '-')
        adict[defline] = seq
    return adict


if __name__ == '__main__':
    #options=parse_options()
    try:
        fn, Range = sys.argv[1:3]
        i, j = list(map(int, Range.split('-')))
    except:
        print("Usage: *.py aln 0-10000", file=sys.stderr)
        sys.exit()

    new = []
    seqDict = read_fa(fn)
    for name in seqDict:
        seq = seqDict[name]
        fasta = '>%s\n%s\n' % (name, seq[i:j])
        new.append(fasta)

    dn = '%s_%s.fa' % (cmn.lastName(fn).replace('.fa', ''), Range)
    cmn.write_file(''.join(new), dn)
import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
        ref_scaf = sys.argv[2]
    except:
        print("Usage: *.py samfile scaf", file=sys.stderr)
        sys.exit()

    dn = 'filtered' + cmn.lastName(fn)
    dp = open(dn, 'w')
    with open(fn) as fp:
        for line in fp:
            if line[0] == '@':
                if ref_scaf in line:
                    dp.write(line)

            else:
                if line.strip().split()[2] == ref_scaf:
                    dp.write(line)

    dp.close()
if __name__ == '__main__':
    #options=parse_options()
    try:
        fqlist, fmitolist = sys.argv[1:]
    except:
        print("Usage: *.py fqlist refMitoList", file=sys.stderr)
        sys.exit()

    #ftemplate = '/work/biophysics/wli/Eudamine/wholeMito_run2/mito_denovo.template'
    ftemplate = '/project/biophysics/Nick_lab/wli/sequencing/scripts/mito_scripts/mito_refDenovo.template'
    template = cmn.txt_read(ftemplate)
    fqlist = cmn.file2lines(fqlist)
    groupDict = {}
    for fq in fqlist:
        fq = os.path.abspath(fq)
        ID = cmn.lastName(fq).split('_')[0]
        try:
            groupDict[ID].append(fq)
        except KeyError:
            groupDict[ID] = [fq]

    fmitolist = os.path.abspath(fmitolist)
    for sample in groupDict:
        fqlist = groupDict[sample]
        wdir = 'mitoRef_%s' % sample
        cmn.mkdir(wdir)
        os.chdir(wdir)
        cwd = os.getcwd()
        info = template.replace('[cwd]', cwd)
        cmn.write_lines(fqlist, 'fqlist')
        cmd = 'cat %s|xargs cat > ref_mito.fa; module add bwa; bwa index ref_mito.fa' % fmitolist
Exemple #11
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py *.fq", file=sys.stderr)
        sys.exit()

    taken = set([])

    dn = cmn.lastName(fn).split('.')[0] + '_unified.fastq'

    isGood = True
    #new = []
    dp = open(dn, 'w')
    with open(fn) as fp:
        for i, line in enumerate(fp):
            if i % 4 == 0:
                ID = line.strip()
                if ID not in taken:
                    isGood = True
                    taken.add(ID)
                else:
                    isGood = False
                    print('duplcated ID: %s' % ID)
            if isGood:
Exemple #12
0
import sys
python_lib = '/home2/wli/my_programs/python_lib'
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py *.clw", file=sys.stderr)
        sys.exit()

    new = []
    with open(fn) as fp:
        for line in fp:
            name, seq = line.strip().split()
            new.append('>%s\n%s\n' % (name, seq))

    dn = cmn.lastName(fn).replace('.clw', '') + '.fa'
    cmn.write_file(''.join(new), dn)
Exemple #13
0
    ]
    fmis = cmn.cmd2lines('ls rescued_read_assembled_mis1*.txt')[0]
    fns.append(fmis)

    for fn in fns:
        cmd = 'chmod a+w %s' % fn
        cmn.run(cmd)

    cmd = "ssh [email protected] 'rm /data/www/wenlin/html/transfer/barcode_lineup_files/%s_rescued_read_assembled_mis1*.txt'" % sp
    cmn.run(cmd)

    cmd = 'rm /project/biophysics/Nick_lab/wli/archive/BWA_barcodes/lineup_files/%s_rescued_read_assembled_mis1*.txt' % sp
    cmn.run(cmd)

    ddirs = [
        '/project/biophysics/Nick_lab/wli/archive/BWA_barcodes/lineup_files',
        '[email protected]:/data/www/wenlin/html/transfer/barcode_lineup_files/other_data'
    ]

    cmd = 'rsync -av %s [email protected]:/data/www/wenlin/html/transfer/barcode_lineup_files/%s_%s' % (
        fmis, sp, fmis)
    cmn.run(cmd)

    for ddir in ddirs:
        for fn in fns:
            if 'rescued_read_assembled_mis1' in fn and '/other_data' in ddir:
                continue

            cmd = 'rsync -av %s %s/%s_%s' % (fn, ddir, sp, cmn.lastName(fn))
            cmn.run(cmd)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py assembly", file=sys.stderr)
        sys.exit()

    seqDict, order_scafs = read_fa(fn)

    #write in the same dict as assembly
    olabel = cmn.lastName(fn)
    dnlabel = '.'.join(olabel.split('.')[:-1]) + '_scaf.header'
    dn = fn.replace(olabel, dnlabel)

    fdn = open(dn, 'w')

    for name in order_scafs:
        shortname = name.split()[0]
        length = len(seqDict[name])
        for i in range(length):
            line = '%s\t%s\n' % (shortname, (i + 1))
            fdn.write(line)

    fdn.close()
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                defline = line.strip()
                if sampleID in defline:
                    isTaken = True
                else:
                    isTaken = False
            else:
                if isTaken:
                    takenSeq = line.strip()
                    print('take the base seq as %s  for %s' %
                          (defline, sampleID))
                    break

    goodP = [i for i in range(len(takenSeq)) if takenSeq[i] not in gapChars]

    f_label = cmn.lastName(fn).replace('.fasta', '').replace('.fa', '')
    dn = f_label + '_base%s.fa' % sampleID
    dp = open(dn, 'w')

    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                defline = line.strip()
            else:
                goodSeq = ''.join([line[i] for i in goodP])
                dp.write('%s\n%s\n' % (defline, goodSeq))
    dp.close()
Exemple #16
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py sam", file=sys.stderr)
        sys.exit()

    dnH = cmn.lastName(fn).replace('.sam', '') + '.HighQmapStat'
    dnL = cmn.lastName(fn).replace('.sam', '') + '.mapStat'

    rdict = {}
    hdict = {}
    samfile = pysam.AlignmentFile(fn)
    for record in samfile:
        if record.is_unmapped:
            continue

        scaf = record.reference_name
        aligns = record.get_aligned_pairs()
        N = len([each for each in aligns if None not in each])

        if scaf not in rdict:
            rdict[scaf] = [0, 0]
Exemple #17
0
        adict[defline] = seq.upper()
    return adict


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fn=sys.argv[1]
    except:
        print("Usage: *.py aln.fa", file=sys.stderr)
        sys.exit()


    seqDict = read_fa(fn)

    newDict = {key: ''.join([rdict[char] for char in seqDict[key][::-1]])
            for key in seqDict}

    dn = cmn.lastName(fn) + '.reverse'
    fastas = ['>%s_reverse\n%s\n' % (name, newDict[name])
            for name in newDict]
    cmn.write_file(''.join(fastas), dn)


        adict[defline] = seq.upper()
    return adict


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fn=sys.argv[1]
    except:
        print("Usage: *.py aln.fa", file=sys.stderr)
        sys.exit()


    seqDict = read_fa(fn)

    newDict = {key: ''.join([rdict[char] for char in seqDict[key]])
            for key in seqDict}

    dn = cmn.lastName(fn) + '.cmpl'
    fastas = ['>%s_reverse\n%s\n' % (name, newDict[name])
            for name in newDict]
    cmn.write_file(''.join(fastas), dn)


                    if copy_counts[0] > copy_counts[1]:
                        char2 = char1
                    else:
                        char1 = char2

            else:
                print('unrecognized line: %s' % line, file=sys.stderr)
                sys.exit()


            seq1.append(char1)
            seq2.append(char2)

    #output the last
    phased_blocks.append('%s\t%s\t%s\t%s\t%s\t%s\n' % (lastPhase, lastScaf, lastPosition[1], right[1], lastPosition[2], right[2]))

    dnlabel = cmn.lastName(fn).replace('.vcf', '')
    sp = dnlabel.split('_')[1]
    dn = dnlabel + '_phased.fa'
    with open(dn, 'w') as dp:
        dp.write('>%s_ref_or_phase1\n' % sp)
        dp.write(''.join(seq1))
        dp.write('\n')
        dp.write('>%s_called_or_phase2\n' % sp)
        dp.write(''.join(seq2))
        dp.write('\n')


    dn = dnlabel + 'phased.blocks'
    cmn.write_file(''.join(phased_blocks), dn)
Exemple #20
0
        sys.exit()

    fhead = '/work/biophysics/mtang/SNP_calling/indexed_references/Junonia_v2_scaf.header'
    #fhead = 'Calycopis_cecrops_assembly_V1.1_scaf.header'
    print('loading header info...')
    headDict = {}
    with open(fhead) as fp:
        for i, line in enumerate(fp):
            scaf, index = line.strip().split()
            try:
                headDict[scaf].append(i)
            except KeyError:
                headDict[scaf] = [i]

    print('finish loading header, begin parsing fasta...')
    outdir = '%s_scafs' % cmn.lastName(fn)
    cmn.mkdir(outdir)

    seqDict = read_fa(fn)
    for scaf in headDict:
        indexes = headDict[scaf]
        new = []
        for name in seqDict:
            seq = seqDict[name]
            newSeq = ''.join([seq[i] for i in indexes])
            fasta = '>%s\n%s\n' % (name, newSeq)
            new.append(fasta)

        dn = '%s/%s.fa' % (outdir, scaf)
        cmn.write_file(''.join(new), dn)
Exemple #21
0
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py vcf", file=sys.stderr)
        sys.exit()

    total = cmn.cmd2info('wc -l %s' % fn).split()[0]

    SNPs = cmn.cmd2info('grep HaplotypeScore %s > %s.tmp; wc -l %s.tmp' %
                        (fn, fn, fn)).split()[0]

    lowqual = cmn.cmd2info('grep LowQual %s.tmp|wc -l ; rm %s.tmp' %
                           (fn, fn)).split()[0]

    print(cmn.lastName(fn), total, SNPs, lowqual,
          int(SNPs) / float(total),
          int(lowqual) / float(SNPs))
Exemple #22
0
            seq = []
        else:
            seq.append(line.strip())

#last seq
seqDict[sp].append(''.join(seq))

new = []

for sp in seqDict:
    seq1, seq2 = seqDict[sp]
    diffN = sum([seq1[i] != seq2[i]
            for i in range(len(seq1))])

    if diffN < cutoff:
        if diffN == 0:
            seq = seq1
            defline = '%s_unique' % sp
        else:    
            seq = collapse_seqs(seq1, seq2)
            defline = '%s_diff%s' % (sp, diffN)
        fasta = '>%s\n%s\n' % (defline, seq)
    else:            
        #need to keep both copy
        label = '%s_diff%s' % (sp, diffN)
        fasta = '>%s_cp1\n%s\n>%s_cp2\n%s\n' % (label, seq1, label, seq2)
    new.append(fasta)

dn = '%s_collapse_cut%s.fa' % (cmn.lastName(fn).replace('.fa', '') , cutoff)
cmn.write_file(''.join(new), dn)
Exemple #23
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fnlist, outdir = sys.argv[1:]
    except:
        print("Usage: *.py falist outdirName", file=sys.stderr)
        sys.exit()

    outlabel = cmn.lastName(fnlist)
    #scan through to see the total IDlist
    IDs = set([])
    fns = cmn.file2lines(fnlist)
    for fn in fns:
        with open(fn) as fp:
            for line in fp:
                if line[0] == '>':
                    ID = name2ID(line[1:].strip())
                    IDs.add(ID)

    #read in sequence and partition
    shift = 0
    final = {}
    setList = []
    for fn in fns:
Exemple #24
0
    adict = {}
    alist = []
    fastas = cmn.txt_read(fa).split('>')[1:]
    for each in fastas:
        lines = each.strip().split('\n')
        defline = lines[0]
        alist.append(defline)
        seq = ''.join(lines[1:])
        adict[defline] = seq
    return adict, alist


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    seqDict, orderlist = read_fa(fn)

    new = ['>%s\n%s\n' % (name, seqDict[name].upper()) for name in orderlist]

    dn = cmn.lastName(fn).replace('.fa', '') + '_4tree.fa'
    cmn.write_file(''.join(new), dn)
            continue

        if N == 0 or N == 1:
            #skip the all gapped positions
            #also skip same character lines
            continue
        else:
            for i, char in enumerate(chars):
                if char in missing_data:
                    result[i].append(-9)
                    continue

                try:
                    code = char_label[char]
                except KeyError:
                    code = current_count
                    char_label[char] = current_count
                    current_count += 1

                result[i].append(code)

    dn = cmn.lastName(fn) + 'STRUCTUREinput.txt'
    new = ['\t'.join(map(str, line)) for line in result]
    new.append('')
    cmn.write_lines(new, dn)

    print('number of loci: %s' % (len(line) - 1))



if __name__=='__main__':
    #options=parse_options()
    try:
        #fn, f_table = sys.argv[1:3]
        fn = sys.argv[1]
    except:
        print("Usage: *.py fqlist", file=sys.stderr)
        sys.exit()

    cmn.mkdir('tmpStat')

    IDlist = set([])
    fq_groups = {}
    for line in cmn.file2lines(fn):
        Id = cmn.lastName(line).split('_')[0]
        Id = Id.replace('NVG-', '').replace('11-BOA-','').replace('LEP-', 'LEP')
        IDlist.add(Id)
        fq = os.path.abspath(line)
        try:
            fq_groups[Id].append(fq)
        except KeyError:
            fq_groups[Id] = [fq]

    nameDict = get_names_4barcode()

    fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta'
    seqDict = read_fa(fall)
    fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa'
    if cmn.filexist(fadd):
Exemple #27
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #fns = cmn.cmd2lines('ls ../0_libs/*/*.fq')
    #fns += cmn.cmd2lines('ls ../0_libs/*/*.fastq')
    #fns = cmn.file2lines('../fqlist')
    #fns = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/*q')
    fns = cmn.file2lines(sys.argv[1])

    #skip_list = set(['5316', '5721'])

    gdict = {}
    for fn in fns:
        #items = fn.split('/')
        #sp = items[-2]
        sp = cmn.lastName(fn).split('_')[0]
        try:
            gdict[sp].append(fn)
        except:
            gdict[sp] = [fn]

    formatcmds = '\n\n'
    for sp in gdict:
        #if sp in skip_list:
        #    continue

        cmd = ''
        fns = gdict[sp]
        for fn in fns:
            cmd += 'fq2fa %s >> %s.fa; ' % (fn, sp)
        seq = ''.join(lines[1:])
        adict[defline] = seq
    return adict, len(seq)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        fn=sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    seqDict, length = read_fa(fn)

    new = ['%s\t%s' % (len(seqDict), length)]
    for name in seqDict:
        new.append('%s        %s' % (name, seqDict[name]))


    dn = cmn.lastName(fn) + '.phylip'
    cmn.write_lines(new, dn)


Exemple #29
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    adict = {}
    with open(fn) as fp:
        for line in fp:
            if line[0] == '>':
                label = line.strip()
            else:
                seq = line.strip()
                adict[label] = '%s\n%s\n' % (label, seq)

    times = 10
    keys = list(adict.keys())
    cmn.mkdir('shuffle_genome')
    for each in range(times):
        random.shuffle(keys)
        new = [adict[key] for key in keys]
        dn = 'shuffle_genome/%s_shuffle%s' % (cmn.lastName(fn), each)
        cmn.write_file(''.join(new), dn)
Exemple #30
0

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn, frange = sys.argv[1:]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    geneRange = read_gene_range(frange)

    seqDict, order_list = read_fa(fn)

    stat = []
    outdir = '%s_gene_fasta' % cmn.lastName(fn)
    cmn.mkdir(outdir)
    for gene in geneRange:
        i, j = geneRange[gene]
        print(gene, i, j)
        stat.append('%s\t%s\n' % (gene, j - i))

        dn = '%s/%s.fa' % (outdir, gene)
        with open(dn, 'w') as dp:
            for name in order_list:
                seq = seqDict[name][i:j]
                if seq.strip('-').strip('N') == '':
                    continue
                fasta = '>%s\n%s\n' % (name, seq)
                dp.write(fasta)