Exemple #1
0
def parse_scaf_length(line):
    ##contig=<ID=scaffold1_len454574_cov98,length=458273>
    adict = {}
    scaf = cmn.find_between(line, 'ID=', ',length')
    length = int(cmn.find_between(line, ',length=', '>'))
    adict[scaf] = length
    return adict
def check_difference(seq1, seq2):
    print(len(seq1), len(seq2))
    if len(seq1) == len(seq2):
        return sum([char1 != char2 for char1, char2 in zip(seq1, seq2)
            if char1 not in gapChars and char2 not in gapChars])

    cmn.write_file(seq1, 'tmpSeq1.fa')
    cmn.write_file(seq2, 'tmpSeq2.fa')
    info = cmn.cmd2info('blastn -query tmpSeq1.fa -subject tmpSeq2.fa')
    #Identities = 656/656 (100%)

    identityString = cmn.find_between(info, 'Identities = ', ' (')
    identN, totalN = list(map(int, identityString.split('/')))
    cmn.write_file(info, 'checkTmp%s.br' % (ID))
    return totalN - identN
                    #different chars and not a gap
                    seq.append('X')
        fasta = '>%s\n%s\n' % (Id, ''.join(seq))
        refBaseDict[Id] = ''.join(seq)
        new.append(fasta)

    cmn.write_file(''.join(new), 'sum_barcodes.fa')

    #cmn.run('rm -r sampleRun_fake')

    #check denovo pipeline one
    fns = cmn.cmd2lines('ls sampleRun_*/denovo_barcode.fa')
    denovoDict = {}
    new = []
    for fn in fns:
        Id = cmn.find_between(fn, 'sampleRun_', '/')
        lines = cmn.file2lines(fn)
        seq = ''.join(lines[1:])
        if seq > 658:
            tmp = seq.replace('N', '')
            if len(tmp) == 658:
                seq = tmp
        denovoDict[Id] = seq

        fasta = '>%s\n%s\n' % (Id, seq)
        new.append(fasta)

    cmn.write_file(''.join(new), 'sum_denovo.fa')

    new = []
    for Id in clean_lines:
Exemple #4
0
    #options=parse_options()
    try:
        wdir = sys.argv[1]
    except:
        print("Usage: *.py t100_highQ", file=sys.stderr)
        sys.exit()

    cmd = 'grep "Estimated Ln Prob of Data" %s/*/r*/*.log' % wdir

    lines = cmn.cmd2lines(cmd)
    print('\n'.join(lines))

    rdict = {}
    countK = {}
    for line in lines:
        K = cmn.find_between(line, 'structureK', '/')
        K = int(K)
        lnL = float(line.strip().split()[-1])
        try:
            rdict[K].append(lnL)
        except KeyError:
            rdict[K] = [lnL]

        try:
            countK[K] += 1
        except:
            countK[K] = 1

    keys = list(rdict.keys())
    keys.sort()
    for K in keys:
Exemple #5
0
if __name__=='__main__':
    #options=parse_options()
    try:
        odir, f_ass = sys.argv[1:3]
    except:
        print("Usage: *.py filelist assembly_v0.fa", file=sys.stderr)
        print("you should index assembly_v0.fa first with -p assembly_v0", file=sys.stderr)
        print("using command /home2/wli/local/bwa-0.7.12/bwa index ", file=sys.stderr)
        sys.exit()

    #fns = cmn.cmd2lines('ls %s/*.fq' % odir)
    fns = cmn.getid(odir)

    group_dict = separate_by_label(fns)

    ass_label = cmn.find_between(cmn.lastName(f_ass), 'assembly_', '.fa')

    cmn.mkdir('job_files')
    cmn.mkdir('cmd_files')

    for plabel in group_dict:
        print('processing lib %s' % plabel)
        each = group_dict[plabel]
        #also parse the files inside this function
        #return the file name after parsing
        paired, unpaired = separate_by_pair(plabel, each)
        if paired == None:
            continue
        label = '%s_%s' % (plabel, ass_label)
        #index_label = cmn.lastName(f_ass).replace('.fa', '')
        index_label = f_ass.replace('.fa', '')
Exemple #6
0
    try:
        fn=sys.argv[1]
    except:
        print("Usage: *.py link.file", file=sys.stderr)
        sys.exit()


    dn = 'retrieved_barcodes.fa'

    dp = open(dn, 'w')
    for link in cmn.file2lines(fn):
        if link[0] == '#':
            continue
        print('processing ' + link)
        info = cmn.link2info(link)
        seq = cmn.find_between(info, "generateBarcode ('#barcodeImg_", "');").split('\'')[-1]
        takeName = False
        takeSp = False
        for line in info.split('\n'):
            if 'Sequence ID' in line:
                takeName = True
                continue
            if '<td>Species:</td>' in line:
                takeSp = True
                continue

            if takeName:
                takeName = False
                #<td style="width:160px;">ANICE505-10.COI-5P</td>
                name = cmn.find_between(line, '>', '<').split('.COI')[0]
            if takeSp:
def scaf2numb(scaf):
    numb = cmn.find_between(scaf, 'scaffold', '_cov')
    return int(numb)
Exemple #8
0
if python_lib not in sys.path:
    sys.path.append(python_lib)

import cmn
import requests as rq

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        ID = sys.argv[1]
    except:
        print("Usage: *.py SRNPnumber", file=sys.stderr)
        sys.exit()

    url = 'http://janzen.sas.upenn.edu/Wadults/resultsexpressVOUCHB.lasso'

    json = {'submitButtonName': 'SUBMIT', 'voucher': ID}

    r = rq.post(url, data=json)

    sp = cmn.find_between(r.content, 'species:<b>', '</b>').strip()

    if '<title>' not in sp:
        print(sp)
Exemple #9
0
        print("Usage: *.py t100_highQ", file=sys.stderr)
        sys.exit()

    #cmd = 'grep "Estimated Ln Prob of Data" %s/*/r*/*.log' % wdir
    cmd = 'ls %s/*/r*/structure*f' % wdir

    fns = cmn.cmd2lines(cmd)
    print(fns)

    outdir = 'harvest_%s' % wdir
    cmn.mkdir(outdir)

    cmd_dict = {}
    for fn in fns:
        #cov_3/structureK10/r0/structure.output_f
        K = cmn.find_between(fn, 'structureK', '/')
        rep = fn.split('/')[-2]
        dn = '%s/out_K%s_%s_f' % (outdir, K, rep)
        cmd = 'cp %s %s' % (fn, dn)
        try:
            cmd_dict[K].append(cmd)
        except KeyError:
            cmd_dict[K] = [cmd]

    for K in cmd_dict:
        cmds = cmd_dict[K]
        if len(cmds) < 3:
            print('insufficent replicates for K=%s, skip' % K)
            continue
        for cmd in cmds:
            cmn.run(cmd)
    except:
        print("Usage: *.py vcf", file=sys.stderr)
        sys.exit()

    length_dict = {}

    current_scaf = ''
    seqDict = {}
    order_list = []
    with open(fn) as fp:
        for line in fp:
            line = line.strip()

            # ##contig=<ID=scaffold1_cov51,length=30279>
            if line.startswith('##contig='):
                scaf = cmn.find_between(line, '<ID=', ',')
                length = int(cmn.find_between(line, ',length=', '>'))
                length_dict[scaf] = length

            if line[0] != '#':
                items = line.strip().split()
                scaf = items[0]
                if scaf != current_scaf:
                    order_list.append(scaf)
                    #start a new scaffold
                    expect_index = 1
                    current_scaf = scaf
                else:
                    expect_index += 1

                index = int(items[1])