コード例 #1
0
ファイル: snp.py プロジェクト: unlhcc/staphb_toolkit
def snp(jobs, cpu_job, outdir, reference, tracker):
    #create sym links for read pairs in subfolders for cfsan snp pipeline
    if not tracker.check_status('initalize'):
        input_path = os.path.join(outdir, 'trimmed')
        checkexists(os.path.join(outdir, 'snp_reads'))
        fastqs = list(getfiles(input_path))[0]
        for read_pair in fastqs:
            sid = os.path.basename(read_pair[0]).split('_')[0]
            read_path = os.path.join(*[outdir, 'snp_reads', sid])
            checkexists(read_path)
            os.link(read_pair[0],
                    os.path.join(read_path, os.path.basename(read_pair[0])))
            os.link(read_pair[1],
                    os.path.join(read_path, os.path.basename(read_pair[1])))
        tracker.update_status_done('initalize')

    if not tracker.check_status('cfsan'):
        total_cpus = jobs * cpu_job
        cfsan_snp(outdir, reference, total_cpus)
        dist_matrix_path = os.path.join(
            *[outdir, 'cfsan', 'snp_distance_matrix.tsv'])
        shutil.copyfile(dist_matrix_path,
                        os.path.join(outdir, 'snp_distance_matrix.tsv'))
        tracker.update_status_done('cfsan')

    if not tracker.check_status('snp_tree'):
        tree_file = build_tree(outdir)
        in_path = os.path.join(*[outdir, 'snp_tree', tree_file])
        out_path = os.path.join(outdir, 'snp_tree.tree')
        shutil.copyfile(in_path, out_path)
        tracker.update_status_done('snp_tree')
コード例 #2
0
ファイル: core_genome.py プロジェクト: unlhcc/staphb_toolkit
def assemble_reads(jobs,cpu_job,outdir):
    logfile = os.path.join(outdir,'assembly.log')
    input_path = os.path.join(outdir,'trimmed')

    #determine free ram
    free_ram = int(psutil.virtual_memory()[1]/1000000000)
    ram_job = int(free_ram / jobs)

    #assemble
    assemblies_path = os.path.join(outdir,"assemblies")
    checkexists(assemblies_path)

    #get trimmed reads
    fastqs = list(getfiles(input_path))[0]
    cmds = []
    read_path = ''
    for read_pair in fastqs:
        #main command
        read1 = os.path.basename(read_pair[0])
        read2 = os.path.basename(read_pair[1])
        sid = os.path.basename(read_pair[0]).split('_')[0]

        cmds.append('bash -c \"shovill --R1 /data/{0} --R2 /data/{1} --outdir /output/{2} --cpus {3} --ram {4} && mv /output/{2}/contigs.fa /output/{2}/{2}.fa \"'.format(read1,read2,sid,cpu_job,ram_job))

        if read_path == '':
            read_path = os.path.dirname(os.path.abspath(read_pair[1]))
        elif read_path != os.path.dirname(os.path.abspath(read_pair[1])):
            print("Reads cannot be in multiple locations. Exiting.")
            sys.exit()
        else:
            pass

    #start multiprocessing
    pool = mp.Pool(processes=jobs)
    print("Begining assembly of reads:\n Number of Jobs: {0}\n CPUs/Job: {1}".format(jobs,cpu_job))

    #denote logs
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Assembly\n')
        #begin multiprocessing
        results = pool.starmap_async(cd.call,[['staphb/shovill:1.0.4',cmd,'/data',{read_path:"/data",os.path.join(outdir,'assemblies'):"/output"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    print("Finished Assembling Reads")
コード例 #3
0
ファイル: snp.py プロジェクト: unlhcc/staphb_toolkit
def build_tree(outdir, model='GTR+G'):
    logfile = os.path.join(outdir, 'tree.log')
    input_path = os.path.join(outdir, "cfsan")

    #remove tree dir if it exists
    shutil.rmtree(os.path.join(outdir, 'snp_tree'), ignore_errors=True)
    #create path
    cg_path = os.path.join(outdir, "snp_tree")
    checkexists(cg_path)
    shutil.copyfile(os.path.join(input_path, 'snpma.fasta'),
                    os.path.join(cg_path, 'snpma.fasta'))

    #count the number of isolates
    with open(os.path.join(os.path.join(cg_path, 'snpma.fasta')),
              'r') as infasta:
        isolate_counter = 0
        for line in infasta.readlines():
            if line[0] == '>':
                isolate_counter += 1

    #iqtree command
    if isolate_counter > 3:
        command = "sh -c 'iqtree -s snpma.fasta -m {0} -bb 1000 '".format(
            model)
        tree_file = 'snpma.fasta.contree'
    else:
        command = "sh -c 'iqtree -s snpma.fasta -m {0} '".format(model)
        tree_file = 'snpma.fasta.treefile'

    print("Building the Phylogeny")

    #denote logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Building Tree\n')
        stdout = cd.call('staphb/iqtree:1.6.7',
                         command,
                         '/data', {cg_path: "/data"},
                         sig_default=False)
        outlog.write('-----------\n')
        outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    print("Finished Bulding Phylogeny")
    return tree_file
コード例 #4
0
ファイル: core_genome.py プロジェクト: unlhcc/staphb_toolkit
def annotate_assemblies(jobs,cpu_job,outdir):
    logfile = os.path.join(outdir,'annotation.log')
    input_path = os.path.join(outdir,"assemblies")
    #annotation
    annotated_path = os.path.join(outdir,"annotated")
    checkexists(annotated_path)

    #get assemblies
    fastas = list(getfiles(input_path))[0]
    #setup command list for annotating
    cmds = []
    for path in fastas:
        #main command
        assembly_file = os.path.basename(path)
        assembly_dir = os.path.basename(os.path.dirname(path))
        sid = os.path.basename(path).split('.')[0]
        cmds.append('prokka --compliant --outdir /output/{0} --prefix {1} --cpus {2} {3}/{4}'.format(sid,sid,cpu_job,assembly_dir,assembly_file))

    #start multiprocessing
    pool = mp.Pool(processes=jobs)
    print("Begining annotation of assemblies:\n Number of Jobs: {0}\n CPUs/Job: {1}".format(jobs,cpu_job))

    #denote logs
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Annotating\n')
        #begin multiprocessing
        results = pool.starmap_async(cd.call,[['staphb/prokka:1.14.0',cmd,'/data',{input_path:"/data",os.path.join(outdir,'annotated'):"/output"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    #move finished .gff up a dir
    for root,dirs,files in os.walk(annotated_path):
        for file in files:
            if '.gff' in file:
                current_path = os.path.join(root,file)
                destination_path = os.path.join(os.path.dirname(root),file)
                os.rename(current_path,destination_path)

    print("Finished Annotating Assemblies")
コード例 #5
0
def q_trim(reads, jobs, cpu, outdir, tracker):
    minlength = 100
    windowsize = 4
    qscore = 30
    logfile = os.path.join(outdir, 'qtrim.log')

    cmds = []
    read_path = ''
    for read_pair in reads:
        #main command
        main_cmd = 'java -jar /Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads {0}'.format(
            cpu)

        sid = os.path.basename(read_pair[0]).split('_')[0]

        args = ' {read1} {read2} -baseout /output/{sid}.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}'.format(
            minlength=minlength,
            windowsize=windowsize,
            qscore=qscore,
            read1=os.path.basename(read_pair[0]),
            read2=os.path.basename(read_pair[1]),
            sid=sid)

        cmds.append(main_cmd + args)

        if read_path == '':
            read_path = os.path.dirname(os.path.abspath(read_pair[1]))
        elif read_path != os.path.dirname(os.path.abspath(read_pair[1])):
            print("Reads cannot be in multiple locations. Exiting.")
            sys.exit()
        else:
            pass
    checkexists(os.path.join(outdir, "trimmed"))

    #start multiprocessing
    pool = mp.Pool(processes=jobs)
    print(
        "Begining quality trimming of reads:\n Number of Jobs: {0}\n CPUs/Job: {1}"
        .format(jobs, cpu))
    #denote logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Trimmomatic\n')
        #begin multiprocessing
        results = pool.starmap_async(cd.call, [[
            'staphb/trimmomatic:0.39', cmd, '/data', {
                read_path: "/data",
                os.path.join(outdir, 'trimmed'): "/output"
            }
        ] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    #remove unpaired reads
    for root, dirs, files in os.walk(os.path.join(outdir, 'trimmed')):
        for file in files:
            if "U.fastq.gz" in file:
                os.remove(os.path.join(root, file))

    print("Finished Quality Trimming Reads")

    #update status
    tracker.update_status_done('trimmed')