ファイル: toolshed.py プロジェクト: WHaasNY/LegioCluster
def run_docker_pull():
    Downloads (pulls) all docker images (programs) in the list, except some 
    that will be used after all individual samples have been run (Parsnp) or
    that are mempry intensive (Kraken).

    base_cmd = 'docker pull '
    DO_IMAGES = config.get_DO_IMAGES()

    for program in DO_IMAGES.keys():
        if program not in ['Kraken', 'Parsnp']:
            command = base_cmd + DO_IMAGES[program][0]
            run_subprocess('', command, use_logging=False)
ファイル: run_Parsnp.py プロジェクト: WHaasNY/LegioCluster
last update: 24 September 2020                                

import os
import shutil
import re
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir']
GENOMES_dir = config.get_DO_PATHS()['GENOMES_dir']

Parsnp_image, Parsnp_WorkingDir = config.get_DO_IMAGES()['Parsnp']
NU_image, NU_WorkingDir = config.get_DO_IMAGES()['Newick_utils']

##### housekeeping ############################################################

def sort_input(lo_phylo_tree_data):
    Converts a list of (sp_abbr, isolate, work_dir, ref_name), collected by 
      pipeline_master.py, into a dict of pipeline/ref/ : [(work_dir, isolate)] 
    param: list lo_phylo_tree_data = list of (pipeline, work_dir, reference, 
    return: dict do_seeds = dictionary of pipeline/ref/ : [(work_dir, isolate)]
ファイル: run_Kraken.py プロジェクト: WHaasNY/LegioCluster
    Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
last update: 24 September 2020                                

import numpy as np
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir']

Kraken_image, Kraken_WorkingDir = config.get_DO_IMAGES()['Kraken']

##### running Kraken ##########################################################

def run_Kraken(work_dir):
    Runs Minikraken to classify contigs by species. Output is a number for the 
      classification and kmer counts, which needs to translated into human-
      readable form.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    return: ReturnCode, StdOut, StdErr
    output: 'kraken_out.txt' file

    print('\nrunning: Kraken')
    Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
last update: 24 September 2020                                

import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']

Qualimap_image, Qualimap_WorkingDir = config.get_DO_IMAGES()['Qualimap']

def run_qualimap(work_dir, suffix):
    Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a FASTA file.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping
    return: ReturnCode, StdOut, StdErr       

    print('\nrunning: Qualimap')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
ファイル: run_Quast.py プロジェクト: WHaasNY/LegioCluster
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
last update: 24 September 2020                                

import os
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
REF_dir = config.get_DO_PATHS()['REF_dir']

Quast_image, Quast_WorkingDir = config.get_DO_IMAGES()['Quast']

def run_quast(work_dir, SS_dir, ref_fa_file, check_seq_file):
    Runs Quast, a quality assessment tool for assemblies.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    param: str check_seq_file = name of a sequence file to be QC'd
    output: Quast generates a number of files that will be deposited in the 
            new 'temp/Quast/' folder

    print('\nrunning: Quast')
import subprocess as sub
import os
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir  = config.get_DO_PATHS()['TEMP_dir']
REF_dir   = config.get_DO_PATHS()['REF_dir'] 

Mash_image, Mash_WorkingDir = config.get_DO_IMAGES()['Mash']

##### house-keeping functions #################################################

def make_lo_genomes(active_folder, isolate=''):
    Returns a list with the path and name of all files in a genomes subfolder
    param: str active_folder = path to the folder with the reference genomes
    param: str isolate = isolate name, e.g.: 'IDR001234'
    return: list lo_genomes = all genome names present in that folder
    lo_genomes = []
ファイル: run_Freebayes.py プロジェクト: WHaasNY/LegioCluster
import matplotlib
# sets the backend to anti-grain geometry for .png output
# prevents RuntimeError: Invalid DISPLAY variable in Linux
import matplotlib.pyplot as plt
import os
from numpy import ceil
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
REF_dir = config.get_DO_PATHS()['REF_dir']

Samtools_image, Samtools_WorkingDir = config.get_DO_IMAGES()['Samtools']
Freebayes_image, Freebayes_WorkingDir = config.get_DO_IMAGES()['Freebayes']
VCFlib_image, VCFlib_WorkingDir = config.get_DO_IMAGES()['VCFlib']

def run_samtools_faidx(work_dir, SS_dir, ref_fa_file):
    Generates a FAI index file, required for FreeBayes.
      Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    return: ReturnCode, StdOut, StdErr
    output: index files
ファイル: run_SPAdes.py プロジェクト: WHaasNY/LegioCluster
    [email protected]
last update: 24 September 2020                                

import config
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']

SPAdes_image, SPAdes_WorkingDir = config.get_DO_IMAGES()['SPAdes']

##### runs SPAdes and housekeeping functions ##################################

def run_spades(work_dir, THREADS, MEMORY, max_read_len):
    de novo genome assembler
      usage: spades.py [options] -o <out_dir>
      -o <out _dir>     directory to store all the resulting files (required)   
      -1 <filename>     file with forward paired-end reads
      -2 <filename>     file with reverse paired-end reads
      -t <int>          number of threads. [default: 16]
      -m <int>          RAM limit for SPAdes in Gb (terminates if exceeded). 
                        [default: 250]
      -k <int,int,...>  Comma-separated list of k-mer sizes to be used for 
ファイル: run_BWA.py プロジェクト: WHaasNY/LegioCluster

import os
import numpy as np
import config
import toolshed

BASE_PATH   = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir    = config.get_DO_PATHS()['TEMP_dir']
REF_dir     = config.get_DO_PATHS()['REF_dir'] 

# docker images 
BWA_image, BWA_WorkingDir           = config.get_DO_IMAGES()['BWA']
Samtools_image, Samtools_WorkingDir = config.get_DO_IMAGES()['Samtools']
Picard_image, Picard_WorkingDir     = config.get_DO_IMAGES()['Picard']
BCFtools_image, BCFtools_WorkingDir = config.get_DO_IMAGES()['BCFtools']

def run_bwa_index(work_dir, SS_dir, ref_fa_file):

    Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a fasta file.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    return: ReturnCode, StdOut, StdErr
    output: index files
ファイル: run_FastQC.py プロジェクト: WHaasNY/LegioCluster
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
last update: 24 September 2020                                

import zipfile
import os
import toolshed
import config

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']

FastQC_image, FastQC_WorkingDir = config.get_DO_IMAGES()['FastQC']

def run_fastqc(work_dir, proc_reads):
    Runs FastQC on a (processed) read file.
    -d DIR   directory for temporary files when generating report images 
             (default: '?')
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str proc_reads = name of file with forward or reverse reads 
           processed by Trimmomatic
    output: FastQC files 'read_file_fastqc.html' and 'read_file_fastqc.zip'

    print('\nrunning: FastQC')
    [email protected]
last update: 24 September 2020

import os
import toolshed
import shutil
import config
from numpy import mean, std

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
REF_dir = config.get_DO_PATHS()['REF_dir']

Trimmomatic_image, Trimmomatic_WorkingDir = config.get_DO_IMAGES(

##### remove reads with too many Gs in a row #######################################

def get_header_symbol(file):
    Returns the first character of the first line, which identifies the 
      header of a read, ususally a "@".
    helper function to remove_poly_Gs()
    param: str file = name of the read file
    return: str CHAR = first character of the header

    with open(file, 'r') as infile: