def cleanup(): ''' Removes "nucmer.error" file (Mac only). nucmer or mummer, which is run as part of quast, creates an error file in the Py_code folder that will be deleted if present. That seems to be a problem only for the Mac, not Linux ''' PY_dir = config.get_DO_PATHS()['PY_dir'] + 'nucmer.error' if os.path.exists(BASE_PATH + PY_dir + 'nucmer.error'): os.remove(BASE_PATH + PY_dir + 'nucmer.error')
Created on Thu Jul 5 16:07:55 2018 @authors: Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import pydot from heapq import heappop, heappush import config BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] ##### generate the minimum spanning tree from distance data ################### def make_graph(lo_concat_pairwise_diffs): ''' Turns a list of [(G1, G2, V1),(G2, G1, V1),...] tuples into a graph, which is in this case a dictionary of dictionaries. G1 and G2 are the Genomes (isolates), V1 (Value) is the number of either SNPs or mutation events. param: list lo_concat_pairwise_diffs = list of (G1, G2, V1) and its inverse, (G2, G1, V1), which are both needed by prim_mst() return: a graph (a dict of dict), input for prim_mst() '''
@authors: Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import os import shutil import re import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir'] GENOMES_dir = config.get_DO_PATHS()['GENOMES_dir'] Parsnp_image, Parsnp_WorkingDir = config.get_DO_IMAGES()['Parsnp'] NU_image, NU_WorkingDir = config.get_DO_IMAGES()['Newick_utils'] ##### housekeeping ############################################################ def sort_input(lo_phylo_tree_data): ''' Converts a list of (sp_abbr, isolate, work_dir, ref_name), collected by pipeline_master.py, into a dict of pipeline/ref/ : [(work_dir, isolate)] items.
@authors: Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import numpy as np import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir'] Kraken_image, Kraken_WorkingDir = config.get_DO_IMAGES()['Kraken'] ##### running Kraken ########################################################## def run_Kraken(work_dir): ''' Runs Minikraken to classify contigs by species. Output is a number for the classification and kmer counts, which needs to translated into human- readable form. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' return: ReturnCode, StdOut, StdErr output: 'kraken_out.txt' file
@authors: Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import LegioCluster_main import config import os import sys # set the paths to various folders BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] INPUT_dir = config.get_DO_PATHS()['INPUT_dir'] VCF_dir = config.get_DO_PATHS()['VCF_dir'] GENOMES_dir = config.get_DO_PATHS()['GENOMES_dir'] READS_dir = config.get_DO_PATHS()['READS_dir'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] ##### various checks ########################################################## # this is purely a check to see that these modules are available import matplotlib import numpy import pydot
(also <reference_SNP_cons.txt> if not already present) @authors: Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import csv import os import config BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] VCF_dir = config.get_DO_PATHS()['VCF_dir'] ##### reading the reference fasta file ######################################## def read_ref_file(SS_dir, ref_fa_file): ''' Reads a fasta file with the sequence of the reference genome (consisting of one or more contigs) and returns a list of headers and sequences. param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file return: list lo_contigs = list of headers and sequences, e.g.: [[NODE_1_length_6526_cov_26.4, 'ACTTGTACTAATTGGCTGATTGTTGACATAA...'],
import stat from numpy import median # dict of abbreviations : [species name, genome length] DO_SPECIES = config.get_DO_SPECIES() # list of species set up to run the pipeline LO_PIPELINES = config.get_LO_SP_ABBR() # folder where all files are located #BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] #BASE_PATH = os.path.dirname(os.getcwd()) # parent folder to current folder BASE_PATH = os.getcwd() + '/' # current folder GENOMES_dir = config.get_DO_PATHS()['GENOMES_dir'] # 'Genomes/', PY_dir = config.get_DO_PATHS()['PY_dir'] # 'Py_code/', REF_dir = config.get_DO_PATHS()['REF_dir'] # 'References/', VCF_dir = config.get_DO_PATHS()['VCF_dir'] # 'VCF_files/', READS_dir = config.get_DO_PATHS()['READS_dir'] # 'reads/' ##### 1. user input ########################################################### def get_species_name():