def combine_lanes(directory=os.getcwd()): """ REPORT: L001_R1 counts L002_R1 counts CF RULES 1e9 in one lane """ directory = Dir(directory) temp = BASEDIR.make_subdir('tmp', directory.dirname) logger = generic_logger(temp.join('combine_lanes.log')) pairs = Fastq.get_pairs(directory) logger.info(f"Combining lanes of {len(pairs)} pairs in {directory}") for pair in pairs: try: if pair.pair1.lane == "L001" and os.path.isfile( pair.pair1.path.replace("L001", "L002")): # READ 1 L001_R1 = pair.pair1 L002_R1 = Fastq(L001_R1.path.replace("L001", "L002")) LCAT_R1 = temp.join(L001_R1.filename.replace("L001", "LCAT")) # READ 2 L001_R2 = pair.pair2 L002_R2 = Fastq(L001_R2.path.replace("L001", "L002")) LCAT_R2 = temp.join(L001_R2.filename.replace("L001", "LCAT")) try: # COMBINE LANE 1 logger.info( f"Concatenating {L001_R1.filename} and {L002_R1.filename} as {LCAT_R1}" ) subprocess.Popen(f'cat {L001_R1} {L002_R1} > {LCAT_R1}', shell=True) logger.info("...PASS") except Exception as e: logger.warning(f"...FAIL: {e}") try: # COMBINE LANE 2 logger.info( f"Concatenating {L001_R2.filename} and {L002_R2.filename} as {LCAT_R2}" ) subprocess.Popen(f'cat {L001_R2} {L002_R2} > {LCAT_R2}', shell=True) logger.info("...PASS") except Exception as e: logger.info(f"...FAIL: {e}") except Exception as e: logger.warning(f'Combining Failed: {e}')
def build_filesystem(): # DIRECTORY STRUCTURE global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR SCRATCH = Dir.make('/scratch/' + os.environ['USER']) BASEDIR = Dir('/Strong/proj/.data/Project_NTM') REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes")) DATA = BASEDIR.make_subdir("data") # RAW_DIR = DATA.make_subdir("00_raw") TRIM_DIR = DATA.make_subdir('trimmed_reads') ASSEMBLY_DIR = DATA.make_subdir('assemblies') ANNOTATION_DIR = DATA.make_subdir('annotations') MAP_DIR = DATA.make_subdir('mapped_reads')
class Config: # DIRECTORY STRUCTURE BASE_DIR = Dir('/Strong/proj/.data/Project_NTM') REFERENCES = Dir(BASE_DIR.join("lib", "reference_genomes")) DATA = BASE_DIR.make_subdir("data") TRIM_DIR = DATA.make_subdir("trimmed_reads") ASSEMBLY_DIR = DATA.make_subdir("assemblies") ANNOTATION_DIR = DATA.make_subdir("annotations") MAP_DIR = DATA.make_subdir("mapped_reads") # NAMING CONVENTIONS REPORTS = "reports" ARCHIVE = "archive" FAILED = "failed" LOGS = "logs" # REPORTS TRIM_STATS = "trim_stats.csv" ASSEMBLY_STATS = "assembly_stats.csv" # LOGGING LOG_LEVEL = logging.DEBUG LOG_FILE = BASE_DIR.join('log', 'pipeline.log') # REFERENCES REFERENCE_GENOMES = { # Clinical 'MAB': 'MAB.ATCC19977.fasta', 'MBOL': 'MAB.ATCC19977.fasta', 'MAV': 'MAV.HOM.H87.fasta', 'MMAS': 'MMAS.BRAPA42FWDG01.fasta', 'MCHIM': 'MCHIM.CDC2015-22-71.fasta', 'MINT': 'MCHIM.CDC2015-22-71.fasta', 'MCHE': 'MCHE.ATCC19237.fasta', 'MTB': 'MTB.H37RV.fasta', # Environmental 'MAROS': 'MAROS.DSM45069.fasta', 'MASIA': 'MASIA.DSM44297.fasta', 'MBOUCH': 'MBOUCH.DSM45439.fasta', 'MBOV': 'MBOV.AF2122.fasta', 'MCANE': 'MCANE.CIPT140070017.fasta', 'MCHUB': 'MCHUB.NBB4.fasta', 'MCOLOM': 'MCOLOM.CECT3035.fasta', 'MELE': 'MELE.DSM44368.fasta', 'MFORT': 'MFORT.CT6.fasta', 'MFRANK': 'MFRANK.DSM45524.fasta', 'MGILV': 'MGILV.SPYR1.fasta', 'MGORD': 'MGORD.DSM44160.fasta', 'MHAEM': 'MHAEM.DSM44634.fasta', 'MIMMU': 'MIMMU.CCUG47286T.fasta', 'MINDP': 'MINDP.MTCC9506.fasta', 'MIRAN': 'MIRAN.DSM45541.fasta', 'MKAN': 'MKAN.ATCC12478.fasta', 'MKUB': 'MKUB.CIP106428.fasta', 'MLENT': 'MLENT.CSURP1491.fasta', 'MLEPR': 'MLEPR.TN.fasta', 'MLIFL': 'MLIFL.128FXT.fasta', 'MMANT': 'MMANT.DSM45255.fasta', 'MMARI': 'MMARI.M.fasta', 'MMARS': 'MMARS.DSM45437.fasta', 'MMUCO': 'MMUCO.CSURP2099.fasta', 'MNEOA': 'MNEOA.VKMAC-1815D.fasta', 'MPORC': 'MPORC.CSURP1564.fasta', 'MRHOD': 'MRHOD.NBB3.fasta', 'MSALM': 'MSALM.D16Q15.fasta', 'MSENE': 'MSENE.NCTC4524.fasta', 'MSIMI': 'MSIMI.ATCC25275.fasta', 'MSMEG': 'MSMEG.MC2155.fasta', 'MTERR': 'MTERR.NCTC10856.fasta', 'MTIM': 'MTIM.CCUG56329.fasta', 'MTRIP': 'MTRIP.DSM44626.fasta', 'MULCE': 'MULCE.AGY99.fasta', 'MVANB': 'MVANB.PYR-1.fasta', 'MVUL': 'MVUL.DSM45247T.fasta', 'MXENO': 'MXENO.RIVM700367.fasta', 'MYONG': 'MYONG.05-1390.fasta', 'NFARC': 'NFARC.NCTC3000.fasta' } SPECIES_GROUPS = {'MAC': ['MAV', 'MCHIM', 'MINT']} @classmethod def declare_globals(cls): global BASEDIR, REFERENCES, REFERENCE_GENOMES, SPECIES_GROUPS
def __init__(self): directory = Dir(BASEDIR.join("lib", "reference_genomes")) references = Fasta.get_all(directory)
def identify(isolate, delimiter="_", species_threshold=0.97, genus_threshold=0.80): isolate.log('Identifying Isolate') assembly = isolate.files.assembly """Blast fasta to reference genomes and store values in db""" ani_script = "/Strong/proj/.data/Morty/.config/software/ani-script/ANI.pl" blastall = "/software/cgeh/blast/2.2.22/bin/blastall" formatdb = "/software/cgeh/blast/2.2.22/bin/formatdb" # COMPARE TO REFERENCES try: matches = [] references = Fasta.get_all(REFERENCES) isolate.log( f"{isolate}: IDENTIFYING TAXON USING {len(references)} REFERENCES", lvl='INFO') for reference in references: ref_id = reference.filename.split('.')[0] scratch = SCRATCH.make_subdir('ani', isolate.name, f"{isolate}_vs_{ref_id}") command = f"perl {ani_script} -bl {blastall} -fd {formatdb} -qr {assembly} -sb {reference} -od {scratch}" output, error = subprocess.Popen( command.split(), stdout=subprocess.PIPE).communicate() try: ani = float(output) / 100 except (ValueError, TypeError): ani = 0 finally: record = { 'ani': ani, 'sample': isolate.name, 'reference': ref_id, 'taxon': ref_id.split('_')[0].split('-')[0] } matches.append(record) # WRITE CSV ani_csv = Dir.make(assembly.dir.join("ANI")).join(f"{isolate}_ANI.csv") df = pd.DataFrame.from_records(matches) df = df.sort_values('ani', ascending=False) df.to_csv(ani_csv, index=False) isolate.files.ani = File(ani_csv) # ASSIGN TAXON taxon = 'UNKNOWN' possible_species = df[(df.ani >= species_threshold)] if len(possible_species) > 0: taxon = possible_species.iloc[0].taxon elif len(df[(df.ani >= genus_threshold)]) > 0: taxon = 'NTM' isolate.taxon = taxon isolate.log(f"taxon={isolate.taxon}", lvl='INFO') return taxon except Exception as e: isolate.log(f"Identification failed: {e}", lvl='WARNING') subprocess.call(f"rm error.log formatdb.log".split())
def process_directory(directory=os.getcwd()): """Run pipeline on all fastq pairs in directory""" [ LSF.bsub(f"{sys.argv[0]} pipeline {pair.pair1} {pair.pair2}") for pair in Fastq.get_pairs(Dir(directory)) ]
def configure(): global BASEDIR, REFERENCES, REFERENCE_GENOMES, SPECIES_GROUPS BASEDIR = Dir('/Strong/proj/.data/ProjectNTM') REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes")) REFERENCE_GENOMES = { # Clinical 'MAB': 'MAB.ATCC19977.fasta', 'MBOL': 'MAB.ATCC19977.fasta', 'MAV': 'MAV.HOM.H87.fasta', 'MMAS': 'MMAS.BRAPA42FWDG01.fasta', 'MCHIM': 'MCHIM.CDC2015-22-71.fasta', 'MINT': 'MCHIM.CDC2015-22-71.fasta', 'MCHE': 'MCHE.ATCC19237.fasta', 'MTB': 'MTB.H37RV.fasta', # Environmental 'MAROS': 'MAROS.DSM45069.fasta', 'MASIA': 'MASIA.DSM44297.fasta', 'MBOUCH': 'MBOUCH.DSM45439.fasta', 'MBOV': 'MBOV.AF2122.fasta', 'MCANE': 'MCANE.CIPT140070017.fasta', 'MCHUB': 'MCHUB.NBB4.fasta', 'MCOLOM': 'MCOLOM.CECT3035.fasta', 'MELE': 'MELE.DSM44368.fasta', 'MFORT': 'MFORT.CT6.fasta', 'MFRANK': 'MFRANK.DSM45524.fasta', 'MGILV': 'MGILV.SPYR1.fasta', 'MGORD': 'MGORD.DSM44160.fasta', 'MHAEM': 'MHAEM.DSM44634.fasta', 'MIMMU': 'MIMMU.CCUG47286T.fasta', 'MINDP': 'MINDP.MTCC9506.fasta', 'MIRAN': 'MIRAN.DSM45541.fasta', 'MKAN': 'MKAN.ATCC12478.fasta', 'MKUB': 'MKUB.CIP106428.fasta', 'MLENT': 'MLENT.CSURP1491.fasta', 'MLEPR': 'MLEPR.TN.fasta', 'MLIFL': 'MLIFL.128FXT.fasta', 'MMANT': 'MMANT.DSM45255.fasta', 'MMARI': 'MMARI.M.fasta', 'MMARS': 'MMARS.DSM45437.fasta', 'MMUCO': 'MMUCO.CSURP2099.fasta', 'MNEOA': 'MNEOA.VKMAC-1815D.fasta', 'MPORC': 'MPORC.CSURP1564.fasta', 'MRHOD': 'MRHOD.NBB3.fasta', 'MSALM': 'MSALM.D16Q15.fasta', 'MSENE': 'MSENE.NCTC4524.fasta', 'MSIMI': 'MSIMI.ATCC25275.fasta', 'MSMEG': 'MSMEG.MC2155.fasta', 'MTERR': 'MTERR.NCTC10856.fasta', 'MTIM': 'MTIM.CCUG56329.fasta', 'MTRIP': 'MTRIP.DSM44626.fasta', 'MULCE': 'MULCE.AGY99.fasta', 'MVANB': 'MVANB.PYR-1.fasta', 'MVUL': 'MVUL.DSM45247T.fasta', 'MXENO': 'MXENO.RIVM700367.fasta', 'MYONG': 'MYONG.05-1390.fasta', 'NFARC': 'NFARC.NCTC3000.fasta' } SPECIES_GROUPS = { 'MAC': ['MAV', 'MCHIM', 'MINT', 'MTIM', 'MBOUCH', 'MMARS'], 'MAB': ['MAB', 'MBOL', 'MMAS'] }
def update_taxa(species_threshold=0.97, genus_threshold=0.80): logger = generic_logger('update_taxa.csv') run = Dir().dirname trim_dir = Dir(BASEDIR.join('data', 'trimmed_reads', run)) assembly_dir = Dir(BASEDIR.join('data', 'assemblies', run)) ani_dir = Dir(assembly_dir.join('ANI')) trimmed_reads = trim_dir.files(endswith="fq", dataframe=True) assemblies = assembly_dir.files(endswith='fna', dataframe=True) ani = ani_dir.files(endswith='.csv') for file in ani: try: df = pd.read_csv(file.path).sort_values('ani') sample_name = df.iloc[0].sample trim1 = trimmed_reads[ (trimmed_reads.filename.str.contains(sample_name)) & (trimmed_reads.filename.str.contains('_R1'))].iloc[0].path trim2 = trimmed_reads[ (trimmed_reads.filename.str.contains(sample_name)) & (trimmed_reads.filename.str.contains('_R2'))].iloc[0].path assembly = assemblies[( assemblies.filename.str.contains(sample_name))].iloc[0].path trim1 = File(trim1) trim2 = File(trim2) assembly = File(assembly) # ASSIGN TAXON taxon = 'UNKNOWN' possible_species = df[(df.ani >= species_threshold)] if len(possible_species) > 0: taxon = possible_species.iloc[0].taxon elif len(df[(df.ani >= genus_threshold)]) > 0: taxon = 'NTM' trim1_filename = trim1.filename trim2_filename = trim2.filename assembly_filename = assembly.filename trim1.rename(f'{sample_name}_{taxon}.fq.gz') trim2.rename(f'{sample_name}_{taxon}.fq.gz') assembly.rename(f'{sample_name}_{taxon}_000.fna') logger.info(f'renamed {trim1_filename} to {trim1.filename}') logger.info(f'renamed {trim2_filename} to {trim1.filename}') logger.info(f'renamed {assembly_filename} to {assembly.filename}') except Exception as e: logger.warning(e)
""" beagles[compbio] alma: grep , E-ALMA1.MAV_vs_NJH87.cf | more CP018363.1 83972 C T,G 0 34 34 SNP CP018363.1 148672 A G,C 0 14 14 SNP CP018363.1 1381815 G A,C 0 8 8 SNP CP018363.1 2334023 G A,T 0 30 30 SNP CP018363.1 2948951 T C,G 0 6 6 SNP CP018363.1 3621547 T C,G 0 35 35 SNP CP018363.1 4243092 A G,C 0 33 33 SNP CP018363.1 4562156 T C,G 0 15 15 SNP """ VCF_DIR = Dir("/Strong/proj/.data/alma") FASTA_DIR = VCF_DIR.make_subdir("fasta") MATRIX = FASTA_DIR.join("ALMA_matrix_N.fna") CSV_OUT = FASTA_DIR.join("ALMA_stats.csv") CSV_MUTATIONS = FASTA_DIR.join("ALMA_mutations.csv") POSITIONS = 5626623 files = VCF_DIR.files(endswith='.cf') records = [] multi_alleles = {} print(f"Building Matrix: {MATRIX} from {len(files)} files.") for i, file in enumerate(files): print(f"\t{i:02d} | {file.filename}") isolate = file.filename.split("_vs_")[0]