def build_filesystem(): # DIRECTORY STRUCTURE global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR SCRATCH = Dir.make('/scratch/' + os.environ['USER']) BASEDIR = Dir('/Strong/proj/.data/Project_NTM') REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes")) DATA = BASEDIR.make_subdir("data") # RAW_DIR = DATA.make_subdir("00_raw") TRIM_DIR = DATA.make_subdir('trimmed_reads') ASSEMBLY_DIR = DATA.make_subdir('assemblies') ANNOTATION_DIR = DATA.make_subdir('annotations') MAP_DIR = DATA.make_subdir('mapped_reads')
def update_taxa(species_threshold=0.97, genus_threshold=0.80): logger = generic_logger('update_taxa.csv') run = Dir().dirname trim_dir = Dir(BASEDIR.join('data', 'trimmed_reads', run)) assembly_dir = Dir(BASEDIR.join('data', 'assemblies', run)) ani_dir = Dir(assembly_dir.join('ANI')) trimmed_reads = trim_dir.files(endswith="fq", dataframe=True) assemblies = assembly_dir.files(endswith='fna', dataframe=True) ani = ani_dir.files(endswith='.csv') for file in ani: try: df = pd.read_csv(file.path).sort_values('ani') sample_name = df.iloc[0].sample trim1 = trimmed_reads[ (trimmed_reads.filename.str.contains(sample_name)) & (trimmed_reads.filename.str.contains('_R1'))].iloc[0].path trim2 = trimmed_reads[ (trimmed_reads.filename.str.contains(sample_name)) & (trimmed_reads.filename.str.contains('_R2'))].iloc[0].path assembly = assemblies[( assemblies.filename.str.contains(sample_name))].iloc[0].path trim1 = File(trim1) trim2 = File(trim2) assembly = File(assembly) # ASSIGN TAXON taxon = 'UNKNOWN' possible_species = df[(df.ani >= species_threshold)] if len(possible_species) > 0: taxon = possible_species.iloc[0].taxon elif len(df[(df.ani >= genus_threshold)]) > 0: taxon = 'NTM' trim1_filename = trim1.filename trim2_filename = trim2.filename assembly_filename = assembly.filename trim1.rename(f'{sample_name}_{taxon}.fq.gz') trim2.rename(f'{sample_name}_{taxon}.fq.gz') assembly.rename(f'{sample_name}_{taxon}_000.fna') logger.info(f'renamed {trim1_filename} to {trim1.filename}') logger.info(f'renamed {trim2_filename} to {trim1.filename}') logger.info(f'renamed {assembly_filename} to {assembly.filename}') except Exception as e: logger.warning(e)
def configure(): global BASEDIR, REFERENCES, REFERENCE_GENOMES, SPECIES_GROUPS BASEDIR = Dir('/Strong/proj/.data/ProjectNTM') REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes")) REFERENCE_GENOMES = { # Clinical 'MAB': 'MAB.ATCC19977.fasta', 'MBOL': 'MAB.ATCC19977.fasta', 'MAV': 'MAV.HOM.H87.fasta', 'MMAS': 'MMAS.BRAPA42FWDG01.fasta', 'MCHIM': 'MCHIM.CDC2015-22-71.fasta', 'MINT': 'MCHIM.CDC2015-22-71.fasta', 'MCHE': 'MCHE.ATCC19237.fasta', 'MTB': 'MTB.H37RV.fasta', # Environmental 'MAROS': 'MAROS.DSM45069.fasta', 'MASIA': 'MASIA.DSM44297.fasta', 'MBOUCH': 'MBOUCH.DSM45439.fasta', 'MBOV': 'MBOV.AF2122.fasta', 'MCANE': 'MCANE.CIPT140070017.fasta', 'MCHUB': 'MCHUB.NBB4.fasta', 'MCOLOM': 'MCOLOM.CECT3035.fasta', 'MELE': 'MELE.DSM44368.fasta', 'MFORT': 'MFORT.CT6.fasta', 'MFRANK': 'MFRANK.DSM45524.fasta', 'MGILV': 'MGILV.SPYR1.fasta', 'MGORD': 'MGORD.DSM44160.fasta', 'MHAEM': 'MHAEM.DSM44634.fasta', 'MIMMU': 'MIMMU.CCUG47286T.fasta', 'MINDP': 'MINDP.MTCC9506.fasta', 'MIRAN': 'MIRAN.DSM45541.fasta', 'MKAN': 'MKAN.ATCC12478.fasta', 'MKUB': 'MKUB.CIP106428.fasta', 'MLENT': 'MLENT.CSURP1491.fasta', 'MLEPR': 'MLEPR.TN.fasta', 'MLIFL': 'MLIFL.128FXT.fasta', 'MMANT': 'MMANT.DSM45255.fasta', 'MMARI': 'MMARI.M.fasta', 'MMARS': 'MMARS.DSM45437.fasta', 'MMUCO': 'MMUCO.CSURP2099.fasta', 'MNEOA': 'MNEOA.VKMAC-1815D.fasta', 'MPORC': 'MPORC.CSURP1564.fasta', 'MRHOD': 'MRHOD.NBB3.fasta', 'MSALM': 'MSALM.D16Q15.fasta', 'MSENE': 'MSENE.NCTC4524.fasta', 'MSIMI': 'MSIMI.ATCC25275.fasta', 'MSMEG': 'MSMEG.MC2155.fasta', 'MTERR': 'MTERR.NCTC10856.fasta', 'MTIM': 'MTIM.CCUG56329.fasta', 'MTRIP': 'MTRIP.DSM44626.fasta', 'MULCE': 'MULCE.AGY99.fasta', 'MVANB': 'MVANB.PYR-1.fasta', 'MVUL': 'MVUL.DSM45247T.fasta', 'MXENO': 'MXENO.RIVM700367.fasta', 'MYONG': 'MYONG.05-1390.fasta', 'NFARC': 'NFARC.NCTC3000.fasta' } SPECIES_GROUPS = { 'MAC': ['MAV', 'MCHIM', 'MINT', 'MTIM', 'MBOUCH', 'MMARS'], 'MAB': ['MAB', 'MBOL', 'MMAS'] }