Esempio n. 1
0
def build_filesystem():
    # DIRECTORY STRUCTURE
    global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR
    SCRATCH = Dir.make('/scratch/' + os.environ['USER'])
    BASEDIR = Dir('/Strong/proj/.data/Project_NTM')
    REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes"))
    DATA = BASEDIR.make_subdir("data")
    # RAW_DIR = DATA.make_subdir("00_raw")
    TRIM_DIR = DATA.make_subdir('trimmed_reads')
    ASSEMBLY_DIR = DATA.make_subdir('assemblies')
    ANNOTATION_DIR = DATA.make_subdir('annotations')
    MAP_DIR = DATA.make_subdir('mapped_reads')
Esempio n. 2
0
def update_taxa(species_threshold=0.97, genus_threshold=0.80):
    logger = generic_logger('update_taxa.csv')

    run = Dir().dirname
    trim_dir = Dir(BASEDIR.join('data', 'trimmed_reads', run))
    assembly_dir = Dir(BASEDIR.join('data', 'assemblies', run))
    ani_dir = Dir(assembly_dir.join('ANI'))

    trimmed_reads = trim_dir.files(endswith="fq", dataframe=True)
    assemblies = assembly_dir.files(endswith='fna', dataframe=True)
    ani = ani_dir.files(endswith='.csv')

    for file in ani:
        try:
            df = pd.read_csv(file.path).sort_values('ani')
            sample_name = df.iloc[0].sample
            trim1 = trimmed_reads[
                (trimmed_reads.filename.str.contains(sample_name))
                & (trimmed_reads.filename.str.contains('_R1'))].iloc[0].path
            trim2 = trimmed_reads[
                (trimmed_reads.filename.str.contains(sample_name))
                & (trimmed_reads.filename.str.contains('_R2'))].iloc[0].path
            assembly = assemblies[(
                assemblies.filename.str.contains(sample_name))].iloc[0].path

            trim1 = File(trim1)
            trim2 = File(trim2)
            assembly = File(assembly)

            # ASSIGN TAXON
            taxon = 'UNKNOWN'
            possible_species = df[(df.ani >= species_threshold)]
            if len(possible_species) > 0:
                taxon = possible_species.iloc[0].taxon
            elif len(df[(df.ani >= genus_threshold)]) > 0:
                taxon = 'NTM'

            trim1_filename = trim1.filename
            trim2_filename = trim2.filename
            assembly_filename = assembly.filename

            trim1.rename(f'{sample_name}_{taxon}.fq.gz')
            trim2.rename(f'{sample_name}_{taxon}.fq.gz')
            assembly.rename(f'{sample_name}_{taxon}_000.fna')

            logger.info(f'renamed {trim1_filename} to {trim1.filename}')
            logger.info(f'renamed {trim2_filename} to {trim1.filename}')
            logger.info(f'renamed {assembly_filename} to {assembly.filename}')

        except Exception as e:
            logger.warning(e)
Esempio n. 3
0
def configure():
    global BASEDIR, REFERENCES, REFERENCE_GENOMES, SPECIES_GROUPS
    BASEDIR = Dir('/Strong/proj/.data/ProjectNTM')
    REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes"))
    REFERENCE_GENOMES = {
        # Clinical
        'MAB': 'MAB.ATCC19977.fasta',
        'MBOL': 'MAB.ATCC19977.fasta',
        'MAV': 'MAV.HOM.H87.fasta',
        'MMAS': 'MMAS.BRAPA42FWDG01.fasta',
        'MCHIM': 'MCHIM.CDC2015-22-71.fasta',
        'MINT': 'MCHIM.CDC2015-22-71.fasta',
        'MCHE': 'MCHE.ATCC19237.fasta',
        'MTB': 'MTB.H37RV.fasta',
        # Environmental
        'MAROS': 'MAROS.DSM45069.fasta',
        'MASIA': 'MASIA.DSM44297.fasta',
        'MBOUCH': 'MBOUCH.DSM45439.fasta',
        'MBOV': 'MBOV.AF2122.fasta',
        'MCANE': 'MCANE.CIPT140070017.fasta',
        'MCHUB': 'MCHUB.NBB4.fasta',
        'MCOLOM': 'MCOLOM.CECT3035.fasta',
        'MELE': 'MELE.DSM44368.fasta',
        'MFORT': 'MFORT.CT6.fasta',
        'MFRANK': 'MFRANK.DSM45524.fasta',
        'MGILV': 'MGILV.SPYR1.fasta',
        'MGORD': 'MGORD.DSM44160.fasta',
        'MHAEM': 'MHAEM.DSM44634.fasta',
        'MIMMU': 'MIMMU.CCUG47286T.fasta',
        'MINDP': 'MINDP.MTCC9506.fasta',
        'MIRAN': 'MIRAN.DSM45541.fasta',
        'MKAN': 'MKAN.ATCC12478.fasta',
        'MKUB': 'MKUB.CIP106428.fasta',
        'MLENT': 'MLENT.CSURP1491.fasta',
        'MLEPR': 'MLEPR.TN.fasta',
        'MLIFL': 'MLIFL.128FXT.fasta',
        'MMANT': 'MMANT.DSM45255.fasta',
        'MMARI': 'MMARI.M.fasta',
        'MMARS': 'MMARS.DSM45437.fasta',
        'MMUCO': 'MMUCO.CSURP2099.fasta',
        'MNEOA': 'MNEOA.VKMAC-1815D.fasta',
        'MPORC': 'MPORC.CSURP1564.fasta',
        'MRHOD': 'MRHOD.NBB3.fasta',
        'MSALM': 'MSALM.D16Q15.fasta',
        'MSENE': 'MSENE.NCTC4524.fasta',
        'MSIMI': 'MSIMI.ATCC25275.fasta',
        'MSMEG': 'MSMEG.MC2155.fasta',
        'MTERR': 'MTERR.NCTC10856.fasta',
        'MTIM': 'MTIM.CCUG56329.fasta',
        'MTRIP': 'MTRIP.DSM44626.fasta',
        'MULCE': 'MULCE.AGY99.fasta',
        'MVANB': 'MVANB.PYR-1.fasta',
        'MVUL': 'MVUL.DSM45247T.fasta',
        'MXENO': 'MXENO.RIVM700367.fasta',
        'MYONG': 'MYONG.05-1390.fasta',
        'NFARC': 'NFARC.NCTC3000.fasta'
    }
    SPECIES_GROUPS = {
        'MAC': ['MAV', 'MCHIM', 'MINT', 'MTIM', 'MBOUCH', 'MMARS'],
        'MAB': ['MAB', 'MBOL', 'MMAS']
    }