def __init__(self, workbench, data_path="/home/moritz/DataBases/genomes/RefSeq/", clean=False): Database.__init__(self, workbench=workbench, data_path=data_path) if not os.path.exists(self.metadata_file) or clean: ftp = FTP(ncbi) print "Getting metadata from ncbi" FNULL = open(os.devnull, 'w') ftp.login() ftp.cwd('genomes/refseq/bacteria/') info = StringIO.StringIO() ftp.retrbinary("RETR " + "assembly_summary.txt", info.write) info.seek(0) self.metadata = DataFrame.from_csv(info, sep="\t", header=1) ftp.close() self.metadata['assembly_level'] = self.metadata[ 'assembly_level'].apply(lambda x: x.replace(" ", "_")) self.metadata = self.metadata.transpose().to_dict() DataFrame.from_dict(self.metadata).to_csv(self.metadata_file) else: print "Loading metadata" self.metadata = DataFrame.from_csv(self.metadata_file).to_dict() print "Loading genomes" for k, v in tqdm(self.metadata.items()): genome_path = pjoin(self.data_path, v['assembly_level'].replace(" ", "_"), k) genome_file = pjoin(genome_path, k + ".fna") self.genomes += [ Genome(k, genome_path, ref=genome_file, manual_metadata=v, taxDb=self.taxDb, workbench=self.workbench) ]
google_data = pjoin(root, "OD1s and more - Sheet1.csv") manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict() cpus = 1 all_genomes = [] for dir in os.listdir(data_root): dir = pjoin(data_root, dir) for g in os.listdir(dir): g_dir = pjoin(dir, g) fasta = [ f for f in os.listdir(g_dir) if ".fasta" in f and not ".fasta." in f ] assert len(fasta) == 1 all_genomes += [ Genome(g, g_dir, pjoin(g_dir, fasta[0]), manual_metadata[g]) ] all_genomes.sort(key=lambda x: x.size, reverse=True) all_clusters = [g for g in all_genomes if g.name == g.cluster] name_map = { g.name: g.conv_name() for g in all_genomes if g.name != g.conv_name() } rev_name_map = {v: k for k, v in name_map.iteritems()} name_map.update({g.conv_name(): g.conv_name() for g in all_genomes}) rev_name_map.update({g.name: g.name for g in all_genomes}) mcl = orthoMCL(pjoin(analyses_root, "orthoMCL/"), all_genomes, "big_clustering")
def __init__(self, workbench, data_path="/home/moritz/people/MoreData/genomes/TOBG/", clean=False): Database.__init__(self, workbench=workbench, data_path=data_path) wb = load_workbook("metadata/Table3_GenomeStats.xlsx") t_metadata = DataFrame( [l for i, l in enumerate(wb['Sheet1'].values) if i > 1], columns=[l for l in wb['Sheet1'].values][1]) corrected = { u'\xc2Gemmatimonadetes': 'Gemmatimonadetes', 'marinegroup': 'Puniceicoccaceae', 'Urania1B19': 'Phycisphaerae', 'Thalassopira': 'Thalassospira', 'SM1A02': 'Phycisphaerae', 'SAR324cluster': 'SAR324 cluster', 'unclassifiedAlphaproteobacteria': 'Alphaproteobacteria', 'SAR202-2': 'SAR202 cluster', 'SAR202-1': 'SAR202 cluster', 'SAR116cluster': 'SAR116 cluster', 'OPB35soil': 'unidentified Verrucomicrobium group OPB35', 'Pla3': 'Planctomycetes', 'OM190': 'Planctomycetes', 'NovelClass_B': 'Ignavibacteriae', 'Nitropelagicus': 'Candidatus Nitrosopelagicus', 'Nanoarchaoeta': 'Nanoarchaeota', 'Methylobacterum': 'Methylobacterium', 'JL-ENTP-F27': 'Phycisphaerae', 'FS140-16B-02marinegroup': 'Phycisphaerae', 'Epsilonbacteraeota': 'Bacteria', 'DEV007': 'Verrucomicrobiales', 'CandidatusPuniceispirillum': 'Candidatus Puniceispirillum', 'CandidatePhylaRadiation': 'Bacteria candidate phyla', 'CaThioglobus': 'Candidatus Thioglobus', 'CaAtelocyanobacterium': 'Candidatus Atelocyanobacterium', '0319-6G20': 'Bdellovibrionales', 'Euryarcheota': 'Euryarchaeota', 'SBR1093': 'Bacteria', 'Euryarcheoata': 'Euryarchaeota' } regions = { 'NP': 'North_Pacific', 'NAT': 'North_Atlantic', 'MED': 'Mediterranean', 'ARS': 'Arabian_Sea', 'RS': 'Red_Sea', 'IN': 'Indian_Ocean', 'EAC': 'East_Africa_Coastal', 'SAT': 'South_Atlantic', 'CPC': 'Chile_Peru_Coastal', 'SP': 'South_Pacific' } wb2 = load_workbook("metadata/Table4_Phylogeny.xlsx") taxos = { l[0]: [v for v in l[:-1] if v != 'null' and not v[0:4] == "nove"][-1] for l in wb2.get_sheet_by_name('Hug set').values } taxos = { k: corrected[v] if corrected.has_key(v) else v for k, v in taxos.items() } tax_2_id = self.taxDb.get_name_translator(taxos.values()) tax_ids = { g: tax_2_id.get(taxos[g])[0] for g in t_metadata['Genome ID'] if taxos.has_key(g) } t_metadata['species_taxid'] = [ tax_ids[g] if tax_ids.has_key(g) else 131567 for g in t_metadata['Genome ID'] ] t_metadata.index = Index(t_metadata['Genome ID']) t_metadata['region'] = [ regions[g.split("_")[1].split("-")[0]] for g in t_metadata['Genome ID'] ] self.metadata = t_metadata.transpose().to_dict() print "Loading genomes" if os.path.exists(pjoin(self.data_path, 'TOBGGENOMES.tar.gz')): os.system("tar xzvf " + pjoin(self.data_path, 'TOBGGENOMES.tar.gz')) os.remove(pjoin(self.data_path, 'TOBGGENOMES.tar.gz')) for k, v in tqdm(self.metadata.items()): genome_path = pjoin(self.data_path, v['region'], k) genome_file = pjoin(genome_path, k + ".fna") if not os.path.exists(genome_file): os.makedirs(pjoin(genome_path, 'original_files')) shutil.move(self.data_path + k + ".fna", pjoin(genome_path, 'original_files')) self.genomes += [ Genome(k, genome_path, ref=pjoin(genome_path, 'original_files', k + ".fna"), manual_metadata=v, taxDb=self.taxDb, workbench=self.workbench) ]
root = "test_data/" data_root = pjoin(root, "data/") analyses_root = pjoin(root, "analyses/") google_data = pjoin(root, "metadata.csv") manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict() cpus = 1 all_genomes = [] all_files = check_output(["find", data_root]).split() for g in manual_metadata.keys(): fasta = [ f for f in all_files if ".fasta" in f and g in f and ".fasta." not in g ] assert len(fasta) == 1 all_genomes += [ Genome(g, os.path.dirname(fasta[0]), fasta[0], manual_metadata[g]) ] if not core: try: print "testing genome clustering with NICsimilarity" # cluster_genomes(all_genomes,pjoin(analyses_root,"rifle_clusters.tsv"),cutoff=0.95) print "testing annotation" annotation( all_genomes, cpus=8, ) except: printerr("non-core functions broken") print "non-core functions work"
from micompy.common.utils.iotl_annotations import * from dendropy import * import csv root = "/home/moritz/people/moritz/CDs/" data_root = pjoin(root, "data/") analyses_root = pjoin(root, "analyses") google_data = pjoin(root, "OD1s and more - Sheet1.csv") manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict() cpus = 16 all_raws = set(os.listdir(pjoin(data_root, "raw"))) assert all([k + ".fasta" in all_raws for k in manual_metadata.keys()]) all_genomes = [ Genome(g, pjoin(data_root, "genomes", g), pjoin(data_root, 'raw', g + ".fasta"), manual_metadata[g]) for g, m in manual_metadata.iteritems() ] for g in tqdm(all_genomes): if not g.size: g.compute_size() all_genomes.sort(key=lambda x: x.completness(), reverse=True) annotation(all_genomes, cpus) all_genomes = [g for g in all_genomes if g.is_good()] derep_clusters = cluster_genomes_mash( genomes=all_genomes,
if not os.path.exists(checkm_dir): os.makedirs(checkm_dir) cpus = 11 all_closed = os.listdir(pjoin(data_root, "closed")) all_cultfree = os.listdir(pjoin(data_root, "cultivation_frees")) metadata = DataFrame.from_csv(pjoin(data_root,"taxontable18602_09-jun-2017.xls"), sep = "\t") metadata.index = Index(metadata.index.to_series().apply(str)) metadata['short_name'] = nan #metadata['taxon_oid'].apply(str) # check if all genomes are in the metadata sheet assert all([g[:-6] in metadata.index for g in all_closed + all_cultfree]) metadata = metadata.transpose().to_dict() all_closed_genomes = [ Genome(g[:-6], pjoin(data_root,"processed_genomes",g[:-6]), pjoin(data_root,'closed', g ), metadata[g[:-6]]) for g in all_closed] all_cultfree_genomes = [ Genome(g[:-6], pjoin(data_root,"processed_genomes",g[:-6]), pjoin(data_root,'cultivation_frees', g ), metadata[g[:-6]]) for g in all_cultfree] all_genomes = all_closed_genomes + all_cultfree_genomes def run(): annotation(all_genomes, cpus=cpus) checkm(all_genomes, cpus=cpus, output = pjoin(checkm_dir, "checkm_all.txt")) phylophlan(all_genomes, cpus=cpus, output = pjoin(checkm_dir, "phyluo.txt")) renaming_tree(pjoin(checkm_dir, "phyluo.txt"), pjoin(checkm_dir, "phyluo_renamed.txt"), { g.name : "|".join([g.metadata['Class'],g.metadata['Order'],g.metadata['Family'], g.metadata['Genus']] )for g in all_genomes })renaming_tree(pjoin(checkm_dir, "phyluo.txt"), pjoin(checkm_dir, "phyluo_renamed.txt"), { g.name : "|".join([g.metadata['Class'],g.metadata['Order'],g.metadata['Family'], g.metadata['Genus']] )for g in all_genomes }) clade_data(all_genomes, {g.name : g.metadata['Class'] for g in all_genomes}, pjoin(checkm_dir, "phylophlan_class.txt") ) class_data(all_genomes, {g.name : g.metadata['Culture Type'] for g in all_genomes if g.metadata['Culture Type'] == g.metadata['Culture Type']}, pjoin(checkm_dir, "phylophlan_type.txt") )
import os, sys from os.path import join as pjoin from tqdm import tqdm home = os.environ['HOME'] sys.path.append(pjoin(home, "repos/moritz/MiComPy/")) from micompy.databases.database import Database from micompy.common.genome import Genome from micompy.common.tools.workbench import WorkBench bench = WorkBench() bench.default_bench() g = Genome("GCF_000005845.2", ".", "GCF_000005845.2.fna", workbench=bench) Db = Database("test_db", [g]) Db.process() test = bench['HMMer'].hmmsearch_pfam_presence(g)
from micompy.gene_clusterings.orthomcl.clustering import Clustering as MCLClustering from micompy.gene_clusterings.clustering import Clustering from micompy.gene_clusterings.pfam_clusters.clustering import PfamClustering from itertools import groupby from pylab import * from micompy.common.utils.iotl_annotations import * root = "/home/moritz/people/sarahi/all_enrichmentss/" data_root = pjoin(root, "all_AGs/") analyses_root = pjoin(root, "") google_data = pjoin(root, "ag_metadata.csv") manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict() cpus = 16 all_genomes = [ Genome(g, pjoin(data_root, g), pjoin(data_root, m['genomes']), manual_metadata[g]) for g, m in manual_metadata.iteritems() ] for g in tqdm(all_genomes): if not g.size: g.compute_size() all_genomes.sort(key=lambda x: x.size, reverse=True) annotation(all_genomes, cpus) sh.cat(*[g.proteom.replace(".faa", ".gff") for g in all_genomes], _out="temp.gff") sh.grep("CDS", "temp.gff", _out=pjoin(analyses_root, "all_gff.gff")) #checkm(all_genomes, pjoin(analyses_root,"checkm"), cpus)