def get_retrieved_by_sample( biom_file, reference_by_obs_id, references_by_sample, uniq_id, uniq_id_by_sample ): counts_by_sample = dict() biom = BiomIO.from_json( biom_file ) for sample_name in biom.get_samples_names(): nb_detected = 0 retrieved = dict() expected_retrieved = dict() for obs in biom.get_observations_by_sample( sample_name ): nb_detected += 1 if not "," in reference_by_obs_id[obs['id']]: # Is not a chimera ref_id = reference_by_obs_id[obs['id']] retrieved[ref_id] = 1 if ref_id in references_by_sample[sample_name]: expected_retrieved[ref_id] = 1 # Uniq sequence for retrieved uniq_retrieved = set() for ref_id in retrieved: uniq_retrieved.add( uniq_id[ref_id] ) # Uniq sequence for retrieved uniq_expected_retrieved = set() for ref_id in expected_retrieved: uniq_expected_retrieved.add( uniq_id_by_sample[sample_name][ref_id] ) # Results counts_by_sample[sample_name] = { "detected": nb_detected, "retrieved": len(uniq_retrieved), "expected_retrieved": len(uniq_expected_retrieved) } return counts_by_sample
def process( in_biom, out_biom, out_metadata ): ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata taxonomy_depth = 0 unclassified_observations = list() FH_metadata = open( out_metadata, "w" ) FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" ) biom = BiomIO.from_json( in_biom ) for observation in biom.get_observations(): for metadata_key in observation["metadata"].keys(): if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file if observation["metadata"][metadata_key] is not None: for current_affi in observation["metadata"][metadata_key]: if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple): current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] ) FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" ) del observation["metadata"][metadata_key] elif observation["metadata"][metadata_key] is not None: # All list are transformed in string if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple): observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) ) if observation["metadata"].has_key( "blast_taxonomy" ): if observation["metadata"]["blast_taxonomy"] is None: unclassified_observations.append( observation["id"] ) observation["metadata"]["taxonomy"] = list() else: taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";")) observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";") # Add "Unclassified" ranks in unclassified observations if taxonomy_depth > 0: for observation_id in unclassified_observations: observation_metadata = biom.get_observation_metadata(observation_id) observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth BiomIO.write( out_biom, biom )
def excluded_obs_on_blastMetrics( input_biom, tag, cmp_operator, threshold, excluded_file ): """ @summary: Writes the list of the observations with no affiliations with sufficient blast value. @param input_biom: [str] The path to the BIOM file to check. @param tag: [str] The metadata checked. @param cmp_operator: [str] The operator use in comparison (tag_value ">=" thresold or tag_value "<=" thresold ). @param threshold: [float] The limit for the tag value. @param excluded_file: [str] The path to the output file. """ valid_operators = { ">=": operator.__ge__, "<=": operator.__le__ } cmp_func = valid_operators[cmp_operator] biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) for observation in biom.get_observations(): alignments = observation["metadata"]["blast_affiliations"] is_discarded = True for current_alignment in alignments: if cmp_func(float(current_alignment[tag]), threshold): is_discarded = False if is_discarded: FH_excluded_file.write( str(observation["id"]) + "\n" ) FH_excluded_file.close()
def __init__( self, out_tsv, in_biom, in_fasta=None ): """ @param in_biom: [str] Path to BIOM file. @param out_tsv: [str] Path to output TSV file. """ # Sequence file option sequence_file_opt = "" if in_fasta is None else " --input-fasta " + in_fasta # Check the metadata biom = BiomIO.from_json( in_biom ) conversion_tags = "" if biom.has_observation_metadata( 'rdp_taxonomy' ) and biom.has_observation_metadata( 'rdp_bootstrap' ): conversion_tags += "'@rdp_tax_and_bootstrap' " if biom.has_observation_metadata( 'blast_taxonomy' ): conversion_tags += "'blast_taxonomy' " if biom.has_observation_metadata( 'blast_affiliations' ): conversion_tags += "'@blast_subject' " conversion_tags += "'@blast_perc_identity' " conversion_tags += "'@blast_perc_query_coverage' " conversion_tags += "'@blast_evalue' " conversion_tags += "'@blast_aln_length' " if biom.has_observation_metadata( 'seed_id' ): conversion_tags += "'seed_id' " if in_fasta is not None: conversion_tags += "'@seed_sequence' " conversion_tags += "'@observation_name' '@observation_sum' '@sample_count'" # Set command Cmd.__init__( self, 'biom2tsv.py', 'Converts a BIOM file in TSV file.', "--input-file " + in_biom + sequence_file_opt + " --output-file " + out_tsv + " --fields " + conversion_tags, '--version' )
def getRealTaxByRefID( input_biom, taxonomy_key, duplication_groups ): """ @summary: Return taxonomy by reference. @param input_biom: [str] Path to BIOM file. @param taxonomy_key: [str] The metadata key for taxonomy. @param duplication_groups: [dict] By reference ID the list of references with the same sequence. @return: [dict] List of taxonomies by reference ID. Example: { "MVF01000012.1.1317": [ ["Root", "Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales", "Enterobacteriaceae", "Cronobacter", "Escherichia coli BIDMC 73"] ], "JQ607252.1.1437": [ ["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "bacterium NLAE-zl-P471"], ["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "Staphylococcus aureus M17299"] ] } """ taxonomy_by_obs_id = dict() tmp_taxonomy_by_obs_id = dict() biom = BiomIO.from_json( input_biom ) for observation in biom.get_observations(): taxonomy_clean = getCleanedTaxonomy(observation["metadata"][taxonomy_key]) taxonomy_by_obs_id[observation["id"]] = [taxonomy_clean] tmp_taxonomy_by_obs_id[observation["id"]] = taxonomy_clean if duplication_groups is not None: for obs_id in duplication_groups: taxonomy_by_obs_id[obs_id] = list() for id_duplicated_seq in duplication_groups[obs_id]: # For each duplication group member taxonomy_by_obs_id[obs_id].append(tmp_taxonomy_by_obs_id[id_duplicated_seq]) return taxonomy_by_obs_id
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ): """ @summary: Writes a BIOM after a random sampling in each sample. @param input_biom: [str] Path to the processed BIOM. @param output_biom: [str] Path to outputed BIOM. @param nb_sampled: [int] Number of sampled sequences by sample. @param sampled_ratio: [float] Ratio of sampled sequences by sample. @note: nb_sampled and sampled_ratio are mutually exclusive. """ initial_biom = BiomIO.from_json( input_biom ) new_biom = Biom( matrix_type="sparse", generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom ) observations_already_added = dict() for sample_name in initial_biom.get_samples_names(): new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) ) sample_seq = initial_biom.get_sample_count(sample_name) sample_nb_sampled = nb_sampled if nb_sampled is None: sample_nb_sampled = int(sample_seq * sampled_ratio) if sample_seq < nb_sampled: raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." ) else: for current_nb_iter in range(sample_nb_sampled): # Take an observation in initial BIOM selected_observation = initial_biom.random_obs_by_sample(sample_name) selected_observation_id = selected_observation['id'] initial_biom.subtract_count( selected_observation_id, sample_name, 1 ) # Put in new BIOM if not observations_already_added.has_key(selected_observation_id): new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) ) observations_already_added[selected_observation_id] = True new_biom.add_count( selected_observation_id, sample_name, 1 ) BiomIO.write( output_biom, new_biom )
def get_checked( abund_file, checked_sample, taxonomy_key, expected_by_depth ): checked_by_depth = dict() biom = BiomIO.from_json(abund_file) for current_obs in biom.get_observations(): clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key]) if current_obs["metadata"][taxonomy_key] is not None else ["unknown_taxa"]*len(expected_by_depth) count = biom.get_count(current_obs["id"], checked_sample) if count > 0: if clean_taxonomy[len(clean_taxonomy)-1] == "Multi-affiliation": nb_selected = 0 selected = list() taxonomies = list() expected_taxonomies = expected_by_depth[len(clean_taxonomy)-1] for affi_idx in range(len(current_obs["metadata"]["blast_affiliations"])): affi_taxonomy = ";".join(getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"])) if affi_taxonomy not in taxonomies: taxonomies.append(affi_taxonomy) if affi_taxonomy in expected_taxonomies: selected = getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"]) nb_selected += 1 if nb_selected == 1: clean_taxonomy = selected else: warnings.warn( "Multi-affiliation cannot be resolved for " + str((float(count)*100)/biom.get_total_count()) + "% sequences. Possible taxonomies: '" + "', '".join(taxonomies) + "'." ) for rank_depth in range(len(clean_taxonomy)): rank_taxonomy = ";".join(clean_taxonomy[:rank_depth + 1]) if rank_depth not in checked_by_depth: checked_by_depth[rank_depth] = dict() if rank_taxonomy not in checked_by_depth[rank_depth]: checked_by_depth[rank_depth][rank_taxonomy] = 0 checked_by_depth[rank_depth][rank_taxonomy] += count return checked_by_depth
def biom_fasta_to_tsv( input_biom, input_fasta, output_tsv, fields, list_separator ): """ @summary: Convert BIOM file to TSV file with sequence. @param input_biom: [str] Path to the BIOM file. @param input_fasta: [str] Path to the sequences of the observations. @param output_tsv: [str] Path to the output file (format : TSV). @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count', '@rdp_tax_and_bootstrap', '@seed_sequence'. The others columns must be metadata title. @param list_separator: [str] Separator for complex metadata. """ biom = BiomIO.from_json( input_biom ) out_fh = open( output_tsv, "w" ) sequence_idx = fields.index("@seed_sequence") # Header header_parts = header_line_parts( fields, biom ) out_fh.write( "#" + "\t".join(header_parts) + "\n" ) # Data fields_without_seq = fields del fields_without_seq[sequence_idx] FH_in = FastaIO( input_fasta ) for record in FH_in: obs_idx = biom.find_idx("observation", record.id) count_by_sample = biom.data.get_row_array(obs_idx) observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields_without_seq, list_separator ) observation_parts.insert( sequence_idx, record.string ) out_fh.write( "\t".join(observation_parts) + "\n" ) out_fh.close()
def observations_depth( input_biom, output_depth ): """ @summary : Write the depths of the observation in file. @param input_biom : [str] path to the biom file processed. @param output_depth : [str] path to the output file. @note : Example of one output file #Depth<TAB>Nb_Observ_concerned<TAB>Prct_Observ_concerned 1<TAB>65<TAB>65.000 2<TAB>30<TAB>30.000 3<TAB>0<TAB>0.000 4<TAB>5<TAB>5.000 """ obs_depth = list() nb_observ = 0 # Process depth calculation biom = BiomIO.from_json( input_biom ) for observation_id, observation_count in biom.get_observations_counts(): while len(obs_depth) <= observation_count: obs_depth.append(0) obs_depth[observation_count] += 1 if observation_count != 0: nb_observ += 1 del biom # Write output out_fh = open( output_depth, 'w' ) out_fh.write( "#Depth\tNb_Observ_concerned\tPrct_Observ_concerned\n" ) for depth in range(1, len(obs_depth)): prct = (float(obs_depth[depth])/ nb_observ)*100 out_fh.write( str(depth) + "\t" + str(obs_depth[depth]) + "\t" + ("%.3f" % prct) + "\n" ) out_fh.close()
def biom_fasta_to_tsv( input_biom, input_fasta, output_tsv, fields, list_separator ): """ @summary: Convert BIOM file to TSV file with sequence. @param input_biom: [str] Path to the BIOM file. @param input_fasta: [str] Path to the sequences of the observations. @param output_tsv: [str] Path to the output file (format : TSV). @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count', '@rdp_tax_and_bootstrap', '@seed_sequence'. The others columns must be metadata title. @param list_separator: [str] Separator for complex metadata. """ biom = BiomIO.from_json( input_biom ) observation_list = [ name for name in biom.get_observations_names()] out_fh = open( output_tsv, "wt" ) sequence_idx = fields.index("@seed_sequence") # Header header_parts = header_line_parts( fields, biom ) out_fh.write( "#" + "\t".join(header_parts) + "\n" ) # Data fields_without_seq = fields del fields_without_seq[sequence_idx] FH_in = FastaIO( input_fasta ) for record in FH_in: try : obs_idx = biom.find_idx("observation", record.id) count_by_sample = biom.data.get_row_array(obs_idx) observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields_without_seq, list_separator ) observation_parts.insert( sequence_idx, record.string ) out_fh.write( "\t".join(observation_parts) + "\n" ) observation_list.remove(record.id) except: pass out_fh.close() if len(observation_list) > 0: raise_exception(Exception("\n\n##ERROR : your input fasta file (" + input_fasta + ") does not contain sequence for :" + ", ".join(observation_list) + "\n"))
def get_checked( abund_file, checked_sample, taxonomy_key, expected_by_depth ): checked_by_depth = dict() biom = BiomIO.from_json(abund_file) for current_obs in biom.get_observations(): clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key]) count = biom.get_count(current_obs["id"], checked_sample) if count > 0: if clean_taxonomy[len(clean_taxonomy)-1] == "Multi-affiliation": nb_selected = 0 selected = list() taxonomies = list() expected_taxonomies = expected_by_depth[len(clean_taxonomy)-1] for affi_idx in range(len(current_obs["metadata"]["blast_affiliations"])): affi_taxonomy = ";".join(getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"])) if affi_taxonomy not in taxonomies: taxonomies.append(affi_taxonomy) if affi_taxonomy in expected_taxonomies: selected = getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"]) nb_selected += 1 if nb_selected == 1: clean_taxonomy = selected else: warnings.warn( "Multi-affiliation cannot be resolved for " + str((float(count)*100)/biom.get_total_count()) + "% sequences. Possible taxonomies: '" + "', '".join(taxonomies) + "'." ) for rank_depth in range(len(clean_taxonomy)): rank_taxonomy = ";".join(clean_taxonomy[:rank_depth + 1]) if rank_depth not in checked_by_depth: checked_by_depth[rank_depth] = dict() if rank_taxonomy not in checked_by_depth[rank_depth]: checked_by_depth[rank_depth][rank_taxonomy] = 0 checked_by_depth[rank_depth][rank_taxonomy] += count return checked_by_depth
def get_step_size(self, nb_step=35): """ @summary: Returns the step size to obtain 'nb_step' steps or more in 3/4 of samples. @param nb_step: [int] The number of expected steps. @returns: [int] The step size. """ counts = list() # Get the number of sequences by sample biom = BiomIO.from_json( self.in_biom ) for sample_name in biom.get_samples_names(): counts.append( biom.get_sample_count(sample_name) ) del biom counts = sorted(counts) nb_samples = len(counts) # Finds the lower quartile number of sequences lower_quartile_idx = nb_samples/4 nb_seq = counts[lower_quartile_idx] # If lower quartile sample is empty if nb_seq == 0: idx = 1 while (lower_quartile_idx + idx) < nb_samples and counts[lower_quartile_idx + idx] == 0: idx += 1 if (lower_quartile_idx + idx) < nb_samples: nb_seq = counts[lower_quartile_idx + idx] step_size = int(nb_seq/nb_step) return max(1, step_size)
def get_step_size(self, nb_step=35): """ @summary: Returns the step size to obtain 'nb_step' steps or more in 3/4 of samples. @param nb_step: [int] The number of expected steps. @returns: [int] The step size. """ counts = list() # Get the number of sequences by sample biom = BiomIO.from_json(self.in_biom) for sample_name in biom.get_samples_names(): counts.append(biom.get_sample_count(sample_name)) del biom counts = sorted(counts) nb_samples = len(counts) # Finds the lower quartile number of sequences lower_quartile_idx = nb_samples / 4 nb_seq = counts[lower_quartile_idx] # If lower quartile sample is empty if nb_seq == 0: idx = 1 while (lower_quartile_idx + idx) < nb_samples and counts[lower_quartile_idx + idx] == 0: idx += 1 if (lower_quartile_idx + idx) < nb_samples: nb_seq = counts[lower_quartile_idx + idx] step_size = int(nb_seq / nb_step) return max(1, step_size)
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None): """ @summary: Add taxonomy metadata on biom file from a blast result. @param reference_file: [str] The path to the reference file. @param biom_in: [str] The path to the Biom file to process. @param biom_out: [str] The path to the biom output file. @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+). @param rdp_files: [list] the list of path to the RDPClassifier results. """ # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy) taxonomy_by_reference = get_tax_from_fasta(reference_file) # Retrieve blast clusters annotations cluster_blast_annot = dict() if blast_files is not None: cluster_blast_annot = get_bests_blast_affi(blast_files, taxonomy_by_reference) del taxonomy_by_reference # Retrieve rdp clusters annotations cluster_rdp_annot = dict() if rdp_files is not None: cluster_rdp_annot = get_rdp_affi(rdp_files) # Add metadata to biom biom = BiomIO.from_json(biom_in) for cluster in biom.get_observations(): cluster_id = cluster["id"] # Blast if blast_files is not None: blast_taxonomy = None blast_affiliations = list() if cluster_blast_annot.has_key( cluster_id): # Current observation has a match blast_taxonomy = get_tax_consensus([ alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments'] ]) blast_affiliations = cluster_blast_annot[cluster_id][ 'alignments'] biom.add_metadata(cluster_id, "blast_affiliations", blast_affiliations, "observation") biom.add_metadata(cluster_id, "blast_taxonomy", blast_taxonomy, "observation") # RDP if rdp_files is not None: rdp_taxonomy = None rdp_bootstrap = None if cluster_rdp_annot.has_key(cluster_id): rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy'] rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap'] biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation") biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation") BiomIO.write(biom_out, biom)
def write_log(in_biom, out_biom, log): FH_log=open(log,"w") FH_log.write("#sample\tnb_otu_before\tnb_otu_after\n") initial_biom = BiomIO.from_json( in_biom ) new_biom = BiomIO.from_json( out_biom ) for sample_name in initial_biom.get_samples_names(): nb_otu_before = len(initial_biom.get_sample_obs(sample_name)) nb_otu_after = len(new_biom.get_sample_obs(sample_name)) FH_log.write("Sample name: "+sample_name+"\n\tnb initials OTU: "+str(nb_otu_before)+"\n\tnb normalized OTU: "+str(nb_otu_after)+"\n") nb_initial_otu=len(initial_biom.rows) nb_new_otu=len(new_biom.rows) FH_log.write("Sample name: all samples\n\tnb initials OTU: "+str(nb_initial_otu)+"\n\tnb normalized OTU: "+str(nb_new_otu)+"\n") FH_log.close()
def samples_hclassification( input_biom, output_newick, distance_method, linkage_method, min_count=1 ): """ @summary : Process and write an hierarchical classification from Biom. @param input_biom : [str] Path to the BIOM file to process. @param output_newick : [str] Path to the newick output file. @param distance_method : [str] Used distance method for classify. @param linkage_method : [str] Used linkage method for classify. @param min_count : [int] Samples with a count lower than this value are not processed. """ from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram import scipy.cluster.hierarchy data_array = list() processed_samples = list() excluded_samples = list() nb_samples = None # Normalisation on count by sample biom = BiomIO.from_json( input_biom ) for col_idx, current_sample in enumerate(biom.columns): sum_on_sample = biom.data.get_col_sum( col_idx ) if sum_on_sample < min_count: excluded_samples.append( current_sample['id'] ) else: processed_samples.append( current_sample['id'] ) OTUs_norm = list() for row_idx in range(len(biom.rows)): OTUs_norm.append( biom.data.nb_at(row_idx, col_idx)/float(sum_on_sample) ) data_array.append( OTUs_norm ) nb_samples = len(biom.columns) del biom # Process distance if len(processed_samples) < 1: raise Exception("All samples have a count lower than threshold (" + str(min_count) + ").") elif len(processed_samples) == 1: # Write newick out_fh = open( output_newick, "w" ) out_fh.write( "(" + processed_samples[0] + ");\n" ) out_fh.close() else: # Computing the distance and linkage data_dist = pdist( data_array, distance_method ) data_link = linkage( data_dist, linkage_method ) # Write newick scipy_hc_tree = scipy.cluster.hierarchy.to_tree( data_link , rd=False ) id_2_name = dict( zip(range(len(processed_samples)), processed_samples) ) out_fh = open( output_newick, "w" ) out_fh.write( to_newick(scipy_hc_tree, id_2_name) + "\n" ) out_fh.close() # Display log print "# Hierarchical clustering log:\n" + \ "\tNumber of samples in BIOM: " + str(nb_samples) + "\n" + \ "\tNumber of processed samples: " + str(len(processed_samples)) if nb_samples > len(processed_samples): print "\n\tExcluded samples (count < " + str(min_count) + "): " + ", ".join(sorted(excluded_samples))
def samples_hclassification( input_biom, output_newick, distance_method, linkage_method, min_count=1 ): """ @summary : Process and write an hierarchical classification from Biom. @param input_biom : [str] Path to the BIOM file to process. @param output_newick : [str] Path to the newick output file. @param distance_method : [str] Used distance method for classify. @param linkage_method : [str] Used linkage method for classify. @param min_count : [int] Samples with a count lower than this value are not processed. """ from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram import scipy.cluster.hierarchy data_array = list() processed_samples = list() excluded_samples = list() nb_samples = None # Normalisation on count by sample biom = BiomIO.from_json( input_biom ) for col_idx, current_sample in enumerate(biom.columns): sum_on_sample = biom.data.get_col_sum( col_idx ) if sum_on_sample < min_count: excluded_samples.append( current_sample['id'] ) else: processed_samples.append( current_sample['id'] ) OTUs_norm = list() for row_idx in range(len(biom.rows)): OTUs_norm.append( biom.data.nb_at(row_idx, col_idx)/float(sum_on_sample) ) data_array.append( OTUs_norm ) nb_samples = len(biom.columns) del biom # Process distance if len(processed_samples) < 1: raise_exception( Exception("\n\n#ERROR :All samples have a count lower than threshold (" + str(min_count) + ").\n\n")) elif len(processed_samples) == 1: # Write newick out_fh = open( output_newick, "wt" ) out_fh.write( "(" + processed_samples[0] + ");\n" ) out_fh.close() else: # Computing the distance and linkage data_dist = pdist( data_array, distance_method ) data_link = linkage( data_dist, linkage_method ) # Write newick scipy_hc_tree = scipy.cluster.hierarchy.to_tree( data_link , rd=False ) id_2_name = dict( list(zip(list(range(len(processed_samples))), processed_samples)) ) out_fh = open( output_newick, "wt" ) out_fh.write( to_newick(scipy_hc_tree, id_2_name) + "\n" ) out_fh.close() # Display log print(("# Hierarchical clustering log:\n" + \ "\tNumber of samples in BIOM: " + str(nb_samples) + "\n" + \ "\tNumber of processed samples: " + str(len(processed_samples)))) if nb_samples > len(processed_samples): print(("\n\tExcluded samples (count < " + str(min_count) + "): " + ", ".join(sorted(excluded_samples))))
def filter_biom( removed_observations, in_biom, out_biom ): """ @summary: Removed the specified observations from BIOM. @param removed_observations: [dict] Each key is an observation name. @param in_biom: [str]: Path to the processed BIOM file. @param out_biom: [str]: Path to the cleaned BIOM file. """ biom = BiomIO.from_json(in_biom) biom.remove_observations(removed_observations) BiomIO.write(out_biom, biom)
def remove_observations( removed_observations, input_biom, output_biom ): """ @summary: Removes the specified list of observations. @param removed_observations: [list] The names of the observations to remove. @param input_biom: [str] The path to the input BIOM. @param output_biom: [str] The path to the output BIOM. """ biom = BiomIO.from_json( input_biom ) biom.remove_observations( removed_observations ) BiomIO.write( output_biom, biom )
def remove_observations(removed_observations, input_biom, output_biom): """ @summary: Removes the specified list of observations. @param removed_observations: [list] The names of the observations to remove. @param input_biom: [str] The path to the input BIOM. @param output_biom: [str] The path to the output BIOM. """ biom = BiomIO.from_json(input_biom) biom.remove_observations(removed_observations) BiomIO.write(output_biom, biom)
def rarefaction( input_biom, interval=10000, ranks=None, taxonomy_key="taxonomy" ): """ @summary: Returns the rarefaction by ranks by samples. @param input_biom: [str] Path to the biom file processed. @param interval: [int] Size of first sampling. @param ranks: [list] The rank(s) level for the diversity. Example : Sampled set : Bacteria; Proteobacteria; Alphaproteobacteria; Sphingomonadales; Sphingomonadaceae; Sphingomonas Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales; Vibrionaceae; Vibrio; Vibrio halioticoli Bacteria; Proteobacteria; Gammaproteobacteria; Legionellales; Coxiellaceae; Coxiella; Ornithodoros moubata symbiont A Bacteria; Proteobacteria; Betaproteobacteria; Burkholderiales; Burkholderiaceae; Limnobacter; Limnobacter thiooxidans Result for this set With rank 1 or 2 : 1 group With rank 3 : 3 different groups With rank 4 or 5 or 6 : 4 different groups @param taxonomy_key : [str] The metadata title for the taxonomy in the input. @return: [dict] By ranks by samples the list of differents taxa for each steps. Example : { 1: { "sampleA" : [10, 20, 22, 23, 24, 25, 25, 25 ], "sampleB" : [15, 25, 28, 30, 32, 34, 35, 36, 37, 37, 37, 37] } } @warning: The taxa with name starting with unknown used as complete new name 'unknown'. """ sample_rarefaction = dict() biom = BiomIO.from_json( input_biom ) for current_rank in ranks: sample_rarefaction[current_rank] = dict() for sample in biom.get_samples_names(): taxa = dict() for current_rank in ranks: sample_rarefaction[current_rank][sample] = list() taxa[current_rank] = dict() sample_count = biom.get_sample_count( sample ) expected_nb_iter = int(sample_count/interval) for current_nb_iter in range(expected_nb_iter): selected_observations = biom.random_obs_extract_by_sample(sample, interval) for current_selected in selected_observations: taxonomy = list() if taxonomy_key in current_selected['observation']["metadata"] and current_selected['observation']["metadata"][taxonomy_key] is not None: taxonomy = biom.get_observation_taxonomy( current_selected['observation']["id"], taxonomy_key ) for idx, taxon in enumerate(taxonomy): if taxon.lower().startswith("unknown"): taxonomy[idx] = "unknown" while len(taxonomy) < max(ranks): taxonomy.append("unknown") for current_rank in ranks: taxonomy_str = (';'.join(taxonomy[0:current_rank+1])).lower() taxa[current_rank][taxonomy_str] = True for current_rank in ranks: sample_rarefaction[current_rank][sample].append( str(len(taxa[current_rank])) ) return sample_rarefaction
def rarefaction( input_biom, interval=10000, ranks=None, taxonomy_key="taxonomy" ): """ @summary: Returns the rarefaction by ranks by samples. @param input_biom: [str] Path to the biom file processed. @param interval: [int] Size of first sampling. @param ranks: [list] The rank(s) level for the diversity. Example : Sampled set : Bacteria; Proteobacteria; Alphaproteobacteria; Sphingomonadales; Sphingomonadaceae; Sphingomonas Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales; Vibrionaceae; Vibrio; Vibrio halioticoli Bacteria; Proteobacteria; Gammaproteobacteria; Legionellales; Coxiellaceae; Coxiella; Ornithodoros moubata symbiont A Bacteria; Proteobacteria; Betaproteobacteria; Burkholderiales; Burkholderiaceae; Limnobacter; Limnobacter thiooxidans Result for this set With rank 1 or 2 : 1 group With rank 3 : 3 different groups With rank 4 or 5 or 6 : 4 different groups @param taxonomy_key : [str] The metadata title for the taxonomy in the input. @return: [dict] By ranks by samples the list of differents taxa for each steps. Example : { 1: { "sampleA" : [10, 20, 22, 23, 24, 25, 25, 25 ], "sampleB" : [15, 25, 28, 30, 32, 34, 35, 36, 37, 37, 37, 37] } } @warning: The taxa with name starting with unknown used as complete new name 'unknown'. """ sample_rarefaction = dict() biom = BiomIO.from_json( input_biom ) for current_rank in ranks: sample_rarefaction[current_rank] = dict() for sample in biom.get_samples_names(): taxa = dict() for current_rank in ranks: sample_rarefaction[current_rank][sample] = list() taxa[current_rank] = dict() sample_count = biom.get_sample_count( sample ) expected_nb_iter = sample_count/interval for current_nb_iter in range(expected_nb_iter): selected_observations = biom.random_obs_extract_by_sample(sample, interval) for current_selected in selected_observations: taxonomy = list() if current_selected['observation']["metadata"].has_key(taxonomy_key) and current_selected['observation']["metadata"][taxonomy_key] is not None: taxonomy = biom.get_observation_taxonomy( current_selected['observation']["id"], taxonomy_key ) for idx, taxon in enumerate(taxonomy): if taxon.lower().startswith("unknown"): taxonomy[idx] = "unknown" while len(taxonomy) < max(ranks): taxonomy.append("unknown") for current_rank in ranks: taxonomy_str = (';'.join(taxonomy[0:current_rank+1])).lower() taxa[current_rank][taxonomy_str] = True for current_rank in ranks: sample_rarefaction[current_rank][sample].append( str(len(taxa[current_rank])) ) return sample_rarefaction
def write_log(in_biom, out_biom, log): FH_log = open(log, "w") FH_log.write("#sample\tnb_otu_before\tnb_otu_after\n") initial_biom = BiomIO.from_json(in_biom) new_biom = BiomIO.from_json(out_biom) for sample_name in initial_biom.get_samples_names(): nb_otu_before = len(initial_biom.get_sample_obs(sample_name)) nb_otu_after = len(new_biom.get_sample_obs(sample_name)) FH_log.write("Sample name: " + sample_name + "\n\tnb initials OTU: " + str(nb_otu_before) + "\n\tnb normalized OTU: " + str(nb_otu_after) + "\n") nb_initial_otu = len(initial_biom.rows) nb_new_otu = len(new_biom.rows) FH_log.write("Sample name: all samples\n\tnb initials OTU: " + str(nb_initial_otu) + "\n\tnb normalized OTU: " + str(nb_new_otu) + "\n") FH_log.close()
def get_obs_from_biom( in_biom ): """ @summary: Returns the counts by observation from a BIOM file. @param in_biom: Path to the BIOM. @return: [dict] Returns the counts by observation. """ observ_dict = dict() biom = BiomIO.from_json(in_biom) for observation_name in biom.get_observations_names(): observ_dict[observation_name] = biom.get_observation_count(observation_name) del biom return observ_dict
def get_bootstrap_distrib(input_biom, bootstrap_tag, multiple_tag): """ @summary: Returns by taxonomic rank the count (seq and clstr) for the different bootstrap categories. @param input_biom: The path to the processed BIOM. @param bootstrap_tag: The metadata tag used in BIOM file to store the taxonomy bootstraps. @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies. @returns: [dict] By taxonomic rank the count for the different bootstrap categories. Example: { "Phylum": { "80": { "clstr": 1, "seq":100 }, "90": { "clstr": 2, "seq":400 }, "100": { "clstr": 50, "seq":20000 }, }, "Genus":{ "80":{ "clstr": 1, "seq":100 }, "90":{ "clstr": 2, "seq":400 }, "100":{ "clstr": 50, "seq":20000 }, } } """ bootstrap_results = dict() biom = BiomIO.from_json(input_biom) for observation in biom.get_observations(): observation_metadata = observation['metadata'] bootstrap = None if multiple_tag is not None: if multiple_tag in observation_metadata and observation_metadata[ multiple_tag] is not None and len( observation_metadata[multiple_tag]) > 0: bootstrap = observation_metadata[multiple_tag][0][ bootstrap_tag] else: if bootstrap_tag in observation_metadata: bootstrap = observation_metadata[bootstrap_tag] if bootstrap is not None: for taxonomy_depth, rank_bootstrap in enumerate(bootstrap): rank_bootstrap = rank_bootstrap * 100 rank = args.taxonomic_ranks[taxonomy_depth] if rank not in bootstrap_results: bootstrap_results[rank] = dict() if rank_bootstrap not in bootstrap_results[rank]: bootstrap_results[rank][rank_bootstrap] = { "clstr": 0, "seq": 0 } bootstrap_results[rank][rank_bootstrap]["clstr"] += 1 bootstrap_results[rank][rank_bootstrap][ "seq"] += biom.get_observation_count(observation['id']) del biom return bootstrap_results
def get_obs_from_biom(in_biom): """ @summary: Returns the counts by observation from a BIOM file. @param in_biom: Path to the BIOM. @return: [dict] Returns the counts by observation. """ observ_dict = dict() biom = BiomIO.from_json(in_biom) for observation_name in biom.get_observations_names(): observ_dict[observation_name] = biom.get_observation_count( observation_name) del biom return observ_dict
def excluded_obs_on_nBiggest( input_biom, nb_selected, excluded_file ): """ @summary: Writes the list of all the observations without the n most abundant. @param input_biom: [str] The path to the BIOM file. @param threshold: [float] The number of the most abundant observations that will not be written in the excluded list. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) sorted_obs_counts = sorted( biom.get_observations_counts(), key=lambda observation: observation[1], reverse=True ) for observation_name, observation_count in sorted_obs_counts[nb_selected:]: FH_excluded_file.write( observation_name + "\n" ) FH_excluded_file.close()
def getCheckedAbunByRank(real_tax, input_biom, sample, taxonomy_key, multi_affiliation, logfile): """ @summary: @param real_tax: [dict] Taxonomy by reference IDs. @param input_biom: [str] Path to BIOM file. @param sample: [str] sample name. @param taxonomy_key: [str] The metadata key for taxonomy. @param multi_affiliation: [bool] ************************************************************************************ @return: [dict] The dictionary of count by taxa in dictionary by rank. """ abund_by_rank = list() tax_list = list() full_tax_list = list() nb_seq = 0 biom = BiomIO.from_json(input_biom) for observation in biom.get_observations(): count = biom.get_count(observation["id"], sample) if count > 0: nb_seq += 1 # Get taxonomy if not multi_affiliation: # Standard affiliation taxonomy_clean = getCleanedTaxonomy( observation["metadata"][taxonomy_key]) if ";".join(taxonomy_clean) not in full_tax_list: full_tax_list.append(";".join(taxonomy_clean)) else: # Multi-affiliation possible_taxonomies = [ getCleanedTaxonomy(affi["taxonomy"]) for affi in observation["metadata"]["blast_affiliations"] ] for taxonomy_clean in possible_taxonomies: if ";".join(taxonomy_clean) not in full_tax_list: full_tax_list.append(";".join(taxonomy_clean)) taxonomy_clean = selectOneMultiaffiliation( real_tax, observation["id"], possible_taxonomies, logfile) if ";".join(taxonomy_clean) not in tax_list: tax_list.append(";".join(taxonomy_clean)) # Store count for depth in range(len(taxonomy_clean)): if len(abund_by_rank) < depth + 1: abund_by_rank.append(dict()) taxon = ";".join( taxonomy_clean[:depth + 1] ) # prevent bug with same sp name but with different ancestors if not abund_by_rank[depth].has_key(taxon): abund_by_rank[depth][taxon] = 0 abund_by_rank[depth][taxon] += count return nb_seq, full_tax_list, tax_list, abund_by_rank
def get_results(biom_file): """ @summary: Returns the results of the affiliation. @param biom_file: [str] Path to a BIOM file after affiliation. @return: [dict] The global results and the sample results. """ global_results = { "nb_clstr": 0, "nb_seq": 0, "nb_clstr_with_affi": 0, "nb_seq_with_affi": 0, "nb_clstr_ambiguous": list(), "nb_seq_ambiguous": list(), } samples_results = dict() biom = BiomIO.from_json(biom_file) for cluster in biom.get_observations(): nb_seq = biom.get_observation_count(cluster["id"]) global_results["nb_clstr"] += 1 global_results["nb_seq"] += nb_seq if cluster["metadata"]["blast_taxonomy"] is not None: global_results["nb_clstr_with_affi"] += 1 global_results["nb_seq_with_affi"] += nb_seq for depth, taxon in enumerate( cluster["metadata"]["blast_taxonomy"]): if len(global_results["nb_clstr_ambiguous"]) < (depth + 1): global_results["nb_clstr_ambiguous"].append(0) global_results["nb_seq_ambiguous"].append(0) if taxon == "Multi-affiliation": global_results["nb_clstr_ambiguous"][depth] += 1 global_results["nb_seq_ambiguous"][depth] += nb_seq # Samples results for sample in biom.get_samples_by_observation(cluster["id"]): sample_name = sample["id"] if not samples_results.has_key(sample_name): samples_results[sample_name] = { "nb_clstr": 0, "nb_seq": 0, "nb_clstr_with_affi": 0, "nb_seq_with_affi": 0 } count = biom.get_count(cluster["id"], sample_name) if count > 0: samples_results[sample_name]["nb_clstr"] += 1 samples_results[sample_name]["nb_seq"] += count if cluster["metadata"]["blast_taxonomy"] is not None: samples_results[sample_name]["nb_clstr_with_affi"] += 1 samples_results[sample_name]["nb_seq_with_affi"] += count return global_results, samples_results
def excluded_obs_on_samplePresence(input_biom, min_sample_presence, excluded_file): """ @summary: Writes the list of the observations present in an insufficient number of samples. @param input_biom: [str] The path to the BIOM file to check. @param min_sample_presence: [int] The observations present in a number of samples inferior than this value are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) for observation_name in biom.get_observations_names(): nb_samples = sum(1 for x in biom.get_samples_by_observation(observation_name)) if nb_samples < min_sample_presence: FH_excluded_file.write( observation_name + "\n" ) FH_excluded_file.close()
def get_alignment_distrib( input_biom, identity_tag, coverage_tag, multiple_tag ): """ @summary: Returns by taxonomic rank the count (seq and clstr) for the different identity/coverage. @param input_biom: The path to the processed BIOM. @param identity_tag: The metadata tag used in BIOM file to store the alignment identity. @param coverage_tag: The metadata tag used in BIOM file to store the alignment query coverage. @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies. @returns: [list] By taxonomic rank the count for the different identity/coverage. Example: [ [100, 100, { "clstr": 53, "seq": 20500 }], [99, 100, { "clstr": 35, "seq": 18000 }], [90, 95, { "clstr": 1, "seq": 10 }], ] """ biom = BiomIO.from_json( input_biom ) aln_results = list() aln_results_hash = dict() for observation in biom.get_observations(): observation_metadata = observation['metadata'] identity = None coverage = None if args.multiple_tag is not None: if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0: identity = observation_metadata[multiple_tag][0][identity_tag] coverage = observation_metadata[multiple_tag][0][coverage_tag] else: if observation_metadata.has_key(identity_tag) and observation_metadata.has_key(coverage_tag): identity = observation_metadata[identity_tag] coverage = observation_metadata[coverage_tag] if identity is not None: if not aln_results_hash.has_key( identity ): aln_results_hash[identity] = dict() if not aln_results_hash[identity].has_key( coverage ): aln_results_hash[identity][coverage] = { "clstr": 0, "seq": 0 } aln_results_hash[identity][coverage]["clstr"] += 1 aln_results_hash[identity][coverage]["seq"] += biom.get_observation_count( observation['id'] ) for ident in aln_results_hash.keys(): for cover in aln_results_hash[ident].keys(): aln_results.append([ ident, cover, aln_results_hash[ident][cover] ]) del biom return aln_results
def get_results( biom_file ): """ @summary: Returns the results of the affiliation. @param biom_file: [str] Path to a BIOM file after affiliation. @return: [dict] The global results and the sample results. """ global_results = { "nb_clstr": 0, "nb_seq": 0, "nb_clstr_with_affi": 0, "nb_seq_with_affi": 0, "nb_clstr_ambiguous": list(), "nb_seq_ambiguous": list(), } samples_results = dict() biom = BiomIO.from_json( biom_file ) for cluster in biom.get_observations(): nb_seq = biom.get_observation_count( cluster["id"] ) global_results["nb_clstr"] += 1 global_results["nb_seq"] += nb_seq if cluster["metadata"]["blast_taxonomy"] is not None: global_results["nb_clstr_with_affi"] += 1 global_results["nb_seq_with_affi"] += nb_seq for depth, taxon in enumerate(cluster["metadata"]["blast_taxonomy"]): if len(global_results["nb_clstr_ambiguous"]) < (depth + 1): global_results["nb_clstr_ambiguous"].append( 0 ) global_results["nb_seq_ambiguous"].append( 0 ) if taxon == "Multi-affiliation": global_results["nb_clstr_ambiguous"][depth] += 1 global_results["nb_seq_ambiguous"][depth] += nb_seq # Samples results for sample in biom.get_samples_by_observation( cluster["id"] ): sample_name = sample["id"] if not samples_results.has_key( sample_name ): samples_results[sample_name] = { "nb_clstr": 0, "nb_seq": 0, "nb_clstr_with_affi": 0, "nb_seq_with_affi": 0 } count = biom.get_count(cluster["id"], sample_name) if count > 0: samples_results[sample_name]["nb_clstr"] += 1 samples_results[sample_name]["nb_seq"] += count if cluster["metadata"]["blast_taxonomy"] is not None: samples_results[sample_name]["nb_clstr_with_affi"] += 1 samples_results[sample_name]["nb_seq_with_affi"] += count return global_results, samples_results
def excluded_obs_on_nBiggest(input_biom, nb_selected, excluded_file): """ @summary: Writes the list of all the observations without the n most abundant. @param input_biom: [str] The path to the BIOM file. @param threshold: [float] The number of the most abundant observations that will not be written in the excluded list. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json(input_biom) FH_excluded_file = open(excluded_file, "w") sorted_obs_counts = sorted(biom.get_observations_counts(), key=lambda observation: observation[1], reverse=True) for observation_name, observation_count in sorted_obs_counts[nb_selected:]: FH_excluded_file.write(observation_name + "\n") FH_excluded_file.close()
def get_bootstrap_distrib( input_biom, bootstrap_tag, multiple_tag ): """ @summary: Returns by taxonomic rank the count (seq and clstr) for the different bootstrap categories. @param input_biom: The path to the processed BIOM. @param bootstrap_tag: The metadata tag used in BIOM file to store the taxonomy bootstraps. @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies. @returns: [dict] By taxonomic rank the count for the different bootstrap categories. Example: { "Phylum": { "80": { "clstr": 1, "seq":100 }, "90": { "clstr": 2, "seq":400 }, "100": { "clstr": 50, "seq":20000 }, }, "Genus":{ "80":{ "clstr": 1, "seq":100 }, "90":{ "clstr": 2, "seq":400 }, "100":{ "clstr": 50, "seq":20000 }, } } """ bootstrap_results = dict() biom = BiomIO.from_json( input_biom ) for observation in biom.get_observations(): observation_metadata = observation['metadata'] bootstrap = None if multiple_tag is not None: if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0: bootstrap = observation_metadata[multiple_tag][0][bootstrap_tag] else: if observation_metadata.has_key(bootstrap_tag): bootstrap = observation_metadata[bootstrap_tag] if bootstrap is not None: for taxonomy_depth, rank_bootstrap in enumerate( bootstrap ): rank_bootstrap = rank_bootstrap * 100 rank = args.taxonomic_ranks[taxonomy_depth] if not bootstrap_results.has_key(rank): bootstrap_results[rank] = dict() if not bootstrap_results[rank].has_key(rank_bootstrap): bootstrap_results[rank][rank_bootstrap] = { "clstr": 0, "seq": 0 } bootstrap_results[rank][rank_bootstrap]["clstr"] += 1 bootstrap_results[rank][rank_bootstrap]["seq"] += biom.get_observation_count( observation['id'] ) del biom return bootstrap_results
def get_alignment_distrib( input_biom, identity_tag, coverage_tag, multiple_tag ): """ @summary: Returns by taxonomic rank the count (seq and clstr) for the different identity/coverage. @param input_biom: The path to the processed BIOM. @param identity_tag: The metadata tag used in BIOM file to store the alignment identity. @param coverage_tag: The metadata tag used in BIOM file to store the alignment query coverage. @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies. @returns: [list] By taxonomic rank the count for the different identity/coverage. Example: [ [100, 100, { "clstr": 53, "seq": 20500 }], [99, 100, { "clstr": 35, "seq": 18000 }], [90, 95, { "clstr": 1, "seq": 10 }], ] """ biom = BiomIO.from_json( input_biom ) aln_results = list() aln_results_hash = dict() for observation in biom.get_observations(): observation_metadata = observation['metadata'] identity = 0 coverage = 0 if args.multiple_tag is not None: if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0: identity = observation_metadata[multiple_tag][0][identity_tag] coverage = observation_metadata[multiple_tag][0][coverage_tag] else: if observation_metadata.has_key(identity_tag) and observation_metadata.has_key(coverage_tag): identity = observation_metadata[identity_tag] coverage = observation_metadata[coverage_tag] if not aln_results_hash.has_key( identity ): aln_results_hash[identity] = dict() if not aln_results_hash[identity].has_key( coverage ): aln_results_hash[identity][coverage] = { "clstr": 0, "seq": 0 } aln_results_hash[identity][coverage]["clstr"] += 1 aln_results_hash[identity][coverage]["seq"] += biom.get_observation_count( observation['id'] ) for ident in aln_results_hash.keys(): for cover in aln_results_hash[ident].keys(): aln_results.append([ ident, cover, aln_results_hash[ident][cover] ]) del biom return aln_results
def excluded_obs_on_samplePresence(input_biom, min_sample_presence, excluded_file): """ @summary: Writes the list of the observations present in an insufficient number of samples. @param input_biom: [str] The path to the BIOM file to check. @param min_sample_presence: [int] The observations present in a number of samples inferior than this value are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json(input_biom) FH_excluded_file = open(excluded_file, "w") for observation_name in biom.get_observations_names(): nb_samples = sum( 1 for x in biom.get_samples_by_observation(observation_name)) if nb_samples < min_sample_presence: FH_excluded_file.write(observation_name + "\n") FH_excluded_file.close()
def get_tree_with_count( input_biom, compress=False, taxonomy_key="taxonomy" ): """ @summary: Returns the tree of taxa and their counts by sample from BIOM. @param input_biom: [str] Path to the BIOM file processed. @param compress: [bool] if true the samples names are replaced by samples index. @param taxonomy_key: [str] The metadata title for the taxonomy in biom. @return: [list] The tree generated and the ordered list of samples names (usefull to retrieve name by index if you use compress). """ ordered_samples_names = list() tree = Node("root") biom = BiomIO.from_json( input_biom ) for sample_name in biom.get_samples_names(): ordered_samples_names.append( sample_name ) sample_id = None if not compress else (len(ordered_samples_names)-1) update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id ) return tree, ordered_samples_names
def __init__(self, out_tsv, in_biom, in_fasta=None): """ @param in_biom: [str] Path to BIOM file. @param out_tsv: [str] Path to output TSV file. """ # Sequence file option sequence_file_opt = "" if in_fasta is None else " --input-fasta " + in_fasta # Check the metadata biom = BiomIO.from_json(in_biom) obs = biom.rows[0] conversion_tags = "" if biom.has_observation_metadata('comment'): conversion_tags += "'comment' " if biom.has_observation_metadata( 'rdp_taxonomy') and biom.has_observation_metadata( 'rdp_bootstrap'): conversion_tags += "'@rdp_tax_and_bootstrap' " if biom.has_observation_metadata('blast_taxonomy'): conversion_tags += "'blast_taxonomy' " if biom.has_observation_metadata('blast_affiliations'): conversion_tags += "'@blast_subject' " conversion_tags += "'@blast_perc_identity' " conversion_tags += "'@blast_perc_query_coverage' " conversion_tags += "'@blast_evalue' " conversion_tags += "'@blast_aln_length' " if biom.has_observation_metadata('seed_id'): conversion_tags += "'seed_id' " if in_fasta is not None: conversion_tags += "'@seed_sequence' " frogs_metadata = [ "comment", "rdp_taxonomy", "rdp_bootstrap", "blast_taxonomy", "blast_affiliations", "seed_id" ] if biom.get_observation_metadata(obs["id"]) != None: for metadata in biom.get_observation_metadata(obs["id"]): if metadata not in frogs_metadata: conversion_tags += "'" + metadata + "' " conversion_tags += "'@observation_name' '@observation_sum' '@sample_count'" # Set command Cmd.__init__( self, 'biom2tsv.py', 'Converts a BIOM file in TSV file.', "--input-file " + in_biom + sequence_file_opt + " --output-file " + out_tsv + " --fields " + conversion_tags, '--version')
def getCheckedAbunByRank( real_tax, input_biom, sample, taxonomy_key, multi_affiliation, duplication_groups ): """ @summary: @param real_tax: [dict] Taxonomy by reference IDs. @param input_biom: [str] Path to BIOM file. @param sample: [str] sample name. @param taxonomy_key: [str] The metadata key for taxonomy. @param multi_affiliation: [bool] ************************************************************************************ @param duplication_groups: [dict] By reference ID the list of IDs for references with the same sequence. @return: [dict] The dictionary of count by taxa in dictionary by rank. """ abund_by_rank = list() biom = BiomIO.from_json( input_biom ) for observation in biom.get_observations(): count = biom.get_count( observation["id"], sample ) if count > 0: # Get taxonomy ref_id = observation["metadata"]["grinder_source"] taxonomy_clean = getCleanedTaxonomy(observation["metadata"][taxonomy_key]) if not multi_affiliation: # Standard affiliation if not "," in ref_id: # Non chimera if taxIsRetrieved(real_tax[ref_id], [taxonomy_clean]): taxonomy_clean = real_tax[ref_id][0] else: # Multi-affiliation if not "," in ref_id: # Non chimera subjects_ids = [affi["subject"] for affi in observation["metadata"]["blast_affiliations"]] possible_taxonomies = [";".join(getCleanedTaxonomy(affi["taxonomy"])) for affi in observation["metadata"]["blast_affiliations"]] # Manage ambiguity if refIDIsRetrieved(ref_id, subjects_ids, duplication_groups): taxonomy_clean = real_tax[ref_id][0] elif len(subjects_ids) > 499 and taxIsRetrieved(real_tax[ref_id], possible_taxonomies): taxonomy_clean = real_tax[ref_id][0] elif "Multi-affiliation" in taxonomy_clean: taxonomy_clean = getCleanedTaxonomy(observation["metadata"]["blast_affiliations"][0]["taxonomy"]) # Select one else: # Chimera if "Multi-affiliation" in taxonomy_clean: taxonomy_clean = getCleanedTaxonomy(observation["metadata"]["blast_affiliations"][0]["taxonomy"]) # Select one # Store count for depth in range(len(taxonomy_clean)): if len(abund_by_rank) < depth+1: abund_by_rank.append(dict()) taxon = ";".join( taxonomy_clean[:depth+1] ) # prevent bug with same sp name but with different ancestors if not abund_by_rank[depth].has_key(taxon): abund_by_rank[depth][taxon] = 0 abund_by_rank[depth][taxon] += count return abund_by_rank
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None): """ @summary: Add taxonomy metadata on biom file from a blast result. @param reference_file: [str] The path to the reference file. @param biom_in: [str] The path to the Biom file to process. @param biom_out: [str] The path to the biom output file. @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+). @param rdp_files: [list] the list of path to the RDPClassifier results. """ # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy) taxonomy_by_reference = get_tax_from_fasta( reference_file ) # Retrieve blast clusters annotations cluster_blast_annot = dict() if blast_files is not None: cluster_blast_annot = get_bests_blast_affi( blast_files, taxonomy_by_reference ) del taxonomy_by_reference # Retrieve rdp clusters annotations cluster_rdp_annot = dict() if rdp_files is not None: cluster_rdp_annot = get_rdp_affi( rdp_files ) # Add metadata to biom biom = BiomIO.from_json(biom_in) for cluster in biom.get_observations(): cluster_id = cluster["id"] # Blast if blast_files is not None: blast_taxonomy = None blast_affiliations = list() if cluster_blast_annot.has_key(cluster_id): # Current observation has a match blast_taxonomy = get_tax_consensus( [alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments']] ) blast_affiliations = cluster_blast_annot[cluster_id]['alignments'] biom.add_metadata( cluster_id, "blast_affiliations", blast_affiliations, "observation" ) biom.add_metadata( cluster_id, "blast_taxonomy", blast_taxonomy, "observation" ) # RDP if rdp_files is not None: rdp_taxonomy = None rdp_bootstrap = None if cluster_rdp_annot.has_key(cluster_id): rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy'] rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap'] biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation") biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation") BiomIO.write(biom_out, biom)
def excluded_obs_on_rdpBootstrap(input_biom, taxonomic_depth, min_bootstrap, excluded_file): """ @summary: Writes the list of the observations with an insufficient bootstrap on the specified taxonomic rank. @param input_biom: [str] The path to the BIOM file to check. @param taxonomic_depth: [int] The taxonomic rank depth to check (example: 6 for Species in system "Domain, Phylum, Class, Order, Family, Genus, Species"). @param min_bootstrap: [float] The observations with a value inferior to this threshold at the specified taxonomic depth are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) for observation in biom.get_observations(): bootstrap = observation["metadata"]["rdp_bootstrap"] if issubclass(bootstrap.__class__, str): bootstrap = bootstrap.split(";") if bootstrap[taxonomic_depth] < min_bootstrap: FH_excluded_file.write( str(observation["id"]) + "\n" ) FH_excluded_file.close()
def excluded_obs_on_abundance(input_biom, min_abundance, excluded_file): """ @summary: Writes the list of the observations with an insufficient abundance. @param input_biom: [str] The path to the BIOM file to check. @param min_abundance: [int/float] The observations with an abundance inferior than this value are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) min_nb_seq = min_abundance if type(min_abundance) == float: min_nb_seq = biom.get_total_count() * min_abundance for idx, count_by_sample in enumerate(biom.to_count()): observation = biom.rows[idx] abundance = sum(count_by_sample) if abundance < min_nb_seq: FH_excluded_file.write( str(observation["id"]) + "\n" ) FH_excluded_file.close()
def get_realTax(taxonomy_key, input_biom): """ @summary: Returns count by taxa by rank in sample. @param input_biom: [str] Path to BIOM file. @return: [dict] The dictionary of count by taxa in dictionary by rank. """ tax_list = list() nb_seq = 0 biom = BiomIO.from_json(input_biom) for observation in biom.get_observations(): nb_seq += 1 taxonomy_clean = getCleanedTaxonomy( observation["metadata"][taxonomy_key]) if not taxonomy_clean in tax_list: tax_list.append(";".join(taxonomy_clean)) return tax_list
def excluded_obs_on_rdpBootstrap(input_biom, taxonomic_depth, min_bootstrap, excluded_file): """ @summary: Writes the list of the observations with an insufficient bootstrap on the specified taxonomic rank. @param input_biom: [str] The path to the BIOM file to check. @param taxonomic_depth: [int] The taxonomic rank depth to check (example: 6 for Species in system "Domain, Phylum, Class, Order, Family, Genus, Species"). @param min_bootstrap: [float] The observations with a value inferior to this threshold at the specified taxonomic depth are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json(input_biom) FH_excluded_file = open(excluded_file, "w") for observation in biom.get_observations(): bootstrap = observation["metadata"]["rdp_bootstrap"] if issubclass(bootstrap.__class__, str): bootstrap = bootstrap.split(";") if bootstrap[taxonomic_depth] < min_bootstrap: FH_excluded_file.write(str(observation["id"]) + "\n") FH_excluded_file.close()
def biom_to_tsv( input_biom, output_tsv, fields, list_separator ): """ @summary: Convert BIOM file to TSV file. @param input_biom: [str] Path to the BIOM file. @param output_tsv: [str] Path to the output file (format : TSV). @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count' '@rdp_tax_and_bootstrap' . The others columns must be metadata title. @param list_separator: [str] Separator for complex metadata. """ biom = BiomIO.from_json( input_biom ) out_fh = open( output_tsv, "w" ) # Header header_parts = header_line_parts( fields, biom ) out_fh.write( "#" + "\t".join(header_parts) + "\n" ) # Data for obs_idx, count_by_sample in enumerate(biom.to_count()): observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields, list_separator ) out_fh.write( "\t".join(observation_parts) + "\n" ) out_fh.close()
def excluded_obs_on_abundance(input_biom, min_abundance, excluded_file): """ @summary: Writes the list of the observations with an insufficient abundance. @param input_biom: [str] The path to the BIOM file to check. @param min_abundance: [int/float] The observations with an abundance inferior than this value are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json(input_biom) FH_excluded_file = open(excluded_file, "w") min_nb_seq = min_abundance if type(min_abundance) == float: min_nb_seq = biom.get_total_count() * min_abundance for idx, count_by_sample in enumerate(biom.to_count()): observation = biom.rows[idx] abundance = sum(count_by_sample) if abundance < min_nb_seq: FH_excluded_file.write(str(observation["id"]) + "\n") FH_excluded_file.close()
def process(args): tmp_files = TmpFiles(os.path.split(args.output_file)[0]) try: # Add temp taxonomy if multiple and without consensus tmp_biom = args.input_biom used_taxonomy_tag = args.taxonomy_tag if args.multiple_tag is not None: used_taxonomy_tag = args.tax_consensus_tag if args.tax_consensus_tag is None: used_taxonomy_tag = "Used_taxonomy_FROGS-affi" tmp_biom = tmp_files.add("tax.biom") biom = BiomIO.from_json(args.input_biom) for observation in biom.get_observations(): metadata = observation["metadata"] if metadata[args.multiple_tag] is not None and len( metadata[args.multiple_tag]) > 0: metadata[used_taxonomy_tag] = metadata[ args.multiple_tag][0][args.taxonomy_tag] BiomIO.write(tmp_biom, biom) del biom # Rarefaction tax_depth = [ args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks ] rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag, tax_depth) rarefaction_cmd.submit(args.log_file) rarefaction_files = rarefaction_cmd.output_files # Taxonomy tree tree_count_file = tmp_files.add("taxCount.enewick") tree_ids_file = tmp_files.add("taxCount_ids.tsv") TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file, tree_ids_file).submit(args.log_file) # Writes summary write_summary(args.output_file, args.input_biom, tree_count_file, tree_ids_file, rarefaction_files, args) finally: if not args.debug: tmp_files.deleteAll()
def biom_to_tsv(input_biom, output_tsv, fields, list_separator): """ @summary: Convert BIOM file to TSV file. @param input_biom: [str] Path to the BIOM file. @param output_tsv: [str] Path to the output file (format : TSV). @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count'. The others columns must be metadata title. @param list_separator: [str] Separator for complex metadata. """ biom = BiomIO.from_json(input_biom) out_fh = open(output_tsv, "w") # Header line = list() for current_field in fields: if current_field == '@observation_name': line.append("observation_name") elif current_field == '@sample_count': line.append("\t".join(biom.get_samples_names())) elif current_field == '@observation_sum': line.append("observation_sum") else: #metadata line.append(str(current_field)) out_fh.write("#" + "\t".join(line) + "\n") # Data for idx, count_by_sample in enumerate(biom.to_count()): observation = biom.rows[idx] line = list() for current_field in fields: if current_field == '@observation_name': line.append(str(observation['id'])) elif current_field == '@sample_count': line.append("\t".join(map(str, count_by_sample))) elif current_field == '@observation_sum': line.append(str(sum(count_by_sample))) else: #metadata if issubclass(observation['metadata'][current_field].__class__, list): line.append( list_separator.join( observation['metadata'][current_field])) else: line.append(str(observation['metadata'][current_field])) out_fh.write("\t".join(line) + "\n") out_fh.close()
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file): FH_in = FastaIO( fasta_in ) FH_out = FastaIO( fasta_out, "w" ) biom = BiomIO.from_json( biom_in ) seq_in=0 seq_out=0 for record in FH_in: seq_in += 1 try: biom.find_idx("observation",record.id) except ValueError: pass else: FH_out.write(record) seq_out += 1 FH_in.close() FH_out.close() FH_log=open(log_file,"w") FH_log.write("Number of sequence in :" + str(seq_in)+"\n" ) FH_log.write("Number of sequence out :" + str(seq_out) +"\n")
def biom_to_tsv( input_biom, output_tsv, fields, list_separator ): """ @summary: Convert BIOM file to TSV file. @param input_biom: [str] Path to the BIOM file. @param output_tsv: [str] Path to the output file (format : TSV). @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count'. The others columns must be metadata title. @param list_separator: [str] Separator for complex metadata. """ biom = BiomIO.from_json( input_biom ) out_fh = open( output_tsv, "w" ) # Header line = list() for current_field in fields: if current_field == '@observation_name': line.append( "observation_name" ) elif current_field == '@sample_count': line.append( "\t".join(biom.get_samples_names()) ) elif current_field == '@observation_sum': line.append( "observation_sum" ) else: #metadata line.append( str(current_field) ) out_fh.write( "#" + "\t".join(line) + "\n" ) # Data for idx, count_by_sample in enumerate(biom.to_count()): observation = biom.rows[idx] line = list() for current_field in fields: if current_field == '@observation_name': line.append( str(observation['id']) ) elif current_field == '@sample_count': line.append( "\t".join(map(str, count_by_sample)) ) elif current_field == '@observation_sum': line.append( str(sum(count_by_sample)) ) else: #metadata if issubclass(observation['metadata'][current_field].__class__, list): line.append( list_separator.join(observation['metadata'][current_field]) ) else: line.append( str(observation['metadata'][current_field]) ) out_fh.write( "\t".join(line) + "\n" ) out_fh.close()
def samples_hclassification( input_biom, output_newick, distance_method, linkage_method ): """ @summary : Process and write an hierarchical classification from Biom. @param input_biom : [str] Path to the BIOM file to process. @param output_newick : [str] Path to the newick output file. @param distance_method : [str] Used distance method for classify. @param linkage_method : [str] Used linkage method for classify. """ from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram import scipy.cluster.hierarchy data_array = list() samples_names = list() # Normalisation on count by sample biom = BiomIO.from_json( input_biom ) for col_idx, current_sample in enumerate(biom.columns): samples_names.append( current_sample['id'] ) sum_on_sample = biom.data.get_col_sum( col_idx ) OTUs_norm = list() for row_idx in range(len(biom.rows)): OTUs_norm.append( biom.data.nb_at(row_idx, col_idx)/float(sum_on_sample) ) data_array.append( OTUs_norm ) del biom if len(samples_names) == 1 : # Write newick out_fh = open( output_newick, "w" ) out_fh.write( "(" + samples_names[0] + ");\n" ) out_fh.close() else: # Computing the distance and linkage data_dist = pdist( data_array, distance_method ) data_link = linkage( data_dist, linkage_method ) # Write newick scipy_hc_tree = scipy.cluster.hierarchy.to_tree( data_link , rd=False ) id_2_name = dict( zip(range(len(samples_names)), samples_names) ) out_fh = open( output_newick, "w" ) out_fh.write( to_newick(scipy_hc_tree, id_2_name) + "\n" ) out_fh.close()