def excluded_obs_on_blastMetrics( input_biom, tag, cmp_operator, threshold, excluded_file ): """ @summary: Writes the list of the observations with no affiliations with sufficient blast value. @param input_biom: [str] The path to the BIOM file to check. @param tag: [str] The metadata checked. @param cmp_operator: [str] The operator use in comparison (tag_value ">=" thresold or tag_value "<=" thresold ). @param threshold: [float] The limit for the tag value. @param excluded_file: [str] The path to the output file. """ valid_operators = { ">=": operator.__ge__, "<=": operator.__le__ } cmp_func = valid_operators[cmp_operator] biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) for observation in biom.get_observations(): alignments = observation["metadata"]["blast_affiliations"] is_discarded = True for current_alignment in alignments: if cmp_func(float(current_alignment[tag]), threshold): is_discarded = False if is_discarded: FH_excluded_file.write( str(observation["id"]) + "\n" ) FH_excluded_file.close()
def process( in_biom, out_biom, out_metadata ): ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata taxonomy_depth = 0 unclassified_observations = list() FH_metadata = open( out_metadata, "w" ) FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" ) biom = BiomIO.from_json( in_biom ) for observation in biom.get_observations(): for metadata_key in observation["metadata"].keys(): if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file if observation["metadata"][metadata_key] is not None: for current_affi in observation["metadata"][metadata_key]: if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple): current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] ) FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" ) del observation["metadata"][metadata_key] elif observation["metadata"][metadata_key] is not None: # All list are transformed in string if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple): observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) ) if observation["metadata"].has_key( "blast_taxonomy" ): if observation["metadata"]["blast_taxonomy"] is None: unclassified_observations.append( observation["id"] ) observation["metadata"]["taxonomy"] = list() else: taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";")) observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";") # Add "Unclassified" ranks in unclassified observations if taxonomy_depth > 0: for observation_id in unclassified_observations: observation_metadata = biom.get_observation_metadata(observation_id) observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth BiomIO.write( out_biom, biom )
def remove_observations( removed_observations, input_biom, output_biom ): """ @summary: Removes the specified list of observations. @param removed_observations: [list] The names of the observations to remove. @param input_biom: [str] The path to the input BIOM. @param output_biom: [str] The path to the output BIOM. """ biom = BiomIO.from_json( input_biom ) biom.remove_observations( removed_observations ) BiomIO.write( output_biom, biom )
def process(in_biom, out_biom, out_metadata): ordered_blast_keys = [ "taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length" ] # Keys in blast_affiliations metadata taxonomy_depth = 0 unclassified_observations = list() FH_metadata = open(out_metadata, "w") FH_metadata.write("#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n") biom = BiomIO.from_json(in_biom) for observation in biom.get_observations(): for metadata_key in observation["metadata"].keys(): if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file if observation["metadata"][metadata_key] is not None: for current_affi in observation["metadata"][metadata_key]: if isinstance(current_affi["taxonomy"], list) or isinstance( current_affi["taxonomy"], tuple): current_affi["taxonomy"] = ";".join( current_affi["taxonomy"]) FH_metadata.write(observation["id"] + "\t" + "\t".join([ str(current_affi[item]) for item in ordered_blast_keys ]) + "\n") del observation["metadata"][metadata_key] elif observation["metadata"][ metadata_key] is not None: # All list are transformed in string if isinstance(observation["metadata"][metadata_key], list) or isinstance( observation["metadata"][metadata_key], tuple): observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key])) if observation["metadata"].has_key("blast_taxonomy"): if observation["metadata"]["blast_taxonomy"] is None: unclassified_observations.append(observation["id"]) observation["metadata"]["taxonomy"] = list() else: taxonomy_depth = len( observation["metadata"]["blast_taxonomy"].split(";")) observation["metadata"]["taxonomy"] = observation["metadata"][ "blast_taxonomy"].split(";") # Add "Unclassified" ranks in unclassified observations if taxonomy_depth > 0: for observation_id in unclassified_observations: observation_metadata = biom.get_observation_metadata( observation_id) observation_metadata["taxonomy"] = ["Unclassified" ] * taxonomy_depth BiomIO.write(out_biom, biom)
def excluded_obs_on_nBiggest( input_biom, nb_selected, excluded_file ): """ @summary: Writes the list of all the observations without the n most abundant. @param input_biom: [str] The path to the BIOM file. @param threshold: [float] The number of the most abundant observations that will not be written in the excluded list. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) sorted_obs_counts = sorted( biom.get_observations_counts(), key=lambda observation: observation[1], reverse=True ) for observation_name, observation_count in sorted_obs_counts[nb_selected:]: FH_excluded_file.write( observation_name + "\n" ) FH_excluded_file.close()
def excluded_obs_on_samplePresence(input_biom, min_sample_presence, excluded_file): """ @summary: Writes the list of the observations present in an insufficient number of samples. @param input_biom: [str] The path to the BIOM file to check. @param min_sample_presence: [int] The observations present in a number of samples inferior than this value are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) for observation_name in biom.get_observations_names(): nb_samples = sum(1 for x in biom.get_samples_by_observation(observation_name)) if nb_samples < min_sample_presence: FH_excluded_file.write( observation_name + "\n" ) FH_excluded_file.close()
def get_alignment_distrib( input_biom, identity_tag, coverage_tag, multiple_tag ): """ @summary: Returns by taxonomic rank the count (seq and clstr) for the different identity/coverage. @param input_biom: The path to the processed BIOM. @param identity_tag: The metadata tag used in BIOM file to store the alignment identity. @param coverage_tag: The metadata tag used in BIOM file to store the alignment query coverage. @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies. @returns: [list] By taxonomic rank the count for the different identity/coverage. Example: [ [100, 100, { "clstr": 53, "seq": 20500 }], [99, 100, { "clstr": 35, "seq": 18000 }], [90, 95, { "clstr": 1, "seq": 10 }], ] """ biom = BiomIO.from_json( input_biom ) aln_results = list() aln_results_hash = dict() for observation in biom.get_observations(): observation_metadata = observation['metadata'] identity = None coverage = None if args.multiple_tag is not None: if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0: identity = observation_metadata[multiple_tag][0][identity_tag] coverage = observation_metadata[multiple_tag][0][coverage_tag] else: if observation_metadata.has_key(identity_tag) and observation_metadata.has_key(coverage_tag): identity = observation_metadata[identity_tag] coverage = observation_metadata[coverage_tag] if identity is not None: if not aln_results_hash.has_key( identity ): aln_results_hash[identity] = dict() if not aln_results_hash[identity].has_key( coverage ): aln_results_hash[identity][coverage] = { "clstr": 0, "seq": 0 } aln_results_hash[identity][coverage]["clstr"] += 1 aln_results_hash[identity][coverage]["seq"] += biom.get_observation_count( observation['id'] ) for ident in aln_results_hash.keys(): for cover in aln_results_hash[ident].keys(): aln_results.append([ ident, cover, aln_results_hash[ident][cover] ]) del biom return aln_results
def get_bootstrap_distrib( input_biom, bootstrap_tag, multiple_tag ): """ @summary: Returns by taxonomic rank the count (seq and clstr) for the different bootstrap categories. @param input_biom: The path to the processed BIOM. @param bootstrap_tag: The metadata tag used in BIOM file to store the taxonomy bootstraps. @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies. @returns: [dict] By taxonomic rank the count for the different bootstrap categories. Example: { "Phylum": { "80": { "clstr": 1, "seq":100 }, "90": { "clstr": 2, "seq":400 }, "100": { "clstr": 50, "seq":20000 }, }, "Genus":{ "80":{ "clstr": 1, "seq":100 }, "90":{ "clstr": 2, "seq":400 }, "100":{ "clstr": 50, "seq":20000 }, } } """ bootstrap_results = dict() biom = BiomIO.from_json( input_biom ) for observation in biom.get_observations(): observation_metadata = observation['metadata'] bootstrap = None if multiple_tag is not None: if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0: bootstrap = observation_metadata[multiple_tag][0][bootstrap_tag] else: if observation_metadata.has_key(bootstrap_tag): bootstrap = observation_metadata[bootstrap_tag] if bootstrap is not None: for taxonomy_depth, rank_bootstrap in enumerate( bootstrap ): rank_bootstrap = rank_bootstrap * 100 rank = args.taxonomic_ranks[taxonomy_depth] if not bootstrap_results.has_key(rank): bootstrap_results[rank] = dict() if not bootstrap_results[rank].has_key(rank_bootstrap): bootstrap_results[rank][rank_bootstrap] = { "clstr": 0, "seq": 0 } bootstrap_results[rank][rank_bootstrap]["clstr"] += 1 bootstrap_results[rank][rank_bootstrap]["seq"] += biom.get_observation_count( observation['id'] ) del biom return bootstrap_results
def excluded_obs_on_rdpBootstrap(input_biom, taxonomic_depth, min_bootstrap, excluded_file): """ @summary: Writes the list of the observations with an insufficient bootstrap on the specified taxonomic rank. @param input_biom: [str] The path to the BIOM file to check. @param taxonomic_depth: [int] The taxonomic rank depth to check (example: 6 for Species in system "Domain, Phylum, Class, Order, Family, Genus, Species"). @param min_bootstrap: [float] The observations with a value inferior to this threshold at the specified taxonomic depth are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) for observation in biom.get_observations(): bootstrap = observation["metadata"]["rdp_bootstrap"] if issubclass(bootstrap.__class__, str): bootstrap = bootstrap.split(";") if bootstrap[taxonomic_depth] < min_bootstrap: FH_excluded_file.write( str(observation["id"]) + "\n" ) FH_excluded_file.close()
def excluded_obs_on_abundance(input_biom, min_abundance, excluded_file): """ @summary: Writes the list of the observations with an insufficient abundance. @param input_biom: [str] The path to the BIOM file to check. @param min_abundance: [int/float] The observations with an abundance inferior than this value are reported in the excluded file. @param excluded_file: [str] The path to the output file. """ biom = BiomIO.from_json( input_biom ) FH_excluded_file = open( excluded_file, "w" ) min_nb_seq = min_abundance if type(min_abundance) == float: min_nb_seq = biom.get_total_count() * min_abundance for idx, count_by_sample in enumerate(biom.to_count()): observation = biom.rows[idx] abundance = sum(count_by_sample) if abundance < min_nb_seq: FH_excluded_file.write( str(observation["id"]) + "\n" ) FH_excluded_file.close()
def get_step_size(self, nb_step=35): """ @summary: Returns the step size to obtain 'nb_step' steps or more in 3/4 of samples. @param nb_step: [int] The number of expected steps. @returns: [int] The step size. """ counts = list() # Get the number of sequences by sample biom = BiomIO.from_json( self.in_biom ) for sample_name in biom.get_samples_names(): counts.append( biom.get_sample_count(sample_name) ) del biom counts = sorted(counts) nb_samples = len(counts) # Finds the lower quartile number of sequences lower_quartile_idx = nb_samples/4 nb_seq = counts[lower_quartile_idx] # If lower quartile sample is empty if nb_seq == 0: idx = 0 while (lower_quartile_idx + idx) < nb_samples and counts[lower_quartile_idx + idx] == 0: nb_seq = counts[lower_quartile_idx + idx] idx += 1 return int(nb_seq/nb_step)
def process( args ): tmp_files = TmpFiles( os.path.split(args.output_file)[0] ) try: # Add temp taxonomy if multiple and without consensus tmp_biom = args.input_biom used_taxonomy_tag = args.taxonomy_tag if args.multiple_tag is not None: used_taxonomy_tag = args.tax_consensus_tag if args.tax_consensus_tag is None: used_taxonomy_tag = "Used_taxonomy_FROGS-affi" tmp_biom = tmp_files.add( "tax.biom" ) biom = BiomIO.from_json( args.input_biom ) for observation in biom.get_observations(): metadata = observation["metadata"] if len(metadata[args.multiple_tag]) > 0: metadata[used_taxonomy_tag] = metadata[args.multiple_tag][0][args.taxonomy_tag] BiomIO.write( tmp_biom, biom ) del biom # Rarefaction tax_depth = [args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks] rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag, tax_depth) rarefaction_cmd.submit( args.log_file ) rarefaction_files = rarefaction_cmd.output_files # Taxonomy tree tree_count_file = tmp_files.add( "taxCount.enewick" ) tree_ids_file = tmp_files.add( "taxCount_ids.tsv" ) TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file, tree_ids_file).submit( args.log_file ) # Writes summary write_summary( args.output_file, args.input_biom, tree_count_file, tree_ids_file, rarefaction_files, args ) finally: if not args.debug: tmp_files.deleteAll()
def write_summary( summary_file, input_biom, tree_count_file, tree_ids_file, rarefaction_files, args ): """ @summary: Writes the summary of results. @param summary_file: [str] The output file. @param input_biom: [str] Path to the input BIOM. @param tree_count_file: [str] Path to biomTools treeCount output. @param tree_ids_file: [str] Path to biomTools treeCount optional output. @param rarefaction_file: [str] Path to biomTools rarefaction output. @param args: The script arguments. """ # Get taxonomy distribution FH_tree_count = open( tree_count_file ) newick_tree = FH_tree_count.readline() FH_tree_count.close() ordered_samples_names = list() FH_tree_ids = open( tree_ids_file ) for line in FH_tree_ids: id, sample_name = line.strip().split( "\t", 1 ) ordered_samples_names.append( sample_name ) FH_tree_ids.close() # Get bootstrap metrics bootstrap_results = None if args.bootstrap_tag is not None: bootstrap_results = get_bootstrap_distrib( input_biom, args.bootstrap_tag, args.multiple_tag ) # Get alignment metrics aln_results = None if args.identity_tag is not None and args.coverage_tag is not None: aln_results = get_alignment_distrib( input_biom, args.identity_tag, args.coverage_tag, args.multiple_tag ) # Get rarefaction data rarefaction_step_size = None rarefaction = None biom = BiomIO.from_json( input_biom ) for rank_idx, current_file in enumerate(rarefaction_files): rank = args.rarefaction_ranks[rank_idx] FH_rarefaction = open( current_file ) for line in FH_rarefaction: fields = map(str.strip, line.split("\t")) if line.startswith('#'): samples = fields[1:] if rarefaction is None: rarefaction = dict() for sample in samples: rarefaction[sample] = dict() rarefaction[sample]['nb_seq'] = biom.get_sample_count( sample ) for sample in samples: rarefaction[sample][rank] = list() else: if rarefaction_step_size is None: rarefaction_step_size = int(fields[0]) if not rarefaction[sample].has_key( rank ): rarefaction[sample][rank] = list() for idx, sample in enumerate(samples): if fields[idx+1] != "": rarefaction[sample][rank].append( int(fields[idx+1]) ) FH_rarefaction.close() del biom # Write FH_summary_tpl = open( os.path.join(CURRENT_DIR, "affiliations_stat_tpl.html") ) FH_summary_out = open( summary_file, "w" ) for line in FH_summary_tpl: if "###TAXONOMIC_RANKS###" in line: line = line.replace( "###TAXONOMIC_RANKS###", json.dumps(args.taxonomic_ranks) ) elif "###SAMPLES_NAMES###" in line: line = line.replace( "###SAMPLES_NAMES###", json.dumps(ordered_samples_names) ) elif "###TREE_DISTRIBUTION###" in line: line = line.replace( "###TREE_DISTRIBUTION###", json.dumps(newick_tree) ) elif "###DATA_RAREFACTION###" in line: line = line.replace( "###DATA_RAREFACTION###", json.dumps(rarefaction) ) elif "###RAREFACTION_STEP_SIZE###" in line: line = line.replace( "###RAREFACTION_STEP_SIZE###", json.dumps(rarefaction_step_size) ) elif "###RAREFACTION_RANKS###" in line: line = line.replace( "###RAREFACTION_RANKS###", json.dumps(args.rarefaction_ranks) ) elif "###ALIGNMENT_SCORES###" in line: line = line.replace( "###ALIGNMENT_SCORES###", json.dumps(aln_results) ) elif "###BOOTSTRAP_SCORES###" in line: line = line.replace( "###BOOTSTRAP_SCORES###", json.dumps(bootstrap_results) ) FH_summary_out.write( line ) FH_summary_out.close() FH_summary_tpl.close()
group_input.add_argument( '-i', '--input-biom', required=True, help="The input biom file." ) # Outputs group_output = parser.add_argument_group( 'Outputs' ) group_output.add_argument( '-o', '--output-file', default="affiliations_metrics.html", help="The output report." ) group_output.add_argument( '-l', '--log-file', default=sys.stdout, help='The list of commands executed.' ) args = parser.parse_args() prevent_shell_injections(args) Logger.static_write(args.log_file, "## Application\nSoftware: " + os.path.basename(sys.argv[0]) + " (version: " + str(__version__) + ")\nCommand: " + " ".join(sys.argv) + "\n\n") # Check parameters if args.multiple_tag is None and args.tax_consensus_tag is not None: raise Exception( "The parameter '--tax-consensus-tag' must be used only with the paameter '--multiple-tag'." ) if args.taxonomy_tag is None and args.tax_consensus_tag is None: raise Exception( "The parameter '--taxonomy-tag' or the parameter '--tax-consensus-tag' must be set." ) if (args.identity_tag is None and args.coverage_tag is not None) or (args.identity_tag is not None and args.coverage_tag is None): raise Exception( "The parameters '--identity-tag' and '--coverage-tag' must be setted together." ) for current_rank in args.rarefaction_ranks: if current_rank not in args.taxonomic_ranks: raise Exception( "'" + current_rank + "' is not in valid taxonomic ranks : " + ", ".join(args.taxonomic_ranks) ) biom = BiomIO.from_json( args.input_biom ) if args.multiple_tag is None: for param in [args.taxonomy_tag, args.bootstrap_tag, args.identity_tag, args.coverage_tag]: if param is not None and not biom.has_observation_metadata( param ): raise Exception( "The metadata '" + param + "' does not exist in the BIOM file." ) else: if args.tax_consensus_tag is not None and not biom.has_observation_metadata( args.tax_consensus_tag ): raise Exception( "The metadata '" + args.tax_consensus_tag + "' does not exist in the BIOM file." ) del biom # Process process( args )
def write_summary( summary_file, input_biom, output_biom, discards ): """ @summary: Writes the process summary. @param summary_file: [str] The path to the output file. @param input_biom: [str] The path to the BIOM before program execution. @param output_biom: [str] The path to the BIOM after program execution. @param discards: [dict] By filter the path of the file that contains the list of the removed observations. """ global_results = { 'nb_clstr_kept': 0, 'nb_clstr_ini': 0, 'nb_seq_kept': 0, 'nb_seq_ini': 0 } samples_results = dict() filters_results = dict() # Global before filters in_biom = BiomIO.from_json( input_biom ) for observation_name in in_biom.get_observations_names(): global_results['nb_clstr_ini'] += 1 global_results['nb_seq_ini'] += in_biom.get_observation_count( observation_name ) for sample_name in in_biom.get_samples_names(): samples_results[sample_name] = { 'initial': sum( 1 for x in in_biom.get_observations_by_sample(sample_name) ), 'filtered': dict(), 'kept': 0 } # By sample and by filters filters_intersections = dict() for filter in discards.keys(): FH_filter = open( discards[filter] ) for line in FH_filter: observation_name = line.strip() if not filters_intersections.has_key( observation_name ): filters_intersections[observation_name] = dict() filters_intersections[observation_name][filter] = 1 FH_filter.close() for observation_name in filters_intersections.keys(): # Removed intersection intersections_key = "--@@--".join(sorted( filters_intersections[observation_name].keys() )) if not filters_results.has_key( intersections_key ): filters_results[intersections_key] = { 'filters': filters_intersections[observation_name].keys(), 'count': 0 } filters_results[intersections_key]['count'] += 1 # Filters by samples for sample in in_biom.get_samples_by_observation(observation_name): for filter in filters_intersections[observation_name]: if not samples_results[sample['id']]['filtered'].has_key(filter): samples_results[sample['id']]['filtered'][filter] = 0 samples_results[sample['id']]['filtered'][filter] += 1 del in_biom # Global after filters out_biom = BiomIO.from_json( output_biom ) for observation_name in out_biom.get_observations_names(): global_results['nb_clstr_kept'] += 1 global_results['nb_seq_kept'] += out_biom.get_observation_count( observation_name ) for sample_name in out_biom.get_samples_names(): samples_results[sample_name]['kept'] = sum( 1 for x in out_biom.get_observations_by_sample(sample_name) ) del out_biom # Write FH_summary_tpl = open( os.path.join(CURRENT_DIR, "filters_tpl.html") ) FH_summary_out = open( summary_file, "w" ) for line in FH_summary_tpl: if "###PORCESSED_FILTERS###" in line: line = line.replace( "###PORCESSED_FILTERS###", json.dumps([filter for filter in discards]) ) elif "###GLOBAL_RESULTS###" in line: line = line.replace( "###GLOBAL_RESULTS###", json.dumps(global_results) ) elif "###SAMPLES_RESULTS###" in line: line = line.replace( "###SAMPLES_RESULTS###", json.dumps(samples_results) ) elif "###FILTERS_RESULTS###" in line: line = line.replace( "###FILTERS_RESULTS###", json.dumps(filters_results.values()) ) FH_summary_out.write( line ) FH_summary_out.close() FH_summary_tpl.close()