def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ): """ @summary: Writes a BIOM after a random sampling in each sample. @param input_biom: [str] Path to the processed BIOM. @param output_biom: [str] Path to outputed BIOM. @param nb_sampled: [int] Number of sampled sequences by sample. @param sampled_ratio: [float] Ratio of sampled sequences by sample. @note: nb_sampled and sampled_ratio are mutually exclusive. """ initial_biom = BiomIO.from_json( input_biom ) new_biom = Biom( matrix_type="sparse", generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom ) observations_already_added = dict() for sample_name in initial_biom.get_samples_names(): new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) ) sample_seq = initial_biom.get_sample_count(sample_name) sample_nb_sampled = nb_sampled if nb_sampled is None: sample_nb_sampled = int(sample_seq * sampled_ratio) if sample_seq < nb_sampled: raise_exception( Exception( "\n\n#ERROR : " + str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences.\n\n" )) else: for current_nb_iter in range(sample_nb_sampled): # Take an observation in initial BIOM selected_observation = initial_biom.random_obs_by_sample(sample_name) selected_observation_id = selected_observation['id'] initial_biom.subtract_count( selected_observation_id, sample_name, 1 ) # Put in new BIOM if selected_observation_id not in observations_already_added: new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) ) observations_already_added[selected_observation_id] = True new_biom.add_count( selected_observation_id, sample_name, 1 ) BiomIO.write( output_biom, new_biom )
def mask_observation(rdp_clusters_discards, blast_clusters_discards, input_biom, output_biom): """ @summary : mask either rdp affiliations and/or blast affiliations @param rdp_clusters_discards : [list] of clusters whith rdp affiliations to mask @param blast_clusters_discards : [list] of clusters whith blast consensus affiliations to mask @param input_biom : [str] Path to input biom file @param input_biom : [str] Path to output biom file with affiliations masked """ biom = BiomIO.from_json(input_biom) for observation in biom.get_observations(): # remove rdp taxonomic metadata if rdp_clusters_discards is not None and observation[ 'id'] in rdp_clusters_discards: if issubclass(observation['metadata']["rdp_taxonomy"].__class__, str): observation['metadata']["rdp_taxonomy"] = "" observation['metadata']["rdp_bootstrap"] = "" elif issubclass(observation['metadata']["rdp_taxonomy"].__class__, str): observation['metadata']["rdp_taxonomy"] = list() observation['metadata']["rdp_bootstrap"] = list() # remove blast metadata if observation['id'] in blast_clusters_discards: observation['metadata']["blast_affiliations"] = list() observation['metadata']["blast_taxonomy"] = list() BiomIO.write(output_biom, biom)
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None): """ @summary: Add taxonomy metadata on biom file from a blast result. @param reference_file: [str] The path to the reference file. @param biom_in: [str] The path to the Biom file to process. @param biom_out: [str] The path to the biom output file. @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+). @param rdp_files: [list] the list of path to the RDPClassifier results. """ # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy) taxonomy_by_reference = get_tax_from_fasta(reference_file) # Retrieve blast clusters annotations cluster_blast_annot = dict() if blast_files is not None: cluster_blast_annot = get_bests_blast_affi(blast_files, taxonomy_by_reference) del taxonomy_by_reference # Retrieve rdp clusters annotations cluster_rdp_annot = dict() if rdp_files is not None: cluster_rdp_annot = get_rdp_affi(rdp_files) # Add metadata to biom biom = BiomIO.from_json(biom_in) for cluster in biom.get_observations(): cluster_id = cluster["id"] # Blast if blast_files is not None: blast_taxonomy = list() blast_affiliations = list() if cluster_id in cluster_blast_annot: # Current observation has a match blast_taxonomy = get_tax_consensus([ taxonomy.split(';') for taxonomy in cluster_blast_annot[cluster_id]['alignments'] ]) for taxonomy in cluster_blast_annot[cluster_id]['alignments']: blast_affiliations.extend(cluster_blast_annot[cluster_id] ['alignments'][taxonomy]) biom.add_metadata(cluster_id, "blast_affiliations", blast_affiliations, "observation") biom.add_metadata(cluster_id, "blast_taxonomy", blast_taxonomy, "observation") # RDP if rdp_files is not None: rdp_taxonomy = list() rdp_bootstrap = list() if cluster_id in cluster_rdp_annot: rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy'] rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap'] biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation") biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation") BiomIO.write(biom_out, biom)
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ): """ @summary: Writes a BIOM after a random sampling in each sample. @param input_biom: [str] Path to the processed BIOM. @param output_biom: [str] Path to outputed BIOM. @param nb_sampled: [int] Number of sampled sequences by sample. @param sampled_ratio: [float] Ratio of sampled sequences by sample. @note: nb_sampled and sampled_ratio are mutually exclusive. """ initial_biom = BiomIO.from_json( input_biom ) new_biom = Biom( matrix_type="sparse", generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom ) observations_already_added = dict() for sample_name in initial_biom.get_samples_names(): new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) ) sample_seq = initial_biom.get_sample_count(sample_name) sample_nb_sampled = nb_sampled if nb_sampled is None: sample_nb_sampled = int(sample_seq * sampled_ratio) if sample_seq < nb_sampled: raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." ) else: for current_nb_iter in range(sample_nb_sampled): # Take an observation in initial BIOM selected_observation = initial_biom.random_obs_by_sample(sample_name) selected_observation_id = selected_observation['id'] initial_biom.subtract_count( selected_observation_id, sample_name, 1 ) # Put in new BIOM if not observations_already_added.has_key(selected_observation_id): new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) ) observations_already_added[selected_observation_id] = True new_biom.add_count( selected_observation_id, sample_name, 1 ) BiomIO.write( output_biom, new_biom )
def process( in_biom, out_biom, out_metadata ): ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata taxonomy_depth = 0 unclassified_observations = list() FH_metadata = open( out_metadata, "w" ) FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" ) biom = BiomIO.from_json( in_biom ) for observation in biom.get_observations(): for metadata_key in observation["metadata"].keys(): if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file if observation["metadata"][metadata_key] is not None: for current_affi in observation["metadata"][metadata_key]: if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple): current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] ) FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" ) del observation["metadata"][metadata_key] elif observation["metadata"][metadata_key] is not None: # All list are transformed in string if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple): observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) ) if observation["metadata"].has_key( "blast_taxonomy" ): if observation["metadata"]["blast_taxonomy"] is None: unclassified_observations.append( observation["id"] ) observation["metadata"]["taxonomy"] = list() else: taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";")) observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";") # Add "Unclassified" ranks in unclassified observations if taxonomy_depth > 0: for observation_id in unclassified_observations: observation_metadata = biom.get_observation_metadata(observation_id) observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth BiomIO.write( out_biom, biom )
def to_biom( clusters_file, count_file, output_biom, size_separator ): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom( generated_by='swarm', matrix_type="sparse" ) # Preclusters count by sample preclusters_count = dict() count_fh = open( count_file ) samples = count_fh.readline().strip().split()[1:] for line in count_fh: precluster_id, count_str = line.strip().split(None, 1) preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count count_fh.close() # Add samples for sample_name in samples: biom.add_sample( sample_name ) # Process count cluster_idx = 1 clusters_fh = open( clusters_file ) for line in clusters_fh: seed_id = line.strip().split()[0] if "FROGS_combined" in seed_id: cluster_name = "Cluster_" + str(cluster_idx) + "_FROGS_combined" comment = "WARNING" else: cluster_name = "Cluster_" + str(cluster_idx) comment = "na" cluster_count = {key:0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] sample_counts = preclusters_count[real_seq_id].split() for sample_idx, sample_name in enumerate(samples): cluster_count[sample_name] += int(sample_counts[sample_idx]) preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'comment': comment, 'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} ) observation_idx = biom.find_idx("observation", cluster_name) for sample_idx, sample_name in enumerate(samples): if cluster_count[sample_name] > 0: biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] ) # Next cluster cluster_idx += 1 # Write BiomIO.write( output_biom, biom )
def to_biom(clusters_file, count_file, output_biom, size_separator): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom(generated_by='swarm', matrix_type="sparse") # Preclusters count by sample preclusters_count = dict() count_fh = open(count_file) samples = count_fh.readline().strip().split()[1:] for line in count_fh: line_fields = line.strip().split() count_by_sample = {} for idx, val in enumerate(line_fields[1:]): if val > 0: count_by_sample[samples[idx]] = int(val) preclusters_count[line_fields[0]] = count_by_sample count_fh.close() # Add samples for sample_name in samples: biom.add_sample(sample_name) # Process count cluster_idx = 1 clusters_fh = open(clusters_file) for line in clusters_fh: cluster_name = "Cluster_" + str(cluster_idx) cluster_count = {key: 0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] for preclust_sample in preclusters_count[real_seq_id]: cluster_count[preclust_sample] += preclusters_count[ real_seq_id][preclust_sample] preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'seed_id': line_fields[0].rsplit(size_separator, 1)[0]}) for sample_name in samples: if cluster_count[sample_name] > 0: biom.add_count(cluster_name, sample_name, cluster_count[sample_name]) # Next cluster cluster_idx += 1 # Write BiomIO.write(output_biom, biom)
def filter_biom( removed_observations, in_biom, out_biom ): """ @summary: Removed the specified observations from BIOM. @param removed_observations: [dict] Each key is an observation name. @param in_biom: [str]: Path to the processed BIOM file. @param out_biom: [str]: Path to the cleaned BIOM file. """ biom = BiomIO.from_json(in_biom) biom.remove_observations(removed_observations) BiomIO.write(out_biom, biom)
def remove_observations( removed_observations, input_biom, output_biom ): """ @summary: Removes the specified list of observations. @param removed_observations: [list] The names of the observations to remove. @param input_biom: [str] The path to the input BIOM. @param output_biom: [str] The path to the output BIOM. """ biom = BiomIO.from_json( input_biom ) biom.remove_observations( removed_observations ) BiomIO.write( output_biom, biom )
def remove_observations(removed_observations, input_biom, output_biom): """ @summary: Removes the specified list of observations. @param removed_observations: [list] The names of the observations to remove. @param input_biom: [str] The path to the input BIOM. @param output_biom: [str] The path to the output BIOM. """ biom = BiomIO.from_json(input_biom) biom.remove_observations(removed_observations) BiomIO.write(output_biom, biom)
def to_biom( clusters_file, count_file, output_biom, size_separator ): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom( generated_by='swarm', matrix_type="sparse" ) # Preclusters count by sample preclusters_count = dict() count_fh = open( count_file ) samples = count_fh.readline().strip().split()[1:] for line in count_fh: precluster_id, count_str = line.strip().split(None, 1) preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count count_fh.close() # Add samples for sample_name in samples: biom.add_sample( sample_name ) # Process count cluster_idx = 1 clusters_fh = open( clusters_file ) for line in clusters_fh: cluster_name = "Cluster_" + str(cluster_idx) cluster_count = {key:0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] sample_counts = preclusters_count[real_seq_id].split() for sample_idx, sample_name in enumerate(samples): cluster_count[sample_name] += int(sample_counts[sample_idx]) preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} ) observation_idx = biom.find_idx("observation", cluster_name) for sample_idx, sample_name in enumerate(samples): if cluster_count[sample_name] > 0: biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] ) # Next cluster cluster_idx += 1 # Write BiomIO.write( output_biom, biom )
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None): """ @summary: Add taxonomy metadata on biom file from a blast result. @param reference_file: [str] The path to the reference file. @param biom_in: [str] The path to the Biom file to process. @param biom_out: [str] The path to the biom output file. @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+). @param rdp_files: [list] the list of path to the RDPClassifier results. """ # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy) taxonomy_by_reference = get_tax_from_fasta( reference_file ) # Retrieve blast clusters annotations cluster_blast_annot = dict() if blast_files is not None: cluster_blast_annot = get_bests_blast_affi( blast_files, taxonomy_by_reference ) del taxonomy_by_reference # Retrieve rdp clusters annotations cluster_rdp_annot = dict() if rdp_files is not None: cluster_rdp_annot = get_rdp_affi( rdp_files ) # Add metadata to biom biom = BiomIO.from_json(biom_in) for cluster in biom.get_observations(): cluster_id = cluster["id"] # Blast if blast_files is not None: blast_taxonomy = None blast_affiliations = list() if cluster_blast_annot.has_key(cluster_id): # Current observation has a match blast_taxonomy = get_tax_consensus( [alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments']] ) blast_affiliations = cluster_blast_annot[cluster_id]['alignments'] biom.add_metadata( cluster_id, "blast_affiliations", blast_affiliations, "observation" ) biom.add_metadata( cluster_id, "blast_taxonomy", blast_taxonomy, "observation" ) # RDP if rdp_files is not None: rdp_taxonomy = None rdp_bootstrap = None if cluster_rdp_annot.has_key(cluster_id): rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy'] rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap'] biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation") biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation") BiomIO.write(biom_out, biom)
def process(args): tmp_files = TmpFiles(os.path.split(args.output_file)[0]) try: # Add temp taxonomy if multiple and without consensus tmp_biom = args.input_biom used_taxonomy_tag = args.taxonomy_tag if args.multiple_tag is not None: used_taxonomy_tag = args.tax_consensus_tag if args.tax_consensus_tag is None: used_taxonomy_tag = "Used_taxonomy_FROGS-affi" tmp_biom = tmp_files.add("tax.biom") biom = BiomIO.from_json(args.input_biom) for observation in biom.get_observations(): metadata = observation["metadata"] if metadata[args.multiple_tag] is not None and len( metadata[args.multiple_tag]) > 0: metadata[used_taxonomy_tag] = metadata[ args.multiple_tag][0][args.taxonomy_tag] BiomIO.write(tmp_biom, biom) del biom # Rarefaction tax_depth = [ args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks ] rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag, tax_depth) rarefaction_cmd.submit(args.log_file) rarefaction_files = rarefaction_cmd.output_files # Taxonomy tree tree_count_file = tmp_files.add("taxCount.enewick") tree_ids_file = tmp_files.add("taxCount_ids.tsv") TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file, tree_ids_file).submit(args.log_file) # Writes summary write_summary(args.output_file, args.input_biom, tree_count_file, tree_ids_file, rarefaction_files, args) finally: if not args.debug: tmp_files.deleteAll()
def impacted_obs_by_undesired_taxon(input_biom, undesired_taxon_list, in_all_or_in_consensus, biom_out, impacted_file): """ @summary : write the list of observation with affiliations including undesired taxon. @param input_biom: [str] The path to the BIOM file to check. @param undesired_taxon_list: [list] list of string to look for @param in_all_or_in_consensus: [bool] if True, one taxon_ignored must be in the consensus or all affiliation must one of the taxon ignored @param biom_out: [str] path to biom with removed undesired taxonomy @param impacted_file: [str] The path to the output file. """ biom = BiomIO.from_json(input_biom) FH_impacted_file = open(impacted_file, "w") for observation in biom.get_observations(): # update blast_affiliations without ignored taxon and recompute de blast_taxonomy new_blast_affi = list() for affiliation in observation['metadata']['blast_affiliations']: if not any(t in ";".join(affiliation["taxonomy"]) for t in undesired_taxon_list): new_blast_affi.append(affiliation) # if some affi are masked, update blast_affiliations and blast_taxonomy if len(new_blast_affi) != len( observation['metadata']['blast_affiliations']): observation['metadata']['blast_affiliations'] = new_blast_affi new_consensus = get_tax_consensus( [affi['taxonomy'] for affi in new_blast_affi]) # delete mode if all affiliations belons to one of undesired taxon if in_all_or_in_consensus and len(new_blast_affi) == 0: FH_impacted_file.write(str(observation["id"]) + "\n") # masking mode if the new consensus is changed because of ignoring undesired taxon elif not in_all_or_in_consensus and new_consensus != observation[ 'metadata']['blast_taxonomy']: FH_impacted_file.write(str(observation["id"]) + "\n") observation['metadata']['blast_taxonomy'] = new_consensus BiomIO.write(biom_out, biom)
def process( args ): tmp_files = TmpFiles( os.path.split(args.output_file)[0] ) try: # Add temp taxonomy if multiple and without consensus tmp_biom = args.input_biom used_taxonomy_tag = args.taxonomy_tag if args.multiple_tag is not None: used_taxonomy_tag = args.tax_consensus_tag if args.tax_consensus_tag is None: used_taxonomy_tag = "Used_taxonomy_FROGS-affi" tmp_biom = tmp_files.add( "tax.biom" ) biom = BiomIO.from_json( args.input_biom ) for observation in biom.get_observations(): metadata = observation["metadata"] if len(metadata[args.multiple_tag]) > 0: metadata[used_taxonomy_tag] = metadata[args.multiple_tag][0][args.taxonomy_tag] BiomIO.write( tmp_biom, biom ) del biom # Rarefaction tax_depth = [args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks] rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag, tax_depth) rarefaction_cmd.submit( args.log_file ) rarefaction_files = rarefaction_cmd.output_files # Taxonomy tree tree_count_file = tmp_files.add( "taxCount.enewick" ) tree_ids_file = tmp_files.add( "taxCount_ids.tsv" ) TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file, tree_ids_file).submit( args.log_file ) # Writes summary write_summary( args.output_file, args.input_biom, tree_count_file, tree_ids_file, rarefaction_files, args ) finally: if not args.debug: tmp_files.deleteAll()
cmd_grinder2biom = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grinder2biom.py") + \ " --affiliation " + os.path.abspath(args.databank) + \ " --output " + real_biom + \ " --samples" for current_sample in samples: cmd_grinder2biom += " '" + current_sample['name'] + ":" + current_sample['path'] + "'" subprocess.check_call( cmd_grinder2biom, shell=True ) # Add reference id in checked BIOM biom = BiomIO.from_json( args.checked_biom ) fasta = FastaIO( args.checked_fasta ) for record in fasta: reference = re.search("reference=([^\s]+)", record.description).group(1) biom.add_metadata( record.id, "grinder_source", reference, "observation" ) fasta.close() BiomIO.write( checked_biom, biom ) del(biom) # Compare expected to obtained for current_sample in samples: print current_sample['name'] cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \ + " --real-biom " + os.path.abspath(real_biom) \ + " --real-tax-key 'real_taxonomy'" \ + " --checked-biom " + os.path.abspath(checked_biom) \ + " --checked-tax-key '" + args.taxonomy_key + "'" \ + (" --multi-affiliations" if args.multi_affiliations else "") \ + (" --uniq-groups " + args.uniq_groups if args.uniq_groups is not None else "") \ + " --sample " + current_sample['name'] print subprocess.check_output( cmd_compareSample, shell=True ) print ""
# ################################################################################################################################################## if __name__ == "__main__": # Manage parameters parser = argparse.ArgumentParser(description="Add taxonomy from UTAX result in BIOM file.") parser.add_argument( '-t', '--taxonomy-tag', default="taxonomy", help="The taxonomy tag in BIOM file. [Default: taxonomy]") parser.add_argument( '-v', '--version', action='version', version=__version__) # Inputs group_input = parser.add_argument_group('Inputs') group_input.add_argument('-f', '--input-fasta', required=True, help='Path to the sequence file outputed by UTAX (format: fasta).') group_input.add_argument('-b', '--input-biom', required=True, help='Path to the abundance file (format: BIOM).') # Outputs group_output = parser.add_argument_group('Outputs') group_output.add_argument('-o', '--output-biom', required=True, help='Path to the abundance file with taxonomy (format: BIOM).') args = parser.parse_args() # Process biom = BiomIO.from_json( args.input_biom ) fasta = FastaIO( args.input_fasta ) for record in fasta: # record.id example: Cluster_1;size=19714;tax=d:Bacteria(1.0000),p:"Proteobacteria"(0.9997),c:Alphaproteobacteria(0.9903),o:Rhodospirillales(0.9940),f:Acetobacteraceae(0.9887),g:Humitalea(0.9724); match = re.search("^([^\;]+)\;size\=\d+\;tax=(.+)$", record.id) if match is None: fasta.close() raise Exception("ID and taxonomy cannot be retrieved from '" + record.id + "'") record.id = match.group(1) record.description = match.group(2) biom.add_metadata( record.id, args.taxonomy_tag, record.description, "observation" ) fasta.close() BiomIO.write( args.output_biom, biom )
def remove_chimera_biom(samples, chimera_files, in_biom_file, out_biom_file, lenient_filter, global_report, bySample_report, log_file): """ @summary: Removes the chimera observation from BIOM. @param samples: [list] samples name list @param chimera_files : [list] samples chimera files @param in_biom_file: [str] The path to the BIOM file to filter. @param out_biom_file: [str] The path to the BIOM after filter. @param lenient_filter: [bool] True: removes one sequence in all samples only if it is detected as chimera in all samples where it is present. With False removes one sequence in all samples if it is detected as chimera in at least one sample. @param global_report: [dict] This dictionary is update with the global number of removed observations, the global removed abundance, ... @param bySample_report: [dict] This dictionary is update for add by sample the number of removed observations, the removed abundance, ... @param log_file : [path] Path to general log output file """ FH_log = Logger(log_file) FH_log.write("## Removes the chimera observation from BIOM.\n") nb_sample_by_chimera = dict() # Init bySample_report for sample_name in samples: bySample_report[sample_name] = { 'nb_kept': 0, 'kept_abundance': 0, 'nb_removed': 0, 'removed_abundance': 0, 'removed_max_abundance': 0 } # Retrieve chimera for chimera_file in chimera_files: chimera_fh = open(chimera_file) for line in chimera_fh: observation_name = line.strip() if not nb_sample_by_chimera.has_key(observation_name): nb_sample_by_chimera[observation_name] = 0 nb_sample_by_chimera[observation_name] += 1 chimera_fh.close() # Remove chimera removed_chimera = list() biom = BiomIO.from_json(in_biom_file) for chimera_name in nb_sample_by_chimera.keys(): is_always_chimera = True nb_sample_with_obs = sum( 1 for sample in biom.get_samples_by_observation(chimera_name)) observation_abundance = biom.get_observation_count(chimera_name) if nb_sample_with_obs != nb_sample_by_chimera[chimera_name]: is_always_chimera = False global_report['nb_ambiguous'] += 1 global_report['abundance_ambiguous'] += observation_abundance FH_log.write( "'" + chimera_name + "' is not interpreted as chimera in all samples where it is present.\n" ) if not lenient_filter or is_always_chimera: removed_chimera.append(chimera_name) # Global metrics global_report['nb_removed'] += 1 global_report['abundance_removed'] += observation_abundance # By sample metrics for sample in biom.get_samples_by_observation(chimera_name): sample_count = biom.get_count(chimera_name, sample['id']) bySample_report[sample['id']]['nb_removed'] += 1 bySample_report[ sample['id']]['removed_abundance'] += sample_count bySample_report[sample['id']]['removed_max_abundance'] = max( bySample_report[sample['id']]['removed_max_abundance'], sample_count) biom.remove_observations(removed_chimera) # Nb non-chimera for observation_name in biom.get_observations_names(): global_report['nb_kept'] += 1 global_report['abundance_kept'] += biom.get_observation_count( observation_name) # By sample metrics for sample in biom.get_samples_by_observation(observation_name): sample_count = biom.get_count(observation_name, sample['id']) bySample_report[sample['id']]['nb_kept'] += 1 bySample_report[sample['id']]['kept_abundance'] += sample_count BiomIO.write(out_biom_file, biom) FH_log.close()
help='Path to the sequence file outputed by UTAX (format: fasta).') group_input.add_argument('-b', '--input-biom', required=True, help='Path to the abundance file (format: BIOM).') # Outputs group_output = parser.add_argument_group('Outputs') group_output.add_argument( '-o', '--output-biom', required=True, help='Path to the abundance file with taxonomy (format: BIOM).') args = parser.parse_args() # Process biom = BiomIO.from_json(args.input_biom) fasta = FastaIO(args.input_fasta) for record in fasta: # record.id example: Cluster_1;size=19714;tax=d:Bacteria(1.0000),p:"Proteobacteria"(0.9997),c:Alphaproteobacteria(0.9903),o:Rhodospirillales(0.9940),f:Acetobacteraceae(0.9887),g:Humitalea(0.9724); match = re.search("^([^\;]+)\;size\=\d+\;tax=(.+)$", record.id) if match is None: fasta.close() raise Exception("ID and taxonomy cannot be retrieved from '" + record.id + "'") record.id = match.group(1) record.description = match.group(2) biom.add_metadata(record.id, args.taxonomy_tag, record.description, "observation") fasta.close() BiomIO.write(args.output_biom, biom)
biom.add_sample(sample_name) fh_abund = open(args.samples[sample_name]) for line in fh_abund: # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc" if not line.startswith('#'): fields = line.strip().split() try: biom.add_observation(fields[1]) except: # already exist pass biom.change_count( fields[1], sample_name, int(float(fields[2]) * 100000000000000 )) ################## depend de la precision grinder fh_abund.close() # Set taxonomy metadata fh_classif = FastaIO(args.affiliation) for record in fh_classif: try: metadata = biom.get_observation_metadata(record.id) if metadata is None or not metadata.has_key(taxonomy_key): taxonomy = getCleanedTaxonomy(record.description) biom.add_metadata(record.id, taxonomy_key, taxonomy, "observation") except ValueError: # is not in BIOM pass fh_classif.close() # Write BIOM BiomIO.write(args.output, biom)
def remove_chimera_biom( samples, in_biom_file, out_biom_file, lenient_filter, global_report, bySample_report ): """ @summary: Removes the chimera observation from BIOM. @param samples: [dict] The chimera observations by sample. Example for sample splA: sample['splA']['chimera_path'] where the value is the path to the file containing the list of the chimera observations names. @param in_biom_file: [str] The path to the BIOM file to filter. @param out_biom_file: [str] The path to the BIOM after filter. @param lenient_filter: [bool] True: removes one sequence in all samples only if it is detected as chimera in all samples where it is present. With False removes one sequence in all samples if it is detected as chimera in at least one sample. @param global_report: [dict] This dictionary is update with the global number of removed observations, the global removed abundance, ... @param bySample_report: [dict] This dictionary is update for add by sample the number of removed observations, the removed abundance, ... """ nb_sample_by_chimera = dict() # Init bySample_report for sample_name in samples.keys(): bySample_report[sample_name] = { 'nb_kept': 0, 'kept_abundance': 0, 'nb_removed': 0, 'removed_abundance': 0, 'removed_max_abundance': 0 } # Retrieve chimera for sample_name in samples.keys(): chimera_fh = open( samples[sample_name]['chimera_path'] ) for line in chimera_fh: observation_name = line.strip() if not nb_sample_by_chimera.has_key(observation_name): nb_sample_by_chimera[observation_name] = 0 nb_sample_by_chimera[observation_name] += 1 chimera_fh.close() # Remove chimera removed_chimera = list() biom = BiomIO.from_json(in_biom_file) for chimera_name in nb_sample_by_chimera.keys(): is_always_chimera = True nb_sample_with_obs = sum( 1 for sample in biom.get_samples_by_observation(chimera_name) ) observation_abundance = biom.get_observation_count(chimera_name) if nb_sample_with_obs != nb_sample_by_chimera[chimera_name]: is_always_chimera = False global_report['nb_ambiguous'] += 1 global_report['abundance_ambiguous'] += observation_abundance print "'" + chimera_name + "' is not interpreted as chimera in all samples where it is present." if not lenient_filter or is_always_chimera: removed_chimera.append(chimera_name) # Global metrics global_report['nb_removed'] += 1 global_report['abundance_removed'] += observation_abundance # By sample metrics for sample in biom.get_samples_by_observation(chimera_name): sample_count = biom.get_count(chimera_name, sample['id']) bySample_report[sample['id']]['nb_removed'] += 1 bySample_report[sample['id']]['removed_abundance'] += sample_count bySample_report[sample['id']]['removed_max_abundance'] = max(bySample_report[sample['id']]['removed_max_abundance'], sample_count) biom.remove_observations(removed_chimera) # Nb non-chimera for observation_name in biom.get_observations_names(): global_report['nb_kept'] += 1 global_report['abundance_kept'] += biom.get_observation_count(observation_name) # By sample metrics for sample in biom.get_samples_by_observation(observation_name): sample_count = biom.get_count(observation_name, sample['id']) bySample_report[sample['id']]['nb_kept'] += 1 bySample_report[sample['id']]['kept_abundance'] += sample_count BiomIO.write(out_biom_file, biom)
biom = Biom( generated_by="grinder", matrix_type="sparse" ) # Set observations count for sample_name in args.samples: biom.add_sample( sample_name ) fh_abund = open( args.samples[sample_name] ) for line in fh_abund: # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc" if not line.startswith('#'): fields = line.strip().split() try: biom.add_observation( fields[1] ) except: # already exist pass biom.change_count( fields[1], sample_name, int(float(fields[2])*100000000000000) )################## depend de la precision grinder fh_abund.close() # Set taxonomy metadata fh_classif = FastaIO( args.affiliation ) for record in fh_classif: try: metadata = biom.get_observation_metadata( record.id ) if metadata is None or not metadata.has_key( taxonomy_key ): taxonomy = getCleanedTaxonomy(record.description) biom.add_metadata( record.id, taxonomy_key, taxonomy, "observation" ) except ValueError: # is not in BIOM pass fh_classif.close() # Write BIOM BiomIO.write( args.output, biom )
" --samples" for current_sample in samples: cmd_grinder2biom += " '" + current_sample[ 'name'] + ":" + current_sample['path'] + "'" subprocess.check_call(cmd_grinder2biom, shell=True) # Add reference id in checked BIOM biom = BiomIO.from_json(args.checked_biom) fasta = FastaIO(args.checked_fasta) for record in fasta: reference = re.search("reference=([^\s]+)", record.description).group(1) biom.add_metadata(record.id, "grinder_source", reference, "observation") fasta.close() BiomIO.write(checked_biom, biom) del (biom) # Compare expected to obtained for current_sample in samples: print current_sample['name'] cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \ + " --real-biom " + os.path.abspath(real_biom) \ + " --real-tax-key 'real_taxonomy'" \ + " --checked-biom " + os.path.abspath(checked_biom) \ + " --checked-tax-key '" + args.taxonomy_key + "'" \ + (" --multi-affiliations" if args.multi_affiliations else "") \ + (" --uniq-groups " + args.uniq_groups if args.uniq_groups is not None else "") \ + " --sample " + current_sample['name'] print subprocess.check_output(cmd_compareSample, shell=True) print ""
def tsv_to_biom( input_tsv, multi_hit_dict, fields, samples_names, output_biom, output_fasta ): """ @summary: Convert TSV file to Biom file. @param input_tsv: [str] Path to the TSV file. @param multi_hit_dict: [dict] Dictionnary describing equivalent multi blast hit : dict[observation_name]=[ {"blast_taxonomy":taxonomy, "blast_subject":subject, "blast_perc_identity": per_id, "blast_perc_query_coverage":per_cov, "blast_evalue":eval, "blast_aln_length":aln}] @param fields: [list] column name to include as metadata (must at least contain observation_name): observation_sum and seed_sequence will be excluded, rdp_tax_and_bootstrap will be split in two metadata @param samples_names: [list] list of sample names. @param output_biom: [str] Path to the output file (format : BIOM). @param output_fasta: [str] Path to the output file (format : fasta). """ # biom = Biom( generated_by='frogs', matrix_type="sparse" ) biom = Biom( matrix_type="sparse" ) seed_seq_idx = -1 metadata_index = dict() sample_index = dict() clusters_count = dict() clusters_metadata = dict() in_fh = open( input_tsv ) if not output_fasta is None: Fasta_fh=FastaIO(output_fasta , "w" ) # parse header and store column index header=in_fh.readline() if header.startswith("#"): header=header[1:] header = header.strip() seed_seq_idx, metadata_index, sample_index = header_line_dict(fields,header,samples_names) if not output_fasta is None and seed_seq_idx == -1: raise Exception("\nYou want to extract seed fasta sequence but there is no seed_sequence column in your TSV file\n\n") # count by sample, and metadata for line in in_fh: cluster_name="" line_list=line.strip().split("\t") count_by_sample = {} metadata_dict = {} # parse columns for idx,val in enumerate(line_list): # recover metadata if idx in metadata_index: if metadata_index[idx]=="observation_name" : cluster_name = val else: metadata_dict[metadata_index[idx]] = val # recover samples count elif idx in sample_index and val > 0: count_by_sample[sample_index[idx]] = int(val) # recover seed sequence elif idx == seed_seq_idx: seed_seq = val # if fasta output file => store de seed sequence if not output_fasta is None: seq = Sequence( cluster_name, seed_seq) Fasta_fh.write(seq) if "taxonomy" in metadata_dict: metadata_dict["taxonomy"] = metadata_dict["taxonomy"].split(";") # format rdp taxonomy to fit BIOM format if "rdp_tax_and_bootstrap" in metadata_dict: metadata_dict["rdp_taxonomy"]=[] metadata_dict["rdp_bootstrap"]=[] tax = metadata_dict["rdp_tax_and_bootstrap"].rstrip(";").split(";") for i in range(0,len(tax),2): metadata_dict["rdp_taxonomy"].append(tax[i]) metadata_dict["rdp_bootstrap"].append(tax[i+1].replace("(","").replace(")","")) metadata_dict.pop("rdp_tax_and_bootstrap") # format blast taxonomy to fit BIOM format (one consensus blast_taxonomy and possible multiples blast_affiliation detailed if "blast_taxonomy" in metadata_dict: metadata_dict["blast_taxonomy"] = metadata_dict["blast_taxonomy"].split(";") # check multihit blast : filter non consistent taxonomy hit with blast_taxonomy (if TSV modified), and compute consensus tax (if multihit line suppressed) if metadata_dict["blast_subject"] == "multi-subject" and not multi_hit_dict is None: if not cluster_name in multi_hit_dict: raise Exception("\n"+cluster_name+" has multi-subject tag but is not present in your multi-hit TSV file. Please, provide the original multi-hit TSV file.\n\n") else: metadata_dict["blast_taxonomy"], metadata_dict["blast_affiliations"] = observation_blast_parts(metadata_dict, multi_hit_dict[cluster_name]) if metadata_dict["blast_affiliations"] == []: raise Exception("\nyour multihit TSV file is no more consistent with your abundance TSV file for (at least) "+cluster_name+"\n\n") # no multi tag= blast affiliation is equal to blast_taxonomy else: blast_dict={key.replace("blast_",""):metadata_dict[key] for key in metadata_dict if key.startswith("blast")} metadata_dict["blast_affiliations"]=[blast_dict] # filter blast metadata which are moved to blast_affiliations for metadata in metadata_dict["blast_affiliations"][0]: if not metadata == "taxonomy": metadata_dict.pop("blast_"+metadata) # add cluster and count to clusters_count dict clusters_count[cluster_name] = count_by_sample # ok print clusters_count[cluster_name].keys(), "CDT0#LOT05" in clusters_count[cluster_name], "CDT0#LOT02" in clusters_count[cluster_name] # add cluster and metadata to clusters_metadata dict clusters_metadata[cluster_name] = metadata_dict if not output_fasta is None: Fasta_fh.close() in_fh.close() #add samples to biom for sample_name in samples_names: biom.add_sample( sample_name ) # add to cluster to biom for cluster_name in clusters_count: biom.add_observation( cluster_name, clusters_metadata[cluster_name] ) for sample_name in samples_names: if clusters_count[cluster_name][sample_name] > 0: biom.add_count( cluster_name, sample_name, clusters_count[cluster_name][sample_name] ) # Write BiomIO.write( output_biom, biom )