def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ): """ @summary: Writes a BIOM after a random sampling in each sample. @param input_biom: [str] Path to the processed BIOM. @param output_biom: [str] Path to outputed BIOM. @param nb_sampled: [int] Number of sampled sequences by sample. @param sampled_ratio: [float] Ratio of sampled sequences by sample. @note: nb_sampled and sampled_ratio are mutually exclusive. """ initial_biom = BiomIO.from_json( input_biom ) new_biom = Biom( matrix_type="sparse", generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom ) observations_already_added = dict() for sample_name in initial_biom.get_samples_names(): new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) ) sample_seq = initial_biom.get_sample_count(sample_name) sample_nb_sampled = nb_sampled if nb_sampled is None: sample_nb_sampled = int(sample_seq * sampled_ratio) if sample_seq < nb_sampled: raise_exception( Exception( "\n\n#ERROR : " + str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences.\n\n" )) else: for current_nb_iter in range(sample_nb_sampled): # Take an observation in initial BIOM selected_observation = initial_biom.random_obs_by_sample(sample_name) selected_observation_id = selected_observation['id'] initial_biom.subtract_count( selected_observation_id, sample_name, 1 ) # Put in new BIOM if selected_observation_id not in observations_already_added: new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) ) observations_already_added[selected_observation_id] = True new_biom.add_count( selected_observation_id, sample_name, 1 ) BiomIO.write( output_biom, new_biom )
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ): """ @summary: Writes a BIOM after a random sampling in each sample. @param input_biom: [str] Path to the processed BIOM. @param output_biom: [str] Path to outputed BIOM. @param nb_sampled: [int] Number of sampled sequences by sample. @param sampled_ratio: [float] Ratio of sampled sequences by sample. @note: nb_sampled and sampled_ratio are mutually exclusive. """ initial_biom = BiomIO.from_json( input_biom ) new_biom = Biom( matrix_type="sparse", generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom ) observations_already_added = dict() for sample_name in initial_biom.get_samples_names(): new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) ) sample_seq = initial_biom.get_sample_count(sample_name) sample_nb_sampled = nb_sampled if nb_sampled is None: sample_nb_sampled = int(sample_seq * sampled_ratio) if sample_seq < nb_sampled: raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." ) else: for current_nb_iter in range(sample_nb_sampled): # Take an observation in initial BIOM selected_observation = initial_biom.random_obs_by_sample(sample_name) selected_observation_id = selected_observation['id'] initial_biom.subtract_count( selected_observation_id, sample_name, 1 ) # Put in new BIOM if not observations_already_added.has_key(selected_observation_id): new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) ) observations_already_added[selected_observation_id] = True new_biom.add_count( selected_observation_id, sample_name, 1 ) BiomIO.write( output_biom, new_biom )
def to_biom( clusters_file, count_file, output_biom, size_separator ): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom( generated_by='swarm', matrix_type="sparse" ) # Preclusters count by sample preclusters_count = dict() count_fh = open( count_file ) samples = count_fh.readline().strip().split()[1:] for line in count_fh: precluster_id, count_str = line.strip().split(None, 1) preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count count_fh.close() # Add samples for sample_name in samples: biom.add_sample( sample_name ) # Process count cluster_idx = 1 clusters_fh = open( clusters_file ) for line in clusters_fh: seed_id = line.strip().split()[0] if "FROGS_combined" in seed_id: cluster_name = "Cluster_" + str(cluster_idx) + "_FROGS_combined" comment = "WARNING" else: cluster_name = "Cluster_" + str(cluster_idx) comment = "na" cluster_count = {key:0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] sample_counts = preclusters_count[real_seq_id].split() for sample_idx, sample_name in enumerate(samples): cluster_count[sample_name] += int(sample_counts[sample_idx]) preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'comment': comment, 'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} ) observation_idx = biom.find_idx("observation", cluster_name) for sample_idx, sample_name in enumerate(samples): if cluster_count[sample_name] > 0: biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] ) # Next cluster cluster_idx += 1 # Write BiomIO.write( output_biom, biom )
def to_biom(clusters_file, count_file, output_biom, size_separator): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom(generated_by='swarm', matrix_type="sparse") # Preclusters count by sample preclusters_count = dict() count_fh = open(count_file) samples = count_fh.readline().strip().split()[1:] for line in count_fh: line_fields = line.strip().split() count_by_sample = {} for idx, val in enumerate(line_fields[1:]): if val > 0: count_by_sample[samples[idx]] = int(val) preclusters_count[line_fields[0]] = count_by_sample count_fh.close() # Add samples for sample_name in samples: biom.add_sample(sample_name) # Process count cluster_idx = 1 clusters_fh = open(clusters_file) for line in clusters_fh: cluster_name = "Cluster_" + str(cluster_idx) cluster_count = {key: 0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] for preclust_sample in preclusters_count[real_seq_id]: cluster_count[preclust_sample] += preclusters_count[ real_seq_id][preclust_sample] preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'seed_id': line_fields[0].rsplit(size_separator, 1)[0]}) for sample_name in samples: if cluster_count[sample_name] > 0: biom.add_count(cluster_name, sample_name, cluster_count[sample_name]) # Next cluster cluster_idx += 1 # Write BiomIO.write(output_biom, biom)
def to_biom( clusters_file, count_file, output_biom, size_separator ): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom( generated_by='swarm', matrix_type="sparse" ) # Preclusters count by sample preclusters_count = dict() count_fh = open( count_file ) samples = count_fh.readline().strip().split()[1:] for line in count_fh: precluster_id, count_str = line.strip().split(None, 1) preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count count_fh.close() # Add samples for sample_name in samples: biom.add_sample( sample_name ) # Process count cluster_idx = 1 clusters_fh = open( clusters_file ) for line in clusters_fh: cluster_name = "Cluster_" + str(cluster_idx) cluster_count = {key:0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] sample_counts = preclusters_count[real_seq_id].split() for sample_idx, sample_name in enumerate(samples): cluster_count[sample_name] += int(sample_counts[sample_idx]) preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} ) observation_idx = biom.find_idx("observation", cluster_name) for sample_idx, sample_name in enumerate(samples): if cluster_count[sample_name] > 0: biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] ) # Next cluster cluster_idx += 1 # Write BiomIO.write( output_biom, biom )
parser.add_argument( '-v', '--version', action='version', version=__version__ ) # Inputs group_input = parser.add_argument_group( 'Inputs' ) group_input.add_argument( '-s', '--samples', type=str, action=SampleParameter, metavar=("SAMPLE_NAME:SAMPLE_PATH"), nargs='+', help="Samples names and grinder rank files." ) group_input.add_argument( '-a', '--affiliation', required=True, help='Path to the databank source for simulated sequence (format: fasta). The description of sequences must be the taxonomy.' ) # Outputs group_output = parser.add_argument_group( 'Outputs' ) group_output.add_argument( '-o', '--output', required=True, help='The output BIOM (format: BIOM).' ) args = parser.parse_args() taxonomy_key = "real_taxonomy" biom = Biom( generated_by="grinder", matrix_type="sparse" ) # Set observations count for sample_name in args.samples: biom.add_sample( sample_name ) fh_abund = open( args.samples[sample_name] ) for line in fh_abund: # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc" if not line.startswith('#'): fields = line.strip().split() try: biom.add_observation( fields[1] ) except: # already exist pass biom.change_count( fields[1], sample_name, int(float(fields[2])*100000000000000) )################## depend de la precision grinder fh_abund.close() # Set taxonomy metadata fh_classif = FastaIO( args.affiliation ) for record in fh_classif: try:
def tsv_to_biom( input_tsv, multi_hit_dict, fields, samples_names, output_biom, output_fasta ): """ @summary: Convert TSV file to Biom file. @param input_tsv: [str] Path to the TSV file. @param multi_hit_dict: [dict] Dictionnary describing equivalent multi blast hit : dict[observation_name]=[ {"blast_taxonomy":taxonomy, "blast_subject":subject, "blast_perc_identity": per_id, "blast_perc_query_coverage":per_cov, "blast_evalue":eval, "blast_aln_length":aln}] @param fields: [list] column name to include as metadata (must at least contain observation_name): observation_sum and seed_sequence will be excluded, rdp_tax_and_bootstrap will be split in two metadata @param samples_names: [list] list of sample names. @param output_biom: [str] Path to the output file (format : BIOM). @param output_fasta: [str] Path to the output file (format : fasta). """ # biom = Biom( generated_by='frogs', matrix_type="sparse" ) biom = Biom( matrix_type="sparse" ) seed_seq_idx = -1 metadata_index = dict() sample_index = dict() clusters_count = dict() clusters_metadata = dict() in_fh = open( input_tsv ) if not output_fasta is None: Fasta_fh=FastaIO(output_fasta , "w" ) # parse header and store column index header=in_fh.readline() if header.startswith("#"): header=header[1:] header = header.strip() seed_seq_idx, metadata_index, sample_index = header_line_dict(fields,header,samples_names) if not output_fasta is None and seed_seq_idx == -1: raise Exception("\nYou want to extract seed fasta sequence but there is no seed_sequence column in your TSV file\n\n") # count by sample, and metadata for line in in_fh: cluster_name="" line_list=line.strip().split("\t") count_by_sample = {} metadata_dict = {} # parse columns for idx,val in enumerate(line_list): # recover metadata if idx in metadata_index: if metadata_index[idx]=="observation_name" : cluster_name = val else: metadata_dict[metadata_index[idx]] = val # recover samples count elif idx in sample_index and val > 0: count_by_sample[sample_index[idx]] = int(val) # recover seed sequence elif idx == seed_seq_idx: seed_seq = val # if fasta output file => store de seed sequence if not output_fasta is None: seq = Sequence( cluster_name, seed_seq) Fasta_fh.write(seq) if "taxonomy" in metadata_dict: metadata_dict["taxonomy"] = metadata_dict["taxonomy"].split(";") # format rdp taxonomy to fit BIOM format if "rdp_tax_and_bootstrap" in metadata_dict: metadata_dict["rdp_taxonomy"]=[] metadata_dict["rdp_bootstrap"]=[] tax = metadata_dict["rdp_tax_and_bootstrap"].rstrip(";").split(";") for i in range(0,len(tax),2): metadata_dict["rdp_taxonomy"].append(tax[i]) metadata_dict["rdp_bootstrap"].append(tax[i+1].replace("(","").replace(")","")) metadata_dict.pop("rdp_tax_and_bootstrap") # format blast taxonomy to fit BIOM format (one consensus blast_taxonomy and possible multiples blast_affiliation detailed if "blast_taxonomy" in metadata_dict: metadata_dict["blast_taxonomy"] = metadata_dict["blast_taxonomy"].split(";") # check multihit blast : filter non consistent taxonomy hit with blast_taxonomy (if TSV modified), and compute consensus tax (if multihit line suppressed) if metadata_dict["blast_subject"] == "multi-subject" and not multi_hit_dict is None: if not cluster_name in multi_hit_dict: raise Exception("\n"+cluster_name+" has multi-subject tag but is not present in your multi-hit TSV file. Please, provide the original multi-hit TSV file.\n\n") else: metadata_dict["blast_taxonomy"], metadata_dict["blast_affiliations"] = observation_blast_parts(metadata_dict, multi_hit_dict[cluster_name]) if metadata_dict["blast_affiliations"] == []: raise Exception("\nyour multihit TSV file is no more consistent with your abundance TSV file for (at least) "+cluster_name+"\n\n") # no multi tag= blast affiliation is equal to blast_taxonomy else: blast_dict={key.replace("blast_",""):metadata_dict[key] for key in metadata_dict if key.startswith("blast")} metadata_dict["blast_affiliations"]=[blast_dict] # filter blast metadata which are moved to blast_affiliations for metadata in metadata_dict["blast_affiliations"][0]: if not metadata == "taxonomy": metadata_dict.pop("blast_"+metadata) # add cluster and count to clusters_count dict clusters_count[cluster_name] = count_by_sample # ok print clusters_count[cluster_name].keys(), "CDT0#LOT05" in clusters_count[cluster_name], "CDT0#LOT02" in clusters_count[cluster_name] # add cluster and metadata to clusters_metadata dict clusters_metadata[cluster_name] = metadata_dict if not output_fasta is None: Fasta_fh.close() in_fh.close() #add samples to biom for sample_name in samples_names: biom.add_sample( sample_name ) # add to cluster to biom for cluster_name in clusters_count: biom.add_observation( cluster_name, clusters_metadata[cluster_name] ) for sample_name in samples_names: if clusters_count[cluster_name][sample_name] > 0: biom.add_count( cluster_name, sample_name, clusters_count[cluster_name][sample_name] ) # Write BiomIO.write( output_biom, biom )
'Path to the databank source for simulated sequence (format: fasta). The description of sequences must be the taxonomy.' ) # Outputs group_output = parser.add_argument_group('Outputs') group_output.add_argument('-o', '--output', required=True, help='The output BIOM (format: BIOM).') args = parser.parse_args() taxonomy_key = "real_taxonomy" biom = Biom(generated_by="grinder", matrix_type="sparse") # Set observations count for sample_name in args.samples: biom.add_sample(sample_name) fh_abund = open(args.samples[sample_name]) for line in fh_abund: # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc" if not line.startswith('#'): fields = line.strip().split() try: biom.add_observation(fields[1]) except: # already exist pass biom.change_count( fields[1], sample_name, int(float(fields[2]) * 100000000000000 )) ################## depend de la precision grinder fh_abund.close() # Set taxonomy metadata