def write_subset(in_path, out_path, selected): FH_in = FastaIO(in_path) FH_out = FastaIO(out_path, "w") for record in FH_in: if record.id in selected: FH_out.write(record) FH_in.close() FH_out.close()
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file): FH_in = FastaIO( fasta_in ) FH_out = FastaIO( fasta_out, "w" ) biom = BiomIO.from_json( biom_in ) seq_in=0 seq_out=0 for record in FH_in: seq_in += 1 try: biom.find_idx("observation",record.id) except ValueError: pass else: FH_out.write(record) seq_out += 1 FH_in.close() FH_out.close() FH_log=open(log_file,"w") FH_log.write("Number of sequence in :" + str(seq_in)+"\n" ) FH_log.write("Number of sequence out :" + str(seq_out) +"\n")
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file): FH_in = FastaIO(fasta_in) FH_out = FastaIO(fasta_out, "w") biom = BiomIO.from_json(biom_in) seq_in = 0 seq_out = 0 for record in FH_in: seq_in += 1 try: biom.find_idx("observation", record.id) except ValueError: pass else: FH_out.write(record) seq_out += 1 FH_in.close() FH_out.close() FH_log = open(log_file, "w") FH_log.write("Number of sequence in :" + str(seq_in) + "\n") FH_log.write("Number of sequence out :" + str(seq_out) + "\n")
def process(params): biom_in = BiomIO.from_json(params.input_biom) # check if biom_in has blast_taxonomy affiliations if not biom_in.has_metadata("blast_affiliations"): raise_exception( Exception( "\n\n#ERROR : Your input biom file, " + os.path.basename(params.input_biom) + ", does not contain any blast_affiliations metadata.\n\n")) biom_out = Biom(generated_by='FROGS_aggregate_affiliated_otu', matrix_type="sparse") # add samples in biom_out for sample_name in biom_in.get_samples_names(): biom_out.add_sample(sample_name) # parse biom from most abondant OTU to less abondant one # save taxonomy # add OTU to biom_out if taxonomy is with poor %id or %cov or taxonomy not already saved # aggregate OTU to previous one if %id or %cov is big enough and share taxonomy with previous one # compute observation sum otu_sums = {} for otu_name, count_sum in biom_in.get_observations_counts(): otu_sums[otu_name] = count_sum # save "confident" taxonomy otu_by_tax = dict() # save aggregated_otu_composition aggregated_otu = OrderedDict() otu_in = 0 otu_out = 0 otu_aggregated = 0 # parse otu from most abondant to less ones for otu_name in sorted(otu_sums, key=lambda i: int(otu_sums[i]), reverse=True): otu_in += 1 observation = biom_in.get_observations_by_name(otu_name) # is this OTU poorly affiliated min_id = 100 min_cov = 100 tax = list() for affiliation in observation["metadata"]["blast_affiliations"]: if params.taxon_ignored and any( t in ";".join(affiliation["taxonomy"]) for t in params.taxon_ignored): continue if not affiliation["taxonomy"] in tax: tax.append(affiliation["taxonomy"]) percent_id = affiliation["perc_identity"] percent_cov = affiliation["perc_query_coverage"] if percent_id < min_id: min_id = percent_id if percent_cov < min_cov: min_cov = percent_cov # Add otu because of poor affiliations stat if min_id < params.identity or min_cov < params.coverage: otu_out += 1 biom_out.add_observation(otu_name, observation["metadata"]) for sample_name in biom_in.get_samples_names(): count = biom_in.get_count(otu_name, sample_name) biom_out.add_count(otu_name, sample_name, count) aggregated_otu[otu_name] = list() # for confident taxonomy else: # check if all taxonomies are new is_new_tax = True equivalent_otu_name = "" for taxonomy in tax: if isinstance(taxonomy, list): taxonomy = ";".join(taxonomy) if taxonomy in otu_by_tax: is_new_tax = False if equivalent_otu_name == "": equivalent_otu_name = otu_by_tax[taxonomy] elif otu_by_tax[taxonomy] != equivalent_otu_name: Logger.static_write( params.log_file, '\tWarning: observation ' + otu_name + ' shares taxonomy ( ' + taxonomy + ' with an other OTU : ' + otu_by_tax[taxonomy] + ', first detected OTU will be kept : ' + equivalent_otu_name + '\n') # if new tax, add OTU and save taxonomies if is_new_tax: otu_out += 1 biom_out.add_observation(otu_name, observation["metadata"]) for sample_name in biom_in.get_samples_names(): count = biom_in.get_count(otu_name, sample_name) if count > 0: biom_out.add_count(otu_name, sample_name, count) aggregated_otu[otu_name] = list() for taxonomy in tax: if isinstance(taxonomy, list): taxonomy = ";".join(taxonomy) otu_by_tax[taxonomy] = otu_name # else aggregation of OTU else: otu_aggregated += 1 equivalent_otu = biom_out.get_observations_by_name( equivalent_otu_name) # add blast_affiliations aggregated_blast_affi = equivalent_otu["metadata"][ "blast_affiliations"] + observation["metadata"][ "blast_affiliations"] biom_out.add_metadata(equivalent_otu_name, "blast_affiliations", aggregated_blast_affi, subject_type="observation", erase_warning=False) # update consensus tax consensus_tax = get_tax_consensus( [affi["taxonomy"] for affi in aggregated_blast_affi]) biom_out.add_metadata(equivalent_otu_name, "blast_taxonomy", consensus_tax, subject_type="observation", erase_warning=False) # update counts for sample_name in biom_in.get_samples_names(): count = biom_out.get_count( equivalent_otu_name, sample_name) + biom_in.get_count( otu_name, sample_name) biom_out.change_count(equivalent_otu_name, sample_name, count) # save aggregated composition aggregated_otu[equivalent_otu_name].append(otu_name) # update known taxonomies for taxonomy in tax: if isinstance(taxonomy, list): taxonomy = ";".join(taxonomy) if not taxonomy in otu_by_tax: otu_by_tax[taxonomy] = equivalent_otu_name # write biom output file BiomIO.write(params.output_biom, biom_out) # update fasta FH_in = FastaIO(params.input_fasta) FH_out = FastaIO(params.output_fasta, "wt") for record in FH_in: if record.id in aggregated_otu: FH_out.write(record) FH_in.close() FH_out.close() # write otu composition FH_compo = open(params.output_compo, "wt") for OTU in aggregated_otu: FH_compo.write(OTU + " " + " ".join(aggregated_otu[OTU]) + "\n") FH_compo.close() # simple log stat Logger.static_write(params.log_file, "# nb OTU in : " + str(otu_in) + "\n") Logger.static_write(params.log_file, "# nb OTU out : " + str(otu_out) + "\n") Logger.static_write(params.log_file, "# nb OTU aggregated : " + str(otu_aggregated) + "\n")
if record.string in observation_id_by_seq: observation_id = observation_id_by_seq[record.string] reference_id = re.search("reference=([^\s]+)", record.description).group(1) if observation_id not in reference_by_observation_id: reference_by_observation_id[observation_id] = reference_id elif len(reference_by_observation_id[observation_id].split( ",")) > len(reference_id.split(",")): reference_by_observation_id[observation_id] = reference_id FH_reads.close() if len(observation_id_by_seq) != len(reference_by_observation_id): missing = list() for seed_seq in observation_id_by_seq: if observation_id_by_seq[ seed_seq] not in reference_by_observation_id: missing.append(observation_id_by_seq[seed_seq]) raise Exception( "All the centroids sequences cannot be retrieved in reads files. Centroids without read: '" + "' '".join(missing) + "'.") # Write seeds fasta with reference information FH_seeds = FastaIO(args.seeds_fasta) FH_seeds_with_ref = FastaIO(args.annotated_fasta, "w") for record in FH_seeds: record.id = record.id.split(";size=")[0] record.description = "reference=" + reference_by_observation_id[ record.id] FH_seeds_with_ref.write(record) FH_seeds.close() FH_seeds_with_ref.close()
record_seq = record.string.replace("-", "").replace(".", "") if record_seq in observation_ids_by_seq: observation_ids_by_centroid_id[record.id] = observation_ids_by_seq[record_seq] FH_reads.close() # Get reference by observation reference_by_observation_id = dict() for file in args.reads: FH_reads = SequenceFileReader.factory(file) for record in FH_reads: if record.id in observation_ids_by_centroid_id: observation_ids = observation_ids_by_centroid_id[record.id] reference_id = re.search("reference=([^\s]+)", record.description).group(1) for current_obs_id in observation_ids: if current_obs_id not in reference_by_observation_id: reference_by_observation_id[current_obs_id] = reference_id elif len(reference_by_observation_id[current_obs_id].split(",")) > len(reference_id.split(",")): reference_by_observation_id[current_obs_id] = reference_id FH_reads.close() if nb_observations != len(reference_by_observation_id): raise Exception("All the centroids sequences cannot be retrieved in reads files.") # Write seeds fasta with reference information FH_seeds = FastaIO(args.input) FH_seeds_with_ref = FastaIO(args.output, "w") for record in FH_seeds: record.description = "reference=" + reference_by_observation_id[record.id] FH_seeds_with_ref.write(record) FH_seeds.close() FH_seeds_with_ref.close()