Esempio n. 1
0
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    """
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    """
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    matrix_type="sparse",
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    )
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise_exception( Exception( "\n\n#ERROR : " + str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences.\n\n" ))
        else:
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if selected_observation_id not in observations_already_added:
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
Esempio n. 2
0
def mask_observation(rdp_clusters_discards, blast_clusters_discards,
                     input_biom, output_biom):
    """
    @summary : mask either rdp affiliations and/or blast affiliations
    @param rdp_clusters_discards : [list] of clusters whith rdp affiliations to mask
    @param blast_clusters_discards : [list] of clusters whith blast consensus affiliations to mask
    @param input_biom : [str] Path to input biom file
    @param input_biom : [str] Path to output biom file with affiliations masked
    """

    biom = BiomIO.from_json(input_biom)
    for observation in biom.get_observations():
        # remove rdp taxonomic metadata
        if rdp_clusters_discards is not None and observation[
                'id'] in rdp_clusters_discards:
            if issubclass(observation['metadata']["rdp_taxonomy"].__class__,
                          str):
                observation['metadata']["rdp_taxonomy"] = ""
                observation['metadata']["rdp_bootstrap"] = ""
            elif issubclass(observation['metadata']["rdp_taxonomy"].__class__,
                            str):
                observation['metadata']["rdp_taxonomy"] = list()
                observation['metadata']["rdp_bootstrap"] = list()

        # remove blast metadata
        if observation['id'] in blast_clusters_discards:
            observation['metadata']["blast_affiliations"] = list()
            observation['metadata']["blast_taxonomy"] = list()

    BiomIO.write(output_biom, biom)
Esempio n. 3
0
def aff_to_metadata(reference_file,
                    biom_in,
                    biom_out,
                    blast_files=None,
                    rdp_files=None):
    """
    @summary: Add taxonomy metadata on biom file from a blast result.
    @param reference_file: [str] The path to the reference file.
    @param biom_in: [str] The path to the Biom file to process.
    @param biom_out: [str] The path to the biom output file.
    @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+).
    @param rdp_files: [list] the list of path to the RDPClassifier results.
    """
    # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
    taxonomy_by_reference = get_tax_from_fasta(reference_file)

    # Retrieve blast clusters annotations
    cluster_blast_annot = dict()
    if blast_files is not None:
        cluster_blast_annot = get_bests_blast_affi(blast_files,
                                                   taxonomy_by_reference)
    del taxonomy_by_reference

    # Retrieve rdp clusters annotations
    cluster_rdp_annot = dict()
    if rdp_files is not None:
        cluster_rdp_annot = get_rdp_affi(rdp_files)

    # Add metadata to biom
    biom = BiomIO.from_json(biom_in)
    for cluster in biom.get_observations():
        cluster_id = cluster["id"]
        # Blast
        if blast_files is not None:
            blast_taxonomy = list()
            blast_affiliations = list()
            if cluster_id in cluster_blast_annot:  # Current observation has a match
                blast_taxonomy = get_tax_consensus([
                    taxonomy.split(';') for taxonomy in
                    cluster_blast_annot[cluster_id]['alignments']
                ])
                for taxonomy in cluster_blast_annot[cluster_id]['alignments']:
                    blast_affiliations.extend(cluster_blast_annot[cluster_id]
                                              ['alignments'][taxonomy])
            biom.add_metadata(cluster_id, "blast_affiliations",
                              blast_affiliations, "observation")
            biom.add_metadata(cluster_id, "blast_taxonomy", blast_taxonomy,
                              "observation")
        # RDP
        if rdp_files is not None:
            rdp_taxonomy = list()
            rdp_bootstrap = list()
            if cluster_id in cluster_rdp_annot:
                rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy']
                rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap']
            biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy,
                              "observation")
            biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap,
                              "observation")
    BiomIO.write(biom_out, biom)
Esempio n. 4
0
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    """
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    """
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    matrix_type="sparse",
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    )
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." )
        else:
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if not observations_already_added.has_key(selected_observation_id):
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
Esempio n. 5
0
def process( in_biom, out_biom, out_metadata ):
    ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata
    taxonomy_depth = 0
    unclassified_observations = list()

    FH_metadata = open( out_metadata, "w" )
    FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" )
    biom = BiomIO.from_json( in_biom )
    for observation in biom.get_observations():
        for metadata_key in observation["metadata"].keys():
            if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file
                if observation["metadata"][metadata_key] is not None:
                    for current_affi in observation["metadata"][metadata_key]:
                        if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple):
                            current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] )
                        FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" )
                del observation["metadata"][metadata_key]
            elif observation["metadata"][metadata_key] is not None: # All list are transformed in string
                if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple):
                    observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) )
        if observation["metadata"].has_key( "blast_taxonomy" ):
            if observation["metadata"]["blast_taxonomy"] is None:
                unclassified_observations.append( observation["id"] )
                observation["metadata"]["taxonomy"] = list()
            else:
                taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";"))
                observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";")
    # Add "Unclassified" ranks in unclassified observations
    if taxonomy_depth > 0:
        for observation_id in unclassified_observations:
            observation_metadata = biom.get_observation_metadata(observation_id)
            observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth
    BiomIO.write( out_biom, biom )
Esempio n. 6
0
def process( in_biom, out_biom, out_metadata ):
    ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata
    taxonomy_depth = 0
    unclassified_observations = list()

    FH_metadata = open( out_metadata, "w" )
    FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" )
    biom = BiomIO.from_json( in_biom )
    for observation in biom.get_observations():
        for metadata_key in observation["metadata"].keys():
            if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file
                if observation["metadata"][metadata_key] is not None:
                    for current_affi in observation["metadata"][metadata_key]:
                        if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple):
                            current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] )
                        FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" )
                del observation["metadata"][metadata_key]
            elif observation["metadata"][metadata_key] is not None: # All list are transformed in string
                if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple):
                    observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) )
        if observation["metadata"].has_key( "blast_taxonomy" ):
            if observation["metadata"]["blast_taxonomy"] is None:
                unclassified_observations.append( observation["id"] )
                observation["metadata"]["taxonomy"] = list()
            else:
                taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";"))
                observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";")
    # Add "Unclassified" ranks in unclassified observations
    if taxonomy_depth > 0:
        for observation_id in unclassified_observations:
            observation_metadata = biom.get_observation_metadata(observation_id)
            observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth
    BiomIO.write( out_biom, biom )
Esempio n. 7
0
def to_biom( clusters_file, count_file, output_biom, size_separator ):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom( generated_by='swarm', matrix_type="sparse" )

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open( count_file )
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        precluster_id, count_str = line.strip().split(None, 1)
        preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample( sample_name )

    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        seed_id = line.strip().split()[0]
        if "FROGS_combined" in seed_id:
            cluster_name = "Cluster_" + str(cluster_idx) + "_FROGS_combined"
            comment = "WARNING"
        else:
            cluster_name = "Cluster_" + str(cluster_idx)
            comment = "na"
        cluster_count = {key:0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            sample_counts = preclusters_count[real_seq_id].split()
            for sample_idx, sample_name in enumerate(samples):
                cluster_count[sample_name] += int(sample_counts[sample_idx])
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation( cluster_name, {'comment': comment, 'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
        observation_idx = biom.find_idx("observation", cluster_name)
        for sample_idx, sample_name in enumerate(samples):
            if cluster_count[sample_name] > 0:
                biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write( output_biom, biom )
Esempio n. 8
0
def to_biom(clusters_file, count_file, output_biom, size_separator):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom(generated_by='swarm', matrix_type="sparse")

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open(count_file)
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        line_fields = line.strip().split()
        count_by_sample = {}
        for idx, val in enumerate(line_fields[1:]):
            if val > 0:
                count_by_sample[samples[idx]] = int(val)
        preclusters_count[line_fields[0]] = count_by_sample
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample(sample_name)

    # Process count
    cluster_idx = 1
    clusters_fh = open(clusters_file)
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key: 0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            for preclust_sample in preclusters_count[real_seq_id]:
                cluster_count[preclust_sample] += preclusters_count[
                    real_seq_id][preclust_sample]
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation(
            cluster_name,
            {'seed_id': line_fields[0].rsplit(size_separator, 1)[0]})
        for sample_name in samples:
            if cluster_count[sample_name] > 0:
                biom.add_count(cluster_name, sample_name,
                               cluster_count[sample_name])
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write(output_biom, biom)
Esempio n. 9
0
def filter_biom( removed_observations, in_biom, out_biom ):
    """
    @summary: Removed the specified observations from BIOM.
    @param removed_observations: [dict] Each key is an observation name.
    @param in_biom: [str]: Path to the processed BIOM file.
    @param out_biom: [str]: Path to the cleaned BIOM file.
    """
    biom = BiomIO.from_json(in_biom)
    biom.remove_observations(removed_observations)
    BiomIO.write(out_biom, biom)
Esempio n. 10
0
def filter_biom( removed_observations, in_biom, out_biom ):
    """
    @summary: Removed the specified observations from BIOM.
    @param removed_observations: [dict] Each key is an observation name.
    @param in_biom: [str]: Path to the processed BIOM file.
    @param out_biom: [str]: Path to the cleaned BIOM file.
    """
    biom = BiomIO.from_json(in_biom)
    biom.remove_observations(removed_observations)
    BiomIO.write(out_biom, biom)
Esempio n. 11
0
def remove_observations( removed_observations, input_biom, output_biom ):
    """
    @summary: Removes the specified list of observations.
    @param removed_observations: [list] The names of the observations to remove.
    @param input_biom: [str] The path to the input BIOM.
    @param output_biom: [str] The path to the output BIOM.
    """
    biom = BiomIO.from_json( input_biom )
    biom.remove_observations( removed_observations )
    BiomIO.write( output_biom, biom )
Esempio n. 12
0
def remove_observations(removed_observations, input_biom, output_biom):
    """
    @summary: Removes the specified list of observations.
    @param removed_observations: [list] The names of the observations to remove.
    @param input_biom: [str] The path to the input BIOM.
    @param output_biom: [str] The path to the output BIOM.
    """
    biom = BiomIO.from_json(input_biom)
    biom.remove_observations(removed_observations)
    BiomIO.write(output_biom, biom)
Esempio n. 13
0
def to_biom( clusters_file, count_file, output_biom, size_separator ):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom( generated_by='swarm', matrix_type="sparse" )

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open( count_file )
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        precluster_id, count_str = line.strip().split(None, 1)
        preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample( sample_name )

    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key:0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            sample_counts = preclusters_count[real_seq_id].split()
            for sample_idx, sample_name in enumerate(samples):
                cluster_count[sample_name] += int(sample_counts[sample_idx])
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
        observation_idx = biom.find_idx("observation", cluster_name)
        for sample_idx, sample_name in enumerate(samples):
            if cluster_count[sample_name] > 0:
                biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write( output_biom, biom )
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None):
    """
    @summary: Add taxonomy metadata on biom file from a blast result.
    @param reference_file: [str] The path to the reference file.
    @param biom_in: [str] The path to the Biom file to process.
    @param biom_out: [str] The path to the biom output file.
    @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+).
    @param rdp_files: [list] the list of path to the RDPClassifier results.
    """
    # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
    taxonomy_by_reference = get_tax_from_fasta( reference_file )

    # Retrieve blast clusters annotations
    cluster_blast_annot = dict()
    if blast_files is not None:
        cluster_blast_annot = get_bests_blast_affi( blast_files, taxonomy_by_reference )
    del taxonomy_by_reference

    # Retrieve rdp clusters annotations
    cluster_rdp_annot = dict()
    if rdp_files is not None:
        cluster_rdp_annot = get_rdp_affi( rdp_files )

    # Add metadata to biom
    biom = BiomIO.from_json(biom_in)
    for cluster in biom.get_observations():
        cluster_id = cluster["id"]
        # Blast
        if blast_files is not None:
            blast_taxonomy = None
            blast_affiliations = list()
            if cluster_blast_annot.has_key(cluster_id): # Current observation has a match
                blast_taxonomy = get_tax_consensus( [alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments']] )
                blast_affiliations = cluster_blast_annot[cluster_id]['alignments']
            biom.add_metadata( cluster_id, "blast_affiliations", blast_affiliations, "observation" )
            biom.add_metadata( cluster_id, "blast_taxonomy", blast_taxonomy, "observation" )
        # RDP
        if rdp_files is not None:
            rdp_taxonomy = None
            rdp_bootstrap = None
            if cluster_rdp_annot.has_key(cluster_id):
                rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy']
                rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap']
            biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation")
            biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation")
    BiomIO.write(biom_out, biom)
Esempio n. 15
0
def process(args):
    tmp_files = TmpFiles(os.path.split(args.output_file)[0])

    try:
        # Add temp taxonomy if multiple and without consensus
        tmp_biom = args.input_biom
        used_taxonomy_tag = args.taxonomy_tag
        if args.multiple_tag is not None:
            used_taxonomy_tag = args.tax_consensus_tag
            if args.tax_consensus_tag is None:
                used_taxonomy_tag = "Used_taxonomy_FROGS-affi"
                tmp_biom = tmp_files.add("tax.biom")
                biom = BiomIO.from_json(args.input_biom)
                for observation in biom.get_observations():
                    metadata = observation["metadata"]
                    if metadata[args.multiple_tag] is not None and len(
                            metadata[args.multiple_tag]) > 0:
                        metadata[used_taxonomy_tag] = metadata[
                            args.multiple_tag][0][args.taxonomy_tag]
                BiomIO.write(tmp_biom, biom)
                del biom

        # Rarefaction
        tax_depth = [
            args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks
        ]
        rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag,
                                      tax_depth)
        rarefaction_cmd.submit(args.log_file)
        rarefaction_files = rarefaction_cmd.output_files

        # Taxonomy tree
        tree_count_file = tmp_files.add("taxCount.enewick")
        tree_ids_file = tmp_files.add("taxCount_ids.tsv")
        TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file,
                     tree_ids_file).submit(args.log_file)

        # Writes summary
        write_summary(args.output_file, args.input_biom, tree_count_file,
                      tree_ids_file, rarefaction_files, args)
    finally:
        if not args.debug:
            tmp_files.deleteAll()
Esempio n. 16
0
def impacted_obs_by_undesired_taxon(input_biom, undesired_taxon_list,
                                    in_all_or_in_consensus, biom_out,
                                    impacted_file):
    """
    @summary : write the list of observation with affiliations including undesired taxon.
    @param input_biom: [str] The path to the BIOM file to check.
    @param undesired_taxon_list: [list] list of string to look for
    @param in_all_or_in_consensus: [bool] if True, one taxon_ignored must be in the consensus or all affiliation must one of the taxon ignored
    @param biom_out: [str] path to biom with removed undesired taxonomy
    @param impacted_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json(input_biom)
    FH_impacted_file = open(impacted_file, "w")

    for observation in biom.get_observations():

        # update blast_affiliations without ignored taxon and recompute de blast_taxonomy
        new_blast_affi = list()
        for affiliation in observation['metadata']['blast_affiliations']:
            if not any(t in ";".join(affiliation["taxonomy"])
                       for t in undesired_taxon_list):
                new_blast_affi.append(affiliation)

        # if some affi are masked, update blast_affiliations and blast_taxonomy
        if len(new_blast_affi) != len(
                observation['metadata']['blast_affiliations']):
            observation['metadata']['blast_affiliations'] = new_blast_affi
            new_consensus = get_tax_consensus(
                [affi['taxonomy'] for affi in new_blast_affi])
            # delete mode if all affiliations belons to one of undesired taxon
            if in_all_or_in_consensus and len(new_blast_affi) == 0:
                FH_impacted_file.write(str(observation["id"]) + "\n")
            # masking mode if the new consensus is changed because of ignoring undesired taxon
            elif not in_all_or_in_consensus and new_consensus != observation[
                    'metadata']['blast_taxonomy']:
                FH_impacted_file.write(str(observation["id"]) + "\n")
            observation['metadata']['blast_taxonomy'] = new_consensus

    BiomIO.write(biom_out, biom)
Esempio n. 17
0
def process( args ):
    tmp_files = TmpFiles( os.path.split(args.output_file)[0] )

    try:
        # Add temp taxonomy if multiple and without consensus
        tmp_biom = args.input_biom
        used_taxonomy_tag = args.taxonomy_tag
        if args.multiple_tag is not None:
            used_taxonomy_tag = args.tax_consensus_tag
            if args.tax_consensus_tag is None:
                used_taxonomy_tag = "Used_taxonomy_FROGS-affi"
                tmp_biom = tmp_files.add( "tax.biom" )
                biom = BiomIO.from_json( args.input_biom )
                for observation in biom.get_observations():
                    metadata = observation["metadata"]
                    if len(metadata[args.multiple_tag]) > 0:
                        metadata[used_taxonomy_tag] = metadata[args.multiple_tag][0][args.taxonomy_tag]
                BiomIO.write( tmp_biom, biom )
                del biom

        # Rarefaction
        tax_depth = [args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks]
        rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag, tax_depth)
        rarefaction_cmd.submit( args.log_file )
        rarefaction_files = rarefaction_cmd.output_files

        # Taxonomy tree
        tree_count_file = tmp_files.add( "taxCount.enewick" )
        tree_ids_file = tmp_files.add( "taxCount_ids.tsv" )
        TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file, tree_ids_file).submit( args.log_file )

        # Writes summary
        write_summary( args.output_file, args.input_biom, tree_count_file, tree_ids_file, rarefaction_files, args )
    finally:
        if not args.debug:
            tmp_files.deleteAll()
    cmd_grinder2biom = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grinder2biom.py") + \
        " --affiliation " + os.path.abspath(args.databank) + \
        " --output " + real_biom + \
        " --samples"
    for current_sample in samples:
        cmd_grinder2biom += " '" + current_sample['name'] + ":" + current_sample['path'] + "'"
    subprocess.check_call( cmd_grinder2biom, shell=True )

    # Add reference id in checked BIOM
    biom = BiomIO.from_json( args.checked_biom )
    fasta = FastaIO( args.checked_fasta )
    for record in fasta:
        reference = re.search("reference=([^\s]+)", record.description).group(1)
        biom.add_metadata( record.id, "grinder_source", reference, "observation" )
    fasta.close()
    BiomIO.write( checked_biom, biom )
    del(biom)

    # Compare expected to obtained
    for current_sample in samples:
        print current_sample['name']
        cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \
            + " --real-biom " + os.path.abspath(real_biom) \
            + " --real-tax-key 'real_taxonomy'" \
            + " --checked-biom " + os.path.abspath(checked_biom) \
            + " --checked-tax-key '" + args.taxonomy_key + "'" \
            + (" --multi-affiliations" if args.multi_affiliations else "") \
            + (" --uniq-groups " + args.uniq_groups if args.uniq_groups is not None else "") \
            + " --sample " + current_sample['name']
        print subprocess.check_output( cmd_compareSample, shell=True )
        print ""
Esempio n. 19
0
#
##################################################################################################################################################
if __name__ == "__main__":
    # Manage parameters
    parser = argparse.ArgumentParser(description="Add taxonomy from UTAX result in BIOM file.")
    parser.add_argument( '-t', '--taxonomy-tag', default="taxonomy", help="The taxonomy tag in BIOM file. [Default: taxonomy]")
    parser.add_argument( '-v', '--version', action='version', version=__version__)
    # Inputs
    group_input = parser.add_argument_group('Inputs')
    group_input.add_argument('-f', '--input-fasta', required=True, help='Path to the sequence file outputed by UTAX (format: fasta).')
    group_input.add_argument('-b', '--input-biom', required=True, help='Path to the abundance file (format: BIOM).')
    # Outputs
    group_output = parser.add_argument_group('Outputs')
    group_output.add_argument('-o', '--output-biom', required=True, help='Path to the abundance file with taxonomy (format: BIOM).')
    args = parser.parse_args()

    # Process
    biom = BiomIO.from_json( args.input_biom )
    fasta = FastaIO( args.input_fasta )
    for record in fasta:
        # record.id example: Cluster_1;size=19714;tax=d:Bacteria(1.0000),p:"Proteobacteria"(0.9997),c:Alphaproteobacteria(0.9903),o:Rhodospirillales(0.9940),f:Acetobacteraceae(0.9887),g:Humitalea(0.9724);
        match = re.search("^([^\;]+)\;size\=\d+\;tax=(.+)$", record.id)
        if match is None:
            fasta.close()
            raise Exception("ID and taxonomy cannot be retrieved from '" + record.id + "'")
        record.id = match.group(1)
        record.description = match.group(2)
        biom.add_metadata( record.id, args.taxonomy_tag, record.description, "observation" )
    fasta.close()
    BiomIO.write( args.output_biom, biom )
Esempio n. 20
0
def remove_chimera_biom(samples, chimera_files, in_biom_file, out_biom_file,
                        lenient_filter, global_report, bySample_report,
                        log_file):
    """
    @summary: Removes the chimera observation from BIOM.
    @param samples: [list] samples name list
    @param chimera_files : [list] samples chimera files
    @param in_biom_file: [str] The path to the BIOM file to filter.
    @param out_biom_file: [str] The path to the BIOM after filter.
    @param lenient_filter: [bool] True: removes one sequence in all samples
                           only if it is detected as chimera in all samples
                           where it is present. With False removes one
                           sequence in all samples if it is detected as chimera
                           in at least one sample.
    @param global_report: [dict] This dictionary is update with the global
                          number of removed observations, the global removed
                          abundance, ...
    @param bySample_report: [dict] This dictionary is update for add by sample the
                            number of removed observations, the removed
                            abundance, ...
    @param log_file : [path] Path to general log output file
    """
    FH_log = Logger(log_file)
    FH_log.write("## Removes the chimera observation from BIOM.\n")
    nb_sample_by_chimera = dict()

    # Init bySample_report
    for sample_name in samples:
        bySample_report[sample_name] = {
            'nb_kept': 0,
            'kept_abundance': 0,
            'nb_removed': 0,
            'removed_abundance': 0,
            'removed_max_abundance': 0
        }

    # Retrieve chimera
    for chimera_file in chimera_files:
        chimera_fh = open(chimera_file)
        for line in chimera_fh:
            observation_name = line.strip()
            if not nb_sample_by_chimera.has_key(observation_name):
                nb_sample_by_chimera[observation_name] = 0
            nb_sample_by_chimera[observation_name] += 1
        chimera_fh.close()

    # Remove chimera
    removed_chimera = list()
    biom = BiomIO.from_json(in_biom_file)
    for chimera_name in nb_sample_by_chimera.keys():
        is_always_chimera = True
        nb_sample_with_obs = sum(
            1 for sample in biom.get_samples_by_observation(chimera_name))
        observation_abundance = biom.get_observation_count(chimera_name)
        if nb_sample_with_obs != nb_sample_by_chimera[chimera_name]:
            is_always_chimera = False
            global_report['nb_ambiguous'] += 1
            global_report['abundance_ambiguous'] += observation_abundance
            FH_log.write(
                "'" + chimera_name +
                "' is not interpreted as chimera in all samples where it is present.\n"
            )
        if not lenient_filter or is_always_chimera:
            removed_chimera.append(chimera_name)
            # Global metrics
            global_report['nb_removed'] += 1
            global_report['abundance_removed'] += observation_abundance
            # By sample metrics
            for sample in biom.get_samples_by_observation(chimera_name):
                sample_count = biom.get_count(chimera_name, sample['id'])
                bySample_report[sample['id']]['nb_removed'] += 1
                bySample_report[
                    sample['id']]['removed_abundance'] += sample_count
                bySample_report[sample['id']]['removed_max_abundance'] = max(
                    bySample_report[sample['id']]['removed_max_abundance'],
                    sample_count)
    biom.remove_observations(removed_chimera)

    # Nb non-chimera
    for observation_name in biom.get_observations_names():
        global_report['nb_kept'] += 1
        global_report['abundance_kept'] += biom.get_observation_count(
            observation_name)
        # By sample metrics
        for sample in biom.get_samples_by_observation(observation_name):
            sample_count = biom.get_count(observation_name, sample['id'])
            bySample_report[sample['id']]['nb_kept'] += 1
            bySample_report[sample['id']]['kept_abundance'] += sample_count
    BiomIO.write(out_biom_file, biom)
    FH_log.close()
Esempio n. 21
0
        help='Path to the sequence file outputed by UTAX (format: fasta).')
    group_input.add_argument('-b',
                             '--input-biom',
                             required=True,
                             help='Path to the abundance file (format: BIOM).')
    # Outputs
    group_output = parser.add_argument_group('Outputs')
    group_output.add_argument(
        '-o',
        '--output-biom',
        required=True,
        help='Path to the abundance file with taxonomy (format: BIOM).')
    args = parser.parse_args()

    # Process
    biom = BiomIO.from_json(args.input_biom)
    fasta = FastaIO(args.input_fasta)
    for record in fasta:
        # record.id example: Cluster_1;size=19714;tax=d:Bacteria(1.0000),p:"Proteobacteria"(0.9997),c:Alphaproteobacteria(0.9903),o:Rhodospirillales(0.9940),f:Acetobacteraceae(0.9887),g:Humitalea(0.9724);
        match = re.search("^([^\;]+)\;size\=\d+\;tax=(.+)$", record.id)
        if match is None:
            fasta.close()
            raise Exception("ID and taxonomy cannot be retrieved from '" +
                            record.id + "'")
        record.id = match.group(1)
        record.description = match.group(2)
        biom.add_metadata(record.id, args.taxonomy_tag, record.description,
                          "observation")
    fasta.close()
    BiomIO.write(args.output_biom, biom)
Esempio n. 22
0
        biom.add_sample(sample_name)
        fh_abund = open(args.samples[sample_name])
        for line in fh_abund:  # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc"
            if not line.startswith('#'):
                fields = line.strip().split()
                try:
                    biom.add_observation(fields[1])
                except:  # already exist
                    pass
                biom.change_count(
                    fields[1], sample_name,
                    int(float(fields[2]) * 100000000000000
                        ))  ################## depend de la precision grinder
        fh_abund.close()

    # Set taxonomy metadata
    fh_classif = FastaIO(args.affiliation)
    for record in fh_classif:
        try:
            metadata = biom.get_observation_metadata(record.id)
            if metadata is None or not metadata.has_key(taxonomy_key):
                taxonomy = getCleanedTaxonomy(record.description)
                biom.add_metadata(record.id, taxonomy_key, taxonomy,
                                  "observation")
        except ValueError:  # is not in BIOM
            pass
    fh_classif.close()

    # Write BIOM
    BiomIO.write(args.output, biom)
def remove_chimera_biom( samples, in_biom_file, out_biom_file, lenient_filter, global_report, bySample_report ):
    """
    @summary: Removes the chimera observation from BIOM.
    @param samples: [dict] The chimera observations by sample. Example for
                    sample splA: sample['splA']['chimera_path'] where the value
                    is the path to the file containing the list of the chimera
                    observations names.
    @param in_biom_file: [str] The path to the BIOM file to filter.
    @param out_biom_file: [str] The path to the BIOM after filter.
    @param lenient_filter: [bool] True: removes one sequence in all samples
                           only if it is detected as chimera in all samples
                           where it is present. With False removes one
                           sequence in all samples if it is detected as chimera
                           in at least one sample.
    @param global_report: [dict] This dictionary is update with the global
                          number of removed observations, the global removed
                          abundance, ...
    @param bySample_report: [dict] This dictionary is update for add by sample the
                            number of removed observations, the removed
                            abundance, ...
    """
    nb_sample_by_chimera = dict()

    # Init bySample_report
    for sample_name in samples.keys():
        bySample_report[sample_name] = {
            'nb_kept': 0,
            'kept_abundance': 0,
            'nb_removed': 0,
            'removed_abundance': 0,
            'removed_max_abundance': 0
        }

    # Retrieve chimera
    for sample_name in samples.keys():
        chimera_fh = open( samples[sample_name]['chimera_path'] )
        for line in chimera_fh:
            observation_name = line.strip()
            if not nb_sample_by_chimera.has_key(observation_name):
                nb_sample_by_chimera[observation_name] = 0
            nb_sample_by_chimera[observation_name] += 1
        chimera_fh.close()

    # Remove chimera
    removed_chimera = list()
    biom = BiomIO.from_json(in_biom_file)
    for chimera_name in nb_sample_by_chimera.keys():
        is_always_chimera = True
        nb_sample_with_obs = sum( 1 for sample in biom.get_samples_by_observation(chimera_name) )
        observation_abundance = biom.get_observation_count(chimera_name)
        if nb_sample_with_obs != nb_sample_by_chimera[chimera_name]:
            is_always_chimera = False
            global_report['nb_ambiguous'] += 1
            global_report['abundance_ambiguous'] += observation_abundance
            print "'" + chimera_name + "' is not interpreted as chimera in all samples where it is present."
        if not lenient_filter or is_always_chimera:
            removed_chimera.append(chimera_name)
            # Global metrics
            global_report['nb_removed'] += 1
            global_report['abundance_removed'] += observation_abundance
            # By sample metrics
            for sample in biom.get_samples_by_observation(chimera_name):
                sample_count = biom.get_count(chimera_name, sample['id'])
                bySample_report[sample['id']]['nb_removed'] += 1
                bySample_report[sample['id']]['removed_abundance'] += sample_count
                bySample_report[sample['id']]['removed_max_abundance'] = max(bySample_report[sample['id']]['removed_max_abundance'], sample_count)
    biom.remove_observations(removed_chimera)

    # Nb non-chimera
    for observation_name in biom.get_observations_names():
        global_report['nb_kept'] += 1
        global_report['abundance_kept'] += biom.get_observation_count(observation_name)
        # By sample metrics
        for sample in biom.get_samples_by_observation(observation_name):
            sample_count = biom.get_count(observation_name, sample['id'])
            bySample_report[sample['id']]['nb_kept'] += 1
            bySample_report[sample['id']]['kept_abundance'] += sample_count
    BiomIO.write(out_biom_file, biom)
Esempio n. 24
0
    biom = Biom( generated_by="grinder", matrix_type="sparse" )

    # Set observations count
    for sample_name in args.samples:
        biom.add_sample( sample_name )
        fh_abund = open( args.samples[sample_name] )
        for line in fh_abund: # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc"
            if not line.startswith('#'):
                fields = line.strip().split()
                try:
                    biom.add_observation( fields[1] )
                except: # already exist
                    pass
                biom.change_count( fields[1], sample_name, int(float(fields[2])*100000000000000) )################## depend de la precision grinder
        fh_abund.close()

    # Set taxonomy metadata
    fh_classif = FastaIO( args.affiliation )
    for record in fh_classif:
        try:
            metadata = biom.get_observation_metadata( record.id )
            if metadata is None or not metadata.has_key( taxonomy_key ):
                taxonomy = getCleanedTaxonomy(record.description)
                biom.add_metadata( record.id, taxonomy_key, taxonomy, "observation" )
        except ValueError: # is not in BIOM
            pass
    fh_classif.close()

    # Write BIOM
    BiomIO.write( args.output, biom )
Esempio n. 25
0
        " --samples"
    for current_sample in samples:
        cmd_grinder2biom += " '" + current_sample[
            'name'] + ":" + current_sample['path'] + "'"
    subprocess.check_call(cmd_grinder2biom, shell=True)

    # Add reference id in checked BIOM
    biom = BiomIO.from_json(args.checked_biom)
    fasta = FastaIO(args.checked_fasta)
    for record in fasta:
        reference = re.search("reference=([^\s]+)",
                              record.description).group(1)
        biom.add_metadata(record.id, "grinder_source", reference,
                          "observation")
    fasta.close()
    BiomIO.write(checked_biom, biom)
    del (biom)

    # Compare expected to obtained
    for current_sample in samples:
        print current_sample['name']
        cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \
            + " --real-biom " + os.path.abspath(real_biom) \
            + " --real-tax-key 'real_taxonomy'" \
            + " --checked-biom " + os.path.abspath(checked_biom) \
            + " --checked-tax-key '" + args.taxonomy_key + "'" \
            + (" --multi-affiliations" if args.multi_affiliations else "") \
            + (" --uniq-groups " + args.uniq_groups if args.uniq_groups is not None else "") \
            + " --sample " + current_sample['name']
        print subprocess.check_output(cmd_compareSample, shell=True)
        print ""
Esempio n. 26
0
def tsv_to_biom( input_tsv, multi_hit_dict, fields, samples_names, output_biom, output_fasta ):
    """
    @summary: Convert TSV file to Biom file.
    @param input_tsv: [str] Path to the TSV file.
    @param multi_hit_dict: [dict] Dictionnary describing equivalent multi blast hit : 
    dict[observation_name]=[ {"blast_taxonomy":taxonomy, "blast_subject":subject, "blast_perc_identity": per_id, "blast_perc_query_coverage":per_cov, "blast_evalue":eval, "blast_aln_length":aln}]
    @param fields: [list] column name to include as metadata (must at least contain observation_name): observation_sum and seed_sequence will be excluded, rdp_tax_and_bootstrap will be split in two metadata
    @param samples_names: [list] list of sample names.
    @param output_biom: [str] Path to the output file (format : BIOM).
    @param output_fasta: [str] Path to the output file (format : fasta).
    """
#     biom = Biom( generated_by='frogs', matrix_type="sparse" )
    biom = Biom( matrix_type="sparse" )

    seed_seq_idx = -1 
    metadata_index = dict()
    sample_index = dict()
    clusters_count = dict()
    clusters_metadata = dict()
    in_fh = open( input_tsv )

    if not output_fasta is None:
        Fasta_fh=FastaIO(output_fasta , "w" )

    # parse header and store column index 
    header=in_fh.readline()
    if header.startswith("#"):
        header=header[1:]
    header = header.strip()
    seed_seq_idx, metadata_index, sample_index = header_line_dict(fields,header,samples_names)
    if not output_fasta is None and seed_seq_idx == -1:
        raise Exception("\nYou want to extract seed fasta sequence but there is no seed_sequence column in your TSV file\n\n")

    # count by sample, and metadata
    for line in in_fh:

        cluster_name=""
        line_list=line.strip().split("\t")
        count_by_sample = {}
        metadata_dict = {}
        # parse columns
        for idx,val in enumerate(line_list):
            # recover metadata
            if idx in metadata_index:
                if metadata_index[idx]=="observation_name" :
                    cluster_name = val
                else:
                    metadata_dict[metadata_index[idx]] = val
            # recover samples count
            elif idx in sample_index and val > 0:
                count_by_sample[sample_index[idx]] = int(val)
            # recover seed sequence
            elif idx == seed_seq_idx:
                seed_seq = val

        # if fasta output file => store de seed sequence
        if not output_fasta is None:
            seq = Sequence( cluster_name, seed_seq) 
            Fasta_fh.write(seq)

        if "taxonomy" in metadata_dict:
            metadata_dict["taxonomy"] = metadata_dict["taxonomy"].split(";")

        # format rdp taxonomy to fit BIOM format
        if "rdp_tax_and_bootstrap" in metadata_dict:
            metadata_dict["rdp_taxonomy"]=[]
            metadata_dict["rdp_bootstrap"]=[]
            tax = metadata_dict["rdp_tax_and_bootstrap"].rstrip(";").split(";")
            for i in range(0,len(tax),2):
                metadata_dict["rdp_taxonomy"].append(tax[i])
                metadata_dict["rdp_bootstrap"].append(tax[i+1].replace("(","").replace(")",""))
            metadata_dict.pop("rdp_tax_and_bootstrap")

        # format blast taxonomy to fit BIOM format (one consensus blast_taxonomy and possible multiples blast_affiliation detailed
        if "blast_taxonomy" in metadata_dict:
            metadata_dict["blast_taxonomy"] = metadata_dict["blast_taxonomy"].split(";")

            # check multihit blast : filter non consistent taxonomy hit with blast_taxonomy (if TSV modified), and compute consensus tax (if multihit line suppressed)
            if metadata_dict["blast_subject"] == "multi-subject" and not multi_hit_dict is None:
                if not cluster_name in multi_hit_dict:
                    raise Exception("\n"+cluster_name+" has multi-subject tag but is not present in your multi-hit TSV file. Please, provide the original multi-hit TSV file.\n\n")
                else:
                    metadata_dict["blast_taxonomy"], metadata_dict["blast_affiliations"] = observation_blast_parts(metadata_dict, multi_hit_dict[cluster_name])
                    if metadata_dict["blast_affiliations"] == []:
                        raise Exception("\nyour multihit TSV file is no more consistent with your abundance TSV file for (at least) "+cluster_name+"\n\n")
            # no multi tag= blast affiliation is equal to blast_taxonomy
            else:
                blast_dict={key.replace("blast_",""):metadata_dict[key] for key in metadata_dict if key.startswith("blast")}
                metadata_dict["blast_affiliations"]=[blast_dict]

            # filter blast metadata which are moved to blast_affiliations
            for metadata in metadata_dict["blast_affiliations"][0]:
                if not metadata == "taxonomy":
                    metadata_dict.pop("blast_"+metadata)

        # add cluster and count to clusters_count dict
        clusters_count[cluster_name] = count_by_sample
        # ok print clusters_count[cluster_name].keys(), "CDT0#LOT05" in clusters_count[cluster_name], "CDT0#LOT02" in clusters_count[cluster_name]
        # add cluster and metadata to clusters_metadata dict
        clusters_metadata[cluster_name] = metadata_dict

    if not output_fasta is None:
        Fasta_fh.close()
    in_fh.close()

    #add samples to biom
    for sample_name in samples_names:
        biom.add_sample( sample_name )

    # add to cluster to biom
    for cluster_name in clusters_count:
        biom.add_observation( cluster_name, clusters_metadata[cluster_name] )
        for sample_name in samples_names:
            if clusters_count[cluster_name][sample_name] > 0:
                biom.add_count( cluster_name, sample_name, clusters_count[cluster_name][sample_name] )

    # Write
    BiomIO.write( output_biom, biom )