Ejemplo n.º 1
0
def get_tree_from_fasta(in_fasta):
    """
    @warning: The root node must be present
    """
    databank_tree = None
    FH_databank = FastaIO(in_fasta)
    for record in FH_databank:
        if record.description.endswith(";"):
            record.description = record.description[:-1]
        taxonomy = record.description.split(";")
        if databank_tree is None:
            databank_tree = Node(taxonomy[0])
        parent = databank_tree
        for rank_depth, taxa in enumerate(taxonomy[1:]):
            if not parent.has_child(taxa):
                taxa_node = Node(taxa, parent)
                if (rank_depth + 1) == (len(taxonomy) -
                                        1):  # Current node is leaf
                    taxa_node.metadata["seq_ids"] = [record.id]
            else:
                if (rank_depth + 1) == (len(taxonomy) -
                                        1):  # Current node is leaf
                    taxa_node = parent.get_child(taxa)
                    taxa_node.metadata["seq_ids"].append(record.id)
            parent = parent.get_child(taxa)
    FH_databank.close()
    return databank_tree
Ejemplo n.º 2
0
def get_tree_from_fasta(in_fasta):
    """
    @warning: The root node must be present
    """
    databank_tree = None
    FH_databank = FastaIO(in_fasta)
    for record in FH_databank:
        if record.description.endswith(";"):
            record.description = record.description[:-1]
        taxonomy = record.description.split(";")
        if databank_tree is None:
            databank_tree = Node(taxonomy[0])
        parent = databank_tree
        for rank_depth, taxa in enumerate(taxonomy[1:]):
            if not parent.has_child(taxa):
                taxa_node = Node(taxa, parent)
                if (rank_depth + 1) == (len(taxonomy) - 1):  # Current node is leaf
                    taxa_node.metadata["seq_ids"] = [record.id]
            else:
                if (rank_depth + 1) == (len(taxonomy) - 1):  # Current node is leaf
                    taxa_node = parent.get_child(taxa)
                    taxa_node.metadata["seq_ids"].append(record.id)
            parent = parent.get_child(taxa)
    FH_databank.close()
    return databank_tree
Ejemplo n.º 3
0
def write_subset(in_path, out_path, selected):
    FH_in = FastaIO(in_path)
    FH_out = FastaIO(out_path, "w")
    for record in FH_in:
        if record.id in selected:
            FH_out.write(record)
    FH_in.close()
    FH_out.close()
Ejemplo n.º 4
0
def write_subset(in_path, out_path, selected):
    FH_in = FastaIO(in_path)
    FH_out = FastaIO(out_path, "w")
    for record in FH_in:
        if record.id in selected:
            FH_out.write(record)
    FH_in.close()
    FH_out.close()
Ejemplo n.º 5
0
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file):
    FH_in = FastaIO(fasta_in)
    FH_out = FastaIO(fasta_out, "w")
    biom = BiomIO.from_json(biom_in)
    seq_in = 0
    seq_out = 0

    for record in FH_in:
        seq_in += 1
        try:
            biom.find_idx("observation", record.id)
        except ValueError:
            pass
        else:
            FH_out.write(record)
            seq_out += 1
    FH_in.close()
    FH_out.close()
    FH_log = open(log_file, "w")
    FH_log.write("Number of sequence in :" + str(seq_in) + "\n")
    FH_log.write("Number of sequence out :" + str(seq_out) + "\n")
Ejemplo n.º 6
0
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file):
    FH_in = FastaIO( fasta_in )
    FH_out = FastaIO( fasta_out, "w" )
    biom = BiomIO.from_json( biom_in )
    seq_in=0
    seq_out=0

    for record in FH_in:
        seq_in += 1
        try:
            biom.find_idx("observation",record.id)
        except ValueError:
            pass
        else:
            FH_out.write(record)
            seq_out += 1
    FH_in.close()
    FH_out.close()
    FH_log=open(log_file,"w")
    FH_log.write("Number of sequence in :" + str(seq_in)+"\n" )
    FH_log.write("Number of sequence out :" + str(seq_out) +"\n") 
            if filename.endswith( "_ranks.txt" ):
                sample_name = filename.split("_ranks.txt")[0].split(args.sample_sep)[0]
                samples.append( {'name':sample_name, 'path':os.path.join(dirname, filename)} )

    # Grinder to BIOM
    cmd_grinder2biom = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grinder2biom.py") + \
        " --affiliation " + os.path.abspath(args.databank) + \
        " --output " + real_biom + \
        " --samples"
    for current_sample in samples:
        cmd_grinder2biom += " '" + current_sample['name'] + ":" + current_sample['path'] + "'"
    subprocess.check_call( cmd_grinder2biom, shell=True )

    # Add reference id in checked BIOM
    biom = BiomIO.from_json( args.checked_biom )
    fasta = FastaIO( args.checked_fasta )
    for record in fasta:
        reference = re.search("reference=([^\s]+)", record.description).group(1)
        biom.add_metadata( record.id, "grinder_source", reference, "observation" )
    fasta.close()
    BiomIO.write( checked_biom, biom )
    del(biom)

    # Compare expected to obtained
    for current_sample in samples:
        print current_sample['name']
        cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \
            + " --real-biom " + os.path.abspath(real_biom) \
            + " --real-tax-key 'real_taxonomy'" \
            + " --checked-biom " + os.path.abspath(checked_biom) \
            + " --checked-tax-key '" + args.taxonomy_key + "'" \
Ejemplo n.º 8
0
        help='Path to the sequence file outputed by UTAX (format: fasta).')
    group_input.add_argument('-b',
                             '--input-biom',
                             required=True,
                             help='Path to the abundance file (format: BIOM).')
    # Outputs
    group_output = parser.add_argument_group('Outputs')
    group_output.add_argument(
        '-o',
        '--output-biom',
        required=True,
        help='Path to the abundance file with taxonomy (format: BIOM).')
    args = parser.parse_args()

    # Process
    biom = BiomIO.from_json(args.input_biom)
    fasta = FastaIO(args.input_fasta)
    for record in fasta:
        # record.id example: Cluster_1;size=19714;tax=d:Bacteria(1.0000),p:"Proteobacteria"(0.9997),c:Alphaproteobacteria(0.9903),o:Rhodospirillales(0.9940),f:Acetobacteraceae(0.9887),g:Humitalea(0.9724);
        match = re.search("^([^\;]+)\;size\=\d+\;tax=(.+)$", record.id)
        if match is None:
            fasta.close()
            raise Exception("ID and taxonomy cannot be retrieved from '" +
                            record.id + "'")
        record.id = match.group(1)
        record.description = match.group(2)
        biom.add_metadata(record.id, args.taxonomy_tag, record.description,
                          "observation")
    fasta.close()
    BiomIO.write(args.output_biom, biom)
Ejemplo n.º 9
0
def process(params):

    biom_in = BiomIO.from_json(params.input_biom)
    # check if biom_in has blast_taxonomy affiliations
    if not biom_in.has_metadata("blast_affiliations"):
        raise_exception(
            Exception(
                "\n\n#ERROR : Your input biom file, " +
                os.path.basename(params.input_biom) +
                ", does not contain any blast_affiliations metadata.\n\n"))

    biom_out = Biom(generated_by='FROGS_aggregate_affiliated_otu',
                    matrix_type="sparse")

    # add samples in biom_out
    for sample_name in biom_in.get_samples_names():
        biom_out.add_sample(sample_name)

    # parse biom from most abondant OTU to less abondant one
    # save taxonomy
    # add OTU to biom_out if taxonomy is with poor %id or %cov or taxonomy not already saved
    # aggregate OTU to previous one if %id or %cov is big enough and share taxonomy with previous one

    # compute observation sum
    otu_sums = {}
    for otu_name, count_sum in biom_in.get_observations_counts():
        otu_sums[otu_name] = count_sum

    # save "confident" taxonomy
    otu_by_tax = dict()
    # save aggregated_otu_composition
    aggregated_otu = OrderedDict()
    otu_in = 0
    otu_out = 0
    otu_aggregated = 0

    # parse otu from most abondant to less ones
    for otu_name in sorted(otu_sums,
                           key=lambda i: int(otu_sums[i]),
                           reverse=True):
        otu_in += 1
        observation = biom_in.get_observations_by_name(otu_name)

        # is this OTU poorly affiliated
        min_id = 100
        min_cov = 100
        tax = list()
        for affiliation in observation["metadata"]["blast_affiliations"]:
            if params.taxon_ignored and any(
                    t in ";".join(affiliation["taxonomy"])
                    for t in params.taxon_ignored):
                continue
            if not affiliation["taxonomy"] in tax:
                tax.append(affiliation["taxonomy"])
            percent_id = affiliation["perc_identity"]
            percent_cov = affiliation["perc_query_coverage"]
            if percent_id < min_id:
                min_id = percent_id
            if percent_cov < min_cov:
                min_cov = percent_cov

        # Add otu because of poor affiliations stat
        if min_id < params.identity or min_cov < params.coverage:
            otu_out += 1
            biom_out.add_observation(otu_name, observation["metadata"])
            for sample_name in biom_in.get_samples_names():
                count = biom_in.get_count(otu_name, sample_name)
                biom_out.add_count(otu_name, sample_name, count)
            aggregated_otu[otu_name] = list()
        # for confident taxonomy
        else:
            # check if all taxonomies are new
            is_new_tax = True
            equivalent_otu_name = ""

            for taxonomy in tax:
                if isinstance(taxonomy, list):
                    taxonomy = ";".join(taxonomy)
                if taxonomy in otu_by_tax:
                    is_new_tax = False
                    if equivalent_otu_name == "":
                        equivalent_otu_name = otu_by_tax[taxonomy]
                    elif otu_by_tax[taxonomy] != equivalent_otu_name:
                        Logger.static_write(
                            params.log_file, '\tWarning: observation ' +
                            otu_name + ' shares taxonomy ( ' + taxonomy +
                            ' with an other OTU : ' + otu_by_tax[taxonomy] +
                            ', first detected OTU will be kept : ' +
                            equivalent_otu_name + '\n')

            # if new tax, add OTU and save taxonomies
            if is_new_tax:
                otu_out += 1
                biom_out.add_observation(otu_name, observation["metadata"])
                for sample_name in biom_in.get_samples_names():
                    count = biom_in.get_count(otu_name, sample_name)
                    if count > 0:
                        biom_out.add_count(otu_name, sample_name, count)
                aggregated_otu[otu_name] = list()
                for taxonomy in tax:
                    if isinstance(taxonomy, list):
                        taxonomy = ";".join(taxonomy)
                    otu_by_tax[taxonomy] = otu_name
            # else aggregation of OTU
            else:
                otu_aggregated += 1
                equivalent_otu = biom_out.get_observations_by_name(
                    equivalent_otu_name)
                # add blast_affiliations
                aggregated_blast_affi = equivalent_otu["metadata"][
                    "blast_affiliations"] + observation["metadata"][
                        "blast_affiliations"]
                biom_out.add_metadata(equivalent_otu_name,
                                      "blast_affiliations",
                                      aggregated_blast_affi,
                                      subject_type="observation",
                                      erase_warning=False)
                # update consensus tax
                consensus_tax = get_tax_consensus(
                    [affi["taxonomy"] for affi in aggregated_blast_affi])
                biom_out.add_metadata(equivalent_otu_name,
                                      "blast_taxonomy",
                                      consensus_tax,
                                      subject_type="observation",
                                      erase_warning=False)
                # update counts
                for sample_name in biom_in.get_samples_names():
                    count = biom_out.get_count(
                        equivalent_otu_name, sample_name) + biom_in.get_count(
                            otu_name, sample_name)
                    biom_out.change_count(equivalent_otu_name, sample_name,
                                          count)
                # save aggregated composition
                aggregated_otu[equivalent_otu_name].append(otu_name)
                # update known taxonomies
                for taxonomy in tax:
                    if isinstance(taxonomy, list):
                        taxonomy = ";".join(taxonomy)
                    if not taxonomy in otu_by_tax:
                        otu_by_tax[taxonomy] = equivalent_otu_name

    # write biom output file
    BiomIO.write(params.output_biom, biom_out)

    # update fasta
    FH_in = FastaIO(params.input_fasta)
    FH_out = FastaIO(params.output_fasta, "wt")
    for record in FH_in:
        if record.id in aggregated_otu:
            FH_out.write(record)
    FH_in.close()
    FH_out.close()

    # write otu composition
    FH_compo = open(params.output_compo, "wt")
    for OTU in aggregated_otu:
        FH_compo.write(OTU + " " + " ".join(aggregated_otu[OTU]) + "\n")
    FH_compo.close()

    # simple log stat
    Logger.static_write(params.log_file, "# nb OTU in : " + str(otu_in) + "\n")
    Logger.static_write(params.log_file,
                        "# nb OTU out : " + str(otu_out) + "\n")
    Logger.static_write(params.log_file,
                        "# nb OTU aggregated : " + str(otu_aggregated) + "\n")
Ejemplo n.º 10
0
        'The joined reads used in UPARSE pipeline (format: fasta or fastq). ID of the original sequence used to create read must be in reads description "reference=ID".'
    )
    # Outputs
    group_output = parser.add_argument_group('Outputs')
    group_output.add_argument(
        '-a',
        '--annotated-fasta',
        required=True,
        help=
        'The sequence of the OTU seeds with reference id in description (format: fasta).'
    )
    args = parser.parse_args()

    # Get observation sequences
    observation_id_by_seq = dict()
    FH_seeds = FastaIO(args.seeds_fasta)
    for record in FH_seeds:
        if record.string in observation_id_by_seq:
            raise Exception("The OTU '" +
                            observation_id_by_seq[record.string] + "' and '" +
                            record.id + "' have the same sequence.")
        observation_id_by_seq[record.string] = record.id.split(";size=")[0]
    FH_seeds.close()

    # Get centroids of observation
    reference_by_observation_id = dict()
    for file in args.reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            if record.string in observation_id_by_seq:
                observation_id = observation_id_by_seq[record.string]
Ejemplo n.º 11
0
#
##################################################################################################################################################
if __name__ == "__main__":
    # Manage parameters
    parser = argparse.ArgumentParser( description='Add the original reference ID to each Mothur seed.' )
    parser.add_argument( '-v', '--version', action='version', version=__version__ )
    parser.add_argument( '-i', '--input', required=True, help='Sequences file from mothur get.oturep (format: fasta).' )
    parser.add_argument( '-t', '--trimmed-reads', required=True, nargs="+", help='Reads after all sequence modifications like aln (format: fasta or fastq). It is used to find the ID of clusters centroids by exact comparison between OTU sequences and reads sequences.' )
    parser.add_argument( '-r', '--reads', required=True, nargs="+", help='Simulated reads used as input in mothur pipeline (format: fasta or fastq). These reads are used to retrieve simulation reference of the centroids. The link between centroids and reads is the sequence ID. The description of reads must contain the tag "reference=<REF_ID>".' )
    parser.add_argument( '-o', '--output', required=True, help='Output file (format: fasta).' )
    args = parser.parse_args()

    # Get observation sequences
    nb_observations = 0
    observation_ids_by_seq = dict()
    FH_seeds = FastaIO(args.input)
    for record in FH_seeds:
        nb_observations += 1
        if record.string not in observation_ids_by_seq:
            observation_ids_by_seq[record.string] = list()
        observation_ids_by_seq[record.string].append(record.id)
    FH_seeds.close()

    # Get centroids (the real centroid and indentical sequences) ID by observation
    observation_ids_by_centroid_id = dict()
    for file in args.trimmed_reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            record_seq = record.string.replace("-", "").replace(".", "")
            if record_seq in observation_ids_by_seq:
                observation_ids_by_centroid_id[record.id] = observation_ids_by_seq[record_seq]
Ejemplo n.º 12
0
                    'path': os.path.join(dirname, filename)
                })

    # Grinder to BIOM
    cmd_grinder2biom = os.path.join(os.path.dirname(os.path.abspath(__file__)), "grinder2biom.py") + \
        " --affiliation " + os.path.abspath(args.databank) + \
        " --output " + real_biom + \
        " --samples"
    for current_sample in samples:
        cmd_grinder2biom += " '" + current_sample[
            'name'] + ":" + current_sample['path'] + "'"
    subprocess.check_call(cmd_grinder2biom, shell=True)

    # Add reference id in checked BIOM
    biom = BiomIO.from_json(args.checked_biom)
    fasta = FastaIO(args.checked_fasta)
    for record in fasta:
        reference = re.search("reference=([^\s]+)",
                              record.description).group(1)
        biom.add_metadata(record.id, "grinder_source", reference,
                          "observation")
    fasta.close()
    BiomIO.write(checked_biom, biom)
    del (biom)

    # Compare expected to obtained
    for current_sample in samples:
        print current_sample['name']
        cmd_compareSample = os.path.join(os.path.dirname(os.path.abspath(__file__)), "biomCmpTax.py") \
            + " --real-biom " + os.path.abspath(real_biom) \
            + " --real-tax-key 'real_taxonomy'" \
Ejemplo n.º 13
0
        '--reads',
        required=True,
        nargs="+",
        help=
        'Simulated reads used as input in mothur pipeline (format: fasta or fastq). These reads are used to retrieve simulation reference of the centroids. The link between centroids and reads is the sequence ID. The description of reads must contain the tag "reference=<REF_ID>".'
    )
    parser.add_argument('-o',
                        '--output',
                        required=True,
                        help='Output file (format: fasta).')
    args = parser.parse_args()

    # Get observation sequences
    nb_observations = 0
    observation_ids_by_seq = dict()
    FH_seeds = FastaIO(args.input)
    for record in FH_seeds:
        nb_observations += 1
        if record.string not in observation_ids_by_seq:
            observation_ids_by_seq[record.string] = list()
        observation_ids_by_seq[record.string].append(record.id)
    FH_seeds.close()

    # Get centroids (the real centroid and indentical sequences) ID by observation
    observation_ids_by_centroid_id = dict()
    for file in args.trimmed_reads:
        FH_reads = SequenceFileReader.factory(file)
        for record in FH_reads:
            record_seq = record.string.replace("-", "").replace(".", "")
            if record_seq in observation_ids_by_seq:
                observation_ids_by_centroid_id[