Example #1
0
def scan_markers(genes, marker_genes_map_file):
    markers = []
    with InputStream(marker_genes_map_file) as mg_map:
        for gene_id, marker_id in select_from_tsv(mg_map, ["gene_id", "marker_id"], {"species_id": str, "genome_id": str, "gene_id": str, "gene_len": int, "marker_id": str}):
            if gene_id in genes:
                markers.append((gene_id, marker_id))
    return markers
Example #2
0
def midas_run_species(args):

    tempdir = f"{args.outdir}/species/temp/"

    command(f"rm -rf {tempdir}")
    command(f"mkdir -p {tempdir}")

    markers_db_files = multithreading_map(download_reference, [f"s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.fa{ext}.lz4" for ext in ["", ".bwt", ".header", ".sa", ".sequence"]] + ["s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"])

    db = UHGG()
    species_info = db.species

    marker_info = read_marker_info_repgenomes(markers_db_files[-1])

    with TimedSection("aligning reads to marker-genes database"):
        m8_file = map_reads_hsblast(tempdir, args.r1, args.r2, args.word_size, markers_db_files[0], args.max_reads)

    with InputStream(params.inputs.marker_genes_hmm_cutoffs) as cutoff_params:
        marker_cutoffs = dict(select_from_tsv(cutoff_params, selected_columns={"marker_id": str, "marker_cutoff": float}))

    with TimedSection("classifying reads"):
        best_hits = find_best_hits(args, marker_info, m8_file, marker_cutoffs)
        unique_alns = assign_unique(best_hits, species_info, marker_info)
        species_alns = assign_non_unique(best_hits, unique_alns, marker_info)

    with TimedSection("estimating species abundance"):
        total_gene_length = sum_marker_gene_lengths(marker_info)
        species_abundance = normalize_counts(species_alns, total_gene_length)

    write_abundance(args.outdir, species_abundance)
Example #3
0
def init(args):
    """
    Input spec: https://github.com/czbiohub/iggtools/wiki#inputs
    Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3
    """

    msg = f"Building {outputs.genomes}."
    if find_files(outputs.genomes):
        if not args.force:
            tsprint(
                f"Destination {outputs.genomes} already exists.  Specify --force to overwrite."
            )
            return
        msg = f"Rebuilding {outputs.genomes}."
    tsprint(msg)

    id_remap = {}
    with InputStream(inputs.alt_species_ids) as ids:
        for row in select_from_tsv(
                ids, selected_columns=["alt_species_id", "species_id"]):
            new_id, old_id = row
            id_remap[old_id] = new_id

    seen_genomes, seen_species = set(), set()
    with OutputStream(outputs.genomes) as out:

        target_columns = [
            "genome", "species", "representative", "genome_is_representative"
        ]
        out.write("\t".join(target_columns) + "\n")

        with InputStream(inputs.genomes2species) as g2s:
            for row in select_from_tsv(
                    g2s, selected_columns=["MAG_code", "Species_id"]):
                genome, representative = row
                species = id_remap[representative]
                genome_is_representative = str(int(genome == representative))
                target_row = [
                    genome, species, representative, genome_is_representative
                ]
                out.write("\t".join(target_row) + "\n")
                seen_genomes.add(genome)
                seen_species.add(species)

    tsprint(
        f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}."
    )
Example #4
0
def parse_species_profile(outdir):
    "Return map of species_id to coverage for the species present in the sample."
    with InputStream(f"{outdir}/species/species_profile.txt") as stream:
        return dict(
            select_from_tsv(stream, {
                "species_id": str,
                "coverage": float
            }))
Example #5
0
def parse_uclust(uclust_file, select_columns):
    # The uclust TSV file does not contain a header line.  So, we have to hardcode the schema here.  Then select specified columns.
    all_uclust_columns = [
        'type', 'cluster_id', 'size', 'pid', 'strand', 'skip1', 'skip2',
        'skip3', 'gene_id', 'centroid_id'
    ]
    with InputStream(uclust_file) as ucf:
        for r in select_from_tsv(ucf, select_columns, all_uclust_columns):
            yield r
Example #6
0
def _UHGG_load(toc_tsv, deep_sort=False):
    species = defaultdict(dict)
    representatives = {}
    genomes = {}
    with InputStream(toc_tsv) as table_of_contents:
        for row in select_from_tsv(table_of_contents, selected_columns=["genome", "species", "representative", "genome_is_representative"]):
            genome_id, species_id, representative_id, _ = row
            species[species_id][genome_id] = row
            representatives[species_id] = representative_id
            genomes[genome_id] = species_id
    if deep_sort:
        for sid in species.keys():
            species[sid] = sorted_dict(species[sid])
        species = sorted_dict(species)
    return species, representatives, genomes
Example #7
0
def find_best_hits(args, marker_info, m8_file, marker_cutoffs):
    """ Find top scoring alignment for each read """
    best_hits = {}
    i = 0
    with InputStream(m8_file) as m8_stream:
        for aln in select_from_tsv(m8_stream, schema=BLAST_M8_SCHEMA, result_structure=dict):
            i += 1
            cutoff = args.aln_mapid
            if cutoff == None:
                marker_id = marker_info[aln['target']]['marker_id'] # get gene family from marker_info
                cutoff = marker_cutoffs[marker_id]
            if aln['pid'] < cutoff: # does not meet marker cutoff
                continue
            if query_coverage(aln) < args.aln_cov: # filter local alignments
                continue
            if aln['query'] not in best_hits: # record aln
                best_hits[aln['query']] = [aln]
            elif best_hits[aln['query']][0]['score'] == aln['score']: # add aln
                best_hits[aln['query']] += [aln]
            elif best_hits[aln['query']][0]['score'] < aln['score']: # update aln
                best_hits[aln['query']] = [aln]
    tsprint(f"  total alignments: {i}")
    return list(best_hits.values())
Example #8
0
def read_marker_info_repgenomes(map_file):
    columns = ["species_id", "genome_id", "gene_id", "gene_length", "marker_id"]
    with InputStream(map_file) as map_file_stream:
        return {r['gene_id']: r for r in select_from_tsv(map_file_stream, schema=columns, result_structure=dict)}