コード例 #1
0
    def _post_align(self, sam_file: str) -> pd.DataFrame:
        logger.debug("Beginning post align with aligner %s" % self._name)
        align_gen = yield_alignments_from_sam_inf(sam_file)
        lca_map = build_lca_map(align_gen, self.tree)
        samples_lca_map = defaultdict(Counter)
        for key, value in valfilter(lambda x: x is not None, lca_map).items():
            samples_lca_map['_'.join(key.split('_')[:-1])].update([value])

        df = pd.DataFrame(samples_lca_map, dtype=int)
        return df
コード例 #2
0
def build_lca_df(sam_file: str,
                 tree: LCATaxonomy,
                 confidence_threshold: float = 1.0,
                 samples_iter: int = 50) -> pd.DataFrame:
    align_gen = yield_alignments_from_sam_inf(sam_file)
    if confidence_threshold == 1.0:
        lca_map_gen = gen_lowest_common_ancestor(align_gen, tree)
    else:
        lca_map_gen = gen_confidence_lowest_common_ancestor(
            align_gen, tree, confidence_threshold)

    sample_names_to_ix = dict()
    ix = 0
    mat_counts = np.zeros((tree.num_nodes, samples_iter), dtype=int)
    max_samples = samples_iter
    for rname, node_id in lca_map_gen:
        sample_name = rname.split('_')[0]
        if sample_name in sample_names_to_ix:
            c_ix = sample_names_to_ix[sample_name]
            mat_counts[node_id, c_ix] += 1
        else:
            if ix >= max_samples:
                b = np.zeros((tree.num_nodes, max_samples + samples_iter))
                b[:, :-samples_iter] = mat_counts
                mat_counts = b
                max_samples += samples_iter
            sample_names_to_ix[sample_name] = ix
            mat_counts[node_id, ix] += 1
            ix += 1

    sample_names = [
        k for k, v in sorted(sample_names_to_ix.items(),
                             key=lambda item: item[1])
    ]

    df = pd.DataFrame(mat_counts[:, :ix], dtype=int, columns=sample_names)
    # drop all node ids of all zeros
    df = df.loc[~(df == 0).all(axis=1)].copy()
    df.index = [tree.node_id_to_taxa_name[node_id] for node_id in df.index]
    df.drop("root", axis=0, errors="ignore", inplace=True)
    return df
コード例 #3
0
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    # Create a SAM file for each input FASTA file
    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    img_map = IMGMap()

    for basename in basenames:
        sam_inf = os.path.join(output, basename + '.sam')
        step_outf = 'test'
        if os.path.isfile(step_outf):
            print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf)
        else:
            lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), )

    sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')]

    img_map = IMGMap()

    ncbi_tree = NCBITree()
    lca = LCA(ncbi_tree, args.depth)

    with open(args.output, 'w') if args.output else sys.stdout as outf:
        csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n')
        csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id'])
        for file in sam_files:
            with open(file) as inf:
                lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map)
                for key in lca_map:
                    img_ids, ncbi_tid = lca_map[key]
                    csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)])

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth - 1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
コード例 #4
0
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta,
                          reference_map, extract_ncbi_tid, depth, threads):
    verify_make_dir(output)

    fna_files = [
        os.path.join(input, filename) for filename in os.listdir(input)
        if filename.endswith('.fna')
    ]

    for fna_file in fna_files:
        sam_outf = os.path.join(
            output,
            '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam')
        print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads))

    tree = NCBITree()
    begin, end = extract_ncbi_tid.split(',')

    sam_files = [
        os.path.join(output, filename) for filename in os.listdir(output)
        if filename.endswith('.sam')
    ]
    lca_maps = {}
    for sam_file in sam_files:
        lca_map = {}
        for qname, rname in yield_alignments_from_sam_inf(sam_file):
            ncbi_tid = int(find_between(rname, begin, end))
            if qname in lca_map:
                current_ncbi_tid = lca_map[qname]
                if current_ncbi_tid:
                    if current_ncbi_tid != ncbi_tid:
                        lca_map[qname] = tree.lowest_common_ancestor(
                            ncbi_tid, current_ncbi_tid)
            else:
                lca_map[qname] = ncbi_tid

        lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth),
                         lca_map)
        # filter out null values
        lca_maps['.'.join(os.path.basename(sam_file).split('.')
                          [:-1])] = reverse_collision_dict(lca_map)

    for basename in lca_maps.keys():
        lca_maps[basename] = valmap(lambda val: (basename, val),
                                    lca_maps[basename])

    lca_map_2 = defaultdict(list)
    for basename in lca_maps.keys():
        for key, val in lca_maps[basename].items():
            if key:
                lca_map_2[key].append(val)

    fna_faidx = {}
    for fna_file in fna_files:
        fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file)

    dict_reference_map = defaultdict(list)
    with open(reference_map) as inf:
        tsv_in = csv.reader(inf, delimiter='\t')
        for line in tsv_in:
            dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

    # reverse the dict to feed into embalmer
    references_faidx = pyfaidx.Fasta(reference_fasta)

    tmpdir = tempfile.mkdtemp()
    with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat:
        for key in lca_map_2.keys():

            queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
            references_fna_filename = os.path.join(tmpdir, 'reference.fna')
            output_filename = os.path.join(tmpdir, 'output.txt')

            with open(queries_fna_filename, 'w') as queries_fna:
                for basename, headers in lca_map_2[key]:
                    for header in headers:
                        record = fna_faidx[basename][header][:]
                        queries_fna.write('>filename|%s|%s\n%s\n' %
                                          (basename, record.name, record.seq))

            with open(references_fna_filename, 'w') as references_fna:
                for i in dict_reference_map[key]:
                    record = references_faidx[i][:]
                    references_fna.write('>%s\n%s\n' %
                                         (record.name, record.seq))

            embalmer_align(queries_fna_filename, references_fna_filename,
                           output_filename)

            with open(output_filename) as embalmer_out:
                for line in embalmer_out:
                    embalmer_cat.write(line)

            os.remove(queries_fna_filename)
            os.remove(references_fna_filename)
            os.remove(output_filename)

    os.rmdir(tmpdir)

    sparse_ncbi_dict = defaultdict(dict)

    # build query by NCBI_TID DataFrame
    with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))