Ejemplo n.º 1
0
def shogun_utree_lca(input, output, utree_indx, threads):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_file = os.path.join(input, basename + '.fna')
        tsv_outf = os.path.join(output, basename + '.utree.tsv')
        if not os.path.isfile(tsv_outf):
            print(utree_search(utree_indx, fna_file, tsv_outf))
        else:
            print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf)

    counts = []
    utree_outf = os.path.join(output, 'taxon_counts.txt')
    # Indexing for emblalmer
    if not os.path.isfile(utree_outf):
        for basename in basenames:
            lcas = []
            utree_tsv = os.path.join(output, basename + '.utree.tsv')
            with open(utree_tsv) as inf:
                tsv_parser = csv.reader(inf, delimiter='\t')
                for line in tsv_parser:
                    if line[1]:
                        lcas.append(';'.join(line[1].split('; ')))
            counts.append(Counter(filter(None, lcas)))

    df = pd.DataFrame(counts, index=basenames)
    df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Ejemplo n.º 2
0
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output BT2 database
    verify_make_dir(os.path.join(output, 'bt2'))
    print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
Ejemplo n.º 3
0
def shogun_embalmer_lca(input_dir, output_dir, embalmer_db, threads, pct_id, mincount, taxa_ncbi):
    if output_dir is None:
        output_dir = input_dir
    verify_make_dir(output_dir)

    inputfiles = [filename for filename in os.listdir(input_dir) if filename.endswith('.fna') or filename.endswith('fasta')]
    basenames = [os.path.splitext(filename)[0] for filename in inputfiles]
    outputfps = []

    for i,filename in enumerate(inputfiles):
        input_fp = os.path.join(input_dir, filename)
        tsv_outf = os.path.join(output_dir, basenames[i] + '.embalmer.tsv')
        outputfps.append(tsv_outf)
        if not os.path.isfile(tsv_outf):
            print("Did not file the output file \"%s\". Running the alignment phase for this file." % tsv_outf)
            print(embalmer_search(input_fp, tsv_outf, embalmer_db+".edb", embalmer_db+".tax", embalmer_db+".acc", threads, pct_id, taxa_ncbi))
        else:
            print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf)

    counts = []

    # Tabulating
    print("Tabulating and filtering hits...")

    # print a row of "-" for every 10 samples
    if len(inputfiles) >= 100:
        for i in range(floor(len(basenames)/10)):
            sys.stdout.write('-')
        sys.stdout.write('\n')
        sys.stdout.flush()

    taxon_outf = os.path.join(output_dir, 'taxon_counts.tsv')
    if os.path.isfile(taxon_outf):
        print("Skipping tabulation step, output file %s already exists." %(taxon_outf))
    else:
        for outputfp in outputfps:
            with open(outputfp) as output_file:
                tsv_parser = csv.reader(output_file, delimiter='\t')
                taxon_counts = Counter()
                for line in tsv_parser:
                    taxon = line[12]
                    # drop trailing t__ in redistribute
                    taxon = re.sub('; t__$','',taxon)
                    taxon = re.sub('; t__None$','',taxon)
                    taxon_counts[taxon] += 1
            counts.append(taxon_counts)
        df = pd.DataFrame(counts, index=basenames)
        # filter by mincount
        df[df < mincount] = 0
        # drop spaces in column
        df.columns = [colname.replace('; ',';') for colname in df.columns]
        # drop columns that sum to zero
        df = df.loc[:,(df.sum(axis=0) != 0)]
        df.T.to_csv(taxon_outf,
                index_label='Taxon',na_rep='0',sep='\t')

        get_rank_specific_taxonomy_tables(df,output_dir)
Ejemplo n.º 4
0
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth,
                  depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id,
                                              prefixes,
                                              db,
                                              tree,
                                              depth=depth,
                                              depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id,
                                          prefixes,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(
                            inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print(
            "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file."
            % (outf_fasta, outf_map))

    # Build the output BT2 database
    verify_make_dir(os.path.join(output, 'bt2'))
    print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
Ejemplo n.º 5
0
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'ncbi':
            annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output CTR
    verify_make_dir(os.path.join(output, 'utree'))
    path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr')
    path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr')
    if os.path.exists(path_compressed_tree):
        print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree)
    else:
        if not os.path.exists(path_uncompressed_tree):
            print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads))
        print(utree_compress(path_uncompressed_tree, path_compressed_tree))
        os.remove(path_uncompressed_tree)
Ejemplo n.º 6
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))
    
    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Ejemplo n.º 7
0
def shogun_bugbase(input, output, img_database_folder):
    verify_make_dir(output)
    utree_outf = os.path.join(output, 'taxa_counts.txt')
    # Indexing for emblalmer
    if not os.path.isfile(utree_outf):

        utree_indx = os.path.join(img_database_folder, 'img.genes.ctr')
        with open(os.path.join(img_database_folder, 'img_map.pkl'),
                  'rb') as inf:
            gg2img_oid = pickle.load(inf)

        basenames = [
            os.path.basename(filename)[:-4] for filename in os.listdir(input)
            if filename.endswith('.fna')
        ]

        for basename in basenames:
            fna_file = os.path.join(input, basename + '.fna')
            tsv_outf = os.path.join(output, basename + '.utree.tsv')
            if not os.path.isfile(tsv_outf):
                print(utree_search(utree_indx, fna_file, tsv_outf))
            else:
                print(
                    "Found the output file \"%s\". Skipping the alignment phase for this file."
                    % tsv_outf)

        counts = []

        for basename in basenames:
            lcas = []
            utree_tsv = os.path.join(output, basename + '.utree.tsv')
            with open(utree_tsv) as inf:
                tsv_parser = csv.reader(inf, delimiter='\t')
                for line in tsv_parser:
                    if line[1]:
                        taxon = line[1].replace('; ', ';')
                        if taxon in gg2img_oid:
                            lcas.append(gg2img_oid[taxon])
            counts.append(Counter(filter(None, lcas)))

        df = pd.DataFrame(counts, index=basenames).fillna(0).astype(int).T
        df.to_csv(utree_outf, sep='\t', index_label='#OTU ID')
    else:
        print("Found the output file \"%s\". Skipping all steps." % utree_outf)
Ejemplo n.º 8
0
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output CTR
    verify_make_dir(os.path.join(output, 'utree'))
    path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr')
    path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr')
    if os.path.exists(path_compressed_tree):
        print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree)
    else:
        if not os.path.exists(path_uncompressed_tree):
            print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads))
        print(utree_compress(path_uncompressed_tree, path_compressed_tree))
        os.remove(path_uncompressed_tree)
Ejemplo n.º 9
0
def shogun_bugbase(input, output, img_database_folder):
    verify_make_dir(output)
    utree_outf = os.path.join(output, 'taxa_counts.txt')
    # Indexing for emblalmer
    if not os.path.isfile(utree_outf):

        utree_indx = os.path.join(img_database_folder, 'img.genes.ctr')
        with open(os.path.join(img_database_folder, 'img_map.pkl'), 'rb') as inf:
            gg2img_oid = pickle.load(inf)

        basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

        for basename in basenames:
            fna_file = os.path.join(input, basename + '.fna')
            tsv_outf = os.path.join(output, basename + '.utree.tsv')
            if not os.path.isfile(tsv_outf):
                print(utree_search(utree_indx, fna_file, tsv_outf))
            else:
                print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf)

        counts = []

        for basename in basenames:
            lcas = []
            utree_tsv = os.path.join(output, basename + '.utree.tsv')
            with open(utree_tsv) as inf:
                tsv_parser = csv.reader(inf, delimiter='\t')
                for line in tsv_parser:
                    if line[1]:
                        taxon = line[1].replace('; ', ';')
                        if taxon in gg2img_oid:
                            lcas.append(gg2img_oid[taxon])
            counts.append(Counter(filter(None, lcas)))

        df = pd.DataFrame(counts, index=basenames).fillna(0).astype(int).T
        df.to_csv(utree_outf, sep='\t', index_label='#OTU ID')
    else:
        print("Found the output file \"%s\". Skipping all steps." % utree_outf)
Ejemplo n.º 10
0
def annotate_fasta(input, output, extract_refseq_id, prefixes, depth,
                   depth_force):
    verify_make_dir(output)

    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    with open(input, 'r') if input != '-' else sys.stdin as inf:
        with open(os.path.join(output, output_fn + '.annotated.fna'),
                  'w') as output_fna:
            with open(os.path.join(output, output_fn + '.annotated.map'),
                      'w') as output_map:
                inf_fasta = FASTA(inf)
                annotater = refseq_annotater(inf_fasta.read(),
                                             prefixes,
                                             extract_refseq_id,
                                             depth=depth,
                                             depth_force=depth_force)
                for lines_fna, lines_map in annotater:
                    output_fna.write(lines_fna)
                    output_map.write(lines_map)
Ejemplo n.º 11
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')

            lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree)

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Ejemplo n.º 12
0
def shogun_utree_lca(input, output, utree_indx, threads, confidence, support,
                     mincount):
    verify_make_dir(output)

    basenames = [
        os.path.basename(filename)[:-4] for filename in os.listdir(input)
        if filename.endswith('.fna')
    ]

    for basename in basenames:
        fna_file = os.path.join(input, basename + '.fna')
        tsv_outf = os.path.join(output, basename + '.utree.tsv')
        if not os.path.isfile(tsv_outf):
            print(utree_search(utree_indx, fna_file, tsv_outf))
        else:
            print(
                "Found the output file \"%s\". Skipping the alignment phase for this file."
                % tsv_outf)

    counts = []
    utree_outf = os.path.join(output, 'taxon_counts.txt')
    # Tabulating
    print("Tabulating and filtering hits...")

    # print a row of "-" for every 10 samples
    if len(basenames) >= 100:
        for i in range(floor(len(basenames) / 10)):
            sys.stdout.write('-')
        sys.stdout.write('\n')
        sys.stdout.flush()
    if not os.path.isfile(utree_outf):
        n_fail_confidence_only = 0
        n_fail_support_only = 0
        n_fail_both = 0
        n = 0
        n_pass = 0
        for i, basename in enumerate(basenames):
            if len(basenames) >= 100:
                if (i + 1) % 10 == 0:
                    sys.stdout.write('.')
                    sys.stdout.flush()
            lcas = []  # list of tuples [redistribute, confidence, support]
            utree_tsv = os.path.join(output, basename + '.utree.tsv')
            with open(utree_tsv) as inf:
                tsv_parser = csv.reader(inf, delimiter='\t')
                for line in tsv_parser:
                    if line[1]:
                        taxonomy = line[1]
                        is_confident = float(line[2]) >= confidence
                        is_supported = int(line[3]) >= support
                        n += 1
                        if not is_confident and not is_supported:
                            n_fail_both += 1
                        elif not is_confident:
                            n_fail_confidence_only += 1
                        elif not is_supported:
                            n_fail_support_only += 1
                        else:
                            n_pass += 1
                            lcas.append(taxonomy)
            counts.append(Counter(lcas))
        print(
            '%d total assignments\n%d failed confidence only\n%d failed support_only\n%d failed both\n%d remaining'
            % (n, n_fail_confidence_only, n_fail_support_only, n_fail_both,
               n_pass))
    sys.stdout.write('\n')
    sys.stdout.flush()

    df = pd.DataFrame(counts, index=basenames)
    # filter by mincount
    df[df < mincount] = 0
    # drop spaces in column
    df.columns = [colname.replace('; ', ';') for colname in df.columns]
    # drop trailing t__ in redistribute
    df.columns = [re.sub(';t__$', '', colname) for colname in df.columns]
    df.T.to_csv(os.path.join(output, 'taxon_counts.csv'),
                index_label='Taxon',
                na_rep='0',
                sep='\t')
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads):
    verify_make_dir(output)

    fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')]

    for fna_file in fna_files:
        sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam')
        print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads))

    tree = NCBITree()
    begin, end = extract_ncbi_tid.split(',')

    sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')]
    lca_maps = {}
    for sam_file in sam_files:
        lca_map = {}
        for qname, rname in yield_alignments_from_sam_inf(sam_file):
            ncbi_tid = int(find_between(rname, begin, end))
            if qname in lca_map:
                current_ncbi_tid = lca_map[qname]
                if current_ncbi_tid:
                    if current_ncbi_tid != ncbi_tid:
                        lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
            else:
                lca_map[qname] = ncbi_tid

        lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
        # filter out null values
        lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map)

    for basename in lca_maps.keys():
        lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename])

    lca_map_2 = defaultdict(list)
    for basename in lca_maps.keys():
        for key, val in lca_maps[basename].items():
            if key:
                lca_map_2[key].append(val)

    fna_faidx = {}
    for fna_file in fna_files:
        fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file)

    dict_reference_map = defaultdict(list)
    with open(reference_map) as inf:
        tsv_in = csv.reader(inf, delimiter='\t')
        for line in tsv_in:
            dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

    # reverse the dict to feed into embalmer
    references_faidx = pyfaidx.Fasta(reference_fasta)

    tmpdir = tempfile.mkdtemp()
    with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat:
        for key in lca_map_2.keys():

            queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
            references_fna_filename = os.path.join(tmpdir, 'reference.fna')
            output_filename = os.path.join(tmpdir, 'output.txt')

            with open(queries_fna_filename, 'w') as queries_fna:
                for basename, headers in lca_map_2[key]:
                    for header in headers:
                        record = fna_faidx[basename][header][:]
                        queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq))

            with open(references_fna_filename, 'w') as references_fna:
                for i in dict_reference_map[key]:
                        record = references_faidx[i][:]
                        references_fna.write('>%s\n%s\n' % (record.name, record.seq))

            embalmer_align(queries_fna_filename, references_fna_filename, output_filename)

            with open(output_filename) as embalmer_out:
                for line in embalmer_out:
                    embalmer_cat.write(line)

            os.remove(queries_fna_filename)
            os.remove(references_fna_filename)
            os.remove(output_filename)

    os.rmdir(tmpdir)

    sparse_ncbi_dict = defaultdict(dict)

    # build query by NCBI_TID DataFrame
    with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))
Ejemplo n.º 14
0
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    # Create a SAM file for each input FASTA file
    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    img_map = IMGMap()

    for basename in basenames:
        sam_inf = os.path.join(output, basename + '.sam')
        step_outf = 'test'
        if os.path.isfile(step_outf):
            print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf)
        else:
            lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), )

    sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')]

    img_map = IMGMap()

    ncbi_tree = NCBITree()
    lca = LCA(ncbi_tree, args.depth)

    with open(args.output, 'w') if args.output else sys.stdout as outf:
        csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n')
        csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id'])
        for file in sam_files:
            with open(file) as inf:
                lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map)
                for key in lca_map:
                    img_ids, ncbi_tid = lca_map[key]
                    csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)])

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth - 1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Ejemplo n.º 15
0
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta,
                          reference_map, extract_ncbi_tid, depth, threads):
    verify_make_dir(output)

    fna_files = [
        os.path.join(input, filename) for filename in os.listdir(input)
        if filename.endswith('.fna')
    ]

    for fna_file in fna_files:
        sam_outf = os.path.join(
            output,
            '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam')
        print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads))

    tree = NCBITree()
    begin, end = extract_ncbi_tid.split(',')

    sam_files = [
        os.path.join(output, filename) for filename in os.listdir(output)
        if filename.endswith('.sam')
    ]
    lca_maps = {}
    for sam_file in sam_files:
        lca_map = {}
        for qname, rname in yield_alignments_from_sam_inf(sam_file):
            ncbi_tid = int(find_between(rname, begin, end))
            if qname in lca_map:
                current_ncbi_tid = lca_map[qname]
                if current_ncbi_tid:
                    if current_ncbi_tid != ncbi_tid:
                        lca_map[qname] = tree.lowest_common_ancestor(
                            ncbi_tid, current_ncbi_tid)
            else:
                lca_map[qname] = ncbi_tid

        lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth),
                         lca_map)
        # filter out null values
        lca_maps['.'.join(os.path.basename(sam_file).split('.')
                          [:-1])] = reverse_collision_dict(lca_map)

    for basename in lca_maps.keys():
        lca_maps[basename] = valmap(lambda val: (basename, val),
                                    lca_maps[basename])

    lca_map_2 = defaultdict(list)
    for basename in lca_maps.keys():
        for key, val in lca_maps[basename].items():
            if key:
                lca_map_2[key].append(val)

    fna_faidx = {}
    for fna_file in fna_files:
        fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file)

    dict_reference_map = defaultdict(list)
    with open(reference_map) as inf:
        tsv_in = csv.reader(inf, delimiter='\t')
        for line in tsv_in:
            dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

    # reverse the dict to feed into embalmer
    references_faidx = pyfaidx.Fasta(reference_fasta)

    tmpdir = tempfile.mkdtemp()
    with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat:
        for key in lca_map_2.keys():

            queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
            references_fna_filename = os.path.join(tmpdir, 'reference.fna')
            output_filename = os.path.join(tmpdir, 'output.txt')

            with open(queries_fna_filename, 'w') as queries_fna:
                for basename, headers in lca_map_2[key]:
                    for header in headers:
                        record = fna_faidx[basename][header][:]
                        queries_fna.write('>filename|%s|%s\n%s\n' %
                                          (basename, record.name, record.seq))

            with open(references_fna_filename, 'w') as references_fna:
                for i in dict_reference_map[key]:
                    record = references_faidx[i][:]
                    references_fna.write('>%s\n%s\n' %
                                         (record.name, record.seq))

            embalmer_align(queries_fna_filename, references_fna_filename,
                           output_filename)

            with open(output_filename) as embalmer_out:
                for line in embalmer_out:
                    embalmer_cat.write(line)

            os.remove(queries_fna_filename)
            os.remove(references_fna_filename)
            os.remove(output_filename)

    os.rmdir(tmpdir)

    sparse_ncbi_dict = defaultdict(dict)

    # build query by NCBI_TID DataFrame
    with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))
Ejemplo n.º 16
0
def shogun_utree_capitalist(input, output, utree_indx, reference_fasta,
                            reference_map, extract_ncbi_tid, threads):
    verify_make_dir(output)

    basenames = [
        os.path.basename(filename)[:-4] for filename in os.listdir(input)
        if filename.endswith('.fna')
    ]

    for basename in basenames:
        fna_file = os.path.join(input, basename + '.fna')
        tsv_outf = os.path.join(output, basename + '.utree.tsv')
        if not os.path.isfile(tsv_outf):
            print(utree_search(utree_indx, fna_file, tsv_outf))
        else:
            print(
                "Found the output file \"%s\". Skipping the alignment phase for this file."
                % tsv_outf)

    embalmer_outf = os.path.join(output, 'embalmer_out.txt')
    # Indexing for emblalmer
    if not os.path.isfile(embalmer_outf):
        lca_maps = defaultdict(lambda: defaultdict(list))
        for basename in basenames:
            utree_tsv = os.path.join(output, basename + '.utree.tsv')
            with open(utree_tsv) as inf:
                tsv_parser = csv.reader(inf, delimiter='\t')
                for line in tsv_parser:
                    if line[1]:
                        lca_maps[';'.join(
                            line[1].split('; '))][basename].append(line[0])

        fna_faidx = {}
        for basename in basenames:
            fna_faidx[basename] = pyfaidx.Fasta(
                os.path.join(input, basename + '.fna'))

        dict_reference_map = defaultdict(list)

        with open(reference_map) as inf:
            tsv_in = csv.reader(inf, delimiter='\t')
            for line in tsv_in:
                dict_reference_map[';'.join(line[1].split('; '))].append(
                    line[0])

        # reverse the dict to feed into embalmer
        references_faidx = pyfaidx.Fasta(reference_fasta)

        tmpdir = tempfile.mkdtemp()
        print(tmpdir)
        with open(embalmer_outf, 'w') as embalmer_cat:
            for species in lca_maps.keys():

                queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
                references_fna_filename = os.path.join(tmpdir, 'reference.fna')
                output_filename = os.path.join(tmpdir, 'output.txt')

                with open(queries_fna_filename, 'w') as queries_fna:
                    for basename in lca_maps[species].keys():
                        for header in lca_maps[species][basename]:
                            record = fna_faidx[basename][header][:]
                            queries_fna.write(
                                '>filename|%s|%s\n%s\n' %
                                (basename, record.name, record.seq))

                with open(references_fna_filename, 'w') as references_fna:
                    for i in dict_reference_map[species]:
                        record = references_faidx[i][:]
                        references_fna.write('>%s\n%s\n' %
                                             (record.name, record.seq))

                print(
                    embalmer_align(queries_fna_filename,
                                   references_fna_filename, output_filename))

                with open(output_filename) as embalmer_out:
                    for line in embalmer_out:
                        embalmer_cat.write(line)

                os.remove(queries_fna_filename)
                os.remove(references_fna_filename)
                os.remove(output_filename)

        os.rmdir(tmpdir)
    else:
        print(
            "Found the output file \"%s\". Skipping the strain alignment phase for this file."
            % embalmer_outf)

    # Convert the results from embalmer into CSV
    sparse_ncbi_dict = defaultdict(dict)

    begin, end = extract_ncbi_tid.split(',')
    # build query by NCBI_TID DataFrame
    with open(embalmer_outf) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_utree_capitalist(input, output, utree_indx, reference_fasta, reference_map, extract_ncbi_tid, threads):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_file = os.path.join(input, basename + '.fna')
        tsv_outf = os.path.join(output, basename + '.utree.tsv')
        if not os.path.isfile(tsv_outf):
            print(utree_search(utree_indx, fna_file, tsv_outf))
        else:
            print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf)

    embalmer_outf = os.path.join(output, 'embalmer_out.txt')
    # Indexing for emblalmer
    if not os.path.isfile(embalmer_outf):
        lca_maps = defaultdict(lambda: defaultdict(list))
        for basename in basenames:
            utree_tsv = os.path.join(output, basename + '.utree.tsv')
            with open(utree_tsv) as inf:
                tsv_parser = csv.reader(inf, delimiter='\t')
                for line in tsv_parser:
                    if line[1]:
                        lca_maps[';'.join(line[1].split('; '))][basename].append(line[0])


        fna_faidx = {}
        for basename in basenames:
            fna_faidx[basename] = pyfaidx.Fasta(os.path.join(input, basename + '.fna'))

        dict_reference_map = defaultdict(list)

        with open(reference_map) as inf:
            tsv_in = csv.reader(inf, delimiter='\t')
            for line in tsv_in:
                dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

        # reverse the dict to feed into embalmer
        references_faidx = pyfaidx.Fasta(reference_fasta)

        tmpdir = tempfile.mkdtemp()
        print(tmpdir)
        with open(embalmer_outf, 'w') as embalmer_cat:
            for species in lca_maps.keys():

                queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
                references_fna_filename = os.path.join(tmpdir, 'reference.fna')
                output_filename = os.path.join(tmpdir, 'output.txt')

                with open(queries_fna_filename, 'w') as queries_fna:
                    for basename in lca_maps[species].keys():
                        for header in lca_maps[species][basename]:
                            record = fna_faidx[basename][header][:]
                            queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq))

                with open(references_fna_filename, 'w') as references_fna:
                    for i in dict_reference_map[species]:
                            record = references_faidx[i][:]
                            references_fna.write('>%s\n%s\n' % (record.name, record.seq))

                print(embalmer_align(queries_fna_filename, references_fna_filename, output_filename))

                with open(output_filename) as embalmer_out:
                    for line in embalmer_out:
                        embalmer_cat.write(line)

                os.remove(queries_fna_filename)
                os.remove(references_fna_filename)
                os.remove(output_filename)

        os.rmdir(tmpdir)
    else:
        print("Found the output file \"%s\". Skipping the strain alignment phase for this file." % embalmer_outf)


    # Convert the results from embalmer into CSV
    sparse_ncbi_dict = defaultdict(dict)

    begin, end = extract_ncbi_tid.split(',')
    # build query by NCBI_TID DataFrame
    with open(embalmer_outf) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))