コード例 #1
0
def refseq_get_ftp_links_from_file(input, output):
    db = RefSeqDatabase()
    tree = NCBITree()

    ncbi_tid_set = set()
    for line in input:
        line = str.replace(line, ' unclassified', '')
        line = str.replace(line, 'cf', '')

        [
            ncbi_tid_set.add(_[0])
            for _ in db.yield_ncbi_tid_row_from_name(line.strip())
        ]

    ncbi_tid_successors = set()
    # How many total strains are there in HMP?
    #
    for ncbi_tid in ncbi_tid_set:
        #TODO Switch the tree around - predecessor and successors
        [
            ncbi_tid_successors.add(_)
            for _ in tree.tree.predecessors_iter(ncbi_tid)
            if not _ in ncbi_tid_set
        ]

    ncbi_tid_set = set.union(ncbi_tid_set, ncbi_tid_successors)
    output.write('ncbi_tid,gg_lineage,ftp_link\n')
    for ncbi_tid in ncbi_tid_set:
        [
            output.write('%s,%s,%s\n' %
                         (ncbi_tid, tree.gg_lineage(ncbi_tid), ftp_link))
            for ftp_link in db.yield_ftp_links(ncbi_tid)
        ]
コード例 #2
0
ファイル: test_ncbi_tree.py プロジェクト: knights-lab/DOJO
class TestGGLineage(unittest.TestCase):
    ncbi_tree = NCBITree()

    def test(self):
        # 11788
        self.test_strains(11788)
        self.test_strains(391904)
        self.test_strains(-10)

    def test_strains(self, taxid):
        strain_name_1 = self.ncbi_tree.green_genes_lineage(taxid,
                                                           depth=5,
                                                           depth_force=True)
        strain_name_2 = self.ncbi_tree.green_genes_lineage(taxid,
                                                           depth_force=True)

        strain_name_3 = self.ncbi_tree.green_genes_lineage(taxid,
                                                           depth=5,
                                                           depth_force=False)
        strain_name_4 = self.ncbi_tree.green_genes_lineage(taxid,
                                                           depth_force=False)

        strain_name_5 = self.ncbi_tree.green_genes_lineage(taxid,
                                                           depth=8,
                                                           depth_force=True)
        strain_name_6 = self.ncbi_tree.green_genes_lineage(taxid,
                                                           depth=8,
                                                           depth_force=False)

        # Test the null pointer
        strain_name_7 = self.ncbi_tree.green_genes_lineage(taxid, depth=8)
コード例 #3
0
def main():
    parser = make_arg_parser()
    args = parser.parse_args()

    sam_files = [
        os.path.join(args.input, filename)
        for filename in os.listdir(args.input) if filename.endswith('.sam')
    ]

    img_map = IMGMap()

    ncbi_tree = NCBITree()

    with open(args.output, 'w') if args.output else sys.stdout as outf:
        csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n')
        csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id'])
        for file in sam_files:
            with open(file) as inf:
                lca_map = build_lca_map(yield_alignments_from_sam_inf(inf),
                                        ncbi_tree, img_map)
                for key in lca_map:
                    img_ids, ncbi_tid = lca_map[key]
                    csv_outf.writerow([
                        os.path.basename(file)[:-4], key, ncbi_tid,
                        ','.join(img_ids)
                    ])
コード例 #4
0
def add_green_genes_tax_to_gb_accession(input, output):
    # Load the taxonomy
    nt = NCBITree()

    # Skip header
    next(input)

    output_csv = csv.writer(output, delimiter="\t")
    # Write header
    output_csv.writerow(["gb_accession", "taxid", "green_genes_taxonomy"])
    csv_input = csv.reader(input, delimiter="\t")
    for row in csv_input:
        out_row = row + [0]
        taxid = int(row[1])
        out_row[2] = nt.green_genes_lineage(taxid, depth=8, depth_force=True)
        output_csv.writerow(out_row)
コード例 #5
0
def get_tids(gbk, gbk_file, outf, nt=NCBITree()):
    dict_list = []
    with open(gbk_file, 'r') as inf:
        y = re.compile(r"^(DEFINITION)\s\s(.*)$")
        p = re.compile(r"^(\s+)\/db_xref=\"taxon:(\d+)\"")
        for line in inf:
            if line.startswith('DEFINITION'):
                y_m = y.search(line)
                prod = str(y_m.group(2))
                prod = '_'.join(prod.split(' '))
                continue
            if line.startswith('                     /db_xref="taxon:'):
                m = p.search(line)
                ncbi_tid = int(m.group(2))
                organism = nt.green_genes_lineage(ncbi_tid,
                                                  depth=8,
                                                  depth_force=True)
                dict_list = [ncbi_tid, organism]
                outf.write('ncbi_tid|' + str(ncbi_tid) + '|mibig|' + gbk +
                           '.1_cluster001' + '|organism|' + organism + '\t' +
                           prod + '\n')
                break
        if not dict_list:
            print(gbk + ' failed to find tid')
            return ['None', 'None']
        else:
            return dict_list
コード例 #6
0
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth,
                  depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id,
                                              prefixes,
                                              db,
                                              tree,
                                              depth=depth,
                                              depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id,
                                          prefixes,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id,
                                          db,
                                          tree,
                                          depth=depth,
                                          depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(
                            inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print(
            "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file."
            % (outf_fasta, outf_map))

    # Build the output BT2 database
    verify_make_dir(os.path.join(output, 'bt2'))
    print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
コード例 #7
0
def main():
	parser = make_arg_parser()
	args = parser.parse_args()
	nt_cat = os.path.join(args.nt_cat)
	gbkpath = os.path.join(args.input)
	outpath = os.path.join(args.output)
	if args.just_compile:
		compile_files(outpath)
		sys.exit()
	if not os.path.isdir(outpath):
		os.mkdir(os.path.join(outpath))
		if not os.path.isdir(outpath):
			print('\nError creating output directory; check given path and try again\n')
			sys.exit()
	logfile = os.path.join(outpath, 'scrapelog.log')
	logging.basicConfig(filename=logfile, level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
	gbks = os.listdir(gbkpath)
	gbks = [f for f in gbks if f.endswith('gbk')]
	with open(nt_cat, 'r') as nt_catalog:
		gbk_dd = defaultdict(list)
		reader = csv.reader(nt_catalog, delimiter='\t')
		next(reader)
		nt = NCBITree()
		gbk_set = set()
		for gbk_file in gbks:
			gbk_id = gbk_file.split('.cluster')[0]
			gbk_set.add(gbk_id)
		for line in reader:
			if line[1] in gbk_set:
				tid = line[2]
				organism = tid_to_name(tid, nt=nt)
				# print(line[1] + tid + organism)
				gbk_dd[line[1]] = [tid, organism]
	i = 0
	for gbk_file in gbks:
		gbk_id = gbk_file.split('.cluster')[0]
		tid_org = gbk_dd[gbk_id]
		if not tid_org:
			print('Error getting taxonomy for %s for cluster file %s' % (gbk_id, gbk_file))
			logging.warning('Error getting taxonomy for %s for cluster file %s' % (gbk_id, gbk_file))
			tid_org = ['na', 'k__None;p__None;c__None;o__None;f__None;g__None;s__None;t__None']
			i += 1
		# print(tid_org)
		# ncbi_tid = str(tid_org[0])
		# organism = str(tid_org[1])
		gbk_filepath = os.path.join(gbkpath, gbk_file)
		parse_aa_seqs(gbk_file, tid_org, gbk_filepath, outpath)
		parse_dna_seqs(gbk_file, tid_org, gbk_filepath, outpath)
	parse_cluster_types(gbkpath, outpath, gbk_dd)
	if not args.no_compile:
		compile_files(outpath)
	logging.warning('DOJO could not acquire NCBI tid information for %s clusters' % i)
コード例 #8
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')

            lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree)

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
コード例 #9
0
def download_refseq_all(verbose):
    pool = multiprocessing.Pool(processes=4)
    rf = RefSeqDatabase()
    data = rf.get_blaze()
    tree = NCBITree()
    specified_kingdoms = {'k__Bacteria', 'k__Viruses', 'k__Archaea'}
    kingdoms = []

    ftp_view = data.tree[data.tree.ftp != ''
                         and data.tree.refseq_version != '']
    ftp_links = yield_ftp_links(ftp_view, specified_kingdoms, tree)
    # ftp_test = [next(ftp_links) for _ in range(10)]

    pool.map(download_ftp_link, ftp_links)
    print('Done')
コード例 #10
0
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force):
    verify_make_dir(output)
    # Verify the FASTA is annotated
    if input == '-':
        output_fn = 'stdin'
    else:
        output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1])

    outf_fasta = os.path.join(output, output_fn + '.annotated.fna')
    outf_map = os.path.join(output, output_fn + '.annotated.map')
    if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map):
        tree = NCBITree()
        db = RefSeqDatabase()

        if annotater == 'refseq':
            annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'nt':
            annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force)
        elif annotater == 'ncbi':
            annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force)
        else:
            annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force)

        with open(outf_fasta, 'w') as output_fna:
            with open(outf_map, 'w') as output_map:
                with open(input) as inf:
                    inf_fasta = FASTA(inf)
                    for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()):
                        output_fna.write(lines_fna)
                        output_map.write(lines_map)
    else:
        print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (
            outf_fasta, outf_map))

    # Build the output CTR
    verify_make_dir(os.path.join(output, 'utree'))
    path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr')
    path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr')
    if os.path.exists(path_compressed_tree):
        print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree)
    else:
        if not os.path.exists(path_uncompressed_tree):
            print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads))
        print(utree_compress(path_uncompressed_tree, path_compressed_tree))
        os.remove(path_uncompressed_tree)
コード例 #11
0
 def test(self):
     ncbi_tree = NCBITree()
     # Try LCA with a null-pointer
     lca = ncbi_tree.lowest_common_ancestor(391904, -10)
     print(lca)
コード例 #12
0
def tid_to_name(tid, nt=NCBITree()):
	tid = int(tid)
	organism = nt.green_genes_lineage(tid, depth=8, depth_force=True)
	return organism
コード例 #13
0
ファイル: test_ncbi_tree.py プロジェクト: knights-lab/DOJO
 def test(self):
     ncbi_tree = NCBITree()
     # Try LCA with a null-pointer
     lca = ncbi_tree.lowest_common_ancestor(391904, -10)
     print(lca)
コード例 #14
0
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    # Create a SAM file for each input FASTA file
    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    img_map = IMGMap()

    for basename in basenames:
        sam_inf = os.path.join(output, basename + '.sam')
        step_outf = 'test'
        if os.path.isfile(step_outf):
            print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf)
        else:
            lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), )

    sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')]

    img_map = IMGMap()

    ncbi_tree = NCBITree()
    lca = LCA(ncbi_tree, args.depth)

    with open(args.output, 'w') if args.output else sys.stdout as outf:
        csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n')
        csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id'])
        for file in sam_files:
            with open(file) as inf:
                lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map)
                for key in lca_map:
                    img_ids, ncbi_tid = lca_map[key]
                    csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)])

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth - 1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
コード例 #15
0
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta,
                          reference_map, extract_ncbi_tid, depth, threads):
    verify_make_dir(output)

    fna_files = [
        os.path.join(input, filename) for filename in os.listdir(input)
        if filename.endswith('.fna')
    ]

    for fna_file in fna_files:
        sam_outf = os.path.join(
            output,
            '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam')
        print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads))

    tree = NCBITree()
    begin, end = extract_ncbi_tid.split(',')

    sam_files = [
        os.path.join(output, filename) for filename in os.listdir(output)
        if filename.endswith('.sam')
    ]
    lca_maps = {}
    for sam_file in sam_files:
        lca_map = {}
        for qname, rname in yield_alignments_from_sam_inf(sam_file):
            ncbi_tid = int(find_between(rname, begin, end))
            if qname in lca_map:
                current_ncbi_tid = lca_map[qname]
                if current_ncbi_tid:
                    if current_ncbi_tid != ncbi_tid:
                        lca_map[qname] = tree.lowest_common_ancestor(
                            ncbi_tid, current_ncbi_tid)
            else:
                lca_map[qname] = ncbi_tid

        lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth),
                         lca_map)
        # filter out null values
        lca_maps['.'.join(os.path.basename(sam_file).split('.')
                          [:-1])] = reverse_collision_dict(lca_map)

    for basename in lca_maps.keys():
        lca_maps[basename] = valmap(lambda val: (basename, val),
                                    lca_maps[basename])

    lca_map_2 = defaultdict(list)
    for basename in lca_maps.keys():
        for key, val in lca_maps[basename].items():
            if key:
                lca_map_2[key].append(val)

    fna_faidx = {}
    for fna_file in fna_files:
        fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file)

    dict_reference_map = defaultdict(list)
    with open(reference_map) as inf:
        tsv_in = csv.reader(inf, delimiter='\t')
        for line in tsv_in:
            dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

    # reverse the dict to feed into embalmer
    references_faidx = pyfaidx.Fasta(reference_fasta)

    tmpdir = tempfile.mkdtemp()
    with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat:
        for key in lca_map_2.keys():

            queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
            references_fna_filename = os.path.join(tmpdir, 'reference.fna')
            output_filename = os.path.join(tmpdir, 'output.txt')

            with open(queries_fna_filename, 'w') as queries_fna:
                for basename, headers in lca_map_2[key]:
                    for header in headers:
                        record = fna_faidx[basename][header][:]
                        queries_fna.write('>filename|%s|%s\n%s\n' %
                                          (basename, record.name, record.seq))

            with open(references_fna_filename, 'w') as references_fna:
                for i in dict_reference_map[key]:
                    record = references_faidx[i][:]
                    references_fna.write('>%s\n%s\n' %
                                         (record.name, record.seq))

            embalmer_align(queries_fna_filename, references_fna_filename,
                           output_filename)

            with open(output_filename) as embalmer_out:
                for line in embalmer_out:
                    embalmer_cat.write(line)

            os.remove(queries_fna_filename)
            os.remove(references_fna_filename)
            os.remove(output_filename)

    os.rmdir(tmpdir)

    sparse_ncbi_dict = defaultdict(dict)

    # build query by NCBI_TID DataFrame
    with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))