def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	db = RefSeqDatabase()
	nt = NCBITree()
    # parse command line
	with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
		fasta_gen = FASTA(inf)
		assembly_version = os.path.basename(args.input).split('_genomic')[0]
		with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
			for header, sequence in fasta_gen.read():
				if '.cluster' in header:
					header = header.replace('.cluster','_cluster')
				else:
					pass
				ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0])
				if ncbi_tid:
					ncbi_tid = ncbi_tid[0]
					organism = nt.gg_lineage(ncbi_tid)
					genus_species = organism.split(';')[-1]
					genus_species = genus_species.replace('s__','')
					outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, genus_species))
					outf.write(sequence+'\n')
				else:
					outf.write('>ref|%s|\n' % (header))
					outf.write(sequence+'\n')
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	db = RefSeqDatabase()
	nt = NCBITree()
	# parse command line
	with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
		fasta_gen = FASTA(inf)
		assembly_version = os.path.basename(args.input).split('_genomic')[0]
		with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
			for header, sequence in fasta_gen.read():
				if '.cluster' in header:
					header = header.replace('.cluster','_cluster')
				else:
					pass
				ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split('_cluster')[0])
				if ncbi_tid:
					ncbi_tid = ncbi_tid[0]
					organism = nt.gg_lineage(ncbi_tid)
					# genus_species = organism.split(';')[-1]
					# genus_species = genus_species.replace('s__','')
					outf.write('>ncbi_tid|%d|ref|%s|organism|%s|\n' % (ncbi_tid, header, organism))
					outf.write(sequence+'\n')
				else:
					outf.write('>ref|%s|\n' % (header))
					outf.write(sequence+'\n')
def main():
    parser = make_arg_parser()
    args = parser.parse_args()

    db = RefSeqDatabase()
    # parse command line
    with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
        fasta_gen = FASTA(inf)
        assembly_version = os.path.basename(args.input).split('_genomic')[0]
        with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
            for header, sequence in fasta_gen.read():
                ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(header.split()[0])[0]
                outf.write('>ncbi_tid|%d|assembly|%s|ref|%s\n' % (ncbi_tid, assembly_version, header))
                outf.write(sequence+'\n')
Exemple #4
0
def refseq_to_tid(refseq_id, db=RefSeqDatabase()):
    ncbi_tid = db.get_ncbi_tid_from_refseq_accession(refseq_id)
    if ncbi_tid:
        ncbi_tid = ncbi_tid[0]
    else:
        ncbi_tid = refseq_id  # if DOJO fails to find the tid, just return the refseq accession ID.
    return ncbi_tid
Exemple #5
0
def list_organisms(ofus, hclus, nt_cat, typetable, outpath, cut_h):
    bgc_dd = defaultdict(list)
    for value, key in hclus.itertuples(index=True):
        key = str('%05d' % key)
        bgc_dd[key].extend(list([value]))
    ofu_list = ofus.split(',')
    i = 0
    # Preload the Database and Tree
    db = RefSeqDatabase()
    nt = NCBITree()
    for ofu in ofu_list:
        ofu = str(ofu)
        if ofu.startswith('ofu'):
            ofu_n = str(ofu.replace('ofu', ''))
        elif ofu.startswith('ofu_'):
            ofu_n = str(ofu.replace('ofu', ''))
        else:
            ofu_n = ofu
        bgcs = bgc_dd[ofu_n]
        name_dict = defaultdict(list)
        with suppress_stdout():
            for bgc in bgcs:
                if bgc.startswith('ncbi_tid'):
                    ncbi_tid = bgc.split('|')[1]
                    if ncbi_tid == 'na':
                        name = bgc.split('|')[3]
                    else:
                        ncbi_tid = int(ncbi_tid)
                        name = nt.green_genes_lineage(ncbi_tid,
                                                      depth=8,
                                                      depth_force=True)
                elif '|genbank|' in bgc:
                    gbk_id = bgc.split('|')[3].split('_cluster')[0]
                    if nt_cat == '-':
                        sys.exit(
                            'Genbank ID BGC headers require an NT Catalog for annotation... see --help'
                        )
                    tid, organism = genbank_id_to_tid(gbk_id, nt_cat)
                    name = organism
                else:
                    refseqid = '_'.join(bgc.split('_')[:2])
                    name = refseq_to_name(refseqid, db=db, nt=nt)
                if typetable is not False:
                    ctype = typetable.filter(like=bgc, axis=0)
                    ctype = str(ctype.iloc[0, 0])
                else:
                    ctype = 'NA'
                if bgc == name:
                    name_dict[bgc] = [ctype, refseqid]
                else:
                    name_dict[bgc] = [ctype, name]
        ofu_file = ''.join(['ofu', ofu_n, '_id', cut_h, '.txt'])
        with open(os.path.join(outpath, ofu_file), 'w') as outf:
            outdf = pd.DataFrame.from_dict(name_dict, orient='index')
            outdf.columns = ['predicted_type', 'organism']
            outdf.to_csv(outf, sep='\t')
        i += 1
    print('\nOrganism information for %d OFUs written to file.\n' % i)
    return bgc_dd
Exemple #6
0
def refseq_to_name(refseq_id, db=RefSeqDatabase(), nt=NCBITree()):
    ncbi_tid = db.get_ncbi_tid_from_refseq_accession(refseq_id)
    if ncbi_tid:
        ncbi_tid = ncbi_tid[0]
        organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True)
    else:
        organism = refseq_id  # if DOJO fails to find the tid, just return the refseq accession ID.
    return organism
def main():
    parser = make_arg_parser()
    args = parser.parse_args()

    db = RefSeqDatabase()
    # parse command line
    with open(args.input, 'r') if args.input != '-' else sys.stdin as inf:
        fasta_gen = FASTA(inf)
        assembly_version = os.path.basename(args.input).split('_genomic')[0]
        with open(args.output,
                  'w') if args.output != '-' else sys.stdout as outf:
            for header, sequence in fasta_gen.read():
                ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(
                    header.split()[0])[0]
                outf.write('>ncbi_tid|%d|assembly|%s|ref|%s\n' %
                           (ncbi_tid, assembly_version, header))
                outf.write(sequence + '\n')
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	db = RefSeqDatabase()
	nt = NCBITree()
	# parse command line
	with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
		if args.assembly != '-':
			ncbi_tid = db.get_ncbi_tid_from_assembly_accession_version(args.assembly)[0]
		elif args.refseq != '-':
			ncbi_tid = db.get_ncbi_tid_from_refseq_accession(args.refseq)[0]
		elif args.tid != '-':
			ncbi_tid = int(args.tid)
		organism = nt.green_genes_lineage(ncbi_tid)
# 		genus_species = organism.split(';')[-1]
# 		genus_species = genus_species.replace('s__','')
		outf.write('>ncbi_tid|%d|organism|%s\n' % (ncbi_tid, organism))
		outf.write('\n')
Exemple #9
0
def list_organism_ofus(orgs, nt_cat, hclus, height, outpath):
    bgc_dd = defaultdict(list)
    for value, key in hclus.itertuples(index=True):
        key = str('%05d' % key)
        key = ''.join(['ofu', key])
        bgc_dd[key].extend(list([value]))
    orgs_list = orgs.split(',')
    i = len(orgs_list)
    ofu_dict = defaultdict(list)
    # Preload the Database and Tree
    db = RefSeqDatabase()
    nt = NCBITree()
    for org in orgs_list:
        # print(org)
        org_ofu_dup = []
        for ofu_num, ofu_orgs in bgc_dd.items():
            for ofu_org in ofu_orgs:
                if org in ofu_org:
                    if org.startswith('BGC'):
                        name = org
                    else:
                        name = identify_organism(org, nt_cat, db=db, nt=nt)
                    if ofu_num not in org_ofu_dup:
                        org_ofu_dup.append(ofu_num)
                        ofu_dict[name].extend([ofu_num])
                    else:
                        continue
    height = str(height)
    ofu_file = ''.join(['OFUs_from_similarity_level', height, '.txt'])
    outdf = pd.DataFrame.from_dict(ofu_dict, orient='index')
    if not outdf.empty:
        with open(os.path.join(outpath, ofu_file), 'w') as outf:
            outdf.to_csv(outf, sep='\t', header=False)
        print(
            '\nOFU assigments written to file for the %d organism ID entries given.\n'
            % i)
    else:
        print(
            '\nNo OFU assignments found; check RefSeq / Genbank / MIBiG identifier and format'
        )
    return ofu_dict
Exemple #10
0
def assembly_to_tid(assembly):
    db = RefSeqDatabase()
    ncbi_tid = db.get_ncbi_tid_from_assembly_accession_version(assembly)[0]
    return ncbi_tid
Exemple #11
0
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    if os.getcwd().split('/')[-1] == 'antismash_results':
        for rdir in os.listdir('.'):
            if rdir.startswith('GCF'):
                parse_products(rdir)
            else:
                pass
    else:
        print(
            '\nNOTE:\nProgram may fail if you are not in a proper antismash_results directory\n'
        )
        for rdir in os.listdir('.'):
            if rdir.startswith('GCF'):
                parse_products(rdir)
            else:
                pass
    if args.compile:
        if "compiled_cluster_types" not in os.listdir('.'):
            os.mkdir("compiled_cluster_types")
        outfilename = 'compiled_cluster_types.csv'
        with open(os.path.join("compiled_cluster_types", outfilename),
                  'w') as outf:
            for cdir in os.listdir('.'):
                if cdir.startswith('GCF'):
                    if "cluster_types" not in os.listdir(cdir):
                        pass
                    else:
                        outf = compile_types(cdir, outf)
            outf.close()
    if args.annotate:
        if not args.compile:
            print(
                '\nSORRY:\nAnnotation only available with compiled option (-c)\n'
            )
            quit()
        else:
            # Preload the Database and Tree
            db = RefSeqDatabase()
            nt = NCBITree()
            strain_label = []
            with open(
                    os.path.join('compiled_cluster_types',
                                 'compiled_cluster_types.csv')) as intab:
                odf = pd.read_csv(intab, index_col=0, header=None)
                refseq_list = list(odf.index)
                for refseq_id in refseq_list:
                    organism = refseq_to_name(refseq_id, db=db, nt=nt)
                    ncbi_tid = refseq_to_tid(refseq_id, db=db)
                    ncbi_tid = str(ncbi_tid)
                    # genus_species = organism.split(';')[-1]
                    # genus_species = genus_species.replace('s__', '')
                    if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
                        strain_label.append(refseq_id)
                    else:
                        strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' %
                                            (ncbi_tid, refseq_id, organism))
                odf.index = strain_label
                odf.columns = ['cluster_type']
                an_outn = 'annotated_cluster_types.csv'
            with open(os.path.join('compiled_cluster_types', an_outn),
                      'w') as an_outf:
                odf.to_csv(an_outf)
    else:
        pass
def main():
	parser = make_arg_parser()
	args = parser.parse_args()

	# Parse command line
	method = args.method
	height = 1 - (args.height / 100)
	with open(args.input, 'r') as inf:
		if args.clusterme:
			print('...performing hierarchical clustering, tree cut at height of %s...\n' % args.height)
			hclus = process_hierarchy(inf, height, method)
		else:
			hclus = pd.read_csv(inf, sep=',', header=0, index_col=0)
		size = hclus.max(0)[0]  # get the total number of clustered OFUs (depends on height cut)
		print('\n...Preparing OFU profile for %s OFUs...\n' % size)
		size += 1
		fill = outer(size)
		dd = defaultdict(fill)  # Initialize the dict with all zeros
		# Collapse into an OFU reference table, strains vs OFUs
		if args.clusterme:
			hclus.to_csv('hcsv_temp.csv')
			with open('hcsv_temp.csv', 'r') as inf2:
				df = cluster_ofus(inf2, dd)
		else:
			with open(args.input, 'r') as inf2:
				df = cluster_ofus(inf2, dd)
		j = 0
		k = 0
		if args.annotate:
			# Preload the Database and Tree
			db = RefSeqDatabase()
			nt = NCBITree()
			strain_label = []
			refseq_list = list(df.index)
			for refseq_id in refseq_list:
				if refseq_id.startswith('ncbi_tid'):
					ncbi_tid = refseq_id.split('|')[1]
					if ncbi_tid == 'na':
						genbank = '|'.join(refseq_id.split('_')[1].split('|')[2:4])
						j += 1
					else:
						ncbi_tid = int(ncbi_tid)
						organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True)
					if organism == 'k__;p__;c__;o__;f__;g__;s__;t__' and ncbi_tid != 'na':
						strain_label.append('|'.join(['ncbi_tid', str(ncbi_tid)]))
						k += 1
					elif ncbi_tid == 'na':
						strain_label.append(genbank)
					else:
						strain_label.append(organism)
				else:
					# TODO: Finish the regex for refseq id
					# p = re.compile(r"N\w\_[\w+\d+]*\.\d")
					# m = p.search(refseq_id)  # searches using the regex defined above
					# refseq_id_extract = ''.join(m)

					organism = refseq_to_name(refseq_id, db=db, nt=nt)
					ncbi_tid = refseq_to_tid(refseq_id, db=db)
					ncbi_tid = str(ncbi_tid)
					if args.taxonomy:
						if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
							strain_label.append(refseq_id)
						else:
							strain_label.append(organism)
					elif args.ncbitid:
						if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
							strain_label.append(refseq_id)
						else:
							strain_label.append(ncbi_tid)
					else:
						if ncbi_tid == organism:  # sometimes DOJO can't look up the refseq accession; in this case, just return refseq.
							strain_label.append(refseq_id)
						elif organism.endswith('None') or organism.endswith('t__'):
							genus_species = organism.split(';')[-2]
							# genus_species = genus_species.strip('s__')
							strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, genus_species))
						else:
							strain = organism.split(';')[-1]
							# strain = strain.strip('t__')
							strain_label.append('ncbi_tid|%s|ref|%s|organism|%s' % (ncbi_tid, refseq_id, strain))
			df.index = strain_label
			df.sort_index(axis=0, inplace=True)
			if j > 0 or k > 0:
				print('Note: Organism information was not obtained for all clusters:\n')
				if j > 0:
					print('%s clusters had no NCBI tid...\n' % j)
				if k > 0:
					print('%s clusters did not match a full named taxonomy annotation\n' % k)
		else:
			pass

	with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
		df.to_csv(outf)
	print('...all done, cleaning up...\n')
	os.remove('hcsv_temp.csv')