Exemple #1
0
def analyze():
    parser = get_aln_option_parser()
    namespace = parser.parse_args()
    db = sqlite3.connect(namespace.db_file)
    cursor = db.cursor()
    empty_reads_file = os.path.sep.join(
        [namespace.output_dir, 'no_alignment_reads.fa'])
    basic_stats_file = os.path.sep.join(
        [namespace.output_dir, 'basic_alignment_stats.txt'])
    phylum_composition_file = os.path.sep.join(
        [namespace.output_dir, 'phylum_composition.csv'])
    no_otu_reads_file = os.path.sep.join(
        [namespace.output_dir, 'no_OTU_assignment_reads.fa'])
    otu_csv = os.path.sep.join([namespace.output_dir, 'OTU_assignment.csv'])
    otu_json = os.path.sep.join([namespace.output_dir, 'OTU_assignment.json'])

    with timeit('LOADING TAX TREE'):
        tt = TaxTree()
    read2tax, low_count_taxa = analysis.perform_OTU_analysis_on_db(
        db, cursor, tt, empty_reads_file, basic_stats_file,
        phylum_composition_file)
    taxa = set(read2tax.values())
    num_alns = len(read2tax)

    no_otu_reads = filter(lambda r: read2tax[r] == -1, read2tax.keys())
    save_reads_to_fasta(db, cursor, no_otu_reads, no_otu_reads_file)

    export_OTU_to_csv(read2tax, tt, otu_csv)
    export_OTU_to_json(read2tax, tt, otu_json)

    db.close()
Exemple #2
0
def metasim_analysis():
    parser = get_metasim_eval_parser()
    args = parser.parse_args()

    tt = TaxTree()
    metasim_json = '{0}{1}metasim-species-reads.json'.format(
        args.output_dir, os.path.sep)
    synth_dataset_profile(args.metasim_fasta, tt, metasim_json)

    reads_comp_json = '{0}{1}metagenomix-metasim-reads-comparison.json'.format(
        args.output_dir, os.path.sep)
    out.read_comparison_json(args.metagenomix_json, metasim_json,
                             reads_comp_json, tt)
    transcript_comp_json = '{0}{1}metagenomix-metasim-transcript-comparison.json'.format(
        args.output_dir, os.path.sep)
    out.transcript_comparison_json(args.metagenomix_json, metasim_json,
                                   transcript_comp_json, tt)

    template_html = '/home/abulovic/BINNER/tools/metagenomix/metagenomix/visualization/metasim-metagenomix-template.html'
    reads_output_html = '{0}{1}metagenomix-metasim-reads-comparison.html'.format(
        args.output_dir, os.path.sep)
    out.default_d3_plot(reads_comp_json, template_html, reads_output_html, 2)

    transcripts_output_html = '{0}{1}metagenomix-metasim-transcript-comparison.html'.format(
        args.output_dir, os.path.sep)
    out.default_d3_plot(transcript_comp_json, template_html,
                        transcripts_output_html, 2)
Exemple #3
0
def lca():
	parser = get_OTU_assign_option_parser()
	args = parser.parse_args()
	if not os.path.exists(args.output_dir):
		os.makedirs(args.output_dir)

	with Report(args.output_dir, 'host-microbe') as report:
		file_type = utils.get_file_type(args.input_file)

		with report.timeit('Parsing %s <%s>' % (args.aln_file, utils.get_appropriate_file_size(args.aln_file))):
			if file_type == 'blast':
				read_alns, target_seqs = blast.parse_tab_delimited(args.input_file, args.db_type)
			elif file_type in ('sam', 'bam'):
				binary = True if file_type == 'bam' else False
				read_alns, target_seqs = sam.parse_cds_sam(args.input_file, args.db_type, binary)
			else:
				raise ValueError('%s alignment format not supported!' % file_type)

		data_access = DataAccess()
		with report.timeit('GI2TAX database querying'):
			gis = set(map(lambda t: t.gi, target_seqs.itervalues()))
			missing_gis = set()
			tax_ids = data_access.get_taxids(gis)
			for target in target_seqs.itervalues():
				target.tax_id = tax_ids.get(target.gi, -1)
				if target.tax_id == -1:
					missing_gis.add(target.gi)
			report.mark('gi count : %d' % len(gis))
			report.mark('tax count: %d' % len(tax_ids))
			report.mark('Number of gis for which no TAXID could be found: %d' % len(missing_gis))

		with report.timeit('Loading tax tree'):
			tt = TaxTree()
			report.mark('Loaded %d nodes.' % len(tt.nodes))

		with report.timeit('lca read assignment'):
			tax2reads = otu.lca_read_assign(target_seqs, read_alns, tt, max_alns=50)
			tax2count = dict(map(lambda i: (i[0], len(i[1])), tax2reads.items()))

		with report.timeit('tax stats'):
			species2reads = utils.get_rank_read_distribution(tax2reads, tt, 'species')
			report.tax2reads(tax2reads, tt, 'lca', 'json')
			report.rank_distribution(species2reads, tt, 'lca')
Exemple #4
0
def parse():
    parser = get_parse_option_parser()
    namespace = parser.parse_args()
    if namespace.type != 'cds':
        raise NotImplementedError('Genome parsing will be up in a jiffy.')

    file_type = get_file_type(namespace.input_file)
    with timeit('Parsing alignment file'):
        if file_type == 'blast':
            read_alns, transcripts = parse_cds_megablast(
                namespace.input_file, namespace.output_dir)
        elif file_type == 'sam':
            read_alns, transcripts = parse_cds_sam(namespace.input_file,
                                                   namespace.output_dir,
                                                   binary=False)
        elif file_type == 'bam':
            read_alns, transcripts = parse_cds_sam(namespace.input_file,
                                                   namespace.output_dir,
                                                   binary=True)

    with timeit('Loading tax tree'):
        tax_tree = TaxTree()
    spec2trans = get_species_transcript_distribution(transcripts, tax_tree)
    for spec, trans in spec2trans.iteritems():
        if len(filter(lambda t: t.total_coverage > 0.9, trans)) > 0:
            if spec <= 0:
                continue
            int1 = filter(lambda t: t.total_coverage > 0.9, trans)
            print tax_tree.nodes[spec].organism_name,
            print len(trans)
            print ' '.join(
                map(
                    lambda t: "(%.3f, %.3f)" %
                    (t.total_coverage, t.coverage_fold), int1))
            print
    get_read_aln_distribution(
        read_alns, os.path.sep.join([namespace.output_dir, 'plot.png']))
    get_spec_transcript_distribution(
        spec2trans, os.path.sep.join([namespace.output_dir, 'plot2.png']))
Exemple #5
0
def greedy():
	parser = get_OTU_assign_option_parser()
	args = parser.parse_args()

	if args.read_count != 0:
		total_read_count = args.read_count

	all_reads = utils.reads_from_fasta(args.original_fasta)

	with Report(args.output_dir, 'microbe') as report:

		file_type = utils.get_file_type(args.input_file)

		with report.timeit('Parsing %s <%s>' % (args.input_file, utils.get_appropriate_file_size(args.input_file))):
			if file_type == 'blast':
				count_entries = blast.get_entry_cnt_tab
				parse_func = blast.parse_tab_delimited
			elif file_type in ('sam', 'bam'):
				count_entries = sam.get_entry_cnt_sam
				parse_func = sam.parse_cds_sam
			elif file_type == 'xml':
				count_entries = blast.get_entry_cnt_xml
				parse_func = blast.parse_xml
			else:
				raise ValueError('%s alignment format not supported!' % file_type)

			with utils.timeit('Retrieving entry count'):
				entry_cnt = count_entries(args.input_file)
				report.mark('\tTotal entries: %d' % entry_cnt)
			with utils.timeit('File parsing'):
				read_alns, target_seqs = parse_func(args.input_file, args.db_type, entry_cnt=entry_cnt, detailed=True)
				report.mark('\tTarget sequences: %d' % len(target_seqs))

		with report.timeit('GI2TAX database querying'):
			utils.retrieve_tax_ids(target_seqs, args.db_type)

		with report.timeit('loading tax tree'):
			tt = TaxTree()
			report.mark('Loaded %d nodes.' % len(tt.nodes))

		#profiling.get_read_overlap(target_seqs, read_alns, tt, all_reads)
		#profiling.sequential_read_set_analysis(read_alns, target_seqs, tt)

		coverage_limit = 0.6
		fold_limit = 1.
		with report.timeit('greedy transcript assignment'):
			report.mark('Coverage threshold: %.2f' % coverage_limit)
			report.mark('Fold threshold: %.2f' % fold_limit)
			report.mark('#Transcripts (before read assignment)  : %d' % len(target_seqs))
			prefilt_transcripts = otu.greedy_transcript_assign(target_seqs, read_alns)
			report.mark('#Transcripts (after read assignemnt)   : %d' % len(prefilt_transcripts))
			#if args.db_type == 'cds':
			final_transcripts = otu.filter_by_coverage_fold(prefilt_transcripts, 0.6, 1.)
			#else:
			#	final_transcripts = prefilt_transcripts
			report.mark('#Transcripts (after cov-fold filtering): %d' % len(final_transcripts))
			total_reads = len(read_alns)

		with report.timeit('Species assignment stats'):
			s2t_nofilt = utils.get_species_transcript_distribution(target_seqs, tt)
			s2t_greedy = utils.get_species_transcript_distribution(final_transcripts, tt)
			report.mark('#Species (pre-read-assignment) : %d' % len(s2t_nofilt))
			report.mark('#Species (post-read-assignment): %d' % len(s2t_greedy))
			report.rank_distribution(s2t_nofilt, tt, 'nofilt')
			report.rank_distribution(s2t_greedy, tt, 'greedy')
			report.tax_tree(s2t_nofilt, tt, 'nofilt')
			report.tax_tree(s2t_greedy, tt, 'greedy')

		if args.db_type == 'cds':
			new_s2t = otu.remove_orthologue_strains(s2t_greedy)
		else:
			new_s2t = s2t_greedy

		if args.db_type == 'cds':
			with report.timeit('Transcript stats'):
				report.transcript_stats(s2t_nofilt, tt, 'nofilt')
				report.transcript_stats(new_s2t, tt, 'greedy', assigned=True)
				report.tax2reads(new_s2t, tt, 'greedy', 'json')

		if args.db_type == 'cds':
			with report.timeit('Outputing gene expression'):
				report.gene_expression(s2t_nofilt, tt, 'gene_expression_nofilt', assigned=False)
				report.gene_expression(new_s2t, tt, 'gene_expression', assigned=True)

		report.summary(read_alns, s2t_nofilt, new_s2t, args.original_fasta)
Exemple #6
0
def host_greedy():
	parser = get_config_run_parser()
	args = parser.parse_args()

	config = parse_config(args.metagenomix_config)
	out_dir = config["output_dir"]

	data_access = DataAccess()

	with Report(out_dir, 'host-microbe') as report:
		if not config["host_separated"]:
			raise utils.NotSupportedError("Host separation currently not supported")

		with report.timeit('loading tax tree'):
			tt = TaxTree()
			report.mark('Loaded %d nodes.' % len(tt.nodes))

		with report.timeit("Loading original fastas"):
			for i, orig_fasta in enumerate(config["input_seq_files"], 1):
				report.load_original_fasta(orig_fasta)

		host_aln_type = config["host_aln_type"].lower()
		for i, aln_file in enumerate(config["host_alignments"]):
			with report.timeit("Extracting read from host alignment"):
				report.extract_reads_from_aln(aln_file, host_aln_type)

		db_type = config["microorganism_db_type"].lower()
		microbe_aln_type = config["microorganism_aln_type"].lower()

		gi_type = 'nucl_gi' if db_type == 'cds' else 'gi'
		get_gi = lambda t: getattr(t, gi_type)

		read_alns = defaultdict(list)
		target_seqs = {}
		for i, aln_file in enumerate(config["microorganism_alignments"]):
			with report.timeit('File parsing'):
				if microbe_aln_type == 'blast':
					read_alns, target_seqs = blast.parse_tab_delimited(aln_file, db_type, 1e5, read_alns, target_seqs,
						filter_low_scoring=False, annotate=False)
				elif microbe_aln_type in ('sam', 'bam'):
					binary = True if microbe_aln_type == 'bam' else False
					read_alns, target_seqs = sam.parse_cds_sam(aln_file, db_type, binary, annotate=False,
						read_alns=read_alns, target_seqs=target_seqs)
				else:
					raise ValueError('%s alignment format not supported!' % microbe_aln_type)

		# Write to file all the rads with alignments to microbes
		report.output_reads(read_alns.keys(), 'microbe', 'reads.txt')
		if microbe_aln_type == 'blast':
			read_alns, target_seqs = blast.annotate_targets(read_alns, target_seqs)
		elif microbe_aln_type in ('sam', 'bam'):
			read_alns, target_seqs = sam.annotate_targets(read_alns, target_seqs)

		with report.timeit('GI2TAX database querying'):
			gis = set(map(lambda t: get_gi(t), target_seqs.itervalues()))
			tax_ids = data_access.get_taxids(gis)
			missing_gis = set()
			for target in target_seqs.itervalues():
				target.tax_id = tax_ids.get(get_gi(target), -1)
				if target.tax_id == -1:
					missing_gis.add(get_gi(target))
			report.mark('gi count : %d' % len(gis))
			report.mark('tax count: %d' % len(set(tax_ids.values())))
			report.mark('Number of gis for which no TAXID could be found: %d' % len(missing_gis))

		with report.timeit('greedy transcript assignment'):
			report.mark('#Transcripts (before read assignment) : %d' % len(target_seqs))
			prefilt_transcripts = otu.greedy_transcript_assign(target_seqs, read_alns)
			report.mark('#Transcripts (after read assignemnt)  : %d' % len(prefilt_transcripts))
			final_transcripts = otu.filter_by_coverage_fold(prefilt_transcripts, 0.5, 1.)
			hypothetical_ids = otu.find_hypothetical(final_transcripts)
			report.mark('#Transcripts (after cov-fold filtering: %d' % len(final_transcripts))
			hypothetical = {_id: final_transcripts[_id] for _id in hypothetical_ids}
			non_hypothetical_ids = set(final_transcripts) - set(hypothetical)
			non_hypothetical = {_id: final_transcripts[_id] for _id in non_hypothetical_ids}

		with report.timeit('Species assignment stats'):
			s2t_nofilt = utils.get_species_transcript_distribution(target_seqs, tt)
			s2t_greedy_nh = utils.get_species_transcript_distribution(non_hypothetical, tt)
			s2t_ss = otu.remove_orthologue_strains(s2t_greedy_nh)
			s2t_greedy_h = utils.get_species_transcript_distribution(hypothetical, tt)
			report.mark('#Species (pre-read-assignment)   : %d' % len(s2t_nofilt))
			report.mark('#Species (post-read-assignment)  : %d' % len(s2t_greedy_nh))
			report.mark('#Species (after-strain-filtering): %d' % len(s2t_ss))
			report.rank_distribution(s2t_nofilt, tt, 'nofilt')
			report.rank_distribution(s2t_ss, tt, 'greedy')
			report.tax_tree(s2t_nofilt, tt, 'nofilt')
			report.tax_tree(s2t_greedy_nh, tt, 'greedy')

		with report.timeit('Transcript stats'):
			report.transcript_stats(s2t_nofilt, tt, 'nofilt')
			report.transcript_stats(s2t_ss, tt, 'greedy', assigned=True)
			report.transcript_stats(s2t_greedy_h, tt, 'hypothetical', assigned=True)

		with report.timeit('Outputing gene expression'):
			report.gene_expression(s2t_nofilt, tt, 'gene_expression/nofilt', assigned=False)
			report.gene_expression(s2t_ss, tt, 'gene_expression/greedy', assigned=True)
			report.gene_expression(s2t_greedy_h, tt, 'gene_expression/hypothetical', assigned=True)

		with report.timeit('Transcript stats'):
			report.transcript_stats(s2t_nofilt, tt, 'nofilt')
			report.transcript_stats(s2t_ss, tt, 'greedy', assigned=True)
			report.tax2reads(s2t_ss, tt, 'greedy', 'json')

		with report.timeit('Different strain stats'):
			report.strains(s2t_ss, tt, 'greedy')
			report.strains(s2t_greedy_h, tt, 'hypothetical')

		with report.timeit('Outputing gene expression'):
			report.gene_expression(s2t_nofilt, tt, 'microbe/gene_expression_nofilt', assigned=False)
			report.gene_expression(s2t_ss, tt, 'microbe/gene_expression', assigned=True)

		total_reads = list()
		for f in os.listdir(out_dir + '/orig_seqs'):
			fname = out_dir + '/orig_seqs/' + f
			with open(fname) as fin:
				total_reads.extend([line.strip() for line in fin])
		total_reads = set(total_reads)
		report.mark('TOTAL')
		report.output_reads(total_reads, 'orig_seqs', 'reads.txt')

		# Let's load all the host-reads
		host_reads = list()
		host_dir = out_dir + os.path.sep + 'host' + os.path.sep
		for f in os.listdir(host_dir):
			with open(host_dir + os.path.sep + f) as fin:
				host_reads.extend([line.strip() for line in fin])
		host_reads = set(host_reads)
		report.mark('HOST')
		report.output_reads(host_reads, 'host', 'reads.txt')

		# Let's load all the microbial reads
		with open(out_dir + '/microbe/reads.txt') as fin:
			microbial_reads = set([line.strip() for line in fin])
		report.mark('MICROBE')
		report.mark('Read count: %d' % len(microbial_reads))

		common_reads = host_reads & microbial_reads
		report.mark('common_reads')
		report.output_reads(common_reads, '', 'common_reads.txt')


		reads_with_no_aln = total_reads - (host_reads | microbial_reads)
		report.mark('NO ALN')
		report.output_reads(reads_with_no_aln, '', 'no_aln_reads.txt')
Exemple #7
0
def extract_subtaxa():
	parser = get_extract_subtaxa_parser()
	args = parser.parse_args()

	if args.db_type == "cds":
		parse_header = db_utils.parse_cds_header
		def get_taxid(data, data_access):
			return int(data["taxon"])
	elif args.db_type == "genome":
		parse_header = db_utils.parse_genome_header
		def get_taxid(data, data_access):
			taxid = data_access.get_taxids((int(data["gi"]),), format=list)
			if len(taxid) == 0:
				return None
			else:
				return taxid[0]
	else:
		parse_header = db_utils.parse_nt_header

	if args.parent_tax:
		taxa = [args.parent_tax]
	elif args.file:
		with open(args.file) as fin:
			taxa = map(lambda t: int(t), fin.read().strip().split(','))

	tt = TaxTree()

	if args.merge:
		single_file = '{0}{1}db-extract.fa'.format(args.output_dir, os.path.sep)
		handle = open(single_file, 'w')
		write = lambda tax, record: SeqIO.write(record, handle, 'fasta')
		close = lambda: handle.close()
	else:
		tax2handle = {}
		for tax in taxa:
			fname = '{0}{1}{2}.fa'.format(args.output_dir, os.path.sep, tt.get_org_name(tax))
			tax2handle[tax] = open(fname, 'w')
			write = lambda tax, record: SeqIO.write(record, tax2handle[tax], 'fasta')
			close = lambda: map(lambda h: h.close(), tax2handle.values())

	data_access = DataAccess()


	if args.gene_count is None:
		with open(args.input_db, 'r') as fin:
			records = SeqIO.parse(fin, 'fasta')
			for record in records:
				data = parse_header(record.name)
				taxid = get_taxid(data, data_access)
				if taxid is None:
					continue
				for pt in taxa:
					if tt.is_child(taxid, pt) or taxid == pt:
						write(pt, record)
	else:
		tax2genes = defaultdict(set)
		interesting_taxa = set(taxa)
		with open(args.input_db) as fin:
			records = SeqIO.parse(fin, 'fasta')
			for record in records:
				if not interesting_taxa:
					break
				data = parse_header(record.name)
				taxid = get_taxid(data, data_access)
				if taxid is None:
					continue
				lineage = tt.get_lineage(taxid)
				overlap = set(lineage) & interesting_taxa
				if not overlap:
					continue
				parent_tax = overlap.pop()
				if 'product' not in data:
					continue
				product = data['product']
				if product != 'hypothetical protein' and product not in tax2genes[parent_tax]:
					write(parent_tax, record)
					tax2genes[parent_tax].add(product)
					if len(tax2genes[parent_tax]) >= args.gene_count:
						interesting_taxa.remove(parent_tax)
	close()
Exemple #8
0
def msim_profile():
    parser = get_msim_profile_parser()
    args = parser.parse_args()

    tt = TaxTree()
    synth_dataset_profile(args.metasim_fasta, tt, args.output_json)