def analyze(): parser = get_aln_option_parser() namespace = parser.parse_args() db = sqlite3.connect(namespace.db_file) cursor = db.cursor() empty_reads_file = os.path.sep.join( [namespace.output_dir, 'no_alignment_reads.fa']) basic_stats_file = os.path.sep.join( [namespace.output_dir, 'basic_alignment_stats.txt']) phylum_composition_file = os.path.sep.join( [namespace.output_dir, 'phylum_composition.csv']) no_otu_reads_file = os.path.sep.join( [namespace.output_dir, 'no_OTU_assignment_reads.fa']) otu_csv = os.path.sep.join([namespace.output_dir, 'OTU_assignment.csv']) otu_json = os.path.sep.join([namespace.output_dir, 'OTU_assignment.json']) with timeit('LOADING TAX TREE'): tt = TaxTree() read2tax, low_count_taxa = analysis.perform_OTU_analysis_on_db( db, cursor, tt, empty_reads_file, basic_stats_file, phylum_composition_file) taxa = set(read2tax.values()) num_alns = len(read2tax) no_otu_reads = filter(lambda r: read2tax[r] == -1, read2tax.keys()) save_reads_to_fasta(db, cursor, no_otu_reads, no_otu_reads_file) export_OTU_to_csv(read2tax, tt, otu_csv) export_OTU_to_json(read2tax, tt, otu_json) db.close()
def metasim_analysis(): parser = get_metasim_eval_parser() args = parser.parse_args() tt = TaxTree() metasim_json = '{0}{1}metasim-species-reads.json'.format( args.output_dir, os.path.sep) synth_dataset_profile(args.metasim_fasta, tt, metasim_json) reads_comp_json = '{0}{1}metagenomix-metasim-reads-comparison.json'.format( args.output_dir, os.path.sep) out.read_comparison_json(args.metagenomix_json, metasim_json, reads_comp_json, tt) transcript_comp_json = '{0}{1}metagenomix-metasim-transcript-comparison.json'.format( args.output_dir, os.path.sep) out.transcript_comparison_json(args.metagenomix_json, metasim_json, transcript_comp_json, tt) template_html = '/home/abulovic/BINNER/tools/metagenomix/metagenomix/visualization/metasim-metagenomix-template.html' reads_output_html = '{0}{1}metagenomix-metasim-reads-comparison.html'.format( args.output_dir, os.path.sep) out.default_d3_plot(reads_comp_json, template_html, reads_output_html, 2) transcripts_output_html = '{0}{1}metagenomix-metasim-transcript-comparison.html'.format( args.output_dir, os.path.sep) out.default_d3_plot(transcript_comp_json, template_html, transcripts_output_html, 2)
def lca(): parser = get_OTU_assign_option_parser() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with Report(args.output_dir, 'host-microbe') as report: file_type = utils.get_file_type(args.input_file) with report.timeit('Parsing %s <%s>' % (args.aln_file, utils.get_appropriate_file_size(args.aln_file))): if file_type == 'blast': read_alns, target_seqs = blast.parse_tab_delimited(args.input_file, args.db_type) elif file_type in ('sam', 'bam'): binary = True if file_type == 'bam' else False read_alns, target_seqs = sam.parse_cds_sam(args.input_file, args.db_type, binary) else: raise ValueError('%s alignment format not supported!' % file_type) data_access = DataAccess() with report.timeit('GI2TAX database querying'): gis = set(map(lambda t: t.gi, target_seqs.itervalues())) missing_gis = set() tax_ids = data_access.get_taxids(gis) for target in target_seqs.itervalues(): target.tax_id = tax_ids.get(target.gi, -1) if target.tax_id == -1: missing_gis.add(target.gi) report.mark('gi count : %d' % len(gis)) report.mark('tax count: %d' % len(tax_ids)) report.mark('Number of gis for which no TAXID could be found: %d' % len(missing_gis)) with report.timeit('Loading tax tree'): tt = TaxTree() report.mark('Loaded %d nodes.' % len(tt.nodes)) with report.timeit('lca read assignment'): tax2reads = otu.lca_read_assign(target_seqs, read_alns, tt, max_alns=50) tax2count = dict(map(lambda i: (i[0], len(i[1])), tax2reads.items())) with report.timeit('tax stats'): species2reads = utils.get_rank_read_distribution(tax2reads, tt, 'species') report.tax2reads(tax2reads, tt, 'lca', 'json') report.rank_distribution(species2reads, tt, 'lca')
def parse(): parser = get_parse_option_parser() namespace = parser.parse_args() if namespace.type != 'cds': raise NotImplementedError('Genome parsing will be up in a jiffy.') file_type = get_file_type(namespace.input_file) with timeit('Parsing alignment file'): if file_type == 'blast': read_alns, transcripts = parse_cds_megablast( namespace.input_file, namespace.output_dir) elif file_type == 'sam': read_alns, transcripts = parse_cds_sam(namespace.input_file, namespace.output_dir, binary=False) elif file_type == 'bam': read_alns, transcripts = parse_cds_sam(namespace.input_file, namespace.output_dir, binary=True) with timeit('Loading tax tree'): tax_tree = TaxTree() spec2trans = get_species_transcript_distribution(transcripts, tax_tree) for spec, trans in spec2trans.iteritems(): if len(filter(lambda t: t.total_coverage > 0.9, trans)) > 0: if spec <= 0: continue int1 = filter(lambda t: t.total_coverage > 0.9, trans) print tax_tree.nodes[spec].organism_name, print len(trans) print ' '.join( map( lambda t: "(%.3f, %.3f)" % (t.total_coverage, t.coverage_fold), int1)) print get_read_aln_distribution( read_alns, os.path.sep.join([namespace.output_dir, 'plot.png'])) get_spec_transcript_distribution( spec2trans, os.path.sep.join([namespace.output_dir, 'plot2.png']))
def greedy(): parser = get_OTU_assign_option_parser() args = parser.parse_args() if args.read_count != 0: total_read_count = args.read_count all_reads = utils.reads_from_fasta(args.original_fasta) with Report(args.output_dir, 'microbe') as report: file_type = utils.get_file_type(args.input_file) with report.timeit('Parsing %s <%s>' % (args.input_file, utils.get_appropriate_file_size(args.input_file))): if file_type == 'blast': count_entries = blast.get_entry_cnt_tab parse_func = blast.parse_tab_delimited elif file_type in ('sam', 'bam'): count_entries = sam.get_entry_cnt_sam parse_func = sam.parse_cds_sam elif file_type == 'xml': count_entries = blast.get_entry_cnt_xml parse_func = blast.parse_xml else: raise ValueError('%s alignment format not supported!' % file_type) with utils.timeit('Retrieving entry count'): entry_cnt = count_entries(args.input_file) report.mark('\tTotal entries: %d' % entry_cnt) with utils.timeit('File parsing'): read_alns, target_seqs = parse_func(args.input_file, args.db_type, entry_cnt=entry_cnt, detailed=True) report.mark('\tTarget sequences: %d' % len(target_seqs)) with report.timeit('GI2TAX database querying'): utils.retrieve_tax_ids(target_seqs, args.db_type) with report.timeit('loading tax tree'): tt = TaxTree() report.mark('Loaded %d nodes.' % len(tt.nodes)) #profiling.get_read_overlap(target_seqs, read_alns, tt, all_reads) #profiling.sequential_read_set_analysis(read_alns, target_seqs, tt) coverage_limit = 0.6 fold_limit = 1. with report.timeit('greedy transcript assignment'): report.mark('Coverage threshold: %.2f' % coverage_limit) report.mark('Fold threshold: %.2f' % fold_limit) report.mark('#Transcripts (before read assignment) : %d' % len(target_seqs)) prefilt_transcripts = otu.greedy_transcript_assign(target_seqs, read_alns) report.mark('#Transcripts (after read assignemnt) : %d' % len(prefilt_transcripts)) #if args.db_type == 'cds': final_transcripts = otu.filter_by_coverage_fold(prefilt_transcripts, 0.6, 1.) #else: # final_transcripts = prefilt_transcripts report.mark('#Transcripts (after cov-fold filtering): %d' % len(final_transcripts)) total_reads = len(read_alns) with report.timeit('Species assignment stats'): s2t_nofilt = utils.get_species_transcript_distribution(target_seqs, tt) s2t_greedy = utils.get_species_transcript_distribution(final_transcripts, tt) report.mark('#Species (pre-read-assignment) : %d' % len(s2t_nofilt)) report.mark('#Species (post-read-assignment): %d' % len(s2t_greedy)) report.rank_distribution(s2t_nofilt, tt, 'nofilt') report.rank_distribution(s2t_greedy, tt, 'greedy') report.tax_tree(s2t_nofilt, tt, 'nofilt') report.tax_tree(s2t_greedy, tt, 'greedy') if args.db_type == 'cds': new_s2t = otu.remove_orthologue_strains(s2t_greedy) else: new_s2t = s2t_greedy if args.db_type == 'cds': with report.timeit('Transcript stats'): report.transcript_stats(s2t_nofilt, tt, 'nofilt') report.transcript_stats(new_s2t, tt, 'greedy', assigned=True) report.tax2reads(new_s2t, tt, 'greedy', 'json') if args.db_type == 'cds': with report.timeit('Outputing gene expression'): report.gene_expression(s2t_nofilt, tt, 'gene_expression_nofilt', assigned=False) report.gene_expression(new_s2t, tt, 'gene_expression', assigned=True) report.summary(read_alns, s2t_nofilt, new_s2t, args.original_fasta)
def host_greedy(): parser = get_config_run_parser() args = parser.parse_args() config = parse_config(args.metagenomix_config) out_dir = config["output_dir"] data_access = DataAccess() with Report(out_dir, 'host-microbe') as report: if not config["host_separated"]: raise utils.NotSupportedError("Host separation currently not supported") with report.timeit('loading tax tree'): tt = TaxTree() report.mark('Loaded %d nodes.' % len(tt.nodes)) with report.timeit("Loading original fastas"): for i, orig_fasta in enumerate(config["input_seq_files"], 1): report.load_original_fasta(orig_fasta) host_aln_type = config["host_aln_type"].lower() for i, aln_file in enumerate(config["host_alignments"]): with report.timeit("Extracting read from host alignment"): report.extract_reads_from_aln(aln_file, host_aln_type) db_type = config["microorganism_db_type"].lower() microbe_aln_type = config["microorganism_aln_type"].lower() gi_type = 'nucl_gi' if db_type == 'cds' else 'gi' get_gi = lambda t: getattr(t, gi_type) read_alns = defaultdict(list) target_seqs = {} for i, aln_file in enumerate(config["microorganism_alignments"]): with report.timeit('File parsing'): if microbe_aln_type == 'blast': read_alns, target_seqs = blast.parse_tab_delimited(aln_file, db_type, 1e5, read_alns, target_seqs, filter_low_scoring=False, annotate=False) elif microbe_aln_type in ('sam', 'bam'): binary = True if microbe_aln_type == 'bam' else False read_alns, target_seqs = sam.parse_cds_sam(aln_file, db_type, binary, annotate=False, read_alns=read_alns, target_seqs=target_seqs) else: raise ValueError('%s alignment format not supported!' % microbe_aln_type) # Write to file all the rads with alignments to microbes report.output_reads(read_alns.keys(), 'microbe', 'reads.txt') if microbe_aln_type == 'blast': read_alns, target_seqs = blast.annotate_targets(read_alns, target_seqs) elif microbe_aln_type in ('sam', 'bam'): read_alns, target_seqs = sam.annotate_targets(read_alns, target_seqs) with report.timeit('GI2TAX database querying'): gis = set(map(lambda t: get_gi(t), target_seqs.itervalues())) tax_ids = data_access.get_taxids(gis) missing_gis = set() for target in target_seqs.itervalues(): target.tax_id = tax_ids.get(get_gi(target), -1) if target.tax_id == -1: missing_gis.add(get_gi(target)) report.mark('gi count : %d' % len(gis)) report.mark('tax count: %d' % len(set(tax_ids.values()))) report.mark('Number of gis for which no TAXID could be found: %d' % len(missing_gis)) with report.timeit('greedy transcript assignment'): report.mark('#Transcripts (before read assignment) : %d' % len(target_seqs)) prefilt_transcripts = otu.greedy_transcript_assign(target_seqs, read_alns) report.mark('#Transcripts (after read assignemnt) : %d' % len(prefilt_transcripts)) final_transcripts = otu.filter_by_coverage_fold(prefilt_transcripts, 0.5, 1.) hypothetical_ids = otu.find_hypothetical(final_transcripts) report.mark('#Transcripts (after cov-fold filtering: %d' % len(final_transcripts)) hypothetical = {_id: final_transcripts[_id] for _id in hypothetical_ids} non_hypothetical_ids = set(final_transcripts) - set(hypothetical) non_hypothetical = {_id: final_transcripts[_id] for _id in non_hypothetical_ids} with report.timeit('Species assignment stats'): s2t_nofilt = utils.get_species_transcript_distribution(target_seqs, tt) s2t_greedy_nh = utils.get_species_transcript_distribution(non_hypothetical, tt) s2t_ss = otu.remove_orthologue_strains(s2t_greedy_nh) s2t_greedy_h = utils.get_species_transcript_distribution(hypothetical, tt) report.mark('#Species (pre-read-assignment) : %d' % len(s2t_nofilt)) report.mark('#Species (post-read-assignment) : %d' % len(s2t_greedy_nh)) report.mark('#Species (after-strain-filtering): %d' % len(s2t_ss)) report.rank_distribution(s2t_nofilt, tt, 'nofilt') report.rank_distribution(s2t_ss, tt, 'greedy') report.tax_tree(s2t_nofilt, tt, 'nofilt') report.tax_tree(s2t_greedy_nh, tt, 'greedy') with report.timeit('Transcript stats'): report.transcript_stats(s2t_nofilt, tt, 'nofilt') report.transcript_stats(s2t_ss, tt, 'greedy', assigned=True) report.transcript_stats(s2t_greedy_h, tt, 'hypothetical', assigned=True) with report.timeit('Outputing gene expression'): report.gene_expression(s2t_nofilt, tt, 'gene_expression/nofilt', assigned=False) report.gene_expression(s2t_ss, tt, 'gene_expression/greedy', assigned=True) report.gene_expression(s2t_greedy_h, tt, 'gene_expression/hypothetical', assigned=True) with report.timeit('Transcript stats'): report.transcript_stats(s2t_nofilt, tt, 'nofilt') report.transcript_stats(s2t_ss, tt, 'greedy', assigned=True) report.tax2reads(s2t_ss, tt, 'greedy', 'json') with report.timeit('Different strain stats'): report.strains(s2t_ss, tt, 'greedy') report.strains(s2t_greedy_h, tt, 'hypothetical') with report.timeit('Outputing gene expression'): report.gene_expression(s2t_nofilt, tt, 'microbe/gene_expression_nofilt', assigned=False) report.gene_expression(s2t_ss, tt, 'microbe/gene_expression', assigned=True) total_reads = list() for f in os.listdir(out_dir + '/orig_seqs'): fname = out_dir + '/orig_seqs/' + f with open(fname) as fin: total_reads.extend([line.strip() for line in fin]) total_reads = set(total_reads) report.mark('TOTAL') report.output_reads(total_reads, 'orig_seqs', 'reads.txt') # Let's load all the host-reads host_reads = list() host_dir = out_dir + os.path.sep + 'host' + os.path.sep for f in os.listdir(host_dir): with open(host_dir + os.path.sep + f) as fin: host_reads.extend([line.strip() for line in fin]) host_reads = set(host_reads) report.mark('HOST') report.output_reads(host_reads, 'host', 'reads.txt') # Let's load all the microbial reads with open(out_dir + '/microbe/reads.txt') as fin: microbial_reads = set([line.strip() for line in fin]) report.mark('MICROBE') report.mark('Read count: %d' % len(microbial_reads)) common_reads = host_reads & microbial_reads report.mark('common_reads') report.output_reads(common_reads, '', 'common_reads.txt') reads_with_no_aln = total_reads - (host_reads | microbial_reads) report.mark('NO ALN') report.output_reads(reads_with_no_aln, '', 'no_aln_reads.txt')
def extract_subtaxa(): parser = get_extract_subtaxa_parser() args = parser.parse_args() if args.db_type == "cds": parse_header = db_utils.parse_cds_header def get_taxid(data, data_access): return int(data["taxon"]) elif args.db_type == "genome": parse_header = db_utils.parse_genome_header def get_taxid(data, data_access): taxid = data_access.get_taxids((int(data["gi"]),), format=list) if len(taxid) == 0: return None else: return taxid[0] else: parse_header = db_utils.parse_nt_header if args.parent_tax: taxa = [args.parent_tax] elif args.file: with open(args.file) as fin: taxa = map(lambda t: int(t), fin.read().strip().split(',')) tt = TaxTree() if args.merge: single_file = '{0}{1}db-extract.fa'.format(args.output_dir, os.path.sep) handle = open(single_file, 'w') write = lambda tax, record: SeqIO.write(record, handle, 'fasta') close = lambda: handle.close() else: tax2handle = {} for tax in taxa: fname = '{0}{1}{2}.fa'.format(args.output_dir, os.path.sep, tt.get_org_name(tax)) tax2handle[tax] = open(fname, 'w') write = lambda tax, record: SeqIO.write(record, tax2handle[tax], 'fasta') close = lambda: map(lambda h: h.close(), tax2handle.values()) data_access = DataAccess() if args.gene_count is None: with open(args.input_db, 'r') as fin: records = SeqIO.parse(fin, 'fasta') for record in records: data = parse_header(record.name) taxid = get_taxid(data, data_access) if taxid is None: continue for pt in taxa: if tt.is_child(taxid, pt) or taxid == pt: write(pt, record) else: tax2genes = defaultdict(set) interesting_taxa = set(taxa) with open(args.input_db) as fin: records = SeqIO.parse(fin, 'fasta') for record in records: if not interesting_taxa: break data = parse_header(record.name) taxid = get_taxid(data, data_access) if taxid is None: continue lineage = tt.get_lineage(taxid) overlap = set(lineage) & interesting_taxa if not overlap: continue parent_tax = overlap.pop() if 'product' not in data: continue product = data['product'] if product != 'hypothetical protein' and product not in tax2genes[parent_tax]: write(parent_tax, record) tax2genes[parent_tax].add(product) if len(tax2genes[parent_tax]) >= args.gene_count: interesting_taxa.remove(parent_tax) close()
def msim_profile(): parser = get_msim_profile_parser() args = parser.parse_args() tt = TaxTree() synth_dataset_profile(args.metasim_fasta, tt, args.output_json)