def main(): # Input arguments argparser = ArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) # ------------------ # print '1. Loading tax tree...' start = time.time() tax_tree = TaxTree() end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # print '2. Loading alignment file...' start = time.time() read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # # Create folder if does not exist if not os.path.exists(args.export_folder): os.makedirs(args.export_folder) # File for data analysis summary summary_path = os.path.join(args.export_folder, "CDSs_summary.txt") cds_summary = open(summary_path, 'w') if args.remove_host: print "Removing host..." start = time.time() #------- FILTER HOST READS -------# #print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid read_count = len(read_container.fetch_all_reads(format=list)) host_read_count = read_count - len(reads_with_no_host_alignments) non_host_read_count = read_count - host_read_count cds_summary.write("total : {0:8d}\n".format(read_count)) cds_summary.write("host : {0:8d} {1:.2f}\n".format(host_read_count, host_read_count / float(read_count) )) cds_summary.write("non-host: {0:8d} {1:.2f}\n".format(non_host_read_count, non_host_read_count / float(read_count) )) # Set host-free reads read_container.set_new_reads(reads_with_no_host_alignments) end = time.time() print("done: {0:.2f} sec".format(end - start)) #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' start = time.time() record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') end = time.time() print("done: {0:.2f} sec".format(end - start)) #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' start = time.time() read_container.populate_cdss(record_container) end = time.time() print("done: {0:.2f} sec".format(end - start)) #- RECORD ALL ALIGNEMENTS TO GENE -# print '6. Populating CDS container...' start = time.time() cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------------------- # print 'Sorting CDSs ...DISABLED' start = time.time() # Sort CDSs by their "good looks"! cds_alns = cds_aln_container.fetch_all_cds_alns(format=list) ''' cds_alns = sorted(cds_alns, key=lambda cds_aln: cds_aln.get_std_over_mean(), reverse=False) ''' end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------------------- # ''' print "Exporting phase 0 - all CDSs..." export_CDS_stats_data(cds_alns, args.export_folder, "0_all_CDSs.txt") print "done" ''' # Count Nones in cds_alns nones = count_nones(cds_alns) cds_summary.write("\n") cds_summary.write("gene None : {0}\n".format(nones['gene'])) cds_summary.write("protein_id None: {0}\n".format(nones['protein_id'])) cds_summary.write("product None : {0}\n".format(nones['product'])) cds_summary.write("\n") cds_summary.write("CDSs all: {0}\n".format(len(cds_alns))) print 'Filtering valid CDSs...' start = time.time() # Remove CDSs with too low mean coverage value or length min_mean_coverage = 0 min_length = 0 cds_alns_targeted = [cds_aln for cds_aln in cds_alns # Filters if cds_aln.get_cds_length() > min_length and cds_aln.get_mean_coverage() > min_mean_coverage] # Remove CDSs with no gene/product cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted if cds_aln.cds.product is not None] #if cds_aln.cds.gene != None #and cds_aln.cds.product != None] end = time.time() print("done: {0:.2f} sec".format(end - start)) # All valid CDSs - Output coverage/length histogram data print "Exporting phase 1 - all CDSs..." start = time.time() export_CDS_stats_data(cds_alns_targeted, args.export_folder, "1_all_valid_CDSs.txt") end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------- CDSs filtered and ready to be analyzed ------------------- # print 'Extracting ribosomal CDSs...' # Number of targeted CDSs cds_summary.write("CDSs valid: {0}\n".format(len(cds_alns_targeted))) cds_alns_ribosomal = [] for cds_aln in cds_alns_targeted: # If has word "ribosomal" in name, store coverage data for graph gene = cds_aln.cds.gene product = cds_aln.cds.product protein_id = cds_aln.cds.protein_id if is_ribosomal(product): #print("{0} {1} {2}\n".format(gene, protein_id, product)) cds_alns_ribosomal.append(cds_aln) print 'done' # ------------------- Ribosomal CDSs acquired! --------------------- # print 'Analysing ribosomals...' # Extract interesting data # Mean coverage, max coverage mm_cov = 0 max_cov = 0 for cds_aln in cds_alns_ribosomal: mean_cov = cds_aln.get_mean_coverage() mm_cov += mean_cov max_cov = max(max_cov, mean_cov) if mm_cov > 0: mm_cov /= len(cds_alns_ribosomal) cds_summary.write("ribosomals all {0}\n".format(len(cds_alns_ribosomal))) cds_summary.write("mean coverage: {0}\n".format(mm_cov)) cds_summary.write("max coverage : {0}\n".format(max_cov)) print 'done' # Ribosomal CDSs only - Output coverage/length histogram print "Exporting phase 2 - ribosomal CDSs only..." export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "2_ribosomal_CDSs.txt") print "done" # ------------------- Making biological sense - choosing CDSs -------------------- # print 'Filtering under-average ribosomals...' # NOTE: take length into consideration? cds_alns_ribosomal = [cds_aln for cds_aln in cds_alns_ribosomal # Filters if cds_aln.get_mean_coverage() > mm_cov] print 'done' cds_summary.write("ribosomals over-mean: {0}\n".format(len(cds_alns_ribosomal))) cds_summary.close() print 'Phase 3 - filtered ribosomal CDSs...' export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "3_ribosomal_CDSs_filtered.txt") print 'done' # Store charts cov data - if selected so if args.export_charts: print "Exporting chart coverage data..." export_CDS_graph_data(cds_alns_ribosomal, args.export_charts) print "done." # --------------------- I have chosen CDSs - determine species and analyse ------------------------ # # Species level resolution # See which species are present - dump ones with not enough CDSs # NOTE: So far done in determine_species_by_ribosomals.py CDS_count = {} # Count CDSs of each species species_set = set() # Get estimated tax_ids for cds_aln in cds_alns_ribosomal: tax_id = cds_aln.cds.taxon # Put each tax_id up to the "species" level tax_id_species = tax_tree.get_parent_with_rank(tax_id, 'species') species_set.add(tax_id_species) CDS_count[tax_id_species] = CDS_count.get(tax_id_species, 0) + 1 # Get reported CDSs ids reported_CDS_ids = set() for cds_aln in cds_alns_ribosomal: reported_CDS_ids.add(cds_aln.cds.id) # ------------ Read assignment analysis -------------- # print "Read assignment analysis..." reads = read_container.fetch_all_reads(format=list) assignment_analysis(species_set, reads, tax_tree, args.export_folder, CDS_count)
def main(): ''' Script to run binner in one of the most common usage scenarios. * load alignment data * load taxonomy data * do basic alignment data filtering (remove host reads ecc) ''' #----------------------------------# #------ INPUT ARGUMENTS -----------# argparser = TestRunArgParser() args = argparser.parse_args() #----------------------------------# #------- STATIC DATA SOURCE -------# # CDS - GI2TAXID -- NAMES -- NODES # dataAccess = DataAccess(args) #raw_input('Data access created') #----------------------------------# #-------- TAXONOMY TREE -----------# print '1. Loading tax tree...' tax_tree = TaxTree() # tax_tree.load_taxonomy_data(dataAccess) print 'done.' #----------------------------------# #------- ALIGNMENT DATA SOURCE ----# print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.input) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) # Remember total number of reads total_read_num = read_container.get_read_count() print 'done' #------- FILTER HOST READS -------# print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments) read_container.set_new_reads(reads_with_no_host_alignments) print 'done' #----------------------------------# #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) print 'done' print '6. Estimating organisms present in sample...' target_organisms = [633, 632, 263, 543, 86661, 1392, 55080, 1386] # What is this part? print 'done.' print '7. Annotating reads...' annotated_reads = rstate.annotate_reads( read_container.fetch_all_reads(format=list), cds_aln_container.read2cds, tax_tree, target_organisms) read_container.set_new_reads(annotated_reads) print 'done' print '8. Binning reads...' orgs = bin_reads( read_container.fetch_all_reads(format=list), cds_aln_container.cds_repository, cds_aln_container.read2cds, tax_tree, target_organisms, None, None, False) ''' for org in orgs.values(): print org.name print len(set(org.get_reads())) print len(org.identified_coding_regions) print 'done.' ''' print ("total_read_num: " + str(total_read_num)) print '9. Generating XML...' dataset = Dataset(args.xml_description_file) xml_organisms = [] host = Organism (host_read_count, host_read_count/float(total_read_num), None, None, "Host", None, None, [], [], [], is_host=True) xml_organisms.append(host) for org in orgs.values(): xml_organisms.append(org.to_xml_organism(tax_tree, total_read_num)) xml_organisms.sort(key=operator.attrgetter("amount_count"), reverse=True) xml = XMLOutput(dataset, xml_organisms, args.output) xml.xml_output();
def main(): ''' Script to run binner in one of the most common usage scenarios. * load alignment data * load taxonomy data * do basic alignment data filtering (remove host reads ecc) ''' #----------------------------------# #------ INPUT ARGUMENTS -----------# argparser = PickleParser() args = argparser.parse_args() #----------------------------------# #------- STATIC DATA SOURCE -------# # CDS - GI2TAXID -- NAMES -- NODES # dataAccess = DataAccess(args) #raw_input('Data access created') #----------------------------------# #-------- TAXONOMY TREE -----------# print '1. Loading tax tree...' tax_tree = TaxTree() # tax_tree.load_taxonomy_data(dataAccess) print 'done.' #----------------------------------# #------- ALIGNMENT DATA SOURCE ----# print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.input) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) print 'done' #------- FILTER HOST READS -------# print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments) read_container.set_new_reads(reads_with_no_host_alignments) print 'done' #----------------------------------# #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') record_container.populate(read_container.fetch_all_reads_versions(), table='rrna') print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) print 'done'