def main(): ''' Script to perform LCA binning with a given read alignment file. ''' # Input arguments argparser = TestRunArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) print '1. Loading tax tree...' tax_tree = TaxTree() print 'done.' print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) print 'done' print '4. Creating LCA solution...' lca_binner = LCABinner(tax_tree) lca_sol = lca_binner.bin_reads(read_container) print 'done' print '5. Nice output of read assignment here' lca_sol.print_nicely(tax_tree)
class RecordContainerTest (unittest.TestCase): def setUp(self): self.read_container = ReadContainer() self.record_container = RecordContainer() def tearUp(self): pass def testFillRecordContainer(self): '''Method to test whether record container populating works. Uses mock database access to test whether record container has correct number of items.''' aln_file = './test/solver/read2cds/.test_data/lisa.in' cds_fasta = './test/solver/read2cds/.test_data/cds.fa' db_access = MockDbQuery(cds_fasta) self.record_container.set_db_access(db_access) self.read_container.populate_from_aln_file(aln_file) self.record_container.populate( self.read_container.fetch_all_reads_versions()) records = self.record_container.fetch_all_records(format=list) self.assertEqual (len(db_access.records), len(records)) def testReturnsNoneForNonexistentRecord(self): record = self.record_container.fetch_existing_record("XXX") self.assertIsNone(record, "No record with version XXX should be found")
class RecordContainerTest(unittest.TestCase): def setUp(self): self.read_container = ReadContainer() self.record_container = RecordContainer() def tearUp(self): pass def testFillRecordContainer(self): '''Method to test whether record container populating works. Uses mock database access to test whether record container has correct number of items.''' aln_file = './test/solver/read2cds/.test_data/lisa.in' cds_fasta = './test/solver/read2cds/.test_data/cds.fa' db_access = MockDbQuery(cds_fasta) self.record_container.set_db_access(db_access) self.read_container.populate_from_aln_file(aln_file) self.record_container.populate( self.read_container.fetch_all_reads_versions()) records = self.record_container.fetch_all_records(format=list) self.assertEqual(len(db_access.records), len(records)) def testReturnsNoneForNonexistentRecord(self): record = self.record_container.fetch_existing_record("XXX") self.assertIsNone(record, "No record with version XXX should be found")
def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa' self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in' self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery(self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) self.greedy_solver = GreedySolver() self.greedy_solver.map_reads_2_cdss(self.cds_aln_cont)
def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa' self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in' self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery (self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) self.greedy_solver = GreedySolver() self.greedy_solver.map_reads_2_cdss(self.cds_aln_cont)
class ReadContainerTest(unittest.TestCase): def setUp(self): self.read_cont = ReadContainer() self.aln_file = './test/data/containers/.data/example.in' def tearUp(self): pass def testReadCount(self): ''' Tests whether the number of reads in the read container is consistent with the number of reads in the alignment file. ''' self.read_cont.populate_from_aln_file(self.aln_file) reads_cont = self.read_cont.read_repository.keys() reads_from_file = self._load_read_ids() for read_id in reads_from_file: self.assertTrue(read_id in reads_cont, msg = "Read ID %s not in read container." % read_id) self.assertEqual(len(reads_cont), 100) def testCorrectAlignmentNumber (self): ''' Test the loader for correct number of alignments. Test file organized so that read ID specifies number of alignments. ''' aln_file = './test/data/containers/.data/aln_num.in' self.read_cont.populate_from_aln_file(aln_file) for (read_id, read) in self.read_cont.read_repository.items(): self.assertEqual(int(read_id), len(read.get_alignments())) def _load_read_ids (self): aln_fhandle = open(self.aln_file, 'r') nextline = aln_fhandle.readline read_ids = [] while(True): line = nextline() if not line: break read_id = line.split(',')[0] if read_id.startswith('@'): read_id = read_id[1:] read_ids.append(read_id) aln_fhandle.close() return read_ids
class ReadContainerTest(unittest.TestCase): def setUp(self): self.read_cont = ReadContainer() self.aln_file = './test/data/containers/.data/example.in' def tearUp(self): pass def testReadCount(self): ''' Tests whether the number of reads in the read container is consistent with the number of reads in the alignment file. ''' self.read_cont.populate_from_aln_file(self.aln_file) reads_cont = self.read_cont.read_repository.keys() reads_from_file = self._load_read_ids() for read_id in reads_from_file: self.assertTrue(read_id in reads_cont, msg="Read ID %s not in read container." % read_id) self.assertEqual(len(reads_cont), 100) def testCorrectAlignmentNumber(self): ''' Test the loader for correct number of alignments. Test file organized so that read ID specifies number of alignments. ''' aln_file = './test/data/containers/.data/aln_num.in' self.read_cont.populate_from_aln_file(aln_file) for (read_id, read) in self.read_cont.read_repository.items(): self.assertEqual(int(read_id), len(read.get_alignments())) def _load_read_ids(self): aln_fhandle = open(self.aln_file, 'r') nextline = aln_fhandle.readline read_ids = [] while (True): line = nextline() if not line: break read_id = line.split(',')[0] if read_id.startswith('@'): read_id = read_id[1:] read_ids.append(read_id) aln_fhandle.close() return read_ids
class BestScoreSolverTest(unittest.TestCase): # setUp is executed before each test method def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa' self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in' self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery(self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) self.bs_solver = BestScoreSolver() self.bs_solver.map_reads_2_cdss(self.cds_aln_cont) def testCdsAlignmentContainerConsistency(self): assert (Read2CDSSolver.test_cds_alignment_container_consistency( self.cds_aln_cont) == True)
def main(): ''' Script to extract reads without any alignments ''' # Input arguments argparser = ArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) # ------------------ # print '1. Loading tax tree...' start = time.time() tax_tree = TaxTree() end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # print '2. Loading alignment file...' start = time.time() read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # # Loop through reads and take those with no alignments out_file = open(args.read_ids_out, 'w') no_aln_count = 0 for read in read_container.fetch_all_reads(format=list): if not read.has_alignments(): out_file.write("{0}\n".format(read.id)) no_aln_count += 1 out_file.close() total_read_count = read_container.get_read_count() print("total number of reads : {0}".format( total_read_count )) print("reads without alignments: {0}".format(no_aln_count)) print print("no aln percentage: {0:.2f}%".format(no_aln_count * 100 / float(total_read_count)))
def main(): ''' Script to experiment with binner. ''' print "Hello world!" # Input arguments argparser = TestRunArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) print '1. Loading tax tree...' tax_tree = TaxTree() print 'done.' print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.input) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) print 'done' print '3. Loading correct solution from FASTA with reads produced by metasim...' sol = Solution.from_metasim_fasta(args.metasim_fasta, dataAccess) print 'done' print '4. Creating LCA solution...' lca_binner = LCABinner(tax_tree) lca_sol = lca_binner.bin_reads(read_container) print 'done' print '5. Evaluating LCA solution...' rankAcc = RankAccuracy(tax_tree, sol, lca_sol) print 'done' # Print test results rankAcc.print_data()
def fill_containers (alignment_file, db_access): ''' Populates read, record and CDS alignment container. @return tuple(ReadContainer, RecordContainer, CdsAlnContainer) ''' read_cont = ReadContainer() record_cont = RecordContainer() record_cont.set_db_access(db_access) cdsaln_cont = CdsAlnContainer() # 1. Load all the information available in the alignment file read_cont.populate_from_aln_file(alignment_file) # 2. Fetch all the records reported in the alignment file from the database record_cont.populate(read_cont.fetch_all_reads_versions()) # 3. Find to which coding sequences reads map read_cont.populate_cdss(record_cont) # 4. Populate Cds Alignment container cdsaln_cont.populate(read_cont.fetch_all_reads()) return (read_cont, record_cont, cdsaln_cont)
class StatisticsTest(unittest.TestCase): # setUp is executed before each test method def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/statistics/.test_data/cds.fa' self.input_aln_fpath = './test/statistics/.test_data/lisa.in' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery(self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) def testStatistics(self): assert (num_read_alns(self.read_cont) == 22) assert (num_active_aligned_regions(self.cds_aln_cont) == 22) assert (num_inactive_read_alns(self.read_cont) == 0) self.bs_solver = BestScoreSolver() self.bs_solver.map_reads_2_cdss(self.cds_aln_cont) records_stats = count_alns_to_record_and_cds(self.read_cont) print "Number of records for which we have stats: %d\n" % len( records_stats) for rec_stat in records_stats.values(): rec_stat.print_data() assert (num_active_aligned_regions(self.cds_aln_cont) == 16) assert (num_cdss(self.cds_aln_cont) == 4) assert (num_cdss_with_no_alns(self.cds_aln_cont) == 0)
class StatisticsTest (unittest.TestCase): # setUp is executed before each test method def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/statistics/.test_data/cds.fa' self.input_aln_fpath = './test/statistics/.test_data/lisa.in' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery (self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) def testStatistics(self): assert(num_read_alns(self.read_cont) == 22) assert(num_active_aligned_regions(self.cds_aln_cont) == 22) assert(num_inactive_read_alns(self.read_cont) == 0) self.bs_solver = BestScoreSolver() self.bs_solver.map_reads_2_cdss(self.cds_aln_cont) records_stats = count_alns_to_record_and_cds(self.read_cont) print "Number of records for which we have stats: %d\n" % len(records_stats) for rec_stat in records_stats.values(): rec_stat.print_data() assert(num_active_aligned_regions(self.cds_aln_cont) == 16) assert(num_cdss(self.cds_aln_cont) == 4) assert(num_cdss_with_no_alns(self.cds_aln_cont) == 0)
def setUp(self): self.read_cont = ReadContainer() self.aln_file = './test/data/containers/.data/example.in'
from solver.read2cds.Read2CDSSolver import Read2CDSSolver from data.containers.record import RecordContainer from data.containers.read import ReadContainer from data.containers.cdsaln import CdsAlnContainer from utils.logger import Logger Logger("log") db_query = DbQuery() # create containers record_container = RecordContainer() record_container.set_db_access(db_query) read_container = ReadContainer() read_container.populate_from_aln_file("example_data/2reads.in") record_container.populate(read_container) read_container.populate_cdss(record_container) cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container) print cds_aln_container r2c_solver = BestScoreSolver() r2c_solver.map_reads_2_cdss(cds_aln_container) print "Consistency test result: ", Read2CDSSolver.test_cds_alignment_container_consistency(cds_aln_container)
class GreedySolverTest(unittest.TestCase): # setUp is executed before each test method def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa' self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in' self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery(self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) self.greedy_solver = GreedySolver() self.greedy_solver.map_reads_2_cdss(self.cds_aln_cont) def testAlignmentsCorrectlyInactivated(self): ''' Loads correct results from results file and checks whether all the reads for a CDS listed in the file are active and whether all the other reads are inactive. ''' cds2read = self._load_active_reads() for (cds, cds_aln) in self.cds_aln_cont.cds_repository.items(): accession = cds.record_id mapped_reads = cds2read[accession] for cds_aln_subloc in cds_aln.aligned_regions.values(): if cds_aln_subloc.active: assert (cds_aln_subloc.read_id in mapped_reads) else: assert (cds_aln_subloc.read_id not in mapped_reads) def testCdsAlignmentContainerConsistency(self): assert (Read2CDSSolver.test_cds_alignment_container_consistency( self.cds_aln_cont) == True) def _load_active_reads(self): results_fhandle = open(self.results_fpath) lines = iter(results_fhandle.readlines()) cds2read_map = {} while (True): cds_id = next(lines, None) read_ids = next(lines, None) if not cds_id: break cds2read_map[cds_id.strip()] = read_ids.strip().split(';') results_fhandle.close() return cds2read_map
def main(): ''' Script to analyse genes expressed for given tax id. ''' # Input arguments argparser = ArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) print '1. Loading tax tree...' tax_tree = TaxTree() print 'done.' print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) print 'done' # TODO: Here i should recognize host reads! # ------------------------------------- # #----------------------------------# #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# print '6. Populating CDS container...' cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) print 'done' print("Loaded CDS container") # Take only CDSs of given tax_id # Remove CDSs with too low mean coverage value min_mean_coverage = 10 min_length = 20 cds_alns = cds_aln_container.fetch_all_cds_alns(format=list) print ( "All CDSs (all organisms): " + str(len(cds_alns)) ) cds_alns_targeted = [cds_aln for cds_aln in cds_alns if cds_aln.get_tax_id() == args.tax_id # Filters and cds_aln.get_cds_length() > min_length and cds_aln.get_mean_coverage() > min_mean_coverage] # Remove CDSs with no gene/product cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted if cds_aln.cds.gene != None and cds_aln.cds.product != None] # ------------------- CDSs filtered and ready to be analyzed ------------------- # # Number of targeted CDSs print ( "Targeted CDSs: " + str(len(cds_alns_targeted)) ) ''' print ("Sorting CDSs: stddev/mean") cds_alns_sorted = sorted(cds_alns_targeted, key=lambda cds_aln: cds_aln.get_std_over_mean(), reverse=False) # TODO: Here I should somehow determine which CDSs are "expressed", and which are not? # Write to file stuff(gene, protein_id) for each cds_aln print("Writing data to file") path = args.export_path f = open(path, 'w') for cds_aln in cds_alns_targeted: gene = cds_aln.cds.gene product = cds_aln.cds.product protein_id = cds_aln.cds.protein_id f.write("{0} {1}\n".format(gene, protein_id)) f.close() print("Done") ''' # -------------------- # ''' # Analyse those CDSs print ( "Targeted CDSs: " + str(len(cds_alns_targeted)) ) # See the mean length of CDS mean_cds_length = 0 no_locs_num = 0 for cds_aln in cds_alns_targeted: try: loc_length = cds_aln.get_cds_length() mean_cds_length += loc_length except: no_locs_num += 1 # Get mean mean_cds_length /= float(len(cds_alns_targeted)) print("---------------------------------------------") print("Mean CDS length: " + str(mean_cds_length)) print("Nones: " + str(no_locs_num)) print("---------------------------------------------") ''' ''' # Create folder where data about CDSs will be stored if not os.path.exists(args.export_path): os.makedirs(args.export_path) # Export some amount of best CDSs i = 1 for cds_aln in cds_alns_sorted: filename = "cds_" + str(i) + ".txt" coverage_path = os.path.join(args.export_path, filename) print(str(i) + ": " + str(cds_aln.get_std_over_mean())) cds_aln.coverage_to_file(coverage_path) if i == 50: # TODO: Define this somehow as parameter break i += 1 # Load CDS container ''' # Analyse stuff # print("Analysing stuff!") '''
def setUp(self): self.read_container = ReadContainer() self.record_container = RecordContainer()
class GreedySolverTest (unittest.TestCase): # setUp is executed before each test method def setUp(self): ''' @param mock_db_fpath (str) path to syntheticaly created CDSs which serves to fill up mock database of records @param input_aln_fpath (str) path to input alignment file @param results_fpath (str) path to file with generated correct results greedy solver should generate ''' self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa' self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in' self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt' # Initialize read container self.read_cont = ReadContainer() self.read_cont.populate_from_aln_file(self.input_aln_fpath) # Initialize and fill record container self.db_query = MockDbQuery (self.mock_db_fpath) self.record_cont = RecordContainer() self.record_cont.set_db_access(self.db_query) self.record_cont.populate(self.read_cont.fetch_all_reads_versions()) self.read_cont.populate_cdss(self.record_cont) # Initialize and fill up cds aln container self.cds_aln_cont = CdsAlnContainer() self.cds_aln_cont.populate(self.read_cont.fetch_all_reads()) self.greedy_solver = GreedySolver() self.greedy_solver.map_reads_2_cdss(self.cds_aln_cont) def testAlignmentsCorrectlyInactivated(self): ''' Loads correct results from results file and checks whether all the reads for a CDS listed in the file are active and whether all the other reads are inactive. ''' cds2read = self._load_active_reads() for (cds, cds_aln) in self.cds_aln_cont.cds_repository.items(): accession = cds.record_id mapped_reads = cds2read[accession] for cds_aln_subloc in cds_aln.aligned_regions.values(): if cds_aln_subloc.active: assert (cds_aln_subloc.read_id in mapped_reads) else: assert (cds_aln_subloc.read_id not in mapped_reads) def testCdsAlignmentContainerConsistency(self): assert(Read2CDSSolver.test_cds_alignment_container_consistency(self.cds_aln_cont) == True) def _load_active_reads (self): results_fhandle = open(self.results_fpath) lines = iter(results_fhandle.readlines()) cds2read_map = {} while (True): cds_id = next(lines, None) read_ids = next(lines, None) if not cds_id: break cds2read_map[cds_id.strip()] = read_ids.strip().split(';') results_fhandle.close() return cds2read_map
# Create database access db_access = DbQuery() tax_tree = TaxTree(args.tax_tree) host_determinator = HostDeterminator(dbquery=db_access, tax_tree=tax_tree) log.info("Started.") processing_start = timing.start() solver = Solver(host_determinator, read2cds_solver, tax_solver) # Populate read container # The read container type can now be determined from the input parameters # and injected into the Solver start = timing.start() read_container = ReadContainer() read_container.populate_from_aln_file(read_alignment_file=args.input) elapsed_time = timing.end(start) log.info("Populate read container - elapsed time: %s", timing.humanize(elapsed_time)) # Populate record container # The record container type can now be determine from the input parameters # and injected into the Solver start = timing.start() record_container = RecordContainer() record_container.set_db_access(db_access) # Extract all records from database record_container.populate(read_container.fetch_all_reads_versions()) elapsed_time = timing.end(start) log.info("Populate record container - elapsed time: %s",
def main(): ''' Script to run binner in one of the most common usage scenarios. * load alignment data * load taxonomy data * do basic alignment data filtering (remove host reads ecc) ''' #----------------------------------# #------ INPUT ARGUMENTS -----------# argparser = TestRunArgParser() args = argparser.parse_args() #----------------------------------# #------- STATIC DATA SOURCE -------# # CDS - GI2TAXID -- NAMES -- NODES # dataAccess = DataAccess(args) #raw_input('Data access created') #----------------------------------# #-------- TAXONOMY TREE -----------# print '1. Loading tax tree...' tax_tree = TaxTree() # tax_tree.load_taxonomy_data(dataAccess) print 'done.' #----------------------------------# #------- ALIGNMENT DATA SOURCE ----# print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.input) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) # Remember total number of reads total_read_num = read_container.get_read_count() print 'done' #------- FILTER HOST READS -------# print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments) read_container.set_new_reads(reads_with_no_host_alignments) print 'done' #----------------------------------# #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) print 'done' print '6. Estimating organisms present in sample...' target_organisms = [633, 632, 263, 543, 86661, 1392, 55080, 1386] # What is this part? print 'done.' print '7. Annotating reads...' annotated_reads = rstate.annotate_reads( read_container.fetch_all_reads(format=list), cds_aln_container.read2cds, tax_tree, target_organisms) read_container.set_new_reads(annotated_reads) print 'done' print '8. Binning reads...' orgs = bin_reads( read_container.fetch_all_reads(format=list), cds_aln_container.cds_repository, cds_aln_container.read2cds, tax_tree, target_organisms, None, None, False) ''' for org in orgs.values(): print org.name print len(set(org.get_reads())) print len(org.identified_coding_regions) print 'done.' ''' print ("total_read_num: " + str(total_read_num)) print '9. Generating XML...' dataset = Dataset(args.xml_description_file) xml_organisms = [] host = Organism (host_read_count, host_read_count/float(total_read_num), None, None, "Host", None, None, [], [], [], is_host=True) xml_organisms.append(host) for org in orgs.values(): xml_organisms.append(org.to_xml_organism(tax_tree, total_read_num)) xml_organisms.sort(key=operator.attrgetter("amount_count"), reverse=True) xml = XMLOutput(dataset, xml_organisms, args.output) xml.xml_output();
def main(): # Input arguments argparser = ArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) # ------------------ # print '1. Loading tax tree...' start = time.time() tax_tree = TaxTree() end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # print '2. Loading alignment file...' start = time.time() read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # # Create folder if does not exist if not os.path.exists(args.export_folder): os.makedirs(args.export_folder) # File for data analysis summary summary_path = os.path.join(args.export_folder, "CDSs_summary.txt") cds_summary = open(summary_path, 'w') if args.remove_host: print "Removing host..." start = time.time() #------- FILTER HOST READS -------# #print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid read_count = len(read_container.fetch_all_reads(format=list)) host_read_count = read_count - len(reads_with_no_host_alignments) non_host_read_count = read_count - host_read_count cds_summary.write("total : {0:8d}\n".format(read_count)) cds_summary.write("host : {0:8d} {1:.2f}\n".format(host_read_count, host_read_count / float(read_count) )) cds_summary.write("non-host: {0:8d} {1:.2f}\n".format(non_host_read_count, non_host_read_count / float(read_count) )) # Set host-free reads read_container.set_new_reads(reads_with_no_host_alignments) end = time.time() print("done: {0:.2f} sec".format(end - start)) #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' start = time.time() record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') end = time.time() print("done: {0:.2f} sec".format(end - start)) #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' start = time.time() read_container.populate_cdss(record_container) end = time.time() print("done: {0:.2f} sec".format(end - start)) #- RECORD ALL ALIGNEMENTS TO GENE -# print '6. Populating CDS container...' start = time.time() cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------------------- # print 'Sorting CDSs ...DISABLED' start = time.time() # Sort CDSs by their "good looks"! cds_alns = cds_aln_container.fetch_all_cds_alns(format=list) ''' cds_alns = sorted(cds_alns, key=lambda cds_aln: cds_aln.get_std_over_mean(), reverse=False) ''' end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------------------- # ''' print "Exporting phase 0 - all CDSs..." export_CDS_stats_data(cds_alns, args.export_folder, "0_all_CDSs.txt") print "done" ''' # Count Nones in cds_alns nones = count_nones(cds_alns) cds_summary.write("\n") cds_summary.write("gene None : {0}\n".format(nones['gene'])) cds_summary.write("protein_id None: {0}\n".format(nones['protein_id'])) cds_summary.write("product None : {0}\n".format(nones['product'])) cds_summary.write("\n") cds_summary.write("CDSs all: {0}\n".format(len(cds_alns))) print 'Filtering valid CDSs...' start = time.time() # Remove CDSs with too low mean coverage value or length min_mean_coverage = 0 min_length = 0 cds_alns_targeted = [cds_aln for cds_aln in cds_alns # Filters if cds_aln.get_cds_length() > min_length and cds_aln.get_mean_coverage() > min_mean_coverage] # Remove CDSs with no gene/product cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted if cds_aln.cds.product is not None] #if cds_aln.cds.gene != None #and cds_aln.cds.product != None] end = time.time() print("done: {0:.2f} sec".format(end - start)) # All valid CDSs - Output coverage/length histogram data print "Exporting phase 1 - all CDSs..." start = time.time() export_CDS_stats_data(cds_alns_targeted, args.export_folder, "1_all_valid_CDSs.txt") end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------- CDSs filtered and ready to be analyzed ------------------- # print 'Extracting ribosomal CDSs...' # Number of targeted CDSs cds_summary.write("CDSs valid: {0}\n".format(len(cds_alns_targeted))) cds_alns_ribosomal = [] for cds_aln in cds_alns_targeted: # If has word "ribosomal" in name, store coverage data for graph gene = cds_aln.cds.gene product = cds_aln.cds.product protein_id = cds_aln.cds.protein_id if is_ribosomal(product): #print("{0} {1} {2}\n".format(gene, protein_id, product)) cds_alns_ribosomal.append(cds_aln) print 'done' # ------------------- Ribosomal CDSs acquired! --------------------- # print 'Analysing ribosomals...' # Extract interesting data # Mean coverage, max coverage mm_cov = 0 max_cov = 0 for cds_aln in cds_alns_ribosomal: mean_cov = cds_aln.get_mean_coverage() mm_cov += mean_cov max_cov = max(max_cov, mean_cov) if mm_cov > 0: mm_cov /= len(cds_alns_ribosomal) cds_summary.write("ribosomals all {0}\n".format(len(cds_alns_ribosomal))) cds_summary.write("mean coverage: {0}\n".format(mm_cov)) cds_summary.write("max coverage : {0}\n".format(max_cov)) print 'done' # Ribosomal CDSs only - Output coverage/length histogram print "Exporting phase 2 - ribosomal CDSs only..." export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "2_ribosomal_CDSs.txt") print "done" # ------------------- Making biological sense - choosing CDSs -------------------- # print 'Filtering under-average ribosomals...' # NOTE: take length into consideration? cds_alns_ribosomal = [cds_aln for cds_aln in cds_alns_ribosomal # Filters if cds_aln.get_mean_coverage() > mm_cov] print 'done' cds_summary.write("ribosomals over-mean: {0}\n".format(len(cds_alns_ribosomal))) cds_summary.close() print 'Phase 3 - filtered ribosomal CDSs...' export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "3_ribosomal_CDSs_filtered.txt") print 'done' # Store charts cov data - if selected so if args.export_charts: print "Exporting chart coverage data..." export_CDS_graph_data(cds_alns_ribosomal, args.export_charts) print "done." # --------------------- I have chosen CDSs - determine species and analyse ------------------------ # # Species level resolution # See which species are present - dump ones with not enough CDSs # NOTE: So far done in determine_species_by_ribosomals.py CDS_count = {} # Count CDSs of each species species_set = set() # Get estimated tax_ids for cds_aln in cds_alns_ribosomal: tax_id = cds_aln.cds.taxon # Put each tax_id up to the "species" level tax_id_species = tax_tree.get_parent_with_rank(tax_id, 'species') species_set.add(tax_id_species) CDS_count[tax_id_species] = CDS_count.get(tax_id_species, 0) + 1 # Get reported CDSs ids reported_CDS_ids = set() for cds_aln in cds_alns_ribosomal: reported_CDS_ids.add(cds_aln.cds.id) # ------------ Read assignment analysis -------------- # print "Read assignment analysis..." reads = read_container.fetch_all_reads(format=list) assignment_analysis(species_set, reads, tax_tree, args.export_folder, CDS_count)
def main(): ''' Script to run binner in one of the most common usage scenarios. * load alignment data * load taxonomy data * do basic alignment data filtering (remove host reads ecc) ''' #----------------------------------# #------ INPUT ARGUMENTS -----------# argparser = PickleParser() args = argparser.parse_args() #----------------------------------# #------- STATIC DATA SOURCE -------# # CDS - GI2TAXID -- NAMES -- NODES # dataAccess = DataAccess(args) #raw_input('Data access created') #----------------------------------# #-------- TAXONOMY TREE -----------# print '1. Loading tax tree...' tax_tree = TaxTree() # tax_tree.load_taxonomy_data(dataAccess) print 'done.' #----------------------------------# #------- ALIGNMENT DATA SOURCE ----# print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.input) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) print 'done' #------- FILTER HOST READS -------# print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments) read_container.set_new_reads(reads_with_no_host_alignments) print 'done' #----------------------------------# #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') record_container.populate(read_container.fetch_all_reads_versions(), table='rrna') print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) print 'done'
def main(): ''' Script to identify analyse ribosomal genes expressed. ''' # Input arguments argparser = ArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) #print '1. Loading tax tree...' tax_tree = TaxTree() #print 'done.' #print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) #print 'done' ''' # TODO: Here i should recognize host reads! #------- FILTER HOST READS -------# #print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid read_count = len(read_container.fetch_all_reads(format=list)) host_read_count = read_count - len(reads_with_no_host_alignments) non_host_read_count = read_count - host_read_count print ("total : {0:8d}".format(read_count)) print ("host : {0:8d} {1:.2f}".format(host_read_count, host_read_count / float(read_count) )) print ("non-host: {0:8d} {1:.2f}".format(non_host_read_count, non_host_read_count / float(read_count) )) print read_container.set_new_reads(reads_with_no_host_alignments) ''' # ------------------------------------- # #----------------------------------# #------- LOAD ALL RECORDS -------# #print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') #print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# #print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# #print '6. Populating CDS container...' cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) #print 'done' #print("Loaded CDS container") # Take only CDSs of given tax_id # Remove CDSs with too low mean coverage value min_mean_coverage = 0 min_length = 0 cds_alns = cds_aln_container.fetch_all_cds_alns(format=list) print ( "CDSs all : " + str(len(cds_alns)) ) cds_alns_targeted = [cds_aln for cds_aln in cds_alns # Filters if cds_aln.get_cds_length() > min_length and cds_aln.get_mean_coverage() > min_mean_coverage] # Remove CDSs with no gene/product cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted if cds_aln.cds.gene != None and cds_aln.cds.product != None] # ------------------- CDSs filtered and ready to be analyzed ------------------- # # Number of targeted CDSs print ( "CDSs valid: " + str(len(cds_alns_targeted)) ) print cds_alns_ribosomal = [] for cds_aln in cds_alns_targeted: # If has word "ribosomal" in name, store coverage data for graph gene = cds_aln.cds.gene product = cds_aln.cds.product protein_id = cds_aln.cds.protein_id if "ribosomal" in product: #print("{0} {1} {2}\n".format(gene, protein_id, product)) cds_alns_ribosomal.append(cds_aln) # ------------------- Ribosomal CDSs acquired! --------------------- # # Sort it! cds_alns_ribosomal = sorted(cds_alns_ribosomal, key=lambda cds_aln: cds_aln.get_std_over_mean(), reverse=False) # Extract interesting data # Mean coverage, max coverage mm_cov = 0 max_cov = 0 for cds_aln in cds_alns_ribosomal: mean_cov = cds_aln.get_mean_coverage() mm_cov += mean_cov max_cov = max(max_cov, mean_cov) if mm_cov > 0: mm_cov /= len(cds_alns_ribosomal) # Print print("ribosomals: " + str(len(cds_alns_ribosomal))) print("mean coverage: " + str(mm_cov)) print("max coverage : {0}".format(max_cov)) print for cds_aln in cds_alns_ribosomal: gene = cds_aln.cds.gene product = cds_aln.cds.product protein_id = cds_aln.cds.protein_id taxon = cds_aln.cds.taxon name = tax_tree.nodes[taxon].organism_name print("{0:4} {1:10} {2:50} {3:10d} {4:60}".format(gene, protein_id, product, taxon, name)) # Store graph data export_CDS_graph_data(cds_alns_ribosomal, args.export_path) '''