Exemple #1
0
class BestScoreSolverTest(unittest.TestCase):

    # setUp is executed before each test method
    def setUp(self):
        '''
        @param mock_db_fpath (str) path to syntheticaly created CDSs which serves
        to fill up mock database of records
        @param input_aln_fpath (str) path to input alignment file 
        @param results_fpath (str) path to file with generated correct results 
        greedy solver should generate
        '''
        self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa'
        self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in'
        self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt'
        #       Initialize read container
        self.read_cont = ReadContainer()
        self.read_cont.populate_from_aln_file(self.input_aln_fpath)
        #       Initialize and fill record container
        self.db_query = MockDbQuery(self.mock_db_fpath)
        self.record_cont = RecordContainer()
        self.record_cont.set_db_access(self.db_query)
        self.record_cont.populate(self.read_cont.fetch_all_reads_versions())
        self.read_cont.populate_cdss(self.record_cont)
        #       Initialize and fill up cds aln container
        self.cds_aln_cont = CdsAlnContainer()
        self.cds_aln_cont.populate(self.read_cont.fetch_all_reads())

        self.bs_solver = BestScoreSolver()
        self.bs_solver.map_reads_2_cdss(self.cds_aln_cont)

    def testCdsAlignmentContainerConsistency(self):
        assert (Read2CDSSolver.test_cds_alignment_container_consistency(
            self.cds_aln_cont) == True)
Exemple #2
0
class RecordContainerTest(unittest.TestCase):
    def setUp(self):
        self.read_container = ReadContainer()
        self.record_container = RecordContainer()

    def tearUp(self):
        pass

    def testFillRecordContainer(self):
        '''Method to test whether record container populating works.
        Uses mock database access to test whether record container
        has correct number of items.'''
        aln_file = './test/solver/read2cds/.test_data/lisa.in'
        cds_fasta = './test/solver/read2cds/.test_data/cds.fa'
        db_access = MockDbQuery(cds_fasta)
        self.record_container.set_db_access(db_access)

        self.read_container.populate_from_aln_file(aln_file)
        self.record_container.populate(
            self.read_container.fetch_all_reads_versions())
        records = self.record_container.fetch_all_records(format=list)
        self.assertEqual(len(db_access.records), len(records))

    def testReturnsNoneForNonexistentRecord(self):
        record = self.record_container.fetch_existing_record("XXX")
        self.assertIsNone(record, "No record with version XXX should be found")
class RecordContainerTest (unittest.TestCase):

    def setUp(self):
        self.read_container = ReadContainer()
        self.record_container = RecordContainer()

    def tearUp(self):
        pass

    def testFillRecordContainer(self):
        '''Method to test whether record container populating works.
        Uses mock database access to test whether record container
        has correct number of items.'''
        aln_file = './test/solver/read2cds/.test_data/lisa.in'
        cds_fasta = './test/solver/read2cds/.test_data/cds.fa'
        db_access = MockDbQuery(cds_fasta)
        self.record_container.set_db_access(db_access)

        self.read_container.populate_from_aln_file(aln_file)
        self.record_container.populate(
            self.read_container.fetch_all_reads_versions())
        records = self.record_container.fetch_all_records(format=list)
        self.assertEqual (len(db_access.records), len(records))
        
    def testReturnsNoneForNonexistentRecord(self):
        record = self.record_container.fetch_existing_record("XXX")
        self.assertIsNone(record, "No record with version XXX should be found")
Exemple #4
0
class StatisticsTest (unittest.TestCase):

    # setUp is executed before each test method
    def setUp(self):
        '''
        @param mock_db_fpath (str) path to syntheticaly created CDSs which serves
        to fill up mock database of records
        @param input_aln_fpath (str) path to input alignment file 
        @param results_fpath (str) path to file with generated correct results 
        greedy solver should generate
        '''        
        self.mock_db_fpath = './test/statistics/.test_data/cds.fa'
        self.input_aln_fpath = './test/statistics/.test_data/lisa.in'
#       Initialize read container
        self.read_cont = ReadContainer()
        self.read_cont.populate_from_aln_file(self.input_aln_fpath)
#       Initialize and fill record container
        self.db_query = MockDbQuery (self.mock_db_fpath)
        self.record_cont = RecordContainer()
        self.record_cont.set_db_access(self.db_query)
        self.record_cont.populate(self.read_cont.fetch_all_reads_versions())
        self.read_cont.populate_cdss(self.record_cont)
#       Initialize and fill up cds aln container
        self.cds_aln_cont = CdsAlnContainer()
        self.cds_aln_cont.populate(self.read_cont.fetch_all_reads())


    def testStatistics(self):
        assert(num_read_alns(self.read_cont) == 22)
        assert(num_active_aligned_regions(self.cds_aln_cont) == 22)
        assert(num_inactive_read_alns(self.read_cont) == 0)
        
        self.bs_solver = BestScoreSolver()
        self.bs_solver.map_reads_2_cdss(self.cds_aln_cont)
        
        records_stats = count_alns_to_record_and_cds(self.read_cont)
        print "Number of records for which we  have stats: %d\n" % len(records_stats)
        for rec_stat in records_stats.values():
            rec_stat.print_data()

        assert(num_active_aligned_regions(self.cds_aln_cont) == 16)
        
        assert(num_cdss(self.cds_aln_cont) == 4)
        assert(num_cdss_with_no_alns(self.cds_aln_cont) == 0)
Exemple #5
0
class StatisticsTest(unittest.TestCase):

    # setUp is executed before each test method
    def setUp(self):
        '''
        @param mock_db_fpath (str) path to syntheticaly created CDSs which serves
        to fill up mock database of records
        @param input_aln_fpath (str) path to input alignment file 
        @param results_fpath (str) path to file with generated correct results 
        greedy solver should generate
        '''
        self.mock_db_fpath = './test/statistics/.test_data/cds.fa'
        self.input_aln_fpath = './test/statistics/.test_data/lisa.in'
        #       Initialize read container
        self.read_cont = ReadContainer()
        self.read_cont.populate_from_aln_file(self.input_aln_fpath)
        #       Initialize and fill record container
        self.db_query = MockDbQuery(self.mock_db_fpath)
        self.record_cont = RecordContainer()
        self.record_cont.set_db_access(self.db_query)
        self.record_cont.populate(self.read_cont.fetch_all_reads_versions())
        self.read_cont.populate_cdss(self.record_cont)
        #       Initialize and fill up cds aln container
        self.cds_aln_cont = CdsAlnContainer()
        self.cds_aln_cont.populate(self.read_cont.fetch_all_reads())

    def testStatistics(self):
        assert (num_read_alns(self.read_cont) == 22)
        assert (num_active_aligned_regions(self.cds_aln_cont) == 22)
        assert (num_inactive_read_alns(self.read_cont) == 0)

        self.bs_solver = BestScoreSolver()
        self.bs_solver.map_reads_2_cdss(self.cds_aln_cont)

        records_stats = count_alns_to_record_and_cds(self.read_cont)
        print "Number of records for which we  have stats: %d\n" % len(
            records_stats)
        for rec_stat in records_stats.values():
            rec_stat.print_data()

        assert (num_active_aligned_regions(self.cds_aln_cont) == 16)

        assert (num_cdss(self.cds_aln_cont) == 4)
        assert (num_cdss_with_no_alns(self.cds_aln_cont) == 0)
Exemple #6
0
def fill_containers (alignment_file, db_access):
    '''
    Populates read, record and CDS alignment container.
    @return tuple(ReadContainer, RecordContainer, CdsAlnContainer)
    '''

    read_cont   = ReadContainer()
    record_cont = RecordContainer()
    record_cont.set_db_access(db_access)
    cdsaln_cont = CdsAlnContainer()

#   1. Load all the information available in the alignment file
    read_cont.populate_from_aln_file(alignment_file)
#   2. Fetch all the records reported in the alignment file from the database
    record_cont.populate(read_cont.fetch_all_reads_versions())
#   3. Find to which coding sequences reads map
    read_cont.populate_cdss(record_cont)
#   4. Populate Cds Alignment container
    cdsaln_cont.populate(read_cont.fetch_all_reads())

    return (read_cont, record_cont, cdsaln_cont)
Exemple #7
0
def main():
    '''
    Script to run binner in one of the most common
    usage scenarios.
    * load alignment data
    * load taxonomy data
    * do basic alignment data filtering (remove host reads ecc)
    '''

    #----------------------------------#
    #------ INPUT ARGUMENTS -----------#
    argparser = TestRunArgParser()
    args  = argparser.parse_args()

    #----------------------------------#
    #------- STATIC DATA SOURCE -------#
    # CDS - GI2TAXID -- NAMES -- NODES #
    dataAccess = DataAccess(args)
    #raw_input('Data access created')
    #----------------------------------#

    #-------- TAXONOMY TREE -----------#
    print '1. Loading tax tree...'
    tax_tree = TaxTree()
    # tax_tree.load_taxonomy_data(dataAccess)
    print 'done.'

    #----------------------------------#
    #------- ALIGNMENT DATA SOURCE ----#
    print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.input)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    # Remember total number of reads
    total_read_num = read_container.get_read_count()
    print 'done'

    #------- FILTER HOST READS -------#
    print '3. Filtering host reads & alignments...'
    new_reads = host_filter.filter_potential_host_reads(
        read_container.fetch_all_reads(format=list),
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        #delete_host_alignments =
        True,
        #filter_unassigned =
        True,
        #unassigned_taxid=
        -1,
        host_filter.perc_of_host_alignments_larger_than)

    dataAccess.clear_cache()    # deletes gi2taxid cache

    reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
        new_reads,
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        True,   # delete host alignments
        True,   # filter unassigned
        -1)     # unassigned taxid

    host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments)
    read_container.set_new_reads(reads_with_no_host_alignments)
    print 'done'

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    print 'done'

    print '6. Estimating organisms present in sample...'
    target_organisms = [633, 632, 263, 543, 86661, 1392, 55080, 1386] # What is this part?
    print 'done.'
   
    print '7. Annotating reads...' 
    annotated_reads = rstate.annotate_reads(
                    read_container.fetch_all_reads(format=list),
                    cds_aln_container.read2cds, 
                    tax_tree, 
                    target_organisms)
    read_container.set_new_reads(annotated_reads)
    print 'done'
   
    print '8. Binning reads...' 
    orgs = bin_reads(
        read_container.fetch_all_reads(format=list),
        cds_aln_container.cds_repository,
        cds_aln_container.read2cds, 
        tax_tree,
        target_organisms,
        None,
        None,
        False) 

    '''
    for org in orgs.values():
        print org.name
        print len(set(org.get_reads()))
        print len(org.identified_coding_regions)
    print 'done.'
    '''

    print ("total_read_num: " + str(total_read_num))

    print '9. Generating XML...'
    dataset = Dataset(args.xml_description_file)
    xml_organisms = []
    host = Organism (host_read_count, host_read_count/float(total_read_num), None, None, "Host",
                 None, None, [], [], [], is_host=True)
    xml_organisms.append(host)
    for org in orgs.values():
        xml_organisms.append(org.to_xml_organism(tax_tree, total_read_num))
    xml_organisms.sort(key=operator.attrgetter("amount_count"), reverse=True)
    xml = XMLOutput(dataset, xml_organisms, args.output) 
    xml.xml_output();
Exemple #8
0
class GreedySolverTest(unittest.TestCase):

    # setUp is executed before each test method
    def setUp(self):
        '''
        @param mock_db_fpath (str) path to syntheticaly created CDSs which serves
        to fill up mock database of records
        @param input_aln_fpath (str) path to input alignment file 
        @param results_fpath (str) path to file with generated correct results 
        greedy solver should generate
        '''
        self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa'
        self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in'
        self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt'
        #       Initialize read container
        self.read_cont = ReadContainer()
        self.read_cont.populate_from_aln_file(self.input_aln_fpath)
        #       Initialize and fill record container
        self.db_query = MockDbQuery(self.mock_db_fpath)
        self.record_cont = RecordContainer()
        self.record_cont.set_db_access(self.db_query)
        self.record_cont.populate(self.read_cont.fetch_all_reads_versions())
        self.read_cont.populate_cdss(self.record_cont)
        #       Initialize and fill up cds aln container
        self.cds_aln_cont = CdsAlnContainer()
        self.cds_aln_cont.populate(self.read_cont.fetch_all_reads())

        self.greedy_solver = GreedySolver()
        self.greedy_solver.map_reads_2_cdss(self.cds_aln_cont)

    def testAlignmentsCorrectlyInactivated(self):
        '''
        Loads correct results from results file and checks whether 
        all the reads for a CDS listed in the file are active and
        whether all the other reads are inactive.
        '''
        cds2read = self._load_active_reads()

        for (cds, cds_aln) in self.cds_aln_cont.cds_repository.items():
            accession = cds.record_id
            mapped_reads = cds2read[accession]
            for cds_aln_subloc in cds_aln.aligned_regions.values():
                if cds_aln_subloc.active:
                    assert (cds_aln_subloc.read_id in mapped_reads)
                else:
                    assert (cds_aln_subloc.read_id not in mapped_reads)

    def testCdsAlignmentContainerConsistency(self):
        assert (Read2CDSSolver.test_cds_alignment_container_consistency(
            self.cds_aln_cont) == True)

    def _load_active_reads(self):
        results_fhandle = open(self.results_fpath)
        lines = iter(results_fhandle.readlines())
        cds2read_map = {}
        while (True):
            cds_id = next(lines, None)
            read_ids = next(lines, None)
            if not cds_id: break
            cds2read_map[cds_id.strip()] = read_ids.strip().split(';')
        results_fhandle.close()
        return cds2read_map
Exemple #9
0
class GreedySolverTest (unittest.TestCase):

    # setUp is executed before each test method
    def setUp(self):
        '''
        @param mock_db_fpath (str) path to syntheticaly created CDSs which serves
        to fill up mock database of records
        @param input_aln_fpath (str) path to input alignment file 
        @param results_fpath (str) path to file with generated correct results 
        greedy solver should generate
        '''        
        self.mock_db_fpath = './test/solver/read2cds/.test_data/cds.fa'
        self.input_aln_fpath = './test/solver/read2cds/.test_data/lisa.in'
        self.results_fpath = './test/solver/read2cds/.test_data/cds_ordering.txt'
#       Initialize read container
        self.read_cont = ReadContainer()
        self.read_cont.populate_from_aln_file(self.input_aln_fpath)
#       Initialize and fill record container
        self.db_query = MockDbQuery (self.mock_db_fpath)
        self.record_cont = RecordContainer()
        self.record_cont.set_db_access(self.db_query)
        self.record_cont.populate(self.read_cont.fetch_all_reads_versions())
        self.read_cont.populate_cdss(self.record_cont)
#       Initialize and fill up cds aln container
        self.cds_aln_cont = CdsAlnContainer()
        self.cds_aln_cont.populate(self.read_cont.fetch_all_reads())

        self.greedy_solver = GreedySolver()
        self.greedy_solver.map_reads_2_cdss(self.cds_aln_cont)


    def testAlignmentsCorrectlyInactivated(self):
        '''
        Loads correct results from results file and checks whether 
        all the reads for a CDS listed in the file are active and
        whether all the other reads are inactive.
        '''
        cds2read = self._load_active_reads()

        for (cds, cds_aln) in self.cds_aln_cont.cds_repository.items():
            accession = cds.record_id
            mapped_reads = cds2read[accession]
            for cds_aln_subloc in cds_aln.aligned_regions.values():
                if cds_aln_subloc.active:
                    assert (cds_aln_subloc.read_id in mapped_reads)
                else:
                    assert (cds_aln_subloc.read_id not in mapped_reads)

    def testCdsAlignmentContainerConsistency(self):
        assert(Read2CDSSolver.test_cds_alignment_container_consistency(self.cds_aln_cont) == True)

    def _load_active_reads (self):
        results_fhandle = open(self.results_fpath)
        lines = iter(results_fhandle.readlines())
        cds2read_map = {}
        while (True):
            cds_id = next(lines, None)
            read_ids = next(lines, None)
            if not cds_id: break
            cds2read_map[cds_id.strip()] = read_ids.strip().split(';')
        results_fhandle.close()
        return cds2read_map
Exemple #10
0
  # and injected into the Solver
  start = timing.start()
  read_container = ReadContainer()
  read_container.populate_from_aln_file(read_alignment_file=args.input)
  elapsed_time = timing.end(start)
  log.info("Populate read container - elapsed time: %s", 
           timing.humanize(elapsed_time))    
  
  # Populate record container
  # The record container type can now be determine from the input parameters
  # and injected into the Solver
  start = timing.start()
  record_container = RecordContainer()
  record_container.set_db_access(db_access)
  # Extract all records from database
  record_container.populate(read_container.fetch_all_reads_versions())
  elapsed_time = timing.end(start)
  log.info("Populate record container - elapsed time: %s", 
           timing.humanize(elapsed_time)) 
 
  solver.generateSolutionXML(read_container=read_container,
                             record_container=record_container,
                             dataset_xml_file=args.descr,
                             output_solution_filename=args.output,
                             stats_dir=args.stats_dir,
                             solution_file=args.solution_file)
  
  processing_delta = timing.end(processing_start)
  log.info("Processing done in %s", 
      timing.humanize(processing_delta))
  
def main():

    # Input arguments
    argparser   = ArgParser()
    args        = argparser.parse_args()

    # Access database
    dataAccess = DataAccess(args)

    # ------------------ #

    print '1. Loading tax tree...'
    start = time.time()

    tax_tree = TaxTree()

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------ #

    print '2. Loading alignment file...'
    start = time.time()

    read_container = ReadContainer()
    read_container.load_alignment_data(args.alignment_file)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------ #

    # Create folder if does not exist
    if not os.path.exists(args.export_folder):
        os.makedirs(args.export_folder)
    # File for data analysis summary
    summary_path = os.path.join(args.export_folder, "CDSs_summary.txt")
    cds_summary = open(summary_path, 'w')

    if args.remove_host:
        print "Removing host..."
        start = time.time()

        #------- FILTER HOST READS -------#
        #print '3. Filtering host reads & alignments...'
        new_reads = host_filter.filter_potential_host_reads(
            read_container.fetch_all_reads(format=list),
            tax_tree.tax2relevantTax,
            tax_tree.potential_hosts,
            #delete_host_alignments =
            True,
            #filter_unassigned =
            True,
            #unassigned_taxid=
            -1,
            host_filter.perc_of_host_alignments_larger_than)

        dataAccess.clear_cache()    # deletes gi2taxid cache

        reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
            new_reads,
            tax_tree.tax2relevantTax,
            tax_tree.potential_hosts,
            True,   # delete host alignments
            True,   # filter unassigned
            -1)     # unassigned taxid

        read_count          = len(read_container.fetch_all_reads(format=list))
        host_read_count     = read_count - len(reads_with_no_host_alignments)
        non_host_read_count = read_count - host_read_count
        
        cds_summary.write("total   : {0:8d}\n".format(read_count))
        cds_summary.write("host    : {0:8d} {1:.2f}\n".format(host_read_count, 
                                      host_read_count / float(read_count)
                                      ))
        cds_summary.write("non-host: {0:8d} {1:.2f}\n".format(non_host_read_count, 
                                      non_host_read_count / float(read_count)
                                      ))
        # Set host-free reads
        read_container.set_new_reads(reads_with_no_host_alignments)

        end = time.time()
        print("done: {0:.2f} sec".format(end - start))

    #------- LOAD ALL RECORDS   -------#

    print '4. Loading referenced records...'
    start = time.time()

    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    #-- MAP ALIGNMENTS TO GENES   -----#

    print '5. Mapping alignments to genes...'
    start = time.time()

    read_container.populate_cdss(record_container)

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    #- RECORD ALL ALIGNEMENTS TO GENE -#

    print '6. Populating CDS container...'
    start = time.time()

    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------------------- #

    print 'Sorting CDSs ...DISABLED'
    start = time.time()

    # Sort CDSs by their "good looks"!
    cds_alns = cds_aln_container.fetch_all_cds_alns(format=list)
    '''
    cds_alns = sorted(cds_alns,
                    key=lambda cds_aln: cds_aln.get_std_over_mean(),
                    reverse=False)
    '''
    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------------------- #

    '''
    print "Exporting phase 0 - all CDSs..."
    export_CDS_stats_data(cds_alns, args.export_folder, "0_all_CDSs.txt")
    print "done"
    '''

    # Count Nones in cds_alns
    nones = count_nones(cds_alns)
    cds_summary.write("\n")
    cds_summary.write("gene None       : {0}\n".format(nones['gene']))
    cds_summary.write("protein_id  None: {0}\n".format(nones['protein_id']))
    cds_summary.write("product  None   : {0}\n".format(nones['product']))
    cds_summary.write("\n")

    cds_summary.write("CDSs all: {0}\n".format(len(cds_alns)))

    print 'Filtering valid CDSs...'
    start = time.time()

    # Remove CDSs with too low mean coverage value or length
    min_mean_coverage   = 0
    min_length          = 0
    cds_alns_targeted = [cds_aln for cds_aln in cds_alns 
                         # Filters
                         if cds_aln.get_cds_length() > min_length
                         and cds_aln.get_mean_coverage() > min_mean_coverage]

    # Remove CDSs with no gene/product
    cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted
                         if cds_aln.cds.product is not None]
                         #if  cds_aln.cds.gene != None
                         #and cds_aln.cds.product != None]

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # All valid CDSs - Output coverage/length histogram data
    print "Exporting phase 1 - all CDSs..."
    start = time.time()

    export_CDS_stats_data(cds_alns_targeted, args.export_folder, "1_all_valid_CDSs.txt")

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------- CDSs filtered and ready to be analyzed ------------------- #

    print 'Extracting ribosomal CDSs...'
    # Number of targeted CDSs
    cds_summary.write("CDSs valid: {0}\n".format(len(cds_alns_targeted)))

    cds_alns_ribosomal = []
    for cds_aln in cds_alns_targeted:

        # If has word "ribosomal" in name, store coverage data for graph
        gene        = cds_aln.cds.gene
        product     = cds_aln.cds.product
        protein_id  = cds_aln.cds.protein_id

        if is_ribosomal(product):
            #print("{0} {1} {2}\n".format(gene, protein_id, product))
            cds_alns_ribosomal.append(cds_aln)

    print 'done'
    # ------------------- Ribosomal CDSs acquired! --------------------- #

    print 'Analysing ribosomals...'

    # Extract interesting data
    # Mean coverage, max coverage
    mm_cov  = 0
    max_cov = 0
    for cds_aln in cds_alns_ribosomal:
        mean_cov = cds_aln.get_mean_coverage()
        mm_cov += mean_cov
        max_cov = max(max_cov, mean_cov)
    if mm_cov > 0:
        mm_cov /= len(cds_alns_ribosomal)

    cds_summary.write("ribosomals all {0}\n".format(len(cds_alns_ribosomal)))
    cds_summary.write("mean coverage: {0}\n".format(mm_cov))
    cds_summary.write("max coverage : {0}\n".format(max_cov))
    print 'done'

    # Ribosomal CDSs only - Output coverage/length histogram
    print "Exporting phase 2 - ribosomal CDSs only..."
    export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "2_ribosomal_CDSs.txt")
    print "done"

    # ------------------- Making biological sense - choosing CDSs -------------------- #

    print 'Filtering under-average ribosomals...'
    # NOTE: take length into consideration?
    cds_alns_ribosomal = [cds_aln for cds_aln in cds_alns_ribosomal
                         # Filters
                         if cds_aln.get_mean_coverage() > mm_cov]
    print 'done'
    cds_summary.write("ribosomals over-mean: {0}\n".format(len(cds_alns_ribosomal)))
    cds_summary.close()

    print 'Phase 3 - filtered ribosomal CDSs...'
    export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "3_ribosomal_CDSs_filtered.txt")
    print 'done'
    
    # Store charts cov data - if selected so
    if args.export_charts:
        print "Exporting chart coverage data..."
        export_CDS_graph_data(cds_alns_ribosomal, args.export_charts)
        print "done."

    # --------------------- I have chosen CDSs - determine species and analyse ------------------------ #

    # Species level resolution
    # See which species are present - dump ones with not enough CDSs
    # NOTE: So far done in determine_species_by_ribosomals.py

    CDS_count   = {}    # Count CDSs of each species
    species_set = set() # Get estimated tax_ids
    for cds_aln in cds_alns_ribosomal:
        tax_id = cds_aln.cds.taxon

        # Put each tax_id up to the "species" level
        tax_id_species = tax_tree.get_parent_with_rank(tax_id, 'species')

        species_set.add(tax_id_species)
        CDS_count[tax_id_species] = CDS_count.get(tax_id_species, 0) + 1

    # Get reported CDSs ids
    reported_CDS_ids = set()
    for cds_aln in cds_alns_ribosomal:
        reported_CDS_ids.add(cds_aln.cds.id)

    # ------------ Read assignment analysis -------------- #

    print "Read assignment analysis..."

    reads = read_container.fetch_all_reads(format=list)
    assignment_analysis(species_set, reads, tax_tree, args.export_folder, CDS_count)
def main():
    '''
    Script to run binner in one of the most common
    usage scenarios.
    * load alignment data
    * load taxonomy data
    * do basic alignment data filtering (remove host reads ecc)
    '''

    #----------------------------------#
    #------ INPUT ARGUMENTS -----------#
    argparser = PickleParser()
    args  = argparser.parse_args()

    #----------------------------------#
    #------- STATIC DATA SOURCE -------#
    # CDS - GI2TAXID -- NAMES -- NODES #
    dataAccess = DataAccess(args)
    #raw_input('Data access created')
    #----------------------------------#

    #-------- TAXONOMY TREE -----------#
    print '1. Loading tax tree...'
    tax_tree = TaxTree()
    # tax_tree.load_taxonomy_data(dataAccess)
    print 'done.'

    #----------------------------------#
    #------- ALIGNMENT DATA SOURCE ----#
    print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.input)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    print 'done'

    #------- FILTER HOST READS -------#
    print '3. Filtering host reads & alignments...'
    new_reads = host_filter.filter_potential_host_reads(
        read_container.fetch_all_reads(format=list),
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        #delete_host_alignments =
        True,
        #filter_unassigned =
        True,
        #unassigned_taxid=
        -1,
        host_filter.perc_of_host_alignments_larger_than)
    dataAccess.clear_cache()    # deletes gi2taxid cache
    reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
        new_reads,
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        True,   # delete host alignments
        True,   # filter unassigned
        -1)     # unassigned taxid
    host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments)
    read_container.set_new_reads(reads_with_no_host_alignments)
    print 'done'

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    record_container.populate(read_container.fetch_all_reads_versions(), table='rrna')
    print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    print 'done'
def main():
    '''
    Script to analyse  genes expressed for given tax id.
    '''

    # Input arguments
    argparser   = ArgParser()
    args        = argparser.parse_args()

    # Access database
    dataAccess = DataAccess(args)

    print '1. Loading tax tree...'
    tax_tree = TaxTree()
    print 'done.'

    print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.alignment_file)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    print 'done'

    # TODO: Here i should recognize host reads!

    # ------------------------------------- #

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    print '6. Populating CDS container...'
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    print 'done'

    print("Loaded CDS container")
    
    # Take only CDSs of given tax_id
    # Remove CDSs with too low mean coverage value
    min_mean_coverage   = 10
    min_length          = 20

    cds_alns = cds_aln_container.fetch_all_cds_alns(format=list)
    print ( "All CDSs (all organisms): " + str(len(cds_alns)) )

    cds_alns_targeted = [cds_aln for cds_aln in cds_alns 
                         if  cds_aln.get_tax_id() == args.tax_id
                         # Filters
                         and cds_aln.get_cds_length() > min_length
                         and cds_aln.get_mean_coverage() > min_mean_coverage]

    # Remove CDSs with no gene/product
    cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted
                         if  cds_aln.cds.gene != None
                         and cds_aln.cds.product != None]

    # ------------------- CDSs filtered and ready to be analyzed ------------------- #

    # Number of targeted CDSs
    print ( "Targeted CDSs: " + str(len(cds_alns_targeted)) )

    '''
    print ("Sorting CDSs: stddev/mean")
    cds_alns_sorted = sorted(cds_alns_targeted, 
                             key=lambda cds_aln: cds_aln.get_std_over_mean(),
                             reverse=False)

    # TODO: Here I should somehow determine which CDSs are "expressed", and which are not?

    # Write to file stuff(gene, protein_id) for each cds_aln
    print("Writing data to file")
    path = args.export_path
    f = open(path, 'w')
    
    for cds_aln in cds_alns_targeted:
        gene        = cds_aln.cds.gene
        product     = cds_aln.cds.product
        protein_id  = cds_aln.cds.protein_id

        f.write("{0} {1}\n".format(gene, protein_id))
    f.close()

    print("Done")
    '''
    
    # -------------------- #
    

    '''
    # Analyse those CDSs
    print ( "Targeted CDSs: " + str(len(cds_alns_targeted)) )

    # See the mean length of CDS
    mean_cds_length = 0
    no_locs_num = 0
    for cds_aln in cds_alns_targeted:
        try:
            loc_length = cds_aln.get_cds_length()
            mean_cds_length += loc_length
        except:
            no_locs_num += 1
    # Get mean
    mean_cds_length /= float(len(cds_alns_targeted))

    print("---------------------------------------------")
    print("Mean CDS length: " + str(mean_cds_length))
    print("Nones: " + str(no_locs_num))
            
    print("---------------------------------------------")
    '''


    '''
    # Create folder where data about CDSs will be stored
    if not os.path.exists(args.export_path):
        os.makedirs(args.export_path)

    # Export some amount of best CDSs
    i = 1
    for cds_aln in cds_alns_sorted:
        filename = "cds_" + str(i) + ".txt"
        coverage_path = os.path.join(args.export_path, filename)

        print(str(i) + ": " + str(cds_aln.get_std_over_mean()))

        cds_aln.coverage_to_file(coverage_path)

        if i == 50: # TODO: Define this somehow as parameter
            break

        i += 1

    # Load CDS container
    '''

    # Analyse stuff
    # print("Analysing stuff!")

    '''
Exemple #14
0
from data.containers.cdsaln import CdsAlnContainer

from utils.logger import Logger

Logger("log")

db_query = DbQuery()

# create containers
record_container = RecordContainer()
record_container.set_db_access(db_query)

read_container = ReadContainer()
read_container.populate_from_aln_file("example_data/2reads.in")

record_container.populate(read_container)
read_container.populate_cdss(record_container)

cds_aln_container = CdsAlnContainer()
cds_aln_container.populate(read_container)

print cds_aln_container


r2c_solver = BestScoreSolver()
r2c_solver.map_reads_2_cdss(cds_aln_container)

print "Consistency test result: ", Read2CDSSolver.test_cds_alignment_container_consistency(cds_aln_container)

print "------------------------------------------------"
print cds_aln_container
Exemple #15
0
    # and injected into the Solver
    start = timing.start()
    read_container = ReadContainer()
    read_container.populate_from_aln_file(read_alignment_file=args.input)
    elapsed_time = timing.end(start)
    log.info("Populate read container - elapsed time: %s",
             timing.humanize(elapsed_time))

    # Populate record container
    # The record container type can now be determine from the input parameters
    # and injected into the Solver
    start = timing.start()
    record_container = RecordContainer()
    record_container.set_db_access(db_access)
    # Extract all records from database
    record_container.populate(read_container.fetch_all_reads_versions())
    elapsed_time = timing.end(start)
    log.info("Populate record container - elapsed time: %s",
             timing.humanize(elapsed_time))

    solver.generateSolutionXML(read_container=read_container,
                               record_container=record_container,
                               dataset_xml_file=args.descr,
                               output_solution_filename=args.output,
                               stats_dir=args.stats_dir,
                               solution_file=args.solution_file)

    processing_delta = timing.end(processing_start)
    log.info("Processing done in %s", timing.humanize(processing_delta))

    log.info("Finished.")
def main():
    '''
    Script to identify analyse ribosomal genes expressed.
    '''

    # Input arguments
    argparser   = ArgParser()
    args        = argparser.parse_args()

    # Access database
    dataAccess = DataAccess(args)

    #print '1. Loading tax tree...'
    tax_tree = TaxTree()
    #print 'done.'

    #print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.alignment_file)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    #print 'done'

    '''
    # TODO: Here i should recognize host reads!
    #------- FILTER HOST READS -------#
    #print '3. Filtering host reads & alignments...'
    new_reads = host_filter.filter_potential_host_reads(
        read_container.fetch_all_reads(format=list),
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        #delete_host_alignments =
        True,
        #filter_unassigned =
        True,
        #unassigned_taxid=
        -1,
        host_filter.perc_of_host_alignments_larger_than)

    dataAccess.clear_cache()    # deletes gi2taxid cache

    reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
        new_reads,
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        True,   # delete host alignments
        True,   # filter unassigned
        -1)     # unassigned taxid

    read_count          = len(read_container.fetch_all_reads(format=list))
    host_read_count     = read_count - len(reads_with_no_host_alignments)
    non_host_read_count = read_count - host_read_count
    
    print ("total   : {0:8d}".format(read_count))
    print ("host    : {0:8d} {1:.2f}".format(host_read_count, 
                                  host_read_count / float(read_count)
                                  ))
    print ("non-host: {0:8d} {1:.2f}".format(non_host_read_count, 
                                  non_host_read_count / float(read_count)
                                  ))
    print
    read_container.set_new_reads(reads_with_no_host_alignments)
    '''

    # ------------------------------------- #

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    #print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    #print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    #print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    #print '6. Populating CDS container...'
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    #print 'done'

    #print("Loaded CDS container")
    
    # Take only CDSs of given tax_id
    # Remove CDSs with too low mean coverage value
    min_mean_coverage   = 0
    min_length          = 0

    cds_alns = cds_aln_container.fetch_all_cds_alns(format=list)
    print ( "CDSs all  : " + str(len(cds_alns)) )

    cds_alns_targeted = [cds_aln for cds_aln in cds_alns 
                         # Filters
                         if cds_aln.get_cds_length() > min_length
                         and cds_aln.get_mean_coverage() > min_mean_coverage]

    # Remove CDSs with no gene/product
    cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted
                         if  cds_aln.cds.gene != None
                         and cds_aln.cds.product != None]

    # ------------------- CDSs filtered and ready to be analyzed ------------------- #

    # Number of targeted CDSs
    print ( "CDSs valid: " + str(len(cds_alns_targeted)) )
    print

    cds_alns_ribosomal = []
    for cds_aln in cds_alns_targeted:

        # If has word "ribosomal" in name, store coverage data for graph
        gene        = cds_aln.cds.gene
        product     = cds_aln.cds.product
        protein_id  = cds_aln.cds.protein_id

        if "ribosomal" in product:
            #print("{0} {1} {2}\n".format(gene, protein_id, product))
            cds_alns_ribosomal.append(cds_aln)

    # ------------------- Ribosomal CDSs acquired! --------------------- #
    # Sort it!
    cds_alns_ribosomal = sorted(cds_alns_ribosomal, 
                             key=lambda cds_aln: cds_aln.get_std_over_mean(),
                             reverse=False)

    # Extract interesting data
    # Mean coverage, max coverage
    mm_cov  = 0
    max_cov = 0
    for cds_aln in cds_alns_ribosomal:
        mean_cov = cds_aln.get_mean_coverage()
        mm_cov += mean_cov
        max_cov = max(max_cov, mean_cov)
    if mm_cov > 0:
        mm_cov /= len(cds_alns_ribosomal)

    # Print
    print("ribosomals: " + str(len(cds_alns_ribosomal)))
    print("mean coverage: " + str(mm_cov))
    print("max coverage : {0}".format(max_cov))
    print
    for cds_aln in cds_alns_ribosomal:
        gene        = cds_aln.cds.gene
        product     = cds_aln.cds.product
        protein_id  = cds_aln.cds.protein_id

        taxon       = cds_aln.cds.taxon
        name        = tax_tree.nodes[taxon].organism_name
        print("{0:4} {1:10} {2:50} {3:10d} {4:60}".format(gene, protein_id, product, taxon, name))

    # Store graph data
    export_CDS_graph_data(cds_alns_ribosomal, args.export_path)

    '''