コード例 #1
0
def main():

    # Input arguments
    argparser   = ArgParser()
    args        = argparser.parse_args()

    # Access database
    dataAccess = DataAccess(args)

    # ------------------ #

    print '1. Loading tax tree...'
    start = time.time()

    tax_tree = TaxTree()

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------ #

    print '2. Loading alignment file...'
    start = time.time()

    read_container = ReadContainer()
    read_container.load_alignment_data(args.alignment_file)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------ #

    # Create folder if does not exist
    if not os.path.exists(args.export_folder):
        os.makedirs(args.export_folder)
    # File for data analysis summary
    summary_path = os.path.join(args.export_folder, "CDSs_summary.txt")
    cds_summary = open(summary_path, 'w')

    if args.remove_host:
        print "Removing host..."
        start = time.time()

        #------- FILTER HOST READS -------#
        #print '3. Filtering host reads & alignments...'
        new_reads = host_filter.filter_potential_host_reads(
            read_container.fetch_all_reads(format=list),
            tax_tree.tax2relevantTax,
            tax_tree.potential_hosts,
            #delete_host_alignments =
            True,
            #filter_unassigned =
            True,
            #unassigned_taxid=
            -1,
            host_filter.perc_of_host_alignments_larger_than)

        dataAccess.clear_cache()    # deletes gi2taxid cache

        reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
            new_reads,
            tax_tree.tax2relevantTax,
            tax_tree.potential_hosts,
            True,   # delete host alignments
            True,   # filter unassigned
            -1)     # unassigned taxid

        read_count          = len(read_container.fetch_all_reads(format=list))
        host_read_count     = read_count - len(reads_with_no_host_alignments)
        non_host_read_count = read_count - host_read_count
        
        cds_summary.write("total   : {0:8d}\n".format(read_count))
        cds_summary.write("host    : {0:8d} {1:.2f}\n".format(host_read_count, 
                                      host_read_count / float(read_count)
                                      ))
        cds_summary.write("non-host: {0:8d} {1:.2f}\n".format(non_host_read_count, 
                                      non_host_read_count / float(read_count)
                                      ))
        # Set host-free reads
        read_container.set_new_reads(reads_with_no_host_alignments)

        end = time.time()
        print("done: {0:.2f} sec".format(end - start))

    #------- LOAD ALL RECORDS   -------#

    print '4. Loading referenced records...'
    start = time.time()

    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    #-- MAP ALIGNMENTS TO GENES   -----#

    print '5. Mapping alignments to genes...'
    start = time.time()

    read_container.populate_cdss(record_container)

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    #- RECORD ALL ALIGNEMENTS TO GENE -#

    print '6. Populating CDS container...'
    start = time.time()

    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------------------- #

    print 'Sorting CDSs ...DISABLED'
    start = time.time()

    # Sort CDSs by their "good looks"!
    cds_alns = cds_aln_container.fetch_all_cds_alns(format=list)
    '''
    cds_alns = sorted(cds_alns,
                    key=lambda cds_aln: cds_aln.get_std_over_mean(),
                    reverse=False)
    '''
    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------------------- #

    '''
    print "Exporting phase 0 - all CDSs..."
    export_CDS_stats_data(cds_alns, args.export_folder, "0_all_CDSs.txt")
    print "done"
    '''

    # Count Nones in cds_alns
    nones = count_nones(cds_alns)
    cds_summary.write("\n")
    cds_summary.write("gene None       : {0}\n".format(nones['gene']))
    cds_summary.write("protein_id  None: {0}\n".format(nones['protein_id']))
    cds_summary.write("product  None   : {0}\n".format(nones['product']))
    cds_summary.write("\n")

    cds_summary.write("CDSs all: {0}\n".format(len(cds_alns)))

    print 'Filtering valid CDSs...'
    start = time.time()

    # Remove CDSs with too low mean coverage value or length
    min_mean_coverage   = 0
    min_length          = 0
    cds_alns_targeted = [cds_aln for cds_aln in cds_alns 
                         # Filters
                         if cds_aln.get_cds_length() > min_length
                         and cds_aln.get_mean_coverage() > min_mean_coverage]

    # Remove CDSs with no gene/product
    cds_alns_targeted = [cds_aln for cds_aln in cds_alns_targeted
                         if cds_aln.cds.product is not None]
                         #if  cds_aln.cds.gene != None
                         #and cds_aln.cds.product != None]

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # All valid CDSs - Output coverage/length histogram data
    print "Exporting phase 1 - all CDSs..."
    start = time.time()

    export_CDS_stats_data(cds_alns_targeted, args.export_folder, "1_all_valid_CDSs.txt")

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------- CDSs filtered and ready to be analyzed ------------------- #

    print 'Extracting ribosomal CDSs...'
    # Number of targeted CDSs
    cds_summary.write("CDSs valid: {0}\n".format(len(cds_alns_targeted)))

    cds_alns_ribosomal = []
    for cds_aln in cds_alns_targeted:

        # If has word "ribosomal" in name, store coverage data for graph
        gene        = cds_aln.cds.gene
        product     = cds_aln.cds.product
        protein_id  = cds_aln.cds.protein_id

        if is_ribosomal(product):
            #print("{0} {1} {2}\n".format(gene, protein_id, product))
            cds_alns_ribosomal.append(cds_aln)

    print 'done'
    # ------------------- Ribosomal CDSs acquired! --------------------- #

    print 'Analysing ribosomals...'

    # Extract interesting data
    # Mean coverage, max coverage
    mm_cov  = 0
    max_cov = 0
    for cds_aln in cds_alns_ribosomal:
        mean_cov = cds_aln.get_mean_coverage()
        mm_cov += mean_cov
        max_cov = max(max_cov, mean_cov)
    if mm_cov > 0:
        mm_cov /= len(cds_alns_ribosomal)

    cds_summary.write("ribosomals all {0}\n".format(len(cds_alns_ribosomal)))
    cds_summary.write("mean coverage: {0}\n".format(mm_cov))
    cds_summary.write("max coverage : {0}\n".format(max_cov))
    print 'done'

    # Ribosomal CDSs only - Output coverage/length histogram
    print "Exporting phase 2 - ribosomal CDSs only..."
    export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "2_ribosomal_CDSs.txt")
    print "done"

    # ------------------- Making biological sense - choosing CDSs -------------------- #

    print 'Filtering under-average ribosomals...'
    # NOTE: take length into consideration?
    cds_alns_ribosomal = [cds_aln for cds_aln in cds_alns_ribosomal
                         # Filters
                         if cds_aln.get_mean_coverage() > mm_cov]
    print 'done'
    cds_summary.write("ribosomals over-mean: {0}\n".format(len(cds_alns_ribosomal)))
    cds_summary.close()

    print 'Phase 3 - filtered ribosomal CDSs...'
    export_CDS_stats_data(cds_alns_ribosomal, args.export_folder, "3_ribosomal_CDSs_filtered.txt")
    print 'done'
    
    # Store charts cov data - if selected so
    if args.export_charts:
        print "Exporting chart coverage data..."
        export_CDS_graph_data(cds_alns_ribosomal, args.export_charts)
        print "done."

    # --------------------- I have chosen CDSs - determine species and analyse ------------------------ #

    # Species level resolution
    # See which species are present - dump ones with not enough CDSs
    # NOTE: So far done in determine_species_by_ribosomals.py

    CDS_count   = {}    # Count CDSs of each species
    species_set = set() # Get estimated tax_ids
    for cds_aln in cds_alns_ribosomal:
        tax_id = cds_aln.cds.taxon

        # Put each tax_id up to the "species" level
        tax_id_species = tax_tree.get_parent_with_rank(tax_id, 'species')

        species_set.add(tax_id_species)
        CDS_count[tax_id_species] = CDS_count.get(tax_id_species, 0) + 1

    # Get reported CDSs ids
    reported_CDS_ids = set()
    for cds_aln in cds_alns_ribosomal:
        reported_CDS_ids.add(cds_aln.cds.id)

    # ------------ Read assignment analysis -------------- #

    print "Read assignment analysis..."

    reads = read_container.fetch_all_reads(format=list)
    assignment_analysis(species_set, reads, tax_tree, args.export_folder, CDS_count)
コード例 #2
0
ファイル: binner.py プロジェクト: matijaSos/tiny-binner
def main():
    '''
    Script to run binner in one of the most common
    usage scenarios.
    * load alignment data
    * load taxonomy data
    * do basic alignment data filtering (remove host reads ecc)
    '''

    #----------------------------------#
    #------ INPUT ARGUMENTS -----------#
    argparser = TestRunArgParser()
    args  = argparser.parse_args()

    #----------------------------------#
    #------- STATIC DATA SOURCE -------#
    # CDS - GI2TAXID -- NAMES -- NODES #
    dataAccess = DataAccess(args)
    #raw_input('Data access created')
    #----------------------------------#

    #-------- TAXONOMY TREE -----------#
    print '1. Loading tax tree...'
    tax_tree = TaxTree()
    # tax_tree.load_taxonomy_data(dataAccess)
    print 'done.'

    #----------------------------------#
    #------- ALIGNMENT DATA SOURCE ----#
    print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.input)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    # Remember total number of reads
    total_read_num = read_container.get_read_count()
    print 'done'

    #------- FILTER HOST READS -------#
    print '3. Filtering host reads & alignments...'
    new_reads = host_filter.filter_potential_host_reads(
        read_container.fetch_all_reads(format=list),
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        #delete_host_alignments =
        True,
        #filter_unassigned =
        True,
        #unassigned_taxid=
        -1,
        host_filter.perc_of_host_alignments_larger_than)

    dataAccess.clear_cache()    # deletes gi2taxid cache

    reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
        new_reads,
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        True,   # delete host alignments
        True,   # filter unassigned
        -1)     # unassigned taxid

    host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments)
    read_container.set_new_reads(reads_with_no_host_alignments)
    print 'done'

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    print 'done'

    print '6. Estimating organisms present in sample...'
    target_organisms = [633, 632, 263, 543, 86661, 1392, 55080, 1386] # What is this part?
    print 'done.'
   
    print '7. Annotating reads...' 
    annotated_reads = rstate.annotate_reads(
                    read_container.fetch_all_reads(format=list),
                    cds_aln_container.read2cds, 
                    tax_tree, 
                    target_organisms)
    read_container.set_new_reads(annotated_reads)
    print 'done'
   
    print '8. Binning reads...' 
    orgs = bin_reads(
        read_container.fetch_all_reads(format=list),
        cds_aln_container.cds_repository,
        cds_aln_container.read2cds, 
        tax_tree,
        target_organisms,
        None,
        None,
        False) 

    '''
    for org in orgs.values():
        print org.name
        print len(set(org.get_reads()))
        print len(org.identified_coding_regions)
    print 'done.'
    '''

    print ("total_read_num: " + str(total_read_num))

    print '9. Generating XML...'
    dataset = Dataset(args.xml_description_file)
    xml_organisms = []
    host = Organism (host_read_count, host_read_count/float(total_read_num), None, None, "Host",
                 None, None, [], [], [], is_host=True)
    xml_organisms.append(host)
    for org in orgs.values():
        xml_organisms.append(org.to_xml_organism(tax_tree, total_read_num))
    xml_organisms.sort(key=operator.attrgetter("amount_count"), reverse=True)
    xml = XMLOutput(dataset, xml_organisms, args.output) 
    xml.xml_output();
コード例 #3
0
def main():
    '''
    Script to run binner in one of the most common
    usage scenarios.
    * load alignment data
    * load taxonomy data
    * do basic alignment data filtering (remove host reads ecc)
    '''

    #----------------------------------#
    #------ INPUT ARGUMENTS -----------#
    argparser = PickleParser()
    args  = argparser.parse_args()

    #----------------------------------#
    #------- STATIC DATA SOURCE -------#
    # CDS - GI2TAXID -- NAMES -- NODES #
    dataAccess = DataAccess(args)
    #raw_input('Data access created')
    #----------------------------------#

    #-------- TAXONOMY TREE -----------#
    print '1. Loading tax tree...'
    tax_tree = TaxTree()
    # tax_tree.load_taxonomy_data(dataAccess)
    print 'done.'

    #----------------------------------#
    #------- ALIGNMENT DATA SOURCE ----#
    print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.input)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    print 'done'

    #------- FILTER HOST READS -------#
    print '3. Filtering host reads & alignments...'
    new_reads = host_filter.filter_potential_host_reads(
        read_container.fetch_all_reads(format=list),
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        #delete_host_alignments =
        True,
        #filter_unassigned =
        True,
        #unassigned_taxid=
        -1,
        host_filter.perc_of_host_alignments_larger_than)
    dataAccess.clear_cache()    # deletes gi2taxid cache
    reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
        new_reads,
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        True,   # delete host alignments
        True,   # filter unassigned
        -1)     # unassigned taxid
    host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments)
    read_container.set_new_reads(reads_with_no_host_alignments)
    print 'done'

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    record_container.populate(read_container.fetch_all_reads_versions(), table='rrna')
    print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    print 'done'