Example #1
0
def pruneReferenceTree_Nmicrobiol201648(taxa):
    treefmt = None
    
    with open(nmicrobiol201648_s6_PATHd8, "r") as f:
    #with open(itol_newick, "r") as f:
        treefmt = f.read()

    # Strip internal support, and parse the resulting string
    tree = PhyloTree(stripTreeInternalSupport(treefmt))

    #ts.show_leaf_name = True
    #ts.layout_fn = nodeLayoutWithTaxonomicNames

    unmatched = []
    matched = []

    #xxx = set([1287680, 115713, 203267, 188937, 4781, 187420, 243230, 130081, 227882, 228908, 227377, 224308, 5693, 345663, 208964, 224325, 1116230, 243273, 213585, 64091, 45670, 1069680, 1397361, 280699, 1047168, 284811, 284812, 46234, 418459, 214684, 262768, 243365, 273063, 511145, 176299, 272557, 272558, 402612, 283166, 223926, 163003, 559292, 1041607, 1183438, 2769, 122586, 273116, 593117, 192222, 1574623, 243159, 160490, 212717, 272623, 272631, 272632, 63737, 272634, 1341181, 1125630, 99287, 27923, 400667, 269084, 257314, 96563, 300852, 4927, 381764, 242507, 65357, 104782, 336722, 190304, 882, 347515, 353152, 83332, 93061, 194439, 1223560, 267671, 196164, 1245935, 449447, 420778, 195522, 556484, 5061, 391623, 70601, 85962, 272844, 259536, 272633, 220668, 169963, 295405, 237561, 407035, 997884, 1432061, 1010810, 562, 1010800])

    labelToTaxId = readTranslationMap()
        
    numProcessed = 0
    numMatched = 0
    # Annotate tree leaves
    for node in tree.traverse():
        if node.is_leaf():
            #items = node.name.split("_") # "items" (words) are separated by '_'

            #for i,x in enumerate(items):
            #    if x.startswith("Submit") or x.startswith("submit"):
            #        print("Removing submission note on node: %s" % items)
            #        items = items[:i]
            #        break
            
            matchingName = None
            matchingTaxId = None
            #print("---------------------------")
            #print(n)


            # Check if the label has a mapping in the id-conversion table
            matchingTaxId = labelToTaxId.get(node.name)

            #if not matchingTaxId is None:
            #    matchingName = ncbiTaxa.get_taxid_translator((matchingTaxId,))[matchingTaxId]
                    
            # Did we find a match for this leaf?
            #if not matchingName is None: # match found
            if not matchingTaxId is None:
                node.label = node.name
                node.name = str(matchingTaxId)
                #node.matchingName
                #print("<%s>" % node.name)

                #lineageItems = [x for x in items[:speciesStartItem] if not x in taxonItemsToIgnore]
                lineageItems = ncbiTaxa.get_lineage(matchingTaxId)
                
                # TODO - Fix lineageItems ?
                node.add_features(taxId = matchingTaxId, lineageItems = lineageItems)
                #node.add_features(taxId = matchingTaxId)
                #if matchingTaxId != speciesLevelTaxon:
                #    node.add_features(speciesLevelTaxon=speciesLevelTaxon)

                matched.append("%s [%d]" % (node.label, matchingTaxId))


                #print("--"*20)
                #print(matchingName)
                #print(items[:speciesStartItem])
                numMatched += 1
            else: # no match found
                unmatched.append(node.name)
                node.name = "n/a"

            numProcessed += 1
            if (rl()):
                print("(processed %d matched %d)" % (numProcessed, numMatched))

            #if(numProcessed>1000):
            #    break


    # Save unmatched names to file (for examination)
    with open("unmatched_names.txt", "w") as f:
        f.writelines(["%s\n" % x for x in sorted(unmatched)])

    # Save unmatched names to file (for examination)
    with open("matched_names.txt", "w") as f:
        f.writelines(["%s\n" % x for x in sorted(matched)])

    print("//"*30)
    print("//"*30)
    print("//"*30)
    print("//"*30)

    outer = {}

    # Try to annotate non-leaf nodes with common taxonomic group
    for node in tree.traverse(strategy='postorder'): # children first
        if node.is_leaf():
            continue

        a = []
        for c in node.children:
            try:
                a.append( c.lineageItems )
            except AttributeError:
                pass


        print(">>" * 20)
        print(len(a))
        print(a)
        
        if a:
            out = None
            
            if len(a)==2:
                out = []
                #if a[0][-1]=="Anabaena" or a[1][-1]=="Anabaena":
                #    print(">"*50)
                #    print(a)
                #    print(">"*50)
                    
                for u,v in zip(a[0], a[1]):
                    if u==v:
                        out.append(u)
                    else:
                        break
            elif len(a)==1:
                #if a[0][-1]=="Anabaena":
                #    print(">"*50)
                #    print(a)
                #    print(">"*50)
                    
                out = a[0]
            else:
                assert(False)

            if out:
                #print("out = %s" % out)
                print("//"*20)
                print(out[-1])
                node.add_features(lineageItems = out, testK = out[-1])

                outer[out[-1]] = id(node)

                print(">>> %s" % out[-1])

                if out[-1]==2:
                    print("- - "*5)
                    print(a)
                    print("- - "*5)
                
            #    #if out[-1] == "Nostocaceae":
            #        #print(node.name)
            #        #print(a)
            #        #pass
                    
            #else:
            #    print("*-"*20)
            #    print(node.name)
            #    print(a)
                
    for node in tree.traverse(strategy='postorder'): # children first
        if node.is_leaf():
            continue

        try:
            l = node.testK
            if not l is None:
                if outer[l]==id(node):
                    node.add_features(testL = l)

        except AttributeError as e:
            pass
        


    print("//"*30)
    print("//"*30)
    print("//"*30)
    print("//"*30)
        
    # Now we have our annotated reference phylogenetic tree

    # Get our target list of species to appear on the final tree
    #taxa = getSpeciesToInclude()
    allNames = ncbiTaxa.get_taxid_translator(taxa)

    
    #taxa.append(1906157)
    #taxa.append(251221)

    #print(ncbiTaxa.get_rank(taxa))

    for x in (45157,4896,44056):
        print(x)
        print(tree.search_nodes(taxId=x))
        
    
    f = set()
    fnodes = []
    notf = set()
    for x in taxa:
        print("=="*20)
        print("Searching for %d" % x)
        found = tree.search_nodes(taxId=x)
        
        if found:
            f.add(x)
            fnodes.append(found[0])
            print("Exact match found")
        else:
            containingSpeciesLevelTaxon = getContainingSpeciesLevelTaxon(x)
            if x != containingSpeciesLevelTaxon:
                found = tree.search_nodes(taxId=containingSpeciesLevelTaxon)

                if not found:
                    found = tree.search_nodes(speciesLevelTaxon=containingSpeciesLevelTaxon)

                # TODO - CONTINUE HERE
                #if not found:
                #    found = tree.search_nodes(
                    
                if found:
                    f.add(x)
                    fnodes.append(found[0])
                    print("Found")
                else:
                    print("Not found at all...")
                        
        #elif ncbiTaxa.get_rank([x])[x] == 'no rank':
        #    parent = ncbiTaxa.get_lineage(x)[-2]
        #    found = tree.search_nodes(taxId=parent)
        #    if found:
        #        f.add(x)
        #        fnodes.append(found[0])
                

        if x not in f:
            #print("--"*50)
            #print("TaxId not found: %d" % x)
            #print("Name: %s" % allNames[x])
            containingSpeciesLevelTaxon = getContainingSpeciesLevelTaxon(x)
            #print("Species TaxId: %d %s" % (containingSpeciesLevelTaxon, "" if x==containingSpeciesLevelTaxon else "***"))
            
            lineage = ncbiTaxa.get_lineage(x)
            #print("Lineage: %s" % lineage)
            names = ncbiTaxa.get_taxid_translator(lineage)
            #print(names)

            for y in reversed(lineage):
                name = names[y]
                res = bool(tree.search_nodes(testK = name))
                #print("%s: %s" % (name, res))
            notf.add(x)
            
    print("Found (%d): %s" % (len(f), f))
    print(ncbiTaxa.get_rank(list(f)))

    print("Couldn't find (%d): %s" % (len(notf), notf))
    print(list(ncbiTaxa.get_taxid_translator(list(notf)).values()))
    print(len(fnodes))

    tree2 = tree.copy()

    print("Before pruning: %d" % len(tree2))
    if fnodes:
        #tree2.prune(fnodes, preserve_branch_length=True)
        pruneTree(tree2, fnodes)
    print("After pruning: %d" % len(tree2))

    return (tree, tree2)
            def nodeLayoutFunc(node):
                taxid = int(node.name)

                if taxid in taxidsToKeep:
                    taxGroupName = ncbiTaxa.get_taxid_translator(
                        [taxid]
                    )[taxid]  # There has to be an easier way to look up names...

                    row = None
                    rangeRows = None

                    print(len(ranges))

                    if (len(ranges) == 1):
                        row = df[(df['ExplanatoryVar'] == var)
                                 & (df['TaxGroup'] == taxid) &
                                 (df['Range'] == ranges[0])]
                        assert (len(row) == len(ranges))
                    elif len(ranges) > 1:
                        row = df[(df['ExplanatoryVar'] == var)
                                 & (df['TaxGroup'] == taxid) &
                                 (df['Range'] == 0)]
                        assert (len(row) == 1)
                        rangeRows = df[(df['ExplanatoryVar'] == var)
                                       & (df['TaxGroup'] == taxid) &
                                       (df['Range'].isin(set(ranges)))]
                    else:
                        assert (False)

                    overallPval = float(row['Pvalue'].values[0])

                    name = TextFace("%s" % taxGroupName,
                                    fsize=baseFontSize * 2.5)
                    name.tight_text = True
                    name.margin_left = 20
                    name.margin_right = 0
                    name.margin_top = 40
                    name.margin_bottom = 12
                    faces.add_face_to_node(name, node, column=0)

                    #print(rangeRows)

                    # For each range to be included in this plot, add a bar
                    for rangeId in ranges:
                        #print("rangeId = %s" % (rangeId))

                        rowForThisRange = None

                        if len(ranges) == 1:
                            rowForThisRange = row
                        else:
                            rowForThisRange = rangeRows[rangeRows['Range'] ==
                                                        rangeId]

                        assert (len(rowForThisRange) == 1)

                        # Extract p-value and "effect-size" (signed R^2)
                        effectSize = float(
                            rowForThisRange['EffectSize'].values[0])
                        pval = float(rowForThisRange['Pvalue'].values[0])

                        # Set bar-graph color and significance markers
                        barColor = ""
                        significanceMarker = ""
                        if (pval < significanceLevel):
                            significanceMarker = " %s" % unichr(0x2731)

                            if effectSize < 0:
                                barColor = "#1133ff"
                            else:
                                barColor = "#ff3311"
                        else:  # not significant
                            if effectSize < 0:
                                barColor = "#b0b0f0"
                            else:
                                barColor = "#ccb090"

                        # Add the minus sign if needed
                        signChar = ""
                        if effectSize < 0:
                            signChar = unichr(
                                0x2212
                            )  # minus sign (more legible than a hypen...)

                        v = RectFace(width=abs(effectSize) * barScale,
                                     height=baseFontSize * 3.5,
                                     fgcolor=barColor,
                                     bgcolor=barColor,
                                     label={
                                         "text":
                                         "%s%.2g %s" %
                                         (signChar, abs(effectSize),
                                          significanceMarker),
                                         "fontsize":
                                         baseFontSize * 1.8,
                                         "color":
                                         "black"
                                     })
                        #v.rotation = -90
                        v.margin_top = 1
                        v.margin_left = 30
                        v.margin_right = 8
                        v.margin_bottom = 12
                        faces.add_face_to_node(v, node, column=0)

                    details = TextFace(
                        "N=%d" % row['NumSpecies'], fsize=baseFontSize *
                        1.5)  #, fsize=baseFontSize) #, fstyle="italic")
                    details.background.color = "#dfdfdf"
                    details.margin_left = 6
                    details.margin_right = 20
                    #details.margin_top=5
                    #details.margin_bottom=0
                    faces.add_face_to_node(details, node, column=1)

                    nstyle = NodeStyle()
                    nstyle["size"] = 0

                    node.set_style(nstyle)
Example #3
0
def speciesByPhylaTable():
    allPhyla = parseReport()  # get all existing phyla

    domainCounts = Counter()
    phylaCounts = Counter()
    skippedCounts = Counter()
    #classesByPhyla = {}   # Disable tallying by class, since these are not used for many taxons
    ordersByPhyla = {}
    familiesByPhyla = {}
    genusesByPhyla = {}

    phylaDf = pd.DataFrame({
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'TaxId': pd.Series([], dtype='int'),  # Phylum TaxId
        'ParentTaxId': pd.Series([], dtype='int'),  # Parent TaxId
        'NumSpecies': pd.Series([],
                                dtype='int'),  # Species count for this phyla
        #                       'NumClasses': pd.Series([], dtype='int'),    # Species count for this phyla
        'NumOrders': pd.Series([], dtype='int'),  # Orders count for this phyla
        'NumFamilies': pd.Series([],
                                 dtype='int'),  # Families count for this phyla
        'NumGenuses': pd.Series([],
                                dtype='int'),  # Genuses count for this phyla
        'RowType': pd.Categorical([])
    })  # Phylum count or total

    for group, phyla in allPhyla.items():
        for phylum, record in phyla.items():
            # Add item for each phylum
            taxId = record['taxId']

            phylaDf = phylaDf.append(
                pd.DataFrame({
                    'Domain':
                    pd.Categorical([group]),
                    'Phylum':
                    pd.Categorical([phylum]),
                    'TaxId':
                    pd.Series([taxId], dtype='int'),
                    'ParentTaxId':
                    pd.Series([record['parentTaxId']], dtype='int'),
                    'NumSpecies':
                    pd.Series([0], dtype='int'),
                    #                                       'NumClasses': pd.Series([0], dtype='int'),
                    'NumOrders':
                    pd.Series([0], dtype='int'),
                    'NumFamilies':
                    pd.Series([0], dtype='int'),
                    'NumGenuses':
                    pd.Series([0], dtype='int'),
                    'RowType':
                    pd.Categorical(['Phylum'])
                }))
            #classesByPhyla[record['taxId']]  = set()
            ordersByPhyla[record['taxId']] = set()
            familiesByPhyla[record['taxId']] = set()
            genusesByPhyla[record['taxId']] = set()

    # Create "special" items
    pid = 1
    for group in allPhyla.keys():
        # Add "Unknown phylum" tally for each domain
        phylaDf = phylaDf.append(
            pd.DataFrame({
                'Domain': pd.Categorical([group]),
                'Phylum': pd.Categorical(['[Unknown]']),
                'TaxId': pd.Series([pid], dtype='int'),
                'ParentTaxId': pd.Series([0], dtype='int'),
                'NumSpecies': pd.Series([0], dtype='int'),
                #                                       'NumClasses': pd.Series([0], dtype='int'),
                'NumOrders': pd.Series([0], dtype='int'),
                'NumFamilies': pd.Series([0], dtype='int'),
                'NumGenuses': pd.Series([0], dtype='int'),
                'RowType': pd.Categorical(['Total'])
            }))
        pid += 1
        # Add totals tally for each domain
        phylaDf = phylaDf.append(
            pd.DataFrame({
                'Domain': pd.Categorical([group]),
                'Phylum': pd.Categorical(['[Total]']),
                'TaxId': pd.Series([pid], dtype='int'),
                'ParentTaxId': pd.Series([0], dtype='int'),
                'NumSpecies': pd.Series([0], dtype='int'),
                #                                       'NumClasses': pd.Series([0], dtype='int'),
                'NumOrders': pd.Series([0], dtype='int'),
                'NumFamilies': pd.Series([0], dtype='int'),
                'NumGenuses': pd.Series([0], dtype='int'),
                'RowType': pd.Categorical(['Total'])
            }))
        pid += 1
    # Add overally totals items
    phylaDf = phylaDf.append(
        pd.DataFrame({
            'Domain': pd.Categorical(['[All]']),
            'Phylum': pd.Categorical(['[Total]']),
            'TaxId': pd.Series([pid], dtype='int'),
            'ParentTaxId': pd.Series([0], dtype='int'),
            'NumSpecies': pd.Series([0], dtype='int'),
            #                                       'NumClasses': pd.Series([0], dtype='int'),
            'NumOrders': pd.Series([0], dtype='int'),
            'NumFamilies': pd.Series([0], dtype='int'),
            'NumGenuses': pd.Series([0], dtype='int'),
            'RowType': pd.Categorical(['Total'])
        }))

    phylaDf.set_index('TaxId', inplace=True)
    skippedSpecies = []

    # Count the number of species under each phylum
    for taxId in allSpeciesSource():
        if taxId in speciesToExclude: continue
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]
        domainCounts.update([domain])

        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if not phylumTaxId:
            skippedSpecies.append(taxId)
            skippedCounts.update([domain])
            print("Skipping %d: (%s) missing phylum" % (taxId, names[taxId]))
            #print(formatLineage(lineage, names))
            continue  # This table is structured by phylum; information will be missing for any species missing a phylum; it will be included in the "species missing phylum" ([Unknown]) row.
        else:
            phylumTaxId = phylumTaxId[0]

        if phylumTaxId:
            phylaCounts.update(
                [phylumTaxId])  # tally this species under the specified phylum

        #classTaxId = [t for t,rank in ranks.items() if rank=='class']
        #if classTaxId:
        #    classesByPhyla[phylumTaxId].add( classTaxId[0] )

        orderTaxId = [t for t, rank in ranks.items() if rank == 'order']
        if orderTaxId:
            ordersByPhyla[phylumTaxId].add(orderTaxId[0])

        familyTaxId = [t for t, rank in ranks.items() if rank == 'family']
        if familyTaxId:
            familiesByPhyla[phylumTaxId].add(familyTaxId[0])

        genusTaxId = [t for t, rank in ranks.items() if rank == 'genus']
        if genusTaxId:
            genusesByPhyla[phylumTaxId].add(genusTaxId[0])

    assert (sum(skippedCounts.values()) == len(skippedSpecies))

    # Update the phyla counts
    for phylaTaxId, counts in phylaCounts.items():
        #phylaDf.loc[phylaTaxId, 'NumClasses']  = len(classesByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId, 'NumOrders'] = len(ordersByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId,
                    'NumFamilies'] = len(familiesByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId, 'NumGenuses'] = len(genusesByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId, 'NumSpecies'] = counts

    # Update the "Unknown phyla" count for each domain
    for group, countMissing in skippedCounts.items():
        #print('-'*20)
        #print("%s - %d missing" % (group, countMissing))
        dummyTaxIdForBasketGroup = phylaDf[(phylaDf.Domain == group) & (
            phylaDf.Phylum == '[Unknown]')].index[0]
        phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = countMissing

    # Update the total for each domain
    for group, totalCount in domainCounts.items():
        dummyTaxIdForBasketGroup = phylaDf[
            (phylaDf.Domain == group) & (phylaDf.Phylum == '[Total]')].index[0]
        phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = totalCount

    # Update the overall total count
    dummyTaxIdForBasketGroup = phylaDf[(phylaDf.Domain == "[All]") &
                                       (phylaDf.Phylum == '[Total]')].index[0]
    phylaDf.loc[dummyTaxIdForBasketGroup,
                'NumSpecies'] = sum(domainCounts.values())
    phylaDf.loc[dummyTaxIdForBasketGroup,
                'NumOrders'] = sum([len(x) for x in ordersByPhyla.values()])
    phylaDf.loc[dummyTaxIdForBasketGroup, 'NumFamilies'] = sum(
        [len(x) for x in familiesByPhyla.values()])
    phylaDf.loc[dummyTaxIdForBasketGroup,
                'NumGenuses'] = sum([len(x) for x in genusesByPhyla.values()])

    # Prepare and save the final table
    phylaReportDf = phylaDf[phylaDf['NumSpecies'] > 0]  # remove "empty" items
    phylaReportDf = phylaReportDf.sort_values(
        by=['Domain', 'RowType', 'Phylum'])  # sort rows
    print(phylaReportDf)
    phylaReportDf.to_html('phyla_report.html',
                          columns=[
                              'Phylum', 'NumOrders', 'NumFamilies',
                              'NumGenuses', 'NumSpecies', 'Domain'
                          ])
    phylaReportDf.to_excel('phyla_report.xlsx', sheet_name='Phyla Summary')

    with open("phyla_report.rst", "w") as f:
        f.write(
            phylaReportDf.drop([
                'RowType', 'NumFamilies', 'NumGenuses', 'NumOrders',
                'ParentTaxId'
            ],
                               axis=1).pipe(tabulate,
                                            headers='keys',
                                            tablefmt='rst'))

    # Prepare the "Missing phyla" report
    missingPhylaReportDf = phylaDf[phylaDf['NumSpecies'] == 0]
    missingPhylaReportDf = missingPhylaReportDf.sort_values(
        by=['Domain', 'RowType', 'Phylum'])  # sort rows
    missingPhylaReportDf.to_html('phyla_report_missing.html',
                                 columns=['Phylum', 'NumSpecies', 'Domain'])
    missingPhylaReportDf.to_excel('phyla_report_missing.xlsx',
                                  sheet_name='Missing Phyla Summary')

    # print counts
    print(domainCounts)
    #print(phylaCounts)

    # Display "skipped items" warning
    if (skippedSpecies):
        print("=" * 50)
        print("Warning: Skipped %d species" % len(skippedSpecies))
        print(skippedCounts)
        print("=" * 50)
    },
    index=pd.Index([], name='tax_id', dtype='int'))

# Add kingdom data to the data-frame
for k, v in taxidToKingdom.items():
    df.loc[k, 'kingdom'] = v
    df.loc[k, 'full.name'] = getSpeciesName(k)
    df.loc[k, 'short.name'] = shortNames[k]
    assert (df.loc[k, 'kingdom'] == v)

# Get list of large taxonomic groups (based on the lineages of all species)
majorGroups = getMajorTaxonomicGroups(taxidToLineage)

# Add a binary membership column for each major group
for groupTaxId, _ in majorGroups:
    groupName = ncbiTaxa.get_taxid_translator([groupTaxId])[groupTaxId]
    groupName = "Member_%s_%d" % (groupName.replace(" ", "_").replace(
        "/", "_").replace("-", "_"), groupTaxId)

    groupDf = pd.DataFrame({groupName: pd.Series(dtype='bool')},
                           index=pd.Index(df.index.values,
                                          name='tax_id',
                                          dtype='int'))

    for taxId, lineage in taxidToLineage.items():
        isMember = int(groupTaxId in lineage)
        groupDf.loc[taxId, groupName] = isMember

    df = pd.merge(df, groupDf, how='inner', left_index=True, right_index=True
                  )  # Add the new column (is there an easier way to do this?)
Example #5
0
def speciesStatisticsAndValidityReport(args):
    import _distributed

    speciesDf = pd.DataFrame({
        'TaxId': pd.Series([], dtype='int'),  # Species TaxId
        'Species': pd.Series([], dtype='str'),  # Species binomial name
        'Nickname': pd.Series([], dtype='str'),
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'NumCDSs': pd.Series([], dtype='int'),  # CDS count for this species
        'NumCDSsInProfile':
        pd.Series([], dtype='int'
                  ),  # Num seqs with 20 shuffled profiles for this species
        'AnnotatedNumCDSs': pd.Series([], dtype='int'),  # 
        'CDSDifference': pd.Series([], dtype='float'),  # 
        'NumNativeSeqs': pd.Series([], dtype='int'),  # 
        'GCContentInCDS': pd.Series([], dtype='float'),  # 
        'AnnotatedGCContent': pd.Series([], dtype='float'),  # 
        'RowType': pd.Categorical([]),  # Species count or total
        'Warnings': pd.Series([], dtype='str'),  # 
        'CDSWarnings': pd.Series([], dtype='int'),  # 
        'CDSWarnings_': pd.Series([], dtype='str'),  # 
        'FirstAA': pd.Series([], dtype='str'),  # 
        'LastAA': pd.Series([], dtype='str')  # 
    })

    scheduler = _distributed.open()

    results = {}
    delayedCalls_native = []

    shuffledCounts = {}
    delayedCalls_shuffledProfiles = []

    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue  # always exclude species from the blacklist
        if args.taxid and taxId not in args.taxid:
            continue  # if a whitelist is specified, skip other species

        warnings = []

        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##
        #if randint(0, 20) > 0:
        #    continue
        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##

        cdsCountInRedis = countSpeciesCDS(taxId)

        #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11)

        annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0]

        annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0]

        proteinDifference = None
        if not annotatedProteinCount is None:
            proteinDifference = (1.0 - float(cdsCountInRedis) /
                                 float(annotatedProteinCount)) * 100.0

            if abs(proteinDifference) > 9.9:
                warnings.append("CDS_count")
        else:
            warnings.append("No_CDS_count")

        # Determine phylum
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        domain = ""
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]

        phylumName = ""
        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if phylumTaxId:
            phylumName = names[phylumTaxId[0]]

        speciesDf = speciesDf.append(
            pd.DataFrame({
                'TaxId':
                pd.Series([taxId], dtype='int'),  # Species TaxId
                'Species':
                pd.Series([getSpeciesName(taxId)], dtype='str'),
                'Nickname':
                pd.Series([shortNames[taxId]], dtype='str'),
                'Domain':
                pd.Categorical([domain]),  # Bacteria, Eukaryota, Archaea
                'Phylum':
                pd.Categorical([phylumName]),  # Phylum name (string)
                'NumCDSs':
                pd.Series([cdsCountInRedis],
                          dtype='int'),  # CDS count for this species
                'NumCDSsInProfile':
                pd.Series([0],
                          dtype='int'),  # Num seqs with 20 shuffled profiles
                'AnnotatedNumCDSs':
                pd.Series([
                    0
                    if annotatedProteinCount is None else annotatedProteinCount
                ],
                          dtype='int'),  # 
                'CDSDifference':
                pd.Series([proteinDifference], dtype='float'),  # 
                'NumNativeSeqs':
                pd.Series([0], dtype='int'),  # 
                'GCContentInCDS':
                pd.Series([0.0], dtype='float'),  # 
                'AnnotatedGCContent':
                pd.Series([annotatedGCContent], dtype='float'),  # 
                'RowType':
                pd.Categorical(["species"]),  # Species count or total
                'Warnings':
                pd.Series([", ".join(warnings)], dtype='str'),  #
                'CDSWarnings':
                pd.Series([0], dtype='int'),
                'CDSWarnings_':
                pd.Series([""], dtype='str'),
                'FirstAA':
                pd.Series([""], dtype='str'),
                'LastAA':
                pd.Series([""], dtype='str'),
                'Source':
                pd.Series([""], dtype='str')
            }))

        fractionSize = 1000  # How many sequences (roughly) to process in each task
        numFractions = cdsCountInRedis / fractionSize
        if numFractions == 0: numFractions = 1

        for i in range(numFractions):
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #
            #if i%100!=5: continue
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #

            call = dask.delayed(calcNativeSequencesStatistics)(taxId, i,
                                                               numFractions)
            delayedCalls_native.append(call)

        call = dask.delayed(countShuffledProfiles)(taxId,
                                                   (310, 10, "begin", 0), 102,
                                                   11)
        delayedCalls_shuffledProfiles.append(call)

    speciesDf.set_index('TaxId', inplace=True)

    print("Starting {} calls...".format(
        len(delayedCalls_native) + len(delayedCalls_shuffledProfiles)))

    futures = scheduler.compute(
        delayedCalls_native + delayedCalls_shuffledProfiles
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}

    errorsCount = 0
    for f in futures:
        try:
            ret = scheduler.gather(f)
            if (len(ret) == 9):
                (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings,
                 warnings, firstAA, lastAA) = ret

                current = None
                if taxId in results:
                    current = results[taxId]
                else:
                    current = (0, 0, 0, 0, Counter(), Counter(), Counter())

                current = (current[0] + cdsCount, current[1] + gcCounts,
                           current[2] + totalCounts, current[3] + cdsWarnings,
                           current[4] + warnings, current[5] + firstAA,
                           current[6] + lastAA)

                results[taxId] = current

            elif (len(ret) == 2):
                (taxId, numShuffledSeqs) = ret
                shuffledCounts[taxId] = numShuffledSeqs

            else:
                assert (False)

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA,
         lastAA) = result
        speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs

        speciesDf.at[taxId, 'GCContentInCDS'] = round(
            float(gcCounts) / float(totalCounts) * 100.0, 1)

        speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings

        speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings)
        speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA)
        speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA)

        #if numNativeSeqs < species.at[taxId, 'NumCDSs']:
        #    pass

    for taxId, result in shuffledCounts.items():
        speciesDf.at[taxId, 'NumCDSsInProfile'] = result

    speciesDf = speciesDf.sort_values(by=['Domain', 'Species'])  # sort rows
    speciesDf.to_html('species_report.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                          'FirstAA', 'LastAA'
                      ])

    with open("species_report_simple.rst", "w") as f:
        f.write(
            speciesDf.drop([
                'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                'FirstAA', 'LastAA', 'CDSDifference'
            ],
                           axis=1).pipe(tabulate,
                                        headers='keys',
                                        tablefmt='rst'))

    speciesDf.to_html('species_report_simple.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain'
                      ])

    speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')