def plotProfile(taxId, data):
    fig, (ax1,ax2) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': [2, 1]})

    data[['native', 'shuffled']].plot(ax=ax1)

    #plt.title(getSpeciesName(taxId))

    plt.xlabel('Position (nt, window start, from cds %s)' % args.profile.ProfileReference)

    ax1.set_title("Mean LFE for %s" % getSpeciesName(taxId))
    ax1.set_ylabel('Mean LFE')
    ax1.legend()
    ax1.grid(True)


    data['gc'].plot(ax=ax2)
    ax2.set_title("GC%")
    ax2.set_ylabel('GC% (in window)')
    ax2.grid(True)


    profileId = str(args.profile.ProfileId).replace(':', '-')
    plt.savefig("mfe_40nt_cds_%s_%s.pdf" % (profileId, getSpeciesFileName(taxId)) )
    plt.savefig("mfe_40nt_cds_%s_%s.svg" % (profileId, getSpeciesFileName(taxId)) )
    plt.close(fig)
def processGenome(taxId, args, outputPositions=OutputPositions, displayPositions=DisplayPositions):
    buffer = []
    for result in getGeneNativeLFEProfiles(taxId, args):
        buffer.append(result)

    allLFEs = np.stack(buffer)

    #mean = allLFEs.mean(axis=0)
    #std  = allLFEs.std( axis=0)
    mean = np.apply_along_axis( lambda x: x[~np.isnan(x)].mean(), axis=0, arr=allLFEs )
    std  = np.apply_along_axis( lambda x: x[~np.isnan(x)].std(),  axis=0, arr=allLFEs )

    vals = []
    
    vals.append( ('Species', 
                  getSpeciesName(taxId),
                  'str') )
    vals.append( ('Domain', 
                  getDomainForSpecies(taxId),
                  'str') )
    
    for pos, displayPos in zip( outputPositions, displayPositions ):
        vals.append( ('Mean at {}'.format(displayPos),
                      mean[pos],
                      'float') )
        vals.append( ('Std at {}'.format(displayPos),
                      std[pos],
                      'float') )

    ssetup = [(label, pd.Series( [value], index=[taxId], dtype=dtype)) for label, value, dtype in vals ]
                      
    return pd.DataFrame( dict(ssetup), index=[taxId] )
Exemple #3
0
def outputNodeExistenceInRnafoldDB():
    from data_helpers import getSpeciesName
    treeNodeIdentifiersDf = pd.read_csv(nodeIdentifiersMappingTable_csv, dtype= {'NodeLabel': 'string', 'DBIdentifier': 'string', 'DBIdentifierType': 'category', 'TaxId': 'int32'} )

    existenceStatuses = pd.Series([], dtype='int32')

    values = Counter()
    
    i=0
    for row in treeNodeIdentifiersDf.itertuples():
        isIncluded = not (getSpeciesName(row.TaxId) is None)
        existenceStatuses[i] = 1 if isIncluded else 0

        values.update((isIncluded,))
        
        i += 1

    treeNodeIdentifiersDf['Included']     = existenceStatuses

    del treeNodeIdentifiersDf['DBIdentifier']
    del treeNodeIdentifiersDf['DBIdentifierType']
    del treeNodeIdentifiersDf['TaxId']
    del treeNodeIdentifiersDf['Unnamed: 0']
    
    treeNodeIdentifiersDf.to_csv( nodeIdentifiersMappingTable_with_inclusion_csv, index=False )

    print("Output values summary: %s" % values)
    
    return 0
def taxIdToGenomeId(taxId):
    speciesName = getSpeciesName(taxId)
    ##speciesName = "Saccharomyces cerevisiae"
    ##speciesName = "Methanocaldococcus jannaschii DSM 2661"
    sleep(requestDelaySeconds)
    handle = Entrez.esearch(db="genome",
                            retmax=10,
                            term="\"%s\"[ORGN]" % speciesName)
    record = Entrez.read(handle)
    handle.close()

    return map(int, record['IdList'])
# Establish DB connections
#r = redis.StrictRedis(host=config.host, port=config.port, db=config.db)
#session = db.Session()

skipped = 0
selected = 0
alreadyCompleted = 0
totalMissingResults = 0

queuedDelayedCalls = []

for taxIdForProcessing in species:
    print("Processing %d sequences for tax-id %d (%s)..." %
          (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
           getSpeciesName(taxIdForProcessing)))

    stats = Counter()

    # Iterate over all CDS entries for this species
    # TODO - preloading all sequences and results should optimize this
    for protId in SpeciesCDSSource(taxIdForProcessing):

        stats['all-sequences'] += 1

        #protId = codecs.decode(protId)
        # Filtering

        # Only process 1/N of the sequences, selected randomly (N=randomFraction)
        # (if randomFraction==1, all sequences will be processed)
        if (randint(1, randomFraction) != 1):
Exemple #6
0
    if (encPropValue[0] is None) or (encPrimePropValue[0] is
                                     None) or overwrite:
        ENc, ENc_prime = calculateENcPrimeForSpecies(taxId)
        assert (
            ENc < 75.0 and ENc > 10.0
        )  # The actual extreme values for ENc are not clear to me, but let's do a sanity check
        assert (ENc_prime < 75.0 and ENc_prime > 10.0)

        setSpeciesProperty(taxId, 'ENc', str(ENc), "ENCprime (custom version)")
        setSpeciesProperty(taxId, 'ENc-prime', str(ENc_prime),
                           "ENCprime (custom version)")

    else:
        return (
            taxId, encPropValue[0], encPrimePropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, ENc, ENc_prime, True
            )  # return values (last value indicates new values)


if __name__ == "__main__":
    import sys
    taxId = int(sys.argv[1])
    print("Name: {}".format(getSpeciesName(taxId)))

    print(annotateENcPrime(taxId))

    print(calculateENcPrimeForSpecies(taxId, orig=True))
    print(calculateENcPrimeForSpecies(taxId, orig=False))
def loadSpeciesMapping():
    for taxId in allSpeciesSource():
        speciesName = getSpeciesName(taxId)
        speciesMapping[speciesName] = taxId
Exemple #8
0
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
import sys
from time import sleep
from data_helpers import SpeciesCDSSource, CDSHelper, getSpeciesName, countSpeciesCDS, matchCDSKeyNamesSource, r
from rate_limit import RateLimit

taxId = int(sys.argv[1])

rl = RateLimit(10)

if (countSpeciesCDS(taxId) == 0):
    print("Species %d (%s) doesn't have any proteins..." %
          (taxId, getSpeciesName(taxId)))
    print("Nothing left to do...")
    sys.exit(0)

print("Species %d (%s) has %d proteins stored." %
      (taxId, getSpeciesName(taxId), countSpeciesCDS(taxId)))
print("Will delete it in 10 seconds...")
sleep(10)

count = 0

for protId in SpeciesCDSSource(taxId):
    print(protId)
    cds = CDSHelper(taxId, protId)
    try:
        cds.dropShuffledSeqs()
Exemple #9
0
def ingestGenome(args):

    # Sanity test 1 -- genes list file doesn't already exists (if it does, short name may have been reused...)
    genesListFilename = "%s/coding_genes_list.%s.list" % (ensembl_data_dir,
                                                          args.short_name)
    print(genesListFilename)
    assert (not os.path.exists(genesListFilename))

    # Sanity test 2 -- this species doesn't already exist in the DB
    if r.exists(speciesNameKey % args.taxid):
        if r.exists(speciesCDSList % args.taxid):
            raise Exception("Species with taxid=%d already exists as '%s'" %
                            (args.taxid, getSpeciesName(args.taxid)))
        else:
            pass  # Name defined but no CDS list; maybe an earlier run of this script was interrupted?

    # Sanity tests passed

    genomefn = None
    cdsfn = None
    gff3fn = None

    # Step 1 - get files from Ensembl FTP

    if (args.variant == "Ensembl" and args.fetch_ftp_files):

        ftp = EnsemblFTP(args.local_name,
                         args.remote_name,
                         release=args.release,
                         section=args.section,
                         subsection=args.subsection)
        (genomefn, cdsfn, gff3fn) = ftp.fetchAll()
        ftp.close()

        assert (os.path.exists(genomefn) and os.path.isfile(genomefn))
        assert (os.path.exists(cdsfn) and os.path.isfile(cdsfn))
        assert (os.path.exists(gff3fn) and os.path.isfile(gff3fn))
    else:
        gff3fn = args.gff3
        cdsfn = args.cds

    # Step 2 - parse GFF3 file to yield list of acceptable CDS genes

    processGff3(gff3fn, genesListFilename, args)

    numGenesReturnedFromGff3 = None
    with open(genesListFilename, "r") as f:
        numGenesReturnedFromGff3 = len(f.readlines())
    if numGenesReturnedFromGff3 < 400:
        raise Exception(
            "Processing gff3 file only yielded %d results; aborting" %
            numGenesReturnedFromGff3)
    print("%d protein-coding genes found in gff3" % numGenesReturnedFromGff3)

    # Step 3 - Add required annotations for this species to redis DB

    # TODO - add redis items here
    #redis-cli -h power5 -a rnafold set "species:taxid:203267:name" "Tropheryma whipplei str. Twist"
    r.set(speciesNameKey % args.taxid, args.full_name)

    #redis-cli -h power5 -a rnafold set "species:name:Tropheryma whipplei str. Twist:taxid" "203267"
    r.set(speciesTaxIdKey % args.full_name, args.taxid)

    #redis-cli -h power5 -a rnafold set "species:taxid:203267:genomic-transl-table"  "11"
    r.set(speciesTranslationTableKey % args.taxid, args.nuclear_genetic_code)

    # Step 4 - Load CDS sequences to DB
    print("Doing trial run for gene loading...")
    if not loadCDSSequences(cdsfn, genesListFilename, args,
                            dryRun=True) is None:
        print("Trial run succeeded.")
        print("Performing actual gene loading...")
        (cdsLoadedCount, cdsIds,
         skippedGenes) = loadCDSSequences(cdsfn,
                                          genesListFilename,
                                          args,
                                          dryRun=False)
        print("Loaded %d CDS genes..." % cdsLoadedCount)
        if skippedGenes:
            print("Skipped genes: %s" % skippedGenes)
    else:
        print(
            "Dry-run loading may have encountered errors; aborting without actual load..."
        )
        return -1

    # Step 5 - Generate randomized sequences
    # TODO

    return 0
        genomeSizeMb = float(genomeSizeMb)

    growthTimeHours = getSpeciesProperty(taxId, 'growth-time-hours-v2')[0]
    if not growthTimeHours is None:
        growthTimeHours = float(growthTimeHours)

    inPhyloTree = taxId in speciesInTree
    if inPhyloTree:
        stats.update(['tree'])

    speciesDf = speciesDf.append(
        pd.DataFrame({
            'TaxId':
            pd.Series([taxId], dtype='int'),
            'Species':
            pd.Series([getSpeciesName(taxId)], dtype='str'),
            'Nickname':
            pd.Series([shortNames[taxId]], dtype='str'),
            'Source':
            pd.Series([''], dtype='str'),
            'TranslationTbl':
            pd.Series([getSpeciesTranslationTable(taxId)], dtype='int'),
            'InPhyloTree':
            pd.Series([inPhyloTree], dtype='bool'),
            'GenomicGC%':
            pd.Series([genomicGC], dtype='float'),
            'GenomicENc\'':
            pd.Series([genomicENcprime], dtype='float'),
            'GrowthTempC':
            pd.Series([optimumTemp], dtype='float'),
            'GenomeSizeMb':
Exemple #11
0
def readSeriesResultsForSpecies(seriesSourceNumber,
                                species,
                                minShuffledGroups=20,
                                maxShuffledGroups=20,
                                shuffleType=db.Sources.ShuffleCDSv2_python,
                                cdsFilter=None,
                                returnCDS=True):
    if isinstance(
            species, Iterable
    ):  # usually, species will be a sequence of numeric taxid values
        if isinstance(species, basestring):
            raise Exception("species cannot be string")
        # all set - proceed...
    else:
        species = (species, )  # assume we got a single (numeric) taxid value
    assert (minShuffledGroups <= maxShuffledGroups)

    for taxIdForProcessing in species:
        print("Procesing %d sequences for tax-id %d (%s)..." %
              (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
               getSpeciesName(taxIdForProcessing)))

        computed = getAllComputedSeqsForSpecies(seriesSourceNumber,
                                                taxIdForProcessing,
                                                maxShuffledGroups,
                                                shuffleType=shuffleType)
        computedIds = frozenset(computed.keys())
        print("Collecting data from %d computation results..." % len(computed))

        skipped = 0
        selected = 0
        alreadyCompleted = 0

        # Iterate over all CDS entries for this species
        for protId in SpeciesCDSSource(taxIdForProcessing):
            cds = CDSHelper(taxIdForProcessing, protId)

            if (not cdsFilter is None) and (not cdsFilter(cds)):
                continue

            cdsSeqId = cds.seqId()

            shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType)

            # How many shuffles (for this cds) exist in the data we found?
            computedShufflesCount = len(
                computedIds.intersection(frozenset(shuffledIds)))

            if (computedShufflesCount < minShuffledGroups
                    or (not cdsSeqId in computedIds)):
                #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount))
                skipped += 1
                continue

            # Get the computed results for this CDS
            seqIds = [cds.seqId()]
            seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType))
            if (len(seqIds) > maxShuffledGroups + 1):
                seqIds = seqIds[:maxShuffledGroups + 1]
            results = [computed.get(x) for x in seqIds]

            if (results is None or len([() for x in results if not x is None])
                    < minShuffledGroups):
                print("Not enough results found for %s" % protId)
                skipped += 1
                continue

            # Decode the results
            results = list(
                map(
                    lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x))
                    if not x is None else None, results))
            if (returnCDS):
                yield {
                    "taxid": taxIdForProcessing,
                    "content": results,
                    "cds": cds
                }
            else:
                yield {"taxid": taxIdForProcessing, "content": results}
            del results
            del cds
            selected += 1

            if (rl()):
                print("# %s - %d records included, %d records skipped" %
                      (datetime.now().isoformat(), selected, skipped))
shortNames = getSpeciesShortestUniqueNamesMapping()

# Create an empty data-frame for csv output
df = pd.DataFrame(
    {
        'kingdom': pd.Series(dtype="string"),
        'full.name': pd.Series(dtype="string"),
        'short.name': pd.Series(dtype="string")
    },
    index=pd.Index([], name='tax_id', dtype='int'))

# Add kingdom data to the data-frame
for k, v in taxidToKingdom.items():
    df.loc[k, 'kingdom'] = v
    df.loc[k, 'full.name'] = getSpeciesName(k)
    df.loc[k, 'short.name'] = shortNames[k]
    assert (df.loc[k, 'kingdom'] == v)

# Get list of large taxonomic groups (based on the lineages of all species)
majorGroups = getMajorTaxonomicGroups(taxidToLineage)

# Add a binary membership column for each major group
for groupTaxId, _ in majorGroups:
    groupName = ncbiTaxa.get_taxid_translator([groupTaxId])[groupTaxId]
    groupName = "Member_%s_%d" % (groupName.replace(" ", "_").replace(
        "/", "_").replace("-", "_"), groupTaxId)

    groupDf = pd.DataFrame({groupName: pd.Series(dtype='bool')},
                           index=pd.Index(df.index.values,
                                          name='tax_id',
Exemple #13
0
def speciesStatisticsAndValidityReport(args):
    import _distributed

    speciesDf = pd.DataFrame({
        'TaxId': pd.Series([], dtype='int'),  # Species TaxId
        'Species': pd.Series([], dtype='str'),  # Species binomial name
        'Nickname': pd.Series([], dtype='str'),
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'NumCDSs': pd.Series([], dtype='int'),  # CDS count for this species
        'NumCDSsInProfile':
        pd.Series([], dtype='int'
                  ),  # Num seqs with 20 shuffled profiles for this species
        'AnnotatedNumCDSs': pd.Series([], dtype='int'),  # 
        'CDSDifference': pd.Series([], dtype='float'),  # 
        'NumNativeSeqs': pd.Series([], dtype='int'),  # 
        'GCContentInCDS': pd.Series([], dtype='float'),  # 
        'AnnotatedGCContent': pd.Series([], dtype='float'),  # 
        'RowType': pd.Categorical([]),  # Species count or total
        'Warnings': pd.Series([], dtype='str'),  # 
        'CDSWarnings': pd.Series([], dtype='int'),  # 
        'CDSWarnings_': pd.Series([], dtype='str'),  # 
        'FirstAA': pd.Series([], dtype='str'),  # 
        'LastAA': pd.Series([], dtype='str')  # 
    })

    scheduler = _distributed.open()

    results = {}
    delayedCalls_native = []

    shuffledCounts = {}
    delayedCalls_shuffledProfiles = []

    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue  # always exclude species from the blacklist
        if args.taxid and taxId not in args.taxid:
            continue  # if a whitelist is specified, skip other species

        warnings = []

        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##
        #if randint(0, 20) > 0:
        #    continue
        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##

        cdsCountInRedis = countSpeciesCDS(taxId)

        #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11)

        annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0]

        annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0]

        proteinDifference = None
        if not annotatedProteinCount is None:
            proteinDifference = (1.0 - float(cdsCountInRedis) /
                                 float(annotatedProteinCount)) * 100.0

            if abs(proteinDifference) > 9.9:
                warnings.append("CDS_count")
        else:
            warnings.append("No_CDS_count")

        # Determine phylum
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        domain = ""
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]

        phylumName = ""
        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if phylumTaxId:
            phylumName = names[phylumTaxId[0]]

        speciesDf = speciesDf.append(
            pd.DataFrame({
                'TaxId':
                pd.Series([taxId], dtype='int'),  # Species TaxId
                'Species':
                pd.Series([getSpeciesName(taxId)], dtype='str'),
                'Nickname':
                pd.Series([shortNames[taxId]], dtype='str'),
                'Domain':
                pd.Categorical([domain]),  # Bacteria, Eukaryota, Archaea
                'Phylum':
                pd.Categorical([phylumName]),  # Phylum name (string)
                'NumCDSs':
                pd.Series([cdsCountInRedis],
                          dtype='int'),  # CDS count for this species
                'NumCDSsInProfile':
                pd.Series([0],
                          dtype='int'),  # Num seqs with 20 shuffled profiles
                'AnnotatedNumCDSs':
                pd.Series([
                    0
                    if annotatedProteinCount is None else annotatedProteinCount
                ],
                          dtype='int'),  # 
                'CDSDifference':
                pd.Series([proteinDifference], dtype='float'),  # 
                'NumNativeSeqs':
                pd.Series([0], dtype='int'),  # 
                'GCContentInCDS':
                pd.Series([0.0], dtype='float'),  # 
                'AnnotatedGCContent':
                pd.Series([annotatedGCContent], dtype='float'),  # 
                'RowType':
                pd.Categorical(["species"]),  # Species count or total
                'Warnings':
                pd.Series([", ".join(warnings)], dtype='str'),  #
                'CDSWarnings':
                pd.Series([0], dtype='int'),
                'CDSWarnings_':
                pd.Series([""], dtype='str'),
                'FirstAA':
                pd.Series([""], dtype='str'),
                'LastAA':
                pd.Series([""], dtype='str'),
                'Source':
                pd.Series([""], dtype='str')
            }))

        fractionSize = 1000  # How many sequences (roughly) to process in each task
        numFractions = cdsCountInRedis / fractionSize
        if numFractions == 0: numFractions = 1

        for i in range(numFractions):
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #
            #if i%100!=5: continue
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #

            call = dask.delayed(calcNativeSequencesStatistics)(taxId, i,
                                                               numFractions)
            delayedCalls_native.append(call)

        call = dask.delayed(countShuffledProfiles)(taxId,
                                                   (310, 10, "begin", 0), 102,
                                                   11)
        delayedCalls_shuffledProfiles.append(call)

    speciesDf.set_index('TaxId', inplace=True)

    print("Starting {} calls...".format(
        len(delayedCalls_native) + len(delayedCalls_shuffledProfiles)))

    futures = scheduler.compute(
        delayedCalls_native + delayedCalls_shuffledProfiles
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}

    errorsCount = 0
    for f in futures:
        try:
            ret = scheduler.gather(f)
            if (len(ret) == 9):
                (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings,
                 warnings, firstAA, lastAA) = ret

                current = None
                if taxId in results:
                    current = results[taxId]
                else:
                    current = (0, 0, 0, 0, Counter(), Counter(), Counter())

                current = (current[0] + cdsCount, current[1] + gcCounts,
                           current[2] + totalCounts, current[3] + cdsWarnings,
                           current[4] + warnings, current[5] + firstAA,
                           current[6] + lastAA)

                results[taxId] = current

            elif (len(ret) == 2):
                (taxId, numShuffledSeqs) = ret
                shuffledCounts[taxId] = numShuffledSeqs

            else:
                assert (False)

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA,
         lastAA) = result
        speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs

        speciesDf.at[taxId, 'GCContentInCDS'] = round(
            float(gcCounts) / float(totalCounts) * 100.0, 1)

        speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings

        speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings)
        speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA)
        speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA)

        #if numNativeSeqs < species.at[taxId, 'NumCDSs']:
        #    pass

    for taxId, result in shuffledCounts.items():
        speciesDf.at[taxId, 'NumCDSsInProfile'] = result

    speciesDf = speciesDf.sort_values(by=['Domain', 'Species'])  # sort rows
    speciesDf.to_html('species_report.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                          'FirstAA', 'LastAA'
                      ])

    with open("species_report_simple.rst", "w") as f:
        f.write(
            speciesDf.drop([
                'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                'FirstAA', 'LastAA', 'CDSDifference'
            ],
                           axis=1).pipe(tabulate,
                                        headers='keys',
                                        tablefmt='rst'))

    speciesDf.to_html('species_report_simple.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain'
                      ])

    speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')