Exemple #1
def wordCounter(db):
# Takes in a database dictionary and returns sorted list of WordFreq objects
# sorted by how often they occur
	from genewordsearch.Classes import GeneNote
	from genewordsearch.Classes import WordFreq
	#Make a list of all the words associated genes in the database
	words = []
	for gene in list(db.values()):
		words += gene.words
	# Sorting the words into alphabetical order
	wordList = []
	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item):
			wordList.insert(0, WordFreq(item,1))
	del words
	# Sorting now by frequency instead of alphabetical and return it
	return sorted(wordList, key=lambda item: item.freq,reverse=True)
Exemple #2
def bookkeeper(species, geneDB, countList):
	import os
	import pickle
	import pkg_resources
	from genewordsearch.Classes import WordFreq
	from genewordsearch.Classes import GeneNote
	# Find the total word count, add it to the list
	total = 0
	for word in countList:
		total += word.freq
	countList.insert(0,WordFreq('Total Count',total))
	# Determine outfile locations
	dbFolder = 'databases/' + species.lower() + '/'
	os.makedirs(pkg_resources.resource_filename(__name__, dbFolder), exist_ok=True)
	folder = pkg_resources.resource_filename(__name__, dbFolder)
	# --------------Save the gene database files-------------------
	# Make a text version for posterity (and error checking)
	printList = list(geneDB.values())
	geneFile = open(folder+'geneNotes.tsv','w',newline='')
	for gene in printList:
		if not(gene.gene == ''):
	# Pickle that stuff! (for geneWordSearch function)
	# ---------------Save the total word count files----------------
	# Make a text version for posterity (and error checking)
	countFile = open(folder+'totalWordCounts.tsv','w')
	for word in countList:
		countFile.write(str(word.freq) + '\t' + str(word.word) + '\n')
	# Pickle a dictionary of that stuff! (for geneWordSearch function)
	countDB = dict()
	for word in countList:
		countDB[word.word] = word.freq
Exemple #3
def bookkeeper(species, geneDB, countList):
    # Internal Method
    # Make all of the necessary files

    # Find the total word count, add it to the list
    total = 0
    for word in countList:
        total += word.freq
    countList.insert(0, WordFreq('Total Count', total))

    # Determine outfile locations
    dbFolder = getPath(species)
    os.makedirs(dbFolder, exist_ok=True)

    # --------------Save the gene database files-------------------

    # Make a text version for posterity (and error checking)
    printList = list(geneDB.values())
    geneFile = open(os.path.join(dbFolder, 'geneNotes.tsv'), 'w', newline='')
    for gene in printList:
        if not (gene.gene == ''):

    # Pickle that stuff! (for geneWordSearch function)
    pickle.dump(geneDB, open(os.path.join(dbFolder, 'geneNotes.p'), 'wb'))

    # ---------------Save the total word count files----------------

    # Make a text version for posterity (and error checking)
    countFile = open(os.path.join(dbFolder, 'totalWordCounts.tsv'), 'w')
    for word in countList:
        countFile.write(str(word.freq) + '\t' + str(word.word) + '\n')

    # Pickle a dictionary of that stuff! (for geneWordSearch function)
    countDB = dict()
    for word in countList:
        countDB[word.word] = word.freq
    pickle.dump(countDB, open(os.path.join(dbFolder, 'totalWordCounts.p'),

def geneWordSearch(genes, species, minChance=0.05, corrected=False):
    # Input: Takes in a list of genes, the species, and the probability cutoff.
    # Output: Returns tuple of words and links. Only returns the genes that have a
    #         chance probability of less than the minChance variable.
    import re
    import pickle
    import pkg_resources
    from genewordsearch.Classes import WordFreq
    from genewordsearch.Classes import GeneNote

    # Unpickle the database of words
    dbFolder = 'databases/' + species
    if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
        dbfile = open(
                                            dbFolder + '/geneNotes.p'), 'rb')
        raise ValueError(
            'There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.'
    db = pickle.load(dbfile)

    # Build the word list up for all of the genes provided.
    words = []
    webSites = []
    links = WordFreq('Web Links', 0)
    for item in genes:
        # Make the input all lowercase to match the database
        gene = item.lower()
        i = 1

        # Get the object from the DB
        geneData = db[gene]

        # Adding words related to the gene in db to the overall list
        for word in geneData.words:
            words.append([word, geneData.gene])

        # Dealing with the websites
        for link in geneData.links:

    # Sort to put words in alphabetical order for counting

    # Adding the web link counts to the list
    wordList = []

    # Counting the words
    for item in words:
        if (wordList == [] or wordList[0].word != item[0]):
            wordList.insert(0, WordFreq(item[0], 1))
    del words

    # Getting rid of words that don't happen in enough genes to matter
    wordListRaw = wordList[:]
    wordList = []
    length = 0
    for word in wordListRaw:
        if (word.freq >= 3):
            length += word.freq
    del wordListRaw

    # Finding the respective P values
    pickleDict = dbfile = open(
                                        dbFolder + '/totalWordCounts.p'), 'rb')
    wordCounts = pickle.load(pickleDict)
    totalWords = wordCounts['Total Count']
    for word in wordList:
        word.computeP(wordCounts, length, totalWords)
    del wordCounts

    # Sorting now by P Value instead of alphabetical
    wordList = sorted(wordList, key=lambda item: item.p)

    # Finding corrected P Values using Holm–Bonferroni method
    count = len(wordList)
    for i in range(0, count):
        wordList[i].pCorrect(count, (i + 1))

    # Sort by corrected P Value instead of original P value if desired
    if (corrected):
        wordList = sorted(wordList, key=lambda item: item.pCor)

    # Filtering out results that are higher than the minimum chance threshold
    wordList = filter(lambda x: x.p <= minChance, wordList)

    return (list(wordList), list(webSites))
def geneWordSearch(genes,
    # Does the analysis work of making of looking at the genes and doing the statistics
    #	genes - list of strings of the gene ids in the set to be analysed
    #	species - str of the species these genes belong to
    #	minChance - the minimum probability that is acceptable for the word to be included in the Results
    #	minWordFreq - the minimum amount of genes the word must appear in in the set to be counted
    #	corrected - boolean saying whether the results should be cutoff using the corrected p value or the
    #	            original p, if true, results are more reliable, but less numerous

    # Unpickle the database of words
    dbFolder = getPath(species)
        dbfile = open(os.path.join(dbFolder, 'geneNotes.p'), 'rb')
        raise ValueError(
            'There is no database associated with ' + species +
            ', please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.'
    db = pickle.load(dbfile)

    # Build the word list up for all of the genes provided.
    words = []
    webSites = []
    badGenes = []
    links = WordFreq('Web Links', 0)
    for item in genes:
        # Make the input all lowercase to match the database
        gene = item.lower()
        i = 1

        # Get the object from the DB, skip term if it is not there
            geneData = db[gene]
        except KeyError:

        # Adding words related to the gene in db to the overall list
        for word in geneData.words:
            words.append([word, geneData.gene])

        # Dealing with the websites
        for link in geneData.links:

    # Sort to put words in alphabetical order for counting

    # Adding the web link counts to the list
    wordList = []

    # Counting the words
    for item in words:
        if (wordList == [] or wordList[0].word != item[0]):
            wordList.insert(0, WordFreq(item[0], 1))
    del words

    # Getting rid of words that don't happen in enough genes to matter
    wordListRaw = wordList[:]
    wordList = []
    length = 0
    for word in wordListRaw:
        if (word.freq >= minWordFreq):
            length += word.freq
    del wordListRaw

    # Finding the respective P values
    pickleDict = dbfile = open(os.path.join(dbFolder, 'totalWordCounts.p'),
    wordCounts = pickle.load(pickleDict)
    totalWords = wordCounts['Total Count']
    for word in wordList:
        word.computeP(wordCounts, length, totalWords)
    del wordCounts

    # Sorting now by P Value instead of alphabetical
    wordList = sorted(wordList, key=lambda item: item.p)

    # Finding corrected P Values using Holm–Bonferroni method
    count = len(wordList)
    for i in range(0, count):
        wordList[i].pCorrect(count, (i + 1))

    # Sort by corrected P Value instead of original P value if desired
    if (corrected):
        wordList = sorted(wordList, key=lambda item: item.pCor)

    # Filtering out results that are higher than the minimum chance threshold
    wordList = filter(lambda x: x.p <= minChance, wordList)

    return (list(wordList), list(webSites))