コード例 #1
0
def geneWordSearch(genes, species, minChance=0.05, corrected=False):
    # Input: Takes in a list of genes, the species, and the probability cutoff.
    # Output: Returns tuple of words and links. Only returns the genes that have a
    #         chance probability of less than the minChance variable.
    import re
    import pickle
    import pkg_resources
    from genewordsearch.Classes import WordFreq
    from genewordsearch.Classes import GeneNote

    # Unpickle the database of words
    dbFolder = 'databases/' + species
    if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
        dbfile = open(
            pkg_resources.resource_filename(__name__,
                                            dbFolder + '/geneNotes.p'), 'rb')
    else:
        raise ValueError(
            'There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.'
        )
    db = pickle.load(dbfile)

    # Build the word list up for all of the genes provided.
    words = []
    webSites = []
    links = WordFreq('Web Links', 0)
    for item in genes:
        # Make the input all lowercase to match the database
        gene = item.lower()
        i = 1

        # Get the object from the DB
        geneData = db[gene]

        # Adding words related to the gene in db to the overall list
        for word in geneData.words:
            words.append([word, geneData.gene])

        # Dealing with the websites
        for link in geneData.links:
            links.addGene(geneData.gene)
            webSites.append(link)

    # Sort to put words in alphabetical order for counting
    words.sort()

    # Adding the web link counts to the list
    wordList = []

    # Counting the words
    for item in words:
        if (wordList == [] or wordList[0].word != item[0]):
            wordList.insert(0, WordFreq(item[0], 1))
            wordList[0].addGene(item[1])
        else:
            wordList[0].increment()
            wordList[0].addGene(item[1])
    del words

    # Getting rid of words that don't happen in enough genes to matter
    wordListRaw = wordList[:]
    wordList = []
    length = 0
    for word in wordListRaw:
        if (word.freq >= 3):
            wordList.append(word)
            length += word.freq
    del wordListRaw

    # Finding the respective P values
    pickleDict = dbfile = open(
        pkg_resources.resource_filename(__name__,
                                        dbFolder + '/totalWordCounts.p'), 'rb')
    wordCounts = pickle.load(pickleDict)
    totalWords = wordCounts['Total Count']
    for word in wordList:
        word.computeP(wordCounts, length, totalWords)
    pickleDict.close()
    del wordCounts

    # Sorting now by P Value instead of alphabetical
    wordList = sorted(wordList, key=lambda item: item.p)

    # Finding corrected P Values using Holm–Bonferroni method
    count = len(wordList)
    for i in range(0, count):
        wordList[i].pCorrect(count, (i + 1))

    # Sort by corrected P Value instead of original P value if desired
    if (corrected):
        wordList = sorted(wordList, key=lambda item: item.pCor)

    # Filtering out results that are higher than the minimum chance threshold
    wordList = filter(lambda x: x.p <= minChance, wordList)

    return (list(wordList), list(webSites))
コード例 #2
0
ファイル: GeneWordSearch.py プロジェクト: mrG7/geneWordSearch
def geneWordSearch(genes,species,minChance=0.05,minWordFreq=3,corrected=False):
# Does the analysis work of making of looking at the genes and doing the statistics
#	genes - list of strings of the gene ids in the set to be analysed
#	species - str of the species these genes belong to
#	minChance - the minimum probability that is acceptable for the word to be included in the Results
#	minWordFreq - the minimum amount of genes the word must appear in in the set to be counted
#	corrected - boolean saying whether the results should be cutoff using the corrected p value or the
#	            original p, if true, results are more reliable, but less numerous

	# Unpickle the database of words
	dbFolder = 'databases/'+ species
	if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
		dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb')
	else:
		raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.')
	db = pickle.load(dbfile)

	# Build the word list up for all of the genes provided.
	words = []
	webSites = []
	badGenes = []
	links = WordFreq('Web Links',0)
	for item in genes:
		# Make the input all lowercase to match the database
		gene = item.lower()
		i=1

		# Get the object from the DB, skip term if it is not there
		try:
			geneData = db[gene]
		except KeyError:
			badGenes.append(gene)
			continue

		# Adding words related to the gene in db to the overall list
		for word in geneData.words:
			words.append([word,geneData.gene])

		# Dealing with the websites
		for link in geneData.links:
			links.addGene(geneData.gene)
			webSites.append(link)

	# Sort to put words in alphabetical order for counting
	words.sort()

	# Adding the web link counts to the list
	wordList = []

	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item[0]):
			wordList.insert(0, WordFreq(item[0],1))
			wordList[0].addGene(item[1])
		else:
			wordList[0].increment()
			wordList[0].addGene(item[1])
	del words

	# Getting rid of words that don't happen in enough genes to matter
	wordListRaw = wordList[:]
	wordList = []
	length = 0
	for word in wordListRaw:
		if(word.freq >= minWordFreq):
			wordList.append(word)
			length += word.freq
	del wordListRaw

	# Finding the respective P values
	pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb')
	wordCounts = pickle.load(pickleDict)
	totalWords = wordCounts['Total Count']
	for word in wordList:
		word.computeP(wordCounts,length,totalWords)
	pickleDict.close()
	del wordCounts

	# Sorting now by P Value instead of alphabetical
	wordList = sorted(wordList, key=lambda item: item.p)

	# Finding corrected P Values using Holm–Bonferroni method
	count = len(wordList)
	for i in range(0,count):
		wordList[i].pCorrect(count,(i+1))

	# Sort by corrected P Value instead of original P value if desired
	if(corrected):
		wordList = sorted(wordList, key=lambda item: item.pCor)

	# Filtering out results that are higher than the minimum chance threshold
	wordList = filter(lambda x: x.p <= minChance,wordList)

	return (list(wordList),list(webSites))
コード例 #3
0
def geneWordSearch(genes,
                   species,
                   minChance=0.05,
                   minWordFreq=3,
                   corrected=False):
    # Does the analysis work of making of looking at the genes and doing the statistics
    #	genes - list of strings of the gene ids in the set to be analysed
    #	species - str of the species these genes belong to
    #	minChance - the minimum probability that is acceptable for the word to be included in the Results
    #	minWordFreq - the minimum amount of genes the word must appear in in the set to be counted
    #	corrected - boolean saying whether the results should be cutoff using the corrected p value or the
    #	            original p, if true, results are more reliable, but less numerous

    # Unpickle the database of words
    dbFolder = getPath(species)
    try:
        dbfile = open(os.path.join(dbFolder, 'geneNotes.p'), 'rb')
    except:
        raise ValueError(
            'There is no database associated with ' + species +
            ', please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.'
        )
    db = pickle.load(dbfile)

    # Build the word list up for all of the genes provided.
    words = []
    webSites = []
    badGenes = []
    links = WordFreq('Web Links', 0)
    for item in genes:
        # Make the input all lowercase to match the database
        gene = item.lower()
        i = 1

        # Get the object from the DB, skip term if it is not there
        try:
            geneData = db[gene]
        except KeyError:
            badGenes.append(gene)
            continue

        # Adding words related to the gene in db to the overall list
        for word in geneData.words:
            words.append([word, geneData.gene])

        # Dealing with the websites
        for link in geneData.links:
            links.addGene(geneData.gene)
            webSites.append(link)

    # Sort to put words in alphabetical order for counting
    words.sort()

    # Adding the web link counts to the list
    wordList = []

    # Counting the words
    for item in words:
        if (wordList == [] or wordList[0].word != item[0]):
            wordList.insert(0, WordFreq(item[0], 1))
            wordList[0].addGene(item[1])
        else:
            wordList[0].increment()
            wordList[0].addGene(item[1])
    del words

    # Getting rid of words that don't happen in enough genes to matter
    wordListRaw = wordList[:]
    wordList = []
    length = 0
    for word in wordListRaw:
        if (word.freq >= minWordFreq):
            wordList.append(word)
            length += word.freq
    del wordListRaw

    # Finding the respective P values
    pickleDict = dbfile = open(os.path.join(dbFolder, 'totalWordCounts.p'),
                               'rb')
    wordCounts = pickle.load(pickleDict)
    totalWords = wordCounts['Total Count']
    for word in wordList:
        word.computeP(wordCounts, length, totalWords)
    pickleDict.close()
    del wordCounts

    # Sorting now by P Value instead of alphabetical
    wordList = sorted(wordList, key=lambda item: item.p)

    # Finding corrected P Values using Holm–Bonferroni method
    count = len(wordList)
    for i in range(0, count):
        wordList[i].pCorrect(count, (i + 1))

    # Sort by corrected P Value instead of original P value if desired
    if (corrected):
        wordList = sorted(wordList, key=lambda item: item.pCor)

    # Filtering out results that are higher than the minimum chance threshold
    wordList = filter(lambda x: x.p <= minChance, wordList)

    return (list(wordList), list(webSites))
コード例 #4
0
def geneWordSearch(genes,species,minChance=0.05,corrected=False):
# Input: Takes in a list of genes, the species, and the probability cutoff.
# Output: Returns tuple of words and links. Only returns the genes that have a 
#         chance probability of less than the minChance variable. 
	import re
	import pickle
	import pkg_resources
	from genewordsearch.Classes import WordFreq
	from genewordsearch.Classes import GeneNote
	
	# Unpickle the database of words
	dbFolder = 'databases/'+ species
	if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
		dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb')
	else:
		raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.')
	db = pickle.load(dbfile)
	
	# Build the word list up for all of the genes provided.
	words = []
	webSites = []
	links = WordFreq('Web Links',0)
	for item in genes:
		# Make the input all lowercase to match the database
		gene = item.lower()
		i=1
		
		# Get the object from the DB
		geneData = db[gene]
		
		# Adding words related to the gene in db to the overall list
		for word in geneData.words:
			words.append([word,geneData.gene])
		
		# Dealing with the websites
		for link in geneData.links:
			links.addGene(geneData.gene)
			webSites.append(link)

	# Sort to put words in alphabetical order for counting
	words.sort()
	
	# Adding the web link counts to the list
	wordList = []
	
	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item[0]):
			wordList.insert(0, WordFreq(item[0],1))
			wordList[0].addGene(item[1])
		else:
			wordList[0].increment()
			wordList[0].addGene(item[1])
	del words
	
	# Getting rid of words that don't happen in enough genes to matter
	wordListRaw = wordList[:]
	wordList = []
	length = 0
	for word in wordListRaw:
		if(word.freq >= 3):
			wordList.append(word)
			length += word.freq
	del wordListRaw
	
	# Finding the respective P values
	pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb')
	wordCounts = pickle.load(pickleDict)
	totalWords = wordCounts['Total Count']
	for word in wordList:
		word.computeP(wordCounts,length,totalWords)
	pickleDict.close()
	del wordCounts
	
	# Sorting now by P Value instead of alphabetical
	wordList = sorted(wordList, key=lambda item: item.p)
	
	# Finding corrected P Values using Holm–Bonferroni method
	count = len(wordList)
	for i in range(0,count):
		wordList[i].pCorrect(count,(i+1))
	
	# Sort by corrected P Value instead of original P value if desired
	if(corrected):
		wordList = sorted(wordList, key=lambda item: item.pCor)
	
	# Filtering out results that are higher than the minimum chance threshold
	wordList = filter(lambda x: x.p <= minChance,wordList)
	
	return (list(wordList),list(webSites))