Beispiel #1
0
def custom_db_analysis():
# Deal with a custom database file
	import os
	import glob
	import shutil
	from genewordsearch.DBBuilder import geneWordBuilder 
	
	# Prep the database files for processing
	ip = str(request.environ['REMOTE_ADDR'])
	folder = os.path.join(app.config['UPLOAD_FOLDER'], ip)
	os.makedirs(folder, exist_ok=True)
	dbFiles = request.files.getlist('geneDBs')
	fileCount = len(dbFiles)
	fileNum = 0
	for db in dbFiles:
		filename = secure_filename(db.filename)
		db.save(os.path.join(folder, (str(fileNum)+filename[-4:])))
		fileNum += 1
	fileList = glob.glob(folder+'/*')
	fileList.sort()
	
	# Pull and organize the rest of the database info
	headers = []
	headtxt = []
	delimiters = []
	geneCols = []
	desCols = []
	fileNum = 0
	while(fileNum < fileCount):
		headtxt.append(str(request.form['header'+str(fileNum)]))
		delimiters.append(str(request.form['delimiter'+str(fileNum)]))
		geneCols.append(str(request.form['geneCol'+str(fileNum)]))
		desCols.append(str(request.form['desCols'+str(fileNum)]))
		fileNum += 1
	for header in headtxt:
		if(header =='y'):
			headers.append(True)
		else:
			headers.append(False)
	geneWordBuilder(ip,fileList,geneCols,desCols,delimiters,headers)
	shutil.rmtree(folder+'/')
	
	# Run the enrichment analysis
	genes = str(request.form['geneList'])
	probCutoff = float(request.form['probCut'])
	genes = re.split('\r| |,|\t|\n',genes)
	genes = list(filter((lambda x: x != ''),genes))
	try:
		results = geneWordSearch(genes,ip,minChance=probCutoff)
	except KeyError:
		abort(400)
	ans = WordFreq.to_JSON_array(results[0])
	shutil.rmtree('genewordsearch/databases/'+ip+'/')
	return jsonify(result=ans)
Beispiel #2
0
def gene_word_search():
    cob = networks[str(request.form['network'])]
    pCutoff = safeOpts('pCutoff',float(request.form['pCutoff']))
    geneList = str(request.form['geneList'])
    geneList = list(filter((lambda x: x != ''), re.split('\r| |,|;|\t|\n', geneList)))
    
    # Run the analysis and return the JSONified results
    if cob._global('parent_refgen') in func_data_db:
        results = geneWordSearch(geneList, cob._global('parent_refgen'), minChance=pCutoff)
    else:
        abort(405)
    if len(results[0]) == 0:
        abort(400)
    results = WordFreq.to_JSON_array(results[0])
    return jsonify(result=results)
Beispiel #3
0
def gene_analysis():
# Run the genes through genewordsearch
	# Sanitize the input
	species = str(request.form['species'])
	genes = str(request.form['geneList'])
	probCutoff = float(request.form['probCut'])
	genes = re.split('\r| |,|\t|\n',genes)
	genes = list(filter((lambda x: x != ''),genes))

	# Run the analysis and return the JSONified results
	try:
		results = geneWordSearch(genes,species,minChance=probCutoff)
	except KeyError:
		abort(400)
	ans = WordFreq.to_JSON_array(results[0])
	return jsonify(result=ans)
Beispiel #4
0
def geneWordSearch(genes,species,minChance=0.05,minWordFreq=3,corrected=False):
# Does the analysis work of making of looking at the genes and doing the statistics
#	genes - list of strings of the gene ids in the set to be analysed
#	species - str of the species these genes belong to
#	minChance - the minimum probability that is acceptable for the word to be included in the Results
#	minWordFreq - the minimum amount of genes the word must appear in in the set to be counted
#	corrected - boolean saying whether the results should be cutoff using the corrected p value or the
#	            original p, if true, results are more reliable, but less numerous

	# Unpickle the database of words
	dbFolder = 'databases/'+ species
	if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
		dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb')
	else:
		raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.')
	db = pickle.load(dbfile)

	# Build the word list up for all of the genes provided.
	words = []
	webSites = []
	badGenes = []
	links = WordFreq('Web Links',0)
	for item in genes:
		# Make the input all lowercase to match the database
		gene = item.lower()
		i=1

		# Get the object from the DB, skip term if it is not there
		try:
			geneData = db[gene]
		except KeyError:
			badGenes.append(gene)
			continue

		# Adding words related to the gene in db to the overall list
		for word in geneData.words:
			words.append([word,geneData.gene])

		# Dealing with the websites
		for link in geneData.links:
			links.addGene(geneData.gene)
			webSites.append(link)

	# Sort to put words in alphabetical order for counting
	words.sort()

	# Adding the web link counts to the list
	wordList = []

	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item[0]):
			wordList.insert(0, WordFreq(item[0],1))
			wordList[0].addGene(item[1])
		else:
			wordList[0].increment()
			wordList[0].addGene(item[1])
	del words

	# Getting rid of words that don't happen in enough genes to matter
	wordListRaw = wordList[:]
	wordList = []
	length = 0
	for word in wordListRaw:
		if(word.freq >= minWordFreq):
			wordList.append(word)
			length += word.freq
	del wordListRaw

	# Finding the respective P values
	pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb')
	wordCounts = pickle.load(pickleDict)
	totalWords = wordCounts['Total Count']
	for word in wordList:
		word.computeP(wordCounts,length,totalWords)
	pickleDict.close()
	del wordCounts

	# Sorting now by P Value instead of alphabetical
	wordList = sorted(wordList, key=lambda item: item.p)

	# Finding corrected P Values using Holm–Bonferroni method
	count = len(wordList)
	for i in range(0,count):
		wordList[i].pCorrect(count,(i+1))

	# Sort by corrected P Value instead of original P value if desired
	if(corrected):
		wordList = sorted(wordList, key=lambda item: item.pCor)

	# Filtering out results that are higher than the minimum chance threshold
	wordList = filter(lambda x: x.p <= minChance,wordList)

	return (list(wordList),list(webSites))
def geneWordSearch(genes,species,minChance=0.05,corrected=False):
# Input: Takes in a list of genes, the species, and the probability cutoff.
# Output: Returns tuple of words and links. Only returns the genes that have a 
#         chance probability of less than the minChance variable. 
	import re
	import pickle
	import pkg_resources
	from genewordsearch.Classes import WordFreq
	from genewordsearch.Classes import GeneNote
	
	# Unpickle the database of words
	dbFolder = 'databases/'+ species
	if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
		dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb')
	else:
		raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.')
	db = pickle.load(dbfile)
	
	# Build the word list up for all of the genes provided.
	words = []
	webSites = []
	links = WordFreq('Web Links',0)
	for item in genes:
		# Make the input all lowercase to match the database
		gene = item.lower()
		i=1
		
		# Get the object from the DB
		geneData = db[gene]
		
		# Adding words related to the gene in db to the overall list
		for word in geneData.words:
			words.append([word,geneData.gene])
		
		# Dealing with the websites
		for link in geneData.links:
			links.addGene(geneData.gene)
			webSites.append(link)

	# Sort to put words in alphabetical order for counting
	words.sort()
	
	# Adding the web link counts to the list
	wordList = []
	
	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item[0]):
			wordList.insert(0, WordFreq(item[0],1))
			wordList[0].addGene(item[1])
		else:
			wordList[0].increment()
			wordList[0].addGene(item[1])
	del words
	
	# Getting rid of words that don't happen in enough genes to matter
	wordListRaw = wordList[:]
	wordList = []
	length = 0
	for word in wordListRaw:
		if(word.freq >= 3):
			wordList.append(word)
			length += word.freq
	del wordListRaw
	
	# Finding the respective P values
	pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb')
	wordCounts = pickle.load(pickleDict)
	totalWords = wordCounts['Total Count']
	for word in wordList:
		word.computeP(wordCounts,length,totalWords)
	pickleDict.close()
	del wordCounts
	
	# Sorting now by P Value instead of alphabetical
	wordList = sorted(wordList, key=lambda item: item.p)
	
	# Finding corrected P Values using Holm–Bonferroni method
	count = len(wordList)
	for i in range(0,count):
		wordList[i].pCorrect(count,(i+1))
	
	# Sort by corrected P Value instead of original P value if desired
	if(corrected):
		wordList = sorted(wordList, key=lambda item: item.pCor)
	
	# Filtering out results that are higher than the minimum chance threshold
	wordList = filter(lambda x: x.p <= minChance,wordList)
	
	return (list(wordList),list(webSites))