def queryPageRank(query, subSet, independentRun=False, topN=None): """Generic function to set up the datasets, and create normalized scoring (0 to 1) for a specific module technique. Also used when independent running of the module technique (no preprocessing of query) is wanted. Keyword arguments: query -- (list/str) the processed query word list (pass a string if independentRun = True) subSet -- (str) supply a subset (based on globalsubset please) to the function, this is also used in retrieving/saving datasets. independentRun -- (bool) if a independentRun is desired this allows a string query (default False) topN -- (int) return the highest N results Returns -- sorted list: [pmid: score] of topN results (highest first). """ citations = sliceDict(datasets.CITATIONS, subSet) # Debug # citations = {1:[2,3,4,5],2:[],3:[],4:[], 5:[]} citedby = dataloader.loadProcessed("citedby", "citations", subSet) if citedby is None: citedby = collections.defaultdict(list) print "PageRank is generating the inverse subset of citations" for ref, papersCitingRef in citations.iteritems(): for pmid in papersCitingRef: citedby[pmid].append(ref) dataloader.saveProcessed(citedby, "citedby", "citations", subSet, saveBZ2=True) if independentRun: query = searches.prepareQuery(query) print "Querying PageRank with: %r" % query results = dataloader.loadProcessed("resultsPageRank", "citations-citedby", subSet) if results is None: results = pageRank(citations, citedby) dataloader.saveProcessed(results, "resultsPageRank", "citations-citedby", subSet, saveBZ2=True) normalizeScore(results) if independentRun: results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:topN] return results
def queryTFIDF(query, subSet, independentRun=False, topN=None): """Generic function to set up the datasets, and create normalized scoring (0 to 1) for a specific module technique. Also used when independent running of the module technique (no preprocessing of query) is wanted. Keyword arguments: query -- (list/str) the processed query word list (pass a string if independentRun = True) subSet -- (str) supply a subset (based on globalsubset please) to the function, this is also used in retrieving/saving datasets. independentRun -- (bool) if a independentRun is desired this allows a string query (default False) topN -- (int) return the highest N results Returns -- sorted list: [pmid: score] of topN results (highest first). """ abstracts = {} titles = {} merged = {} titles = dataloader.loadProcessed("normalized", "titles", subSet) if titles is None: print "TFIDF is generating the processed subset of SUMMARIES" titles = {pmid: paper_info.title for pmid, paper_info in datasets.SUMMARIES.iteritems()} titles = sliceDict(titles, subSet) titles = normalizeDocuments(titles, datasets.STOPWORDS, lemmatization) dataloader.saveProcessed(titles, "normalized", "titles", subSet, saveBZ2=True) abstracts = dataloader.loadProcessed("normalized", "abstracts", subSet) if abstracts is None: print "TFIDF is generating the processed subset of ABSTRACTS" abstracts = sliceDict(datasets.ABSTRACTS, subSet) abstracts = normalizeDocuments(abstracts, datasets.STOPWORDS, lemmatization) dataloader.saveProcessed(abstracts, "normalized", "abstracts", subSet, saveBZ2=True) if independentRun: query = searches.prepareQuery(query) print "Querying TFIDF with: %r" % query merged = dataloader.loadProcessed("mergedTFIDF", "titles-abstracts", subSet) if merged is None: merged = {} for pmid, doc in titles.iteritems(): merged[pmid] = doc + abstracts[pmid] # Finally run tfidf merged = TFIDF(merged) dataloader.saveProcessed(merged, "mergedTFIDF", "titles-abstracts", subSet, saveBZ2=True) results = {} for pmid, tfidfscores in merged.iteritems(): score = 0.0 for term, termscore in tfidfscores.iteritems(): for qword in query: if qword in term: score += termscore results[pmid] = score normalizeScore(results) if independentRun: results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:topN] return results
def queryTFIDF(query, subSet, independentRun=False, topN=None): """Generic function to set up the datasets, and create normalized scoring (0 to 1) for a specific module technique. Also used when independent running of the module technique (no preprocessing of query) is wanted. Keyword arguments: query -- (list/str) the processed query word list (pass a string if independentRun = True) subSet -- (str) supply a subset (based on globalsubset please) to the function, this is also used in retrieving/saving datasets. independentRun -- (bool) if a independentRun is desired this allows a string query (default False) topN -- (int) return the highest N results Returns -- sorted list: [pmid: score] of topN results (highest first). """ abstracts = {} titles = {} merged = {} titles = dataloader.loadProcessed("normalized", "titles", subSet) if titles is None: print "TFIDF is generating the processed subset of SUMMARIES" titles = { pmid: paper_info.title for pmid, paper_info in datasets.SUMMARIES.iteritems() } titles = sliceDict(titles, subSet) titles = normalizeDocuments(titles, datasets.STOPWORDS, lemmatization) dataloader.saveProcessed(titles, "normalized", "titles", subSet, saveBZ2=True) abstracts = dataloader.loadProcessed("normalized", "abstracts", subSet) if abstracts is None: print "TFIDF is generating the processed subset of ABSTRACTS" abstracts = sliceDict(datasets.ABSTRACTS, subSet) abstracts = normalizeDocuments(abstracts, datasets.STOPWORDS, lemmatization) dataloader.saveProcessed(abstracts, "normalized", "abstracts", subSet, saveBZ2=True) if independentRun: query = searches.prepareQuery(query) print "Querying TFIDF with: %r" % query merged = dataloader.loadProcessed("mergedTFIDF", "titles-abstracts", subSet) if merged is None: merged = {} for pmid, doc in titles.iteritems(): merged[pmid] = doc + abstracts[pmid] # Finally run tfidf merged = TFIDF(merged) dataloader.saveProcessed(merged, "mergedTFIDF", "titles-abstracts", subSet, saveBZ2=True) results = {} for pmid, tfidfscores in merged.iteritems(): score = 0.0 for term, termscore in tfidfscores.iteritems(): for qword in query: if qword in term: score += termscore results[pmid] = score normalizeScore(results) if independentRun: results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:topN] return results