Beispiel #1
0
def query(query, subSet, useFeatures, usePageRank, useTFIDF, useClustering, useRecommend, topN=100):
    results = {}

    if not useFeatures and not usePageRank and not useTFIDF and not useClustering and not useRecommend:
        return None

    query = normalizeString(query, datasets.STOPWORDS, lemmatization).split()
    print "Querying with: %r" % query

    if useFeatures:
        results['IF'] = features.queryFeatures(query, subSet)
        prevMethod = 'IF'
    if usePageRank:
        results['PR'] = pagerank.queryPageRank(query, subSet)
    if useTFIDF:
        results['TI'] = TFIDF.queryTFIDF(query, subSet)
    if useClustering:
        results['CL'] = clustering.queryClustering(query, subSet)
        prevMethod = 'CL'
    if useRecommend:
        results['RE'] = recommend.queryRecommend(query, subSet)
        prevMethod = 'RE'


    endresults = collections.defaultdict(float)
    # Loop through all pmid's after having ID set sliced like subSet prescribes
    for pmid in [pmid for pmid in datasets.IDS if str(pmid).startswith(subSet)]:
        i = 0
        if useFeatures and results['IF'].has_key(pmid):
            i += 1
            endresults[pmid] += results['IF'][pmid] * 1.0 # Weighted modifier, if required 
        if usePageRank and results['PR'].has_key(pmid):
            i += 1
            endresults[pmid] += results['PR'][pmid] * 1.0 # Weighted modifier, if required
        if useTFIDF and results['TI'].has_key(pmid):
            i += 1 
            endresults[pmid] += results['TI'][pmid] * 1.0 # Weighted modifier, if required
        if useClustering and results['CL'].has_key(pmid):
            i += 1 
            endresults[pmid] += results['CL'][pmid] * 1.0 # Weighted modifier, if required
        if useRecommend and results['RE'].has_key(pmid):
            i += 1 
            endresults[pmid] += results['RE'][pmid] * 1.0 # Weighted modifier, if required
        if i > 0:
            endresults[pmid] /= i # Divide by the amount of succesful techniques ran on this pmid

    normalizeScore(endresults)
    # Return the topN results, sorted descending by score
    return sorted(endresults.items(), key=operator.itemgetter(1), reverse=True)[:topN]
Beispiel #2
0
def queryPageRank(query, subSet, independentRun=False, topN=None):
    """Generic function to set up the datasets, and create normalized scoring (0 to 1) for a specific module technique.
    Also used when independent running of the module technique (no preprocessing of query) is wanted. 
    
    Keyword arguments:
    query -- (list/str) the processed query word list (pass a string if independentRun = True)
    subSet -- (str) supply a subset (based on globalsubset please) to the function, this is also used in retrieving/saving datasets.
    independentRun -- (bool) if a independentRun is desired this allows a string query (default False)
    topN -- (int) return the highest N results

    Returns -- sorted list: [pmid: score] of topN results (highest first).
    
    """
    citations = sliceDict(datasets.CITATIONS, subSet)
    # Debug
    # citations = {1:[2,3,4,5],2:[],3:[],4:[], 5:[]}

    citedby = dataloader.loadProcessed("citedby", "citations", subSet)
    if citedby is None:
        citedby = collections.defaultdict(list)
        print "PageRank is generating the inverse subset of citations"
        for ref, papersCitingRef in citations.iteritems():
            for pmid in papersCitingRef:
                citedby[pmid].append(ref)
        dataloader.saveProcessed(citedby, "citedby", "citations", subSet, saveBZ2=True)

    if independentRun:
        query = searches.prepareQuery(query)
        print "Querying PageRank with: %r" % query

    results = dataloader.loadProcessed("resultsPageRank", "citations-citedby", subSet)
    if results is None:
        results = pageRank(citations, citedby)
        dataloader.saveProcessed(results, "resultsPageRank", "citations-citedby", subSet, saveBZ2=True)

    normalizeScore(results)

    if independentRun:
        results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:topN]

    return results
Beispiel #3
0
def queryTFIDF(query, subSet, independentRun=False, topN=None):
    """Generic function to set up the datasets, and create normalized scoring (0 to 1) for a specific module technique.
    Also used when independent running of the module technique (no preprocessing of query) is wanted. 
    
    Keyword arguments:
    query -- (list/str) the processed query word list (pass a string if independentRun = True)
    subSet -- (str) supply a subset (based on globalsubset please) to the function, this is also used in retrieving/saving datasets.
    independentRun -- (bool) if a independentRun is desired this allows a string query (default False)
    topN -- (int) return the highest N results

    Returns -- sorted list: [pmid: score] of topN results (highest first).
    
    """
    abstracts = {}
    titles = {}
    merged = {}

    titles = dataloader.loadProcessed("normalized", "titles", subSet)
    if titles is None:
        print "TFIDF is generating the processed subset of SUMMARIES"
        titles = {pmid: paper_info.title for pmid, paper_info in datasets.SUMMARIES.iteritems()}
        titles = sliceDict(titles, subSet)
        titles = normalizeDocuments(titles, datasets.STOPWORDS, lemmatization)
        dataloader.saveProcessed(titles, "normalized", "titles", subSet, saveBZ2=True)

    abstracts = dataloader.loadProcessed("normalized", "abstracts", subSet)
    if abstracts is None:
        print "TFIDF is generating the processed subset of ABSTRACTS"
        abstracts = sliceDict(datasets.ABSTRACTS, subSet)
        abstracts = normalizeDocuments(abstracts, datasets.STOPWORDS, lemmatization)
        dataloader.saveProcessed(abstracts, "normalized", "abstracts", subSet, saveBZ2=True)

    if independentRun:
        query = searches.prepareQuery(query)
        print "Querying TFIDF with: %r" % query

    merged = dataloader.loadProcessed("mergedTFIDF", "titles-abstracts", subSet)
    if merged is None:
        merged = {}
        for pmid, doc in titles.iteritems():
            merged[pmid] = doc + abstracts[pmid]
        # Finally run tfidf
        merged = TFIDF(merged)
        dataloader.saveProcessed(merged, "mergedTFIDF", "titles-abstracts", subSet, saveBZ2=True)

    results = {}
    for pmid, tfidfscores in merged.iteritems():
        score = 0.0
        for term, termscore in tfidfscores.iteritems():
            for qword in query:
                if qword in term:
                    score += termscore
                    
        results[pmid] = score

    normalizeScore(results)

    if independentRun:
        results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:topN]

    return results
Beispiel #4
0
def queryTFIDF(query, subSet, independentRun=False, topN=None):
    """Generic function to set up the datasets, and create normalized scoring (0 to 1) for a specific module technique.
    Also used when independent running of the module technique (no preprocessing of query) is wanted. 
    
    Keyword arguments:
    query -- (list/str) the processed query word list (pass a string if independentRun = True)
    subSet -- (str) supply a subset (based on globalsubset please) to the function, this is also used in retrieving/saving datasets.
    independentRun -- (bool) if a independentRun is desired this allows a string query (default False)
    topN -- (int) return the highest N results

    Returns -- sorted list: [pmid: score] of topN results (highest first).
    
    """
    abstracts = {}
    titles = {}
    merged = {}

    titles = dataloader.loadProcessed("normalized", "titles", subSet)
    if titles is None:
        print "TFIDF is generating the processed subset of SUMMARIES"
        titles = {
            pmid: paper_info.title
            for pmid, paper_info in datasets.SUMMARIES.iteritems()
        }
        titles = sliceDict(titles, subSet)
        titles = normalizeDocuments(titles, datasets.STOPWORDS, lemmatization)
        dataloader.saveProcessed(titles,
                                 "normalized",
                                 "titles",
                                 subSet,
                                 saveBZ2=True)

    abstracts = dataloader.loadProcessed("normalized", "abstracts", subSet)
    if abstracts is None:
        print "TFIDF is generating the processed subset of ABSTRACTS"
        abstracts = sliceDict(datasets.ABSTRACTS, subSet)
        abstracts = normalizeDocuments(abstracts, datasets.STOPWORDS,
                                       lemmatization)
        dataloader.saveProcessed(abstracts,
                                 "normalized",
                                 "abstracts",
                                 subSet,
                                 saveBZ2=True)

    if independentRun:
        query = searches.prepareQuery(query)
        print "Querying TFIDF with: %r" % query

    merged = dataloader.loadProcessed("mergedTFIDF", "titles-abstracts",
                                      subSet)
    if merged is None:
        merged = {}
        for pmid, doc in titles.iteritems():
            merged[pmid] = doc + abstracts[pmid]
        # Finally run tfidf
        merged = TFIDF(merged)
        dataloader.saveProcessed(merged,
                                 "mergedTFIDF",
                                 "titles-abstracts",
                                 subSet,
                                 saveBZ2=True)

    results = {}
    for pmid, tfidfscores in merged.iteritems():
        score = 0.0
        for term, termscore in tfidfscores.iteritems():
            for qword in query:
                if qword in term:
                    score += termscore

        results[pmid] = score

    normalizeScore(results)

    if independentRun:
        results = sorted(results.items(),
                         key=operator.itemgetter(1),
                         reverse=True)[:topN]

    return results