예제 #1
0
def searchDisease(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """

    sanitizer = TextCleaner.sanitizeString()
    queryString = sanitizer.sub(' ', queryString)
    
    # OPTIONAL:
    # Stem the information
    if _stemmer:
        # Get the regex pattern that sanitizeses information and sanitize it
        # Stem the information
        queryString = FilterInterface.porterStemmer(queryString)

    # CHOOSE HEURISTIC:
    # Search-heuristic used to retrieve the list of results
    if _cosineMeasure:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)
    else:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)

    # Sort the results and reverse to get the highest score first
    results.sort()
    results.reverse()

    resultDic = {}
    for item in results[:top]:
        pmid = item[1]
        label = _labelHash[pmid]
        resultDic[label] = item[0]

    resultList = sorted(resultDic.items(), key=lambda(k, v):(v, k), reverse=True)

    return resultList[:20]
예제 #2
0
def search(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """
    
    sanitizer = TextCleaner.sanitizeString()
    queryString = sanitizer.sub(' ', queryString)

    # OPTIONAL:
    # Stem the information
    if _stemmer:
        # Get the regex pattern that sanitizeses information and sanitize it
        # Stem the information
        queryString = FilterInterface.porterStemmer(queryString)

    # CHOOSE HEURISTIC:
    # Search-heuristic used to retrieve the list of results
    if _cosineMeasure:
        results = SearchInterface.cosineMeasure(M_lil, M_csc, queryString)
    else:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)

    # Sort the results and reverse to get the highest score first
    results.sort()
    results.reverse()

    # ###########################################################################
    # ### For the term-doc matrix: ##############################################

    # ###########
    # # 1: Mean #
    # ###########

    # # Get the sum cosine score the labels
    # ## (normDic counts the number of times a label has been summed)
    resultDic1 = {}
    normDic1 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDic1[label] += item[0]
                normDic1[label] += 1
            except:
                resultDic1[label] = item[0]
                normDic1[label] = 1

    # #############
    # # 2: Median #
    # #############

    # # Get the median cosine score of the labels
    # ## (normDic counts the number of times a label has been summed)
    resultDicList2 = {}
    normDic2 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDicList2[label].append(item[0])
                normDic2[label] += 1
            except:
                resultDicList2[label] = []
                resultDicList2[label].append(item[0])
                normDic2[label] = 1
    resultDic2 = {}
    for label in resultDicList2.keys():
        labelList = resultDicList2[label]
        numOfScores = len(labelList)
        if numOfScores > 2:
            medianIndex = numOfScores / 2
        else:
            medianIndex = 0
        resultDic2[label] = sorted(labelList)[medianIndex]

    # ##########
    # # 3: Max #
    # ##########

    # # Get the max cosine score of labels
    # ## (normDic counts the number of times a label has been summed)
    resultDicList3 = {}
    normDic3 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDicList3[label].append(item[0])
                normDic3[label] += 1
            except:
                resultDicList3[label] = []
                resultDicList3[label].append(item[0])
                normDic3[label] = 1
    resultDic3 = {}
    for label in resultDicList3.keys():
        labelList = resultDicList3[label]
        resultDic3[label] = max(labelList)

    # # Normalize the summed labels
        #for label in resultDic1.keys():
        #    resultDic1[label]/=normDic1[label]
        #for label in resultDic2.keys():
        #    resultDic2[label]/=normDic2[label]
        #for label in resultDic3.keys():
        #    resultDic3[label]/=normDic3[label]

        ###############################################################################

            ###################################
            ####### return pmid results #######

    # Reverse and sort the concensus list
    resultList_mean = sorted(resultDic1.items(), key=lambda(k, v):(v, k), reverse=True)
    resultList_median = sorted(resultDic2.items(), key=lambda(k, v):(v, k), reverse=True)
    resultList_max = sorted(resultDic3.items(), key=lambda(k, v):(v, k), reverse=True)

    return [resultList_mean, resultList_median, resultList_max]