コード例 #1
0
ファイル: SearchCases.py プロジェクト: hmbachelor/bachelor
def analyseDiseaseTerms(M_coo):

    listOfDiseases=["Adrenoleukodystrophy  autosomal  neonatal form","Kleine Levin Syndrome"]
    listOfSymptoms=["Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions",
                    "Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"]

    sanitizer = TextCleaner.sanitizeString()

    M_lil=M_coo.tolil()

    count=0
    for disease in listOfDiseases:
        rowIndex=_diseaseHash[disease]

        termIndices=M_lil.getrow(rowIndex).nonzero()[1][1:]

        termList=[]
        for colIndex in termIndices:
            termList.append((M_lil[rowIndex,colIndex],revTermHashTable[colIndex]))

        termList.sort()
        termList.reverse()

        printout1=[]
        #for item in termList[:20]
        #    printout1.append(item[1])
        count=0
        newTermList=[]
        for item in termList:
            if len(item[1])>7: newTermList.append(item)
        for item in newTermList[:20]:
            printout1.append(item[1])

        print 'Top 20 terms:'
        print '---------------------'
        print printout1
        print "====================="

        printout2=[]
        symptoms=listOfSymptoms[count]
        symptoms = sanitizer.sub(' ', symptoms)
        symptoms = FilterInterface.stopwordRemover(symptoms)
        symptoms=FilterInterface.porterStemmer(symptoms)
        symptoms=SearchTermDoc._modifySearchString(symptoms)
        count+=1

        for symptom in symptoms:

            for term in termList:
                if term[1]==symptom: printout2.append((termList.index(term),symptom))
        print 'Ranks of searched symptoms:'
        print '---------------------'
        print printout2
        print "====================="
        print ''
コード例 #2
0
ファイル: TermDoc.py プロジェクト: hmbachelor/bachelor
def _gatherMatrixData(filename):

    """
    This function utilizes the RecordHandler module to create and structure the
    data to populate the term-doc matrices. It currently also removes stopwords
    from the abstract.

    It takes the records' file name to gather data from.

    It returns a doc-term list on the form: [[PMID,[(term1,count1),...],...]
    """

    medlineDir=_medlineDir

    # Get the regex pattern that sanitize strings.
    sanitizer = sanitizeString()

    l = []
    records = RecordHandler.loadMedlineRecords(medlineDir, filename)
    fields = RecordHandler.readMedlineFields(records, ['AB','TI','MH'])
    for entry in fields.items():
        information=''
	# Get the title if any
        try:
		information=' '+entry[1]['TI']
        except:
		print 'Unable to find title in', entry[0]
	# Get the abstract if any
        try:
		information+=' '+entry[1]['AB']
        except:
		print 'Unable to find abstract in', entry[0]
	# Get all the mesh terms if any
	if 'MH' in entry[1]:
		for meshterm in entry[1]['MH']:
			information+=' '+meshterm

        # Sanitize the abstract
        information=sanitizer.sub(' ', information)
        # Remove english stopwords from the information
        information=FilterInterface.stopwordRemover(information)

        # OPTIONAL:
        # Stem the information
        if _stemmer: information=FilterInterface.porterStemmer(information)

        l.append(_wordCounter(entry[0],information))

    return l
コード例 #3
0
ファイル: SearchCases.py プロジェクト: hmbachelor/bachelor
def searchDisease(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """

    sanitizer = TextCleaner.sanitizeString()
    queryString = sanitizer.sub(' ', queryString)
    
    # OPTIONAL:
    # Stem the information
    if _stemmer:
        # Get the regex pattern that sanitizeses information and sanitize it
        # Stem the information
        queryString = FilterInterface.porterStemmer(queryString)

    # CHOOSE HEURISTIC:
    # Search-heuristic used to retrieve the list of results
    if _cosineMeasure:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)
    else:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)

    # Sort the results and reverse to get the highest score first
    results.sort()
    results.reverse()

    resultDic = {}
    for item in results[:top]:
        pmid = item[1]
        label = _labelHash[pmid]
        resultDic[label] = item[0]

    resultList = sorted(resultDic.items(), key=lambda(k, v):(v, k), reverse=True)

    return resultList[:20]
コード例 #4
0
ファイル: TermDoc.py プロジェクト: hmbachelor/bachelor
def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable
コード例 #5
0
ファイル: SearchCases.py プロジェクト: hmbachelor/bachelor
def search(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """
    
    sanitizer = TextCleaner.sanitizeString()
    queryString = sanitizer.sub(' ', queryString)

    # OPTIONAL:
    # Stem the information
    if _stemmer:
        # Get the regex pattern that sanitizeses information and sanitize it
        # Stem the information
        queryString = FilterInterface.porterStemmer(queryString)

    # CHOOSE HEURISTIC:
    # Search-heuristic used to retrieve the list of results
    if _cosineMeasure:
        results = SearchInterface.cosineMeasure(M_lil, M_csc, queryString)
    else:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)

    # Sort the results and reverse to get the highest score first
    results.sort()
    results.reverse()

    # ###########################################################################
    # ### For the term-doc matrix: ##############################################

    # ###########
    # # 1: Mean #
    # ###########

    # # Get the sum cosine score the labels
    # ## (normDic counts the number of times a label has been summed)
    resultDic1 = {}
    normDic1 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDic1[label] += item[0]
                normDic1[label] += 1
            except:
                resultDic1[label] = item[0]
                normDic1[label] = 1

    # #############
    # # 2: Median #
    # #############

    # # Get the median cosine score of the labels
    # ## (normDic counts the number of times a label has been summed)
    resultDicList2 = {}
    normDic2 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDicList2[label].append(item[0])
                normDic2[label] += 1
            except:
                resultDicList2[label] = []
                resultDicList2[label].append(item[0])
                normDic2[label] = 1
    resultDic2 = {}
    for label in resultDicList2.keys():
        labelList = resultDicList2[label]
        numOfScores = len(labelList)
        if numOfScores > 2:
            medianIndex = numOfScores / 2
        else:
            medianIndex = 0
        resultDic2[label] = sorted(labelList)[medianIndex]

    # ##########
    # # 3: Max #
    # ##########

    # # Get the max cosine score of labels
    # ## (normDic counts the number of times a label has been summed)
    resultDicList3 = {}
    normDic3 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDicList3[label].append(item[0])
                normDic3[label] += 1
            except:
                resultDicList3[label] = []
                resultDicList3[label].append(item[0])
                normDic3[label] = 1
    resultDic3 = {}
    for label in resultDicList3.keys():
        labelList = resultDicList3[label]
        resultDic3[label] = max(labelList)

    # # Normalize the summed labels
        #for label in resultDic1.keys():
        #    resultDic1[label]/=normDic1[label]
        #for label in resultDic2.keys():
        #    resultDic2[label]/=normDic2[label]
        #for label in resultDic3.keys():
        #    resultDic3[label]/=normDic3[label]

        ###############################################################################

            ###################################
            ####### return pmid results #######

    # Reverse and sort the concensus list
    resultList_mean = sorted(resultDic1.items(), key=lambda(k, v):(v, k), reverse=True)
    resultList_median = sorted(resultDic2.items(), key=lambda(k, v):(v, k), reverse=True)
    resultList_max = sorted(resultDic3.items(), key=lambda(k, v):(v, k), reverse=True)

    return [resultList_mean, resultList_median, resultList_max]