Ejemplo n.º 1
0
def analyseDiseaseTerms(M_coo):

    listOfDiseases=["Adrenoleukodystrophy  autosomal  neonatal form","Kleine Levin Syndrome"]
    listOfSymptoms=["Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions",
                    "Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"]

    sanitizer = TextCleaner.sanitizeString()

    M_lil=M_coo.tolil()

    count=0
    for disease in listOfDiseases:
        rowIndex=_diseaseHash[disease]

        termIndices=M_lil.getrow(rowIndex).nonzero()[1][1:]

        termList=[]
        for colIndex in termIndices:
            termList.append((M_lil[rowIndex,colIndex],revTermHashTable[colIndex]))

        termList.sort()
        termList.reverse()

        printout1=[]
        #for item in termList[:20]
        #    printout1.append(item[1])
        count=0
        newTermList=[]
        for item in termList:
            if len(item[1])>7: newTermList.append(item)
        for item in newTermList[:20]:
            printout1.append(item[1])

        print 'Top 20 terms:'
        print '---------------------'
        print printout1
        print "====================="

        printout2=[]
        symptoms=listOfSymptoms[count]
        symptoms = sanitizer.sub(' ', symptoms)
        symptoms = FilterInterface.stopwordRemover(symptoms)
        symptoms=FilterInterface.porterStemmer(symptoms)
        symptoms=SearchTermDoc._modifySearchString(symptoms)
        count+=1

        for symptom in symptoms:

            for term in termList:
                if term[1]==symptom: printout2.append((termList.index(term),symptom))
        print 'Ranks of searched symptoms:'
        print '---------------------'
        print printout2
        print "====================="
        print ''
Ejemplo n.º 2
0
def getSemanticKeywords(top=20):

    diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"),
                ("Adrenoleukodystrophy  autosomal  neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"),
                ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"),
                ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"),
                ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")]

    matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5"

    scoreDic={}
    totalTermDic={}
    for disease in diseaseList:

        filename=disease[0]
        symptoms=disease[1]

        symptoms=SearchTermDoc._modifySearchString(symptoms)
        symptoms=map(FilterInterface.porterStemmer,symptoms)

        M_coo=IOmodule.readInTDM(matrixDir,filename)
        totalTermDic[filename]=(M_coo.shape[1]-1)

        M_csc=M_coo.tocsc()
    
        termSum=[]
        for col in range(1,M_coo.shape[1]):
            term=revTermHashTable[M_csc[0,col]]
            termSum.append((sum(M_csc.getcol(col).data[:-1]),term))

        termSum.sort()
        termSum.reverse()

        scoreDic[filename]={}
        for item in termSum:
            if item[1] in symptoms:
                scoreDic[filename][item[1]]=termSum.index(item)

    #return termSum[:top]

    for score in scoreDic.items():
        print "Total number of terms for the disease:",totalTermDic[score[0]]
        print str(score[0])+'\t'+str(score[1].items())
        print ''