예제 #1
0
def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False):

    files = IO.getSortedFilelist(dir+'/')

    if output:
        counter = 0

    for f in files:
        diseaseName = f[0:f.find('.mtx')]
        subTermDoc = IO.readInTDM(dir, diseaseName)
        if output:
            counter += 1
            print 'Count:', counter

        # If sub term document matrix is empty, just skip it.
        if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1:
            continue

        if time_log:
            t1 = time.time()
        subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log)
        if time_log:
            print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4]

        if output:
            print 'Writing',

        subTermDoc = sparse.coo_matrix(subTermDoc)

        IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
예제 #2
0
def runTFIDF():

    """
    Create a normalized log-transformed TFIDF-matrix from a sparse coo_matrix.
    """

    files = IOmodule.getSortedFilelist(_matrixDir + "/")

    #    files = sorted([f for f in os.listdir(_matrixDir+"/") if os.path.isfile(_matrixDir+"/" + f)])

    for file in files:

        file = file[:-4]

        subM_coo = IOmodule.readInTDM(_matrixDir, file)

        t1 = time.time()
        dense, lil = _generateLogTFIDF(subM_coo)
        t2 = time.time()
        print "Generated log_TFIDF in " + str(t2 - t1)

        t1 = time.time()
        _normalizeVectorLengths(dense, lil, file)
        t2 = time.time()
        print "Normalized vector lengths in " + str(t2 - t1)

        print "Done with: " + file + "\n"
예제 #3
0
def createDiseaseLabelHash():

    """
    Create and save a hash that connects every PMID with one or more diseases.
    """

    t1 = time.time()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

    labelHash={}

    fileCount=0
    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        colMatrix=subMatrix.tocsc()

        pmids=colMatrix.getcol(0)[1:].data

        for pmid in pmids:
            try:
                labelHash[pmid].append(file[:-4])
            except:
                labelHash[pmid]=[]
                labelHash[pmid].append(file[:-4])
            
        fileCount+=1
        print "Remaining:",(len(files)-fileCount),"Completed",file[:-4]

    t2 = time.time()

    print 'Created disease label hash in:',str(t2-t1)

    IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
예제 #4
0
def createTermDoc(refreshHash=False):

    """
    This function creates a large term-doc martix from a directory of sub term-
    doc matrices.

    It returns a matrix with dimensions given by the specified hash tables.

    It also saves the matrix for later use as a MatrixMarket .mtx file.
    """

    t1 = time.time()

    if refreshHash:
        createTermAndPmidHashes()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

#    files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)])
    
    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)


    # Need to add one due to non zero indexing
    m=len(pmidHashTable)+1
    n=len(termHashTable)+1

    termDoc = sparse.lil_matrix((m,n))

    # Insert values representing hashes
    for i in range(m): termDoc[i,0]=i
    termDoc[0,:]=range(n)

    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        subMCopy=subMatrix.todok()
        for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data):
            m = subMCopy[i,0]
            n = subMCopy[0,j]

            # Make sure not to add index's
            if m==0 or n==0:
                continue

            termDoc[m,n] += v
        print "Added",file

    IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc)

    t2 = time.time()

    print 'Time elapsed:',str(t2-t1)

    return termDoc
예제 #5
0
def loadMatrix(dirPath,filename):

    t1=time.time()

    M = IOmodule.readInTDM(dirPath, filename)

    t2=time.time()

    print str(t2-t1)

    return M
예제 #6
0
def getSemanticKeywords(top=20):

    diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"),
                ("Adrenoleukodystrophy  autosomal  neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"),
                ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"),
                ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"),
                ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")]

    matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5"

    scoreDic={}
    totalTermDic={}
    for disease in diseaseList:

        filename=disease[0]
        symptoms=disease[1]

        symptoms=SearchTermDoc._modifySearchString(symptoms)
        symptoms=map(FilterInterface.porterStemmer,symptoms)

        M_coo=IOmodule.readInTDM(matrixDir,filename)
        totalTermDic[filename]=(M_coo.shape[1]-1)

        M_csc=M_coo.tocsc()
    
        termSum=[]
        for col in range(1,M_coo.shape[1]):
            term=revTermHashTable[M_csc[0,col]]
            termSum.append((sum(M_csc.getcol(col).data[:-1]),term))

        termSum.sort()
        termSum.reverse()

        scoreDic[filename]={}
        for item in termSum:
            if item[1] in symptoms:
                scoreDic[filename][item[1]]=termSum.index(item)

    #return termSum[:top]

    for score in scoreDic.items():
        print "Total number of terms for the disease:",totalTermDic[score[0]]
        print str(score[0])+'\t'+str(score[1].items())
        print ''
예제 #7
0
def runAndSaveMatrices():

    """
    Transform a directory of matrices to a directory of decomposed matrices.
    """

    files = IOmodule.getSortedFilelist(_oldMatrixDir+'/')

    for file in files:

        M_coo=IOmodule.readInTDM(_oldMatrixDir,file)

        # Make sure the matrix contains information (1-dim. is an empty matrix)
        if M_coo.shape[0]==1:
            continue

        print "Shape:"+str(M_coo.shape)

        # SVD does not run well single dimenstion matrices
        ## (remembering that the first dimension is indices and does not count)
        if M_coo.shape[0]>2:
            M_dense=M_coo.todense()

            # Run SVD
            U,Sig,Vt=_svd(M_dense)

            # Get the reduced semantic space
            S= _semanticSpace(U,Sig,Vt,_reduceBy)

            # Recombine the indices and the reduced matrix
            M_dense[1:,1:]=S.todense()

            # Save the matrix
            M_coo=sparse.coo_matrix(M_dense)

            IOmodule.writeOutTDM(_newMatrixDir, file, M_coo)
            print ''
        else:
            print "Dimensionality too low for svd"
            IOmodule.writeOutTDM(_newMatrixDir, file, M_coo)
            print ''
예제 #8
0
def createDiseaseHash(dir,output=False):

    """
    Recieves a directory containing files to be hashed. It uses the
    filename as a key. It requires the files to be in .mtx format. The
    hashes starts from 1 ... number_of_files
    """

    diseaseHashes={}

    files = IO.getSortedFilelist(dir)
    counter=0
    for f in files:
        diseaseName=f[0:f.find('.mtx')]
        stdm=IO.readInTDM(dir,f)
        if stdm.shape[0]==1:
            continue
        if diseaseName not in diseaseHashes.keys():
            counter+=1
            if output:
                print 'Created', diseaseName, 'with hash', counter
            diseaseHashes[diseaseName]=counter

    IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
예제 #9
0
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False):

    """
    Recieves a subMatrixDir goes through all the files and sums up the
    column of it, creating a single row vector containing the sum of
    all column in the sub term doc matrix. It then proceeds to making
    a disease term doc, based on these row vector

    Optional flags are:

    avg, takes the average over the columns of the sub matrices
    instead of the sum.

    output, makes the funtion produce additional output

    time_log, makes the function print out how much time is spend on
    what
    """

    if output:
        print 'Initialising...'

    if time_log:
        t1 = time.time()

    files = IO.getSortedFilelist(subMatrixDir)

    termHashTable = IO.pickleIn(_hashTablesDir, _termHash)
    diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash)

    diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1))

    # Initialize subTermSum to something
    subTermSum = sparse.lil_matrix((1,1))

    if output:
        print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1))
        count = 0

    if time_log:
        print 'Time for initialization:', str(time.time() - t1)[:4]

    for f in files:
        if time_log:
            t2 = time.time()
        diseaseName = f[0:f.find('.mtx')]
        if output:
            print 'Processing', diseaseName
            count+=1
            print 'Numbers remaining', len(files)-count

        subTermDoc = IO.readInTDM(subMatrixDir, diseaseName)
        subTermDoc = subTermDoc.tolil()

        # If the subTermDoc contains nothing, just skip it
        if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1):
            continue
        
        subTermSum = getColumnSum(subTermDoc,avg)
        subTermSum[0,0] = diseaseHashTable[diseaseName]
        subTermSum[0,:] = subTermDoc.getrow(0)

        diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName]
        
        if time_log:
            print 'Time for', diseaseName, str(time.time() - t2)[:4]
            t3 = time.time()

        if output:
            print 'Filling in values in disease matrix for', diseaseName
        for columnIndex in range(1,subTermSum.shape[1]):
            diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex]
        if time_log:
            print 'Values filled into disease matrix in', str(time.time() - t3)[:4]
        if output:
            print 'Completed filling in values.'

    # Hack way of making term hashes
    diseaseMatrix[0,:] = range(0,len(termHashTable))
    
    if output:
        print 'Done making disease matrix, writing to'

    IO.writeOutTDM(_termDocDir, label, diseaseMatrix)

    if output:
        print 'Done writing disease matrix.'
        
    return diseaseMatrix