Exemple #1
0
def createTermDoc(refreshHash=False):

    """
    This function creates a large term-doc martix from a directory of sub term-
    doc matrices.

    It returns a matrix with dimensions given by the specified hash tables.

    It also saves the matrix for later use as a MatrixMarket .mtx file.
    """

    t1 = time.time()

    if refreshHash:
        createTermAndPmidHashes()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

#    files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)])
    
    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)


    # Need to add one due to non zero indexing
    m=len(pmidHashTable)+1
    n=len(termHashTable)+1

    termDoc = sparse.lil_matrix((m,n))

    # Insert values representing hashes
    for i in range(m): termDoc[i,0]=i
    termDoc[0,:]=range(n)

    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        subMCopy=subMatrix.todok()
        for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data):
            m = subMCopy[i,0]
            n = subMCopy[0,j]

            # Make sure not to add index's
            if m==0 or n==0:
                continue

            termDoc[m,n] += v
        print "Added",file

    IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc)

    t2 = time.time()

    print 'Time elapsed:',str(t2-t1)

    return termDoc
Exemple #2
0
def medlineDir2MatrixDir():

    """
    This function converts a directory of MedLine records to a new directory of
    corresponding term-doc matrices.

    It takes the matrix dimensions (row: m, col: n).

    It creates a directory (in the home folder) named 'diseaseMatrices' and
    stores the matrices as 'MatrixMarket' .mtx files, named by the disease name.
    """

    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)

    files = IOmodule.getSortedFilelist(_medlineDir+'/')

#    files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)])

    counter = 0
    for file in files:
        data = _gatherMatrixData(file)

        # Get matrix dimensions (+1 for the [0,0] field)
        ## (Here follows a small 0.0001 sec. hack to get n = total number of terms)
        temp={}
        for pmid in data:
            for term in pmid[1]:
                temp[term[0]]=0
        m=len(data)+1
        n=len(temp)+1

        M = _populateMatrix(m, n, data,termHashTable, pmidHashTable)
        diseaseName = file[0:file.find('.txt')]
        IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M)
        counter += 1
        print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
Exemple #3
0
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False):

    """
    Recieves a subMatrixDir goes through all the files and sums up the
    column of it, creating a single row vector containing the sum of
    all column in the sub term doc matrix. It then proceeds to making
    a disease term doc, based on these row vector

    Optional flags are:

    avg, takes the average over the columns of the sub matrices
    instead of the sum.

    output, makes the funtion produce additional output

    time_log, makes the function print out how much time is spend on
    what
    """

    if output:
        print 'Initialising...'

    if time_log:
        t1 = time.time()

    files = IO.getSortedFilelist(subMatrixDir)

    termHashTable = IO.pickleIn(_hashTablesDir, _termHash)
    diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash)

    diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1))

    # Initialize subTermSum to something
    subTermSum = sparse.lil_matrix((1,1))

    if output:
        print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1))
        count = 0

    if time_log:
        print 'Time for initialization:', str(time.time() - t1)[:4]

    for f in files:
        if time_log:
            t2 = time.time()
        diseaseName = f[0:f.find('.mtx')]
        if output:
            print 'Processing', diseaseName
            count+=1
            print 'Numbers remaining', len(files)-count

        subTermDoc = IO.readInTDM(subMatrixDir, diseaseName)
        subTermDoc = subTermDoc.tolil()

        # If the subTermDoc contains nothing, just skip it
        if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1):
            continue
        
        subTermSum = getColumnSum(subTermDoc,avg)
        subTermSum[0,0] = diseaseHashTable[diseaseName]
        subTermSum[0,:] = subTermDoc.getrow(0)

        diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName]
        
        if time_log:
            print 'Time for', diseaseName, str(time.time() - t2)[:4]
            t3 = time.time()

        if output:
            print 'Filling in values in disease matrix for', diseaseName
        for columnIndex in range(1,subTermSum.shape[1]):
            diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex]
        if time_log:
            print 'Values filled into disease matrix in', str(time.time() - t3)[:4]
        if output:
            print 'Completed filling in values.'

    # Hack way of making term hashes
    diseaseMatrix[0,:] = range(0,len(termHashTable))
    
    if output:
        print 'Done making disease matrix, writing to'

    IO.writeOutTDM(_termDocDir, label, diseaseMatrix)

    if output:
        print 'Done writing disease matrix.'
        
    return diseaseMatrix
Exemple #4
0
 # Load the precomputed length of each column in the stemmed term-doc matrix
#_termSum = IOmodule.pickleIn(_hashTablePath,_CLHash)


#######################################################################################
#### Use stopword-removed, Porter-stemmed (english) and TFIDF-prefiltered TermDoc: ####
#######################################################################################

 # TFIDF-matrix file name
_tfidfName = "label_TFIDFMatrix_reduced_90"
 # Vector-norm hash for then TFIDFMatrix
#_RLHash = "RLHash_tfidf_stemmed"
 # Hash for the number of documents each term occur in
_CLHash = "svdlabel_CLHash"
 # Load the precomputed length of each column in the stemmed term-doc matrix
_termSum = IOmodule.pickleIn(_hashTablePath,_CLHash)

####################################################################

print "Hashes loaded."


def _generateLogTFIDF(M_coo):

    """
    Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix,
    using log-transformation on TF and IDF.

    Returns a sparse lil_matrix to be used for vector-normalization.
    """
Exemple #5
0
# Main folder
_path = os.getenv("HOME") + "/" + "The_Hive"
# Sub folder
_subFolder = _path + "/" + "term_doc"
# Hashtable directory
_hashTablePath = _subFolder + "/" + "hashTables"
# Set True for Porter-stemming
_stemmer = True
# For term document set to True, for DiseaseMatrix set to false.
_termdocumentmatrix = False
# Cosine measure = True, Sum measure = False
_cosineMeasure = False
############
if _termdocumentmatrix:
    # Disease label hash (for pmid lookup)
    _labelHash = IOmodule.pickleIn(_hashTablePath, "labelHash")
    print "Label hash loaded"
else:
    # Disease label hash (for label lookup)
    _diseaseHash = IOmodule.pickleIn(_hashTablePath, "diseaseHash") #_reduced")
    _labelHash = dict(zip(_diseaseHash.values(), _diseaseHash.keys()))
    print "Disease hash loaded"
############

def search(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """
    
    sanitizer = TextCleaner.sanitizeString()
Exemple #6
0
# Term- and PMID-hash directory
_hashTablesDir=_path+'/'+'term_doc'+'/'+"hashTables"

# If subFolder do not exists
if not os.path.isdir(_path+'/'+subFolder):
    os.mkdir(_path+'/'+subFolder)

_stemmer=True

####################################################################
#### Use stopword-removed TermDoc ##################################
####################################################################

if not _stemmer:
    # Hashes
    termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash")
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash")
    revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys()))

####################################################################
#### Use stopword-removed and Porter-stemmed (english) TermDoc: ####
####################################################################
else:
    # Stemmed hashes
    termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash_stemmed")
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash_stemmed")
    revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys()))


print "Hashes loaded"