Python IOmodule.pickleOut 예제들

프로그래밍 언어: Python

클래스/타입: IOmodule

메소드/함수: pickleOut

hotexamples.com에서의 예제들: 6

Python IOmodule.pickleOut - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 IOmodule.pickleOut에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

getSortedFilelist(13)

readInTDM(9)

writeOutTDM(9)

pickleIn(6)

pickleOut(6)

writeOutTxt(3)

read_2col(2)

예제 #1

파일 보기

파일: TermDoc.py 프로젝트: hmbachelor/bachelor

def createDiseaseLabelHash():

    """
    Create and save a hash that connects every PMID with one or more diseases.
    """

    t1 = time.time()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

    labelHash={}

    fileCount=0
    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        colMatrix=subMatrix.tocsc()

        pmids=colMatrix.getcol(0)[1:].data

        for pmid in pmids:
            try:
                labelHash[pmid].append(file[:-4])
            except:
                labelHash[pmid]=[]
                labelHash[pmid].append(file[:-4])
            
        fileCount+=1
        print "Remaining:",(len(files)-fileCount),"Completed",file[:-4]

    t2 = time.time()

    print 'Created disease label hash in:',str(t2-t1)

    IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)

예제 #2

파일 보기

파일: testModule.py 프로젝트: hmbachelor/bachelor

def createVLHash(M_lil):

    VLHash={}
    for pmidHash in range(M_lil.shape[0]):
        VLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:])

    IOmodule.pickleOut("/root/The_Hive/term_doc/hashTables", "VLHash","btd", VLHash)

예제 #3

파일 보기

파일: SearchTermDoc.py 프로젝트: hmbachelor/bachelor

def createCLHash(M_coo,filename,save_file=True):

    """
    Precompute and save the length of each column vector in the term-doc matrix.
    Here the length refers to the number of elements.
    """

    t1=time.time()

    if not os.path.isdir(_hashTablesDir):
        os.mkdir(_hashTablesDir)

    M_lil=(M_coo.transpose()).tolil()

    CLHash={}
    count=0
    for termHash in range(1,M_coo.shape[1]):
        termVectorLength=len((M_lil.getrow(termHash).nonzero()[0])[1:])
        CLHash[termHash]=termVectorLength
        count+=1
        if save_file: print "Hashes created: "+str(count)+". Length:"+str(termVectorLength)

    t2=time.time()    

    if save_file:
        print "Created and saved ColumnLength-hash in: "+str(t2-t1)
        IOmodule.pickleOut(_hashTablesDir, filename,"btd", CLHash)
    else:
        return CLHash

예제 #4

파일 보기

파일: SearchTermDoc.py 프로젝트: hmbachelor/bachelor

def createRLHash(M_lil,filename,save_file=True):

    """
    Precompute and save the norm of each row vector in the term-doc matrix.
    """

    t1=time.time()

    if not os.path.isdir(_hashTablesDir):
        os.mkdir(_hashTablesDir)

    RLHash={}
    count=0
    for pmidHash in range(1,M_lil.shape[0]):
        RLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:])
        count+=1
        if save_file: print "Hashes created: "+str(count)

    if save_file:
        IOmodule.pickleOut(_hashTablesDir, filename,"btd", RLHash)
    else:
        return RLHash

    t2=time.time()
    print "Created and saved RowLength-hash in: "+str(t2-t1)

예제 #5

파일 보기

파일: DiseaseMatrix.py 프로젝트: hmbachelor/bachelor

def createDiseaseHash(dir,output=False):

    """
    Recieves a directory containing files to be hashed. It uses the
    filename as a key. It requires the files to be in .mtx format. The
    hashes starts from 1 ... number_of_files
    """

    diseaseHashes={}

    files = IO.getSortedFilelist(dir)
    counter=0
    for f in files:
        diseaseName=f[0:f.find('.mtx')]
        stdm=IO.readInTDM(dir,f)
        if stdm.shape[0]==1:
            continue
        if diseaseName not in diseaseHashes.keys():
            counter+=1
            if output:
                print 'Created', diseaseName, 'with hash', counter
            diseaseHashes[diseaseName]=counter

    IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)

예제 #6

파일 보기

파일: TermDoc.py 프로젝트: hmbachelor/bachelor

def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable