Ejemplo n.º 1
0
def _gatherMatrixData(filename):

    """
    This function utilizes the RecordHandler module to create and structure the
    data to populate the term-doc matrices. It currently also removes stopwords
    from the abstract.

    It takes the records' file name to gather data from.

    It returns a doc-term list on the form: [[PMID,[(term1,count1),...],...]
    """

    medlineDir=_medlineDir

    # Get the regex pattern that sanitize strings.
    sanitizer = sanitizeString()

    l = []
    records = RecordHandler.loadMedlineRecords(medlineDir, filename)
    fields = RecordHandler.readMedlineFields(records, ['AB','TI','MH'])
    for entry in fields.items():
        information=''
	# Get the title if any
        try:
		information=' '+entry[1]['TI']
        except:
		print 'Unable to find title in', entry[0]
	# Get the abstract if any
        try:
		information+=' '+entry[1]['AB']
        except:
		print 'Unable to find abstract in', entry[0]
	# Get all the mesh terms if any
	if 'MH' in entry[1]:
		for meshterm in entry[1]['MH']:
			information+=' '+meshterm

        # Sanitize the abstract
        information=sanitizer.sub(' ', information)
        # Remove english stopwords from the information
        information=FilterInterface.stopwordRemover(information)

        # OPTIONAL:
        # Stem the information
        if _stemmer: information=FilterInterface.porterStemmer(information)

        l.append(_wordCounter(entry[0],information))

    return l
Ejemplo n.º 2
0
def testRH():

    path="/root/The_Hive/data_acquisition/medline_records"
    disease="Winkelman Bethge Pfeiffer syndrome.txt"


    records = RH.loadMedlineRecords(path,disease)

    fields = RH.readMedlineFields(records,['TI','MH', 'AB'])

    l = []

    

    for entry in fields.items():
        # Get the abstract
        try:
            information=entry[1]['TI']
        except:
            print 'Unable to find title in', entry[0]
            continue
        try:
            information+=entry[1]['AB']
        except:
            print 'Unable to find abstract in', entry[0]
            continue
        try:
            for meshterm in entry[1]['MH']:
                information+=' '+meshterm
        except:
            print 'Unable to find MeSH in', entry[0]
            continue
        # MESH GOES HERE

        # Sanitize the abstract
#        abstract=sanitizer.sub(' ', abstract)
        # Remove english stopwords from the abstract
 #       abstract=FilterInterface.stopwordRemover(abstract)

        # OPTIONAL:
        # Stem the abstract
  #      if _stemmer: abstract=FilterInterface.porterStemmer(abstract)

        l.append((entry[0],information))

    return l
Ejemplo n.º 3
0
def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable