def _gatherMatrixData(filename): """ This function utilizes the RecordHandler module to create and structure the data to populate the term-doc matrices. It currently also removes stopwords from the abstract. It takes the records' file name to gather data from. It returns a doc-term list on the form: [[PMID,[(term1,count1),...],...] """ medlineDir=_medlineDir # Get the regex pattern that sanitize strings. sanitizer = sanitizeString() l = [] records = RecordHandler.loadMedlineRecords(medlineDir, filename) fields = RecordHandler.readMedlineFields(records, ['AB','TI','MH']) for entry in fields.items(): information='' # Get the title if any try: information=' '+entry[1]['TI'] except: print 'Unable to find title in', entry[0] # Get the abstract if any try: information+=' '+entry[1]['AB'] except: print 'Unable to find abstract in', entry[0] # Get all the mesh terms if any if 'MH' in entry[1]: for meshterm in entry[1]['MH']: information+=' '+meshterm # Sanitize the abstract information=sanitizer.sub(' ', information) # Remove english stopwords from the information information=FilterInterface.stopwordRemover(information) # OPTIONAL: # Stem the information if _stemmer: information=FilterInterface.porterStemmer(information) l.append(_wordCounter(entry[0],information)) return l
def testRH(): path="/root/The_Hive/data_acquisition/medline_records" disease="Winkelman Bethge Pfeiffer syndrome.txt" records = RH.loadMedlineRecords(path,disease) fields = RH.readMedlineFields(records,['TI','MH', 'AB']) l = [] for entry in fields.items(): # Get the abstract try: information=entry[1]['TI'] except: print 'Unable to find title in', entry[0] continue try: information+=entry[1]['AB'] except: print 'Unable to find abstract in', entry[0] continue try: for meshterm in entry[1]['MH']: information+=' '+meshterm except: print 'Unable to find MeSH in', entry[0] continue # MESH GOES HERE # Sanitize the abstract # abstract=sanitizer.sub(' ', abstract) # Remove english stopwords from the abstract # abstract=FilterInterface.stopwordRemover(abstract) # OPTIONAL: # Stem the abstract # if _stemmer: abstract=FilterInterface.porterStemmer(abstract) l.append((entry[0],information)) return l