def resetSystem(self): #constants #variables categoryEntity = dataClasses.Category() categoryTerm = dataClasses.CategoryTerm() document = dataClasses.Document() documentTerm = dataClasses.DocumentTerm() documentCategory = dataClasses.DocumentCategory() term = dataClasses.Term() #Reset persistent data categoryTerm.delete() #Delete Category Term Instances documentCategory.delete() #Delete Document Category Instances documentTerm.delete() #Delete Document Term Instances categoryEntity.delete() #Delete Category Instances document.delete() #Delete Document Instances term.delete() #Delete Term Instances
def indexDocument(self, documentLocation, documentType, wrkFileLoc): #constants NO = "no" YES = "yes" DELIM = "~" dbAppend = 'mysqlimport -u root -p --lines-terminated-by="\\r\\n"' + \ ' --fields-terminated-by="~" --password="******" --local hj801 ' lineEnd = '\n' #variables categoryDict = dict() categoryEntity = dataClasses.Category() classesExist = NO doc = etree.parse(documentLocation) document = dataClasses.Document() documentCategory = dataClasses.DocumentCategory() documentCategoryCount = int() documentCategoryDict = dict() documentCategoryFile = open(wrkFileLoc + 'document_category.txt', 'w') documentTerm = dataClasses.DocumentTerm() documentTermCount = int() documentTermFile = open(wrkFileLoc + 'document_term.txt', 'w') documentFile = open(wrkFileLoc + 'document.txt', 'w') termOccurenceDict = dict() firstTimeThru = YES stemmer = PorterStemmer() stopChar = dataClasses.StopChar() stopCharacter = list() stopWord = dataClasses.StopWord() stopWords = list() termEntity = dataClasses.Term() normalizedTermFrequencyDict = dict() termDict = dict() word = str() #get current key values documentId = document.getLastKeyValue() documentCategoryId = documentCategory.getLastKeyValue() documentTermId = documentTerm.getLastKeyValue() #load categories categoryEntity = dataClasses.Category() categoryList = categoryEntity.get() if categoryList is not None: for c in categoryList: categoryDict[c.categoryName] = c.categoryId #load terms termEntity = dataClasses.Term() termList = termEntity.get() if termList is not None: for t in termList: termDict[t.term] = t.termId normalizedTermFrequencyDict[ t.termId] = t.normalizedTermFrequency #load Stop Words stopWordList = stopWord.get() for s in stopWordList: #decode list of tuples into simple stop char list stopWords.append(s.stopWord) stopCharList = stopChar.get() for s in stopCharList: #decode list of tuples into simple stop char list stopCharacter.append(s.stopChar) #main processing of XML Document xpathString = "/documents/document[type='" + documentType + "']" for d in doc.xpath( xpathString): #read through XML document and parse nodes if len(d) > 0: for e in d: if e.tag == "title": #title & new document if firstTimeThru == NO: #write a new record for previous document & reset variables #only write document record if classes exist for document if classesExist == YES: tmpStr = str(documentId) + DELIM + str(documentName) + \ DELIM + str(documentType) + DELIM + str(documentCategoryCount) + \ lineEnd documentFile.write(tmpStr) #write document classes if documentCategoryDict is not None: for dc in documentCategoryDict.iteritems(): documentCategoryId = documentCategoryId + 1 tmpStr = str(documentCategoryId) + DELIM + \ str(documentId) + DELIM + str(dc[0]) + \ DELIM + str(dc[1]) + DELIM + '1' + DELIM + '1' + lineEnd documentCategoryFile.write(tmpStr) #write document terms if termOccurenceDict is not None: for t in termOccurenceDict.iteritems(): documentTermId = documentTermId + 1 termId = t[0] termFrequency = int(t[1]) if int(t[1]) > 0: normalizedTermFrequency = ( float(termFrequency) / documentTermCount) else: normalizedTermFrequency = 0 #calculate tf-idf tfIdf = float() inverseDocumentFrequency = \ (1 / float(normalizedTermFrequencyDict.get(termId, 0))) tfIdf = normalizedTermFrequency * inverseDocumentFrequency tmpStr = str(documentTermId) + DELIM + \ str(termId) + DELIM + str(documentId) + \ DELIM + str(normalizedTermFrequency) + DELIM + str(tfIdf) + \ DELIM + str(termFrequency) + lineEnd documentTermFile.write(tmpStr) #reset variables documentName = "" documentType = "" documentTermCount = 0 documentCategoryCount = 0 documentCategoryDict.clear() termOccurenceDict.clear() classesExist = NO documentId = documentId + 1 else: firstTimeThru = NO documentId = documentId + 1 #document name if e.text is not None: titleText = e.text documentName = e.text.strip() documentName = documentName.replace("'", "") documentName = documentName.replace('"', '') documentName = documentName.replace("\n", '') else: documentName = "" titleText = "" elif e.tag == "type": #document type if e.text is not None: documentType = e.text.strip() else: documentType = "" elif e.tag == "classification": #possible multiple classifications if e.text is not None: #get class id if e.text.strip() in categoryDict: categoryId = categoryDict.get(e.text.strip()) if categoryId not in documentCategoryDict: documentCategoryDict[categoryId] = int(1) documentCategoryCount = documentCategoryCount + 1 classesExist = YES elif e.tag == "text": #text likely to be split over several lines #this bit takes each line and strips out individual terms if e.text is not None: totalText = titleText + " " + e.text else: totalText = titleText if totalText is not None: for s in totalText: if ord( s ) > 64: #65 is the start of the alphabet if s not in stopCharacter: word = word + s.lower() else: if word not in stopWords: word = stemmer.stem_word(word) termId = termDict.get(word, 0) if termId > 0: documentTermCount = documentTermCount + 1 termOccurence = termOccurenceDict.get( termId, 0) termOccurenceDict[ termId] = termOccurence + 1 word = "" #reset word to empty #process last word if word not in stopWords: word = stemmer.stem_word(word) termId = termDict.get(word, 0) word = str() if termId > 0: documentTermCount = documentTermCount + 1 termOccurence = termOccurenceDict.get( termId, 0) termOccurenceDict[ termId] = termOccurence + 1 #write last header record if firstTimeThru == NO: if classesExist == YES: tmpStr = str(documentId) + DELIM + str(documentName) + DELIM + \ str(documentType) + DELIM + str(documentCategoryCount) + lineEnd documentFile.write(tmpStr) #write document classes if documentCategoryDict is not None: for dc in documentCategoryDict.iteritems(): documentCategoryId = documentCategoryId + 1 tmpStr = str(documentCategoryId) + DELIM + str(documentId) + \ DELIM + str(dc[0]) + DELIM + str(dc[1]) + DELIM + '1' + \ DELIM + '1' + lineEnd documentCategoryFile.write(tmpStr) #write document terms if termOccurenceDict is not None: for t in termOccurenceDict.iteritems(): documentTermId = documentTermId + 1 termId = t[0] termFrequency = int(t[1]) if documentTermCount > 0: normalizedTermFrequency = (float(termFrequency) / documentTermCount) else: normalizedTermFrequency = 0 #calculate tf-idf inverseDocumentFrequency = \ (1 / float(normalizedTermFrequencyDict.get(termId, 0))) tfIdf = normalizedTermFrequency * inverseDocumentFrequency tmpStr = str(documentTermId) + DELIM + str(termId) + \ DELIM + str(documentId) + DELIM + str(normalizedTermFrequency) + \ DELIM + str(tfIdf) + DELIM + str(termFrequency) + lineEnd documentTermFile.write(tmpStr) #close load files documentFile.close() documentCategoryFile.close() documentTermFile.close() #Load data print("Load documents.....") callStr = dbAppend + wrkFileLoc + 'document.txt' call(callStr, shell=True) print("Load document categories.....") callStr = dbAppend + wrkFileLoc + 'document_category.txt' call(callStr, shell=True) print("Load document terms.....") callStr = dbAppend + wrkFileLoc + 'document_term.txt' call(callStr, shell=True)
def checkResults(self, wrkFileLoc): #constants TEST_DOCUMENT = "TEST" NO = "no" YES = "yes" DELIM = "~" GIVEN = 1 CALCULATED = 2 dbReplace = 'mysqlimport -u root -p --lines-terminated-by="\\r\\n"' + \ ' --fields-terminated-by="~" --delete --password="******" --local hj801 ' lineEnd = '\n' #variable document = dataClasses.Document() documentAuditFile = open(wrkFileLoc + 'document_audit.txt', 'w') documentCategory = dataClasses.DocumentCategory() documentDict = dict() firstTimeThru = YES givenCategorySet = set() svDocumentID = int() documentAuditId = int() truePositive = int() falsePositive = int() #read document category count into memory documentType = TEST_DOCUMENT documentList = document.getDocumentByType(documentType) if documentList is not None: for d in documentList: documentDict[d.documentId] = d.documentCategoryCount #read current given document classes into memory documentType = TEST_DOCUMENT categoryWeightTypeId = GIVEN documentCategoryList = documentCategory.getByDocumentType(documentType, \ categoryWeightTypeId) if documentCategoryList is not None: for dc in documentCategoryList: docCategoryStr = (str(dc.documentId) + ':' + str(dc.categoryId)) givenCategorySet.add(docCategoryStr) documentType = TEST_DOCUMENT categoryWeightTypeId = CALCULATED categoryForDocumentList = documentCategory.getByDocumentType(documentType, \ categoryWeightTypeId) if categoryForDocumentList is not None: for cd in categoryForDocumentList: documentId = cd.documentId categoryId = cd.categoryId positiveAssignment = cd.positiveAssignment categoryWeightTypeId = cd.categoryWeightTypeId #have any control fields changed if (svDocumentID != documentId): if firstTimeThru == NO: #write new audit record documentCategoryCount = documentDict.get( svDocumentID, 0) documentAuditId = documentAuditId + 1 tmpStr = str(documentAuditId) + DELIM + str(svDocumentID) + DELIM + \ str(categoryWeightTypeId) + DELIM + str(truePositive) + DELIM + \ str(falsePositive) + DELIM + str(documentCategoryCount) + lineEnd documentAuditFile.write(tmpStr) else: firstTimeThru = NO svDocumentID = documentId truePositive = 0 falsePositive = 0 #check is assigned class is part of given class set docCategoryStr = (str(documentId) + ':' + str(categoryId)) if positiveAssignment == 1: truePositive = truePositive + 1 else: falsePositive = falsePositive + 1 #remember to write last record if firstTimeThru == NO: #write new audit record documentCategoryCount = documentDict.get(svDocumentID, 0) documentAuditId = documentAuditId + 1 tmpStr = str(documentAuditId) + DELIM + str(documentId) + DELIM + \ str(categoryWeightTypeId) + DELIM + str(truePositive) + DELIM + \ str(falsePositive) + DELIM + str(documentCategoryCount) + lineEnd documentAuditFile.write(tmpStr) documentAuditFile.close() callStr = dbReplace + wrkFileLoc + 'document_audit.txt' call(callStr, shell=True)
def calcCategoryTermFrequency(self, wrkFileLoc): #constants DELIM = "~" NO = "no" YES = "yes" dbReplace = 'mysqlimport -u root -p --lines-terminated-by="\\r\\n"' + \ ' --fields-terminated-by="~" --delete --password="******" --local hj801 ' lineEnd = '\n' #variables categoryEntity = dataClasses.Category() categoryTerm = dataClasses.CategoryTerm() categoryTermFile = open(wrkFileLoc + 'category_term.txt', 'w') documentCategory = dataClasses.DocumentCategory() documentTermDict = dict() newDocumentTermDict = dict() termEntity = dataClasses.Term() #get current key value categoryTermId = categoryTerm.getLastKeyValue() #load starter terms dict termList = termEntity.get() if termList is not None: for t in termList: documentTermDict[t.termId] = 0 #calculate the frequency (probability) of a term #given a particular class categoryList = categoryEntity.get() #get all categories if categoryList is not None: for c in categoryList: categoryId = c.categoryId #read all documents for this class totalTermsForCategoryCount = 0 #reset documentCategory = dataClasses.DocumentCategory() documentCategoryList = documentCategory.getDocumentByCategory( categoryId) if documentCategoryList is not None: for dc in documentCategoryList: #now get all terms for document list documentTerm = dataClasses.DocumentTerm() documentTermList = documentTerm.getByDocument( dc.documentId) if documentTermList is not None: for df in documentTermList: #accumulate term frequency totalTermsForCategoryCount = totalTermsForCategoryCount + \ df.termFrequency categoryTermCount = documentTermDict.get( df.termId, 0) documentTermDict[df.termId] = categoryTermCount + \ df.termFrequency #end of terms for this category #check for zero frequency records, if found apply Laplace correction #(if requested) laplaceCorrectionRequired = NO #assume not required if totalTermsForCategoryCount > 0: if documentTermDict is not None: for terms in documentTermDict.items(): if terms[1] == 0: #check for any zero count terms laplaceCorrectionRequired = YES break #leave loop if found if laplaceCorrectionRequired == YES: totalTermsForCategoryCount = 0 #reset for terms in documentTermDict.items(): #add 1 to all term counts #this ensures that this is no zero counts newDocumentTermDict[terms[0]] = terms[1] + 1 totalTermsForCategoryCount = totalTermsForCategoryCount + \ terms[1] + 1 documentTermDict = newDocumentTermDict #now read back dictionary of terms and write category term record if totalTermsForCategoryCount > 0: if documentTermDict is not None: for terms in documentTermDict.items(): categoryTermId = categoryTermId + 1 termId = terms[0] normalizedCategoryTermFrequency = float( terms[1]) / totalTermsForCategoryCount tmpStr = str(categoryTermId) + DELIM + str(categoryId) + \ DELIM + str(termId) + DELIM + str(normalizedCategoryTermFrequency) + \ lineEnd categoryTermFile.write(tmpStr) #reset dict for next category if documentTermDict is not None: for t in documentTermDict.iterkeys(): documentTermDict[t] = 0 categoryTermFile.close() callStr = dbReplace + wrkFileLoc + 'category_term.txt' call(callStr, shell=True) #batch import into RDBMS
def categorizeDocument(self, documentType, wrkFileLoc, numberOfCategoriesToAssign=1): #constants TEST_DOCUMENT = "TEST" GIVEN = 1 CALCULATED = 2 dbAppend = 'mysqlimport -u root -p --lines-terminated-by="\\r\\n"' + \ ' --fields-terminated-by="~" --password="******" --local hj801 ' lineEnd = '\n' DELIM = "~" #variables categoryDict = dict() categoryEntity = dataClasses.Category() categoryTerm = dataClasses.CategoryTerm() categoryTermDict = dict() categoryTermProbDict = dict() document = dataClasses.Document() documentCategory = dataClasses.DocumentCategory() documentCategoryFile = open(wrkFileLoc + 'document_category.txt', 'w') documentTerm = dataClasses.DocumentTerm() givenCategorySet = set() #get last key value for Document Category entity documentCategoryId = documentCategory.getLastKeyValue() #read all category records into memory categoryList = categoryEntity.get() if categoryList is not None: for c in categoryList: categoryDict[c.categoryId] = c.normalizedCategoryFrequency #read all category term records into memory categoryTermList = categoryTerm.get() if categoryTermList is not None: for ct in categoryTermList: catTermStr = str(ct.categoryId) + ":" + str(ct.termId) categoryTermDict[ catTermStr] = ct.normalizedCategoryTermFrequency #read current given document classes into memory documentCategoryList = documentCategory.getByDocumentType( TEST_DOCUMENT, GIVEN) if documentCategoryList is not None: for dc in documentCategoryList: docCategoryStr = (str(dc.documentId) + ':' + str(dc.categoryId)) givenCategorySet.add(docCategoryStr) #Set loop to read all TEST Documents documentList = document.getDocumentByType(documentType) if documentList is not None: for d in documentList: documentId = d.documentId documentTermList = documentTerm.getByDocument(documentId) #compare each category for this document as per Naive Bayes #first P(Category) if categoryDict is not None: for c in categoryDict.iteritems(): categoryId = c[0] categoryTermProbDict[categoryId] = math.log(c[1]) #now calculate P(Term|Category) if documentTermList is not None: for dt in documentTermList: #now retrieve P(Term|Category) catTermStr = str(categoryId) + ":" + str( dt.termId) if catTermStr in categoryTermDict: categoryTermProbDict[categoryId] = \ categoryTermProbDict.get(categoryId,0) + \ math.log(categoryTermDict.get(catTermStr,0)) #read back classes and write document class record ds = 0 categoriesToAssign = numberOfCategoriesToAssign if categoryTermProbDict is not None: for ct in sorted(categoryTermProbDict.iteritems(), \ key=lambda (k,v): (v,k), reverse=True): if ds < categoriesToAssign: ds = ds + 1 documentCategoryId = documentCategoryId + 1 categoryId = ct[0] categoryWeight = ct[1] categoryWeightTypeId = CALCULATED positive_assignment = 0 #check if assigned class is part of given class set docCategoryStr = (str(documentId) + ':' + str(categoryId)) if docCategoryStr in givenCategorySet: givenCategorySet.discard(docCategoryStr) positive_assignment = 1 tmpStr = str(documentCategoryId) + DELIM + str(documentId) + \ DELIM + str(categoryId) + DELIM + str(categoryWeight) + \ DELIM + str(categoryWeightTypeId) + DELIM + \ str(positive_assignment) + lineEnd documentCategoryFile.write(tmpStr) #end documentCategoryFile.close() print("Load document classes.....") callStr = dbAppend + wrkFileLoc + 'document_category.txt' call(callStr, shell=True)
####################################################### import dataClasses import task from lxml import etree if __name__ == "__main__": #constants TEST_DOCUMENT = "TEST" TRAINING_DOCUMENT = "TRAIN" documentLocation = 'C:/Users/paul/Documents/Education/MRes/Project/HJ801/data/reuters_full.xml' #variables doc = etree.parse(documentLocation) documentCategorizer = task.CategorizeDocument() document = dataClasses.Document() documentCategory = dataClasses.DocumentCategory() documentTerm = dataClasses.DocumentTerm() documentIndexer = task.IndexDocument() featureReducer = task.DocumentTermReduction() numberOfCategoriesToAssign = 1 resultChecker = task.CheckResults() thresholdPercentage = 100 wrkFileLoc = "C:/Users/paul/Documents/Education/MRes/Project/HJ801/data/" #Remove existing Test Documents from previous runs print("Remove existing Test Documents from previous runs.....") documentType = TEST_DOCUMENT documentCategory.deleteByDocumentType(documentType) documentTerm.deleteByDocumentType(documentType) document.deleteByDocumentType(documentType)