def PrepareJSandJM(nodes): nodes.head.ApplyFeature(utils.FeatureID_JS2) JSnode = Tokenization.SentenceNode('') JSnode.ApplyFeature(utils.FeatureID_JS) JSnode.ApplyFeature(utils.FeatureID_JS2) nodes.insert(JSnode, 0) p = nodes.head.next while p.next: if utils.FeatureID_SYM not in p.features: p.ApplyFeature(utils.FeatureID_JS2) break p.ApplyFeature(utils.FeatureID_JS2) p = p.next PUNCSet = {".", "?", "!", ";", "...", ":", "。"} if utils.FeatureID_SYM not in nodes.tail.features and \ nodes.tail.text not in PUNCSet : JMnode = Tokenization.SentenceNode('') JMnode.StartOffset = nodes.tail.EndOffset JMnode.EndOffset = nodes.tail.EndOffset JMnode.ApplyFeature(utils.FeatureID_punc) nodes.append(JMnode) nodes.tail.ApplyFeature(utils.FeatureID_JM) nodes.tail.ApplyFeature(utils.FeatureID_JM2) p = nodes.tail.prev while p.prev: if utils.FeatureID_SYM not in p.features: # first one that is not punc. the real JM2: p.ApplyFeature(utils.FeatureID_JM2) break p.ApplyFeature(utils.FeatureID_JM2) p = p.prev
def test_LogicOr(self): """Logic Or""" node = Tokenization.SentenceNode('being') strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "being|getting", [Rules.RuleToken()], 0))
def test_LogicNotOr(self): """Logic And/Or""" node = Tokenization.SentenceNode('d') strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) RuleTokenList = [Rules.RuleToken()] self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0)) node.text = "f" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0)) node.text = "e" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0)) node.text = "f" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0)) node.text = "c" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0)) node.text = "d" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0)) node.text = "e" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|e !d|f|g|e", RuleTokenList, 0)) node.text = "e" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|c", RuleTokenList, 0)) node.text = "f" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
def test_simple(self): """exact match""" node = Tokenization.SentenceNode('') node.features.add(FeatureOntology.GetFeatureID('NN')) strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN", None, 0))
def test_LogicAnd(self): """Logic And""" node = Tokenization.SentenceNode("c") strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) ruletokenlist = [Rules.RuleToken()] self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c d", ruletokenlist, 0)) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c c", ruletokenlist, 0))
def test_LogicAndOr(self): """Logic And/Or""" node = Tokenization.SentenceNode('d') strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) ruletokenlist = [Rules.RuleToken()] self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0)) node.text = "c" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0))
def test_And(self): node = Tokenization.SentenceNode("abc") node.features.add(FeatureOntology.GetFeatureID('NN')) strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0)) node.features.add(FeatureOntology.GetFeatureID('percent')) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0))
def searchQueryUnion(query,dictionary): result=[] IDresult=[] postings=[] DocID=[] #Tokenise the query for process later tokens=Tokenization.tokenization(query) #Normalize the tokens for process later for index in range(len(tokens)): tokens[index] = Normalization.cleanStopWords150(tokens[index]) tokens[index] = Normalization.cleanedTokens(tokens[index]) tokens[index] = Normalization.caseFoldedTokens(tokens[index]) tokens[index] = Normalization.stemmedTokens(tokens[index]) if tokens[index] in dictionary: DocID=DocID+dictionary.get(tokens[index]) if DocID!=[]: #change docID from string to int for index in range(len(DocID)): DocID[index]=DocID[index].split(',') DocID[index]=DocID[index][0].split() DocID[index]=map(int, DocID[index]) #Getting the intersection of the DocID for different tokens in query DocID=[set(id) for id in DocID] #print DocID finalID=sorted(set.union(*DocID)) return finalID else: finalID=DocID return finalID
def getInvertedIndexTokens(fileNum): invertedIndex={} #loading the orginal documents for tokenization and normalization later fileNum= '%0*d' % (3, fileNum) resourcepath = 'reuters/reut2-' + fileNum + '.sgm' file = open(resourcepath) soup = BeautifulSoup(file, 'html.parser') allReuters=soup.find_all('reuters') for reuters in allReuters: if reuters.body is not None: #Tokenize the text inside the [body]tag in the files tokenslist=reuters.body.text.rsplit('reuters',1) for e in tokenslist: tokens=''.join(e).encode('utf8') tokens=Tokenization.tokenization(tokens) for token in tokens: #Normalization after get the tokens token = Normalization.cleanedTokens(token) token = Normalization.caseFoldedTokens(token) token = Normalization.cleanStopWords150(token) token = Normalization.stemmedTokens(token) #Construct the inverted index for tokens if token != '': if invertedIndex.has_key(token): if reuters['newid'] not in invertedIndex[token]: invertedIndex[token].append(reuters['newid']) else: invertedIndex[token] = [reuters['newid']] return invertedIndex
def test_ApplyLexicon(self): node = Tokenization.SentenceNode('0') ApplyLexicon(node) CDFeatureID = GetFeatureID('CD') self.assertTrue(CDFeatureID in node.features) LoadLexicon(dir_path + '/../../../fsa/X/LexX-ChinesePunctuate.txt') node = Tokenization.SentenceNode(':') ApplyLexicon(node) self.assertTrue(utils.FeatureID_SYM in node.features) self.assertFalse(utils.FeatureID_OOV in node.features) node = Tokenization.SentenceNode(':') ApplyLexicon(node) self.assertTrue(utils.FeatureID_SYM in node.features) self.assertFalse(utils.FeatureID_OOV in node.features)
def test_LexiconLookup(self): LoadLexicon(dir_path + '/../../../fsa/X/defLexX.txt', lookupSource=LexiconLookupSource.defLex) LoadLexicon(dir_path + '/../../../fsa/X/defPlus.txt', lookupSource=LexiconLookupSource.defLex) Sentence="喝不惯" NodeList = Tokenization.Tokenize(Sentence) import ProcessSentence ProcessSentence.PrepareJSandJM(NodeList) LexiconLookup(NodeList, LexiconLookupSource.defLex) self.assertEqual(NodeList.size, 3) Sentence="李四" NodeList = Tokenization.Tokenize(Sentence) #import ProcessSentence ProcessSentence.PrepareJSandJM(NodeList) LexiconLookup(NodeList, LexiconLookupSource.defLex) self.assertEqual(NodeList.size, 3) self.assertFalse(utils.FeatureID_OOV in NodeList.head.features)
def walkOverContribution(contributionPath, ignoreFunc = None, endsWith = '.tokens.json'): result = {} for (path, dirs, files) in os.walk(contributionPath): if ignoreFunc(path): continue for file in files: if file.endswith(endsWith): if ignoreFunc(file): continue result[file] = Tokenization.tokenizeFile(os.path.join(path,file)) return result
def purrify(query): # purrify the query normalizedTokens = [] tokens = Tokenization.tokenization(query) for token in tokens: token = Normalization.cleanedTokens(token) token = Normalization.caseFoldedTokens(token) token = Normalization.cleanStopWords150(token) token = Normalization.stemmedTokens(token) if token != "": normalizedTokens.append(token) #print normalizedTokens return normalizedTokens
def main(): """ The program must accept two command line arguments: -train.json -test.json """ # first handle user input trainJSONData, testJSONData = command_parser() # import the text process after checking user input import Normalization import Tokenization # init text processing classes global normalization, tokenization normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() print("Pre-processing begin >>>>>>>>") # Perform Data pre-processing (text processing and get each document terms) Document_vectors, corpus, number_of_document, corpus_count = pre_processing( trainJSONData) print("<<<<<<<< Pre-processing done") # apply the kNN best_accuary = -1 best_k = -1 decrease = 0 k_parameter_accuracy = [] # try all different parameter k # until if there are two consectively decreases # then stop for k in range(1, number_of_document): print("Apply kNN begin with K=%d >>>>>>>>" % (k)) accuracy = apply_kNN_on_test_documents(testJSONData, Document_vectors, corpus, number_of_document, corpus_count, k) k_parameter_accuracy.append(accuracy) print("<<<<<<<< Apply kNN done with K=%d" % (k)) print("Accuracy: " + str(accuracy) + " with K=%d" % (k)) if accuracy > best_accuary: best_accuary = accuracy best_k = k if k > 1 and accuracy < k_parameter_accuracy[k - 2]: decrease += 1 if decrease == 2: # if consectively decreasing break print("Two consectively decreasing accuracy! Stop here") break print("") print("Best Accuracy: %f with parameter K=%d" % (best_accuary, best_k))
def refineTokens(data, debug = False, force = True): #find all .tokens.json files files = Helper.derivedFiles(Helper.relevantFiles(data['data']), inputFileExt) if (not force): files = Helper.disregardFiles(files, inputFileExt, outputFileExt) for file in files: tokenized = Tokenization.tokenizeFile(file) if (debug): json.dump(tokenized, open(file.replace(inputFileExt, outputDebugFileExt), 'w')) map = createMap(tokenized) json.dump(map, open(file.replace('.tokens.json', outputFileExt), 'w')) Helper.incProgress() print ''
def cleantxt(line, cleanfile): word_list = tk.TokenizeMultiWord(line, Taglist) for word in word_list: if (not word.isnumeric() and len(word) > 1 and word in words.words()) or (word in DiseaseName): cleanfile.write(word) cleanfile.write(" ") else: if p.singular_noun(word): cleanfile.write(word) cleanfile.write(" ") cleanfile.write("\n")
def LexicalAnalyzeTask( SubSentence, schema): NodeList = Tokenization.Tokenize(SubSentence) if not NodeList or NodeList.size == 0: return None, None Lexicon.ApplyLexiconToNodes(NodeList) # print("after ApplyLexiconToNodes" + OutputStringTokens_oneliner(NodeList)) PrepareJSandJM(NodeList) #Lexicon.LexiconoQoCLookup(NodeList) NodeList, Dag, WinningRules = DynamicPipeline(NodeList, schema) # t = Thread(target=Cache.WriteSentenceDB, args=(SubSentence, NodeList)) # t.start() return NodeList, Dag, WinningRules
def refineTokens(data, debug=False, force=True): #find all .tokens.json files files = Helper.derivedFiles(Helper.relevantFiles(data['data']), inputFileExt) if (not force): files = Helper.disregardFiles(files, inputFileExt, outputFileExt) for file in files: tokenized = Tokenization.tokenizeFile(file) if (debug): json.dump( tokenized, open(file.replace(inputFileExt, outputDebugFileExt), 'w')) map = createMap(tokenized) json.dump(map, open(file.replace('.tokens.json', outputFileExt), 'w')) Helper.incProgress() print ''
def test_LogicCombined(self): """Logic Combined""" blocks = SeparateOrBlocks("a|b|c") self.assertEqual(len(blocks), 3) blocks = SeparateOrBlocks("a") self.assertEqual(len(blocks), 1) blocks = SeparateOrBlocks("'a|b'|c") self.assertEqual(len(blocks), 2) strtokenlist = Tokenization.Tokenize('d') RuleTokenList = [Rules.RuleToken()] self.assertTrue(LogicMatch(strtokenlist, 0, 'd', RuleTokenList, 0)) #strtokenlist = Tokenization.Tokenize("notfeature|'d'|notfeature2") self.assertTrue(LogicMatch(strtokenlist, 0, "notfeature|'d'|notfeature2", RuleTokenList, 0))
def getAllTokens(fileNum): # Create inverted index, loop through all articles in one file invertedIndex = {} tokensLength= open('invertedIndex/tokensLength', 'a') Content = open('invertedIndex/Content', 'a') #loading the orginal documents for tokenization and normalization later fileNum= '%0*d' % (3, fileNum) resourcepath = 'reuters/reut2-' + fileNum + '.sgm' file = open(resourcepath) soup = BeautifulSoup(file, 'html.parser') for doc in soup.find_all('reuters'): docId = int(doc['newid'].encode('utf8')) tokenCounter = 0 if doc.body is not None: content = doc.body.text length=len(content) Content.write (str(docId) + ' Start ' + content.encode('utf8') + ' End ') tokens = Tokenization.tokenization(content) for token in tokens: # Normalization token = Normalization.cleanedTokens(token) token = Normalization.caseFoldedTokens(token) token = Normalization.cleanStopWords150(token) token = Normalization.stemmedTokens(token) if token != '': tokenCounter += 1 # Add to the postings list if the word exists if invertedIndex.has_key(token): if invertedIndex[token].has_key(docId): tf = invertedIndex[token][docId] invertedIndex[token][docId] = tf +1 else: invertedIndex[token][docId] = 1 else: invertedIndex[token] = {docId:1} tokensLength.write (str(docId) + ':' + str(tokenCounter) +'\n') tokensLength.close() Content.close() return invertedIndex
def refineTokens(debug = False, force = True, fragments=False): #find all .tokens.json or fragments.tokens.json files if fragments: ending = inputFragFileExt outEnding = outputFragFileExt files = Helper.derivedFiles(Helper.relevantFiles(), inputFragFileExt) if (not force): files = Helper.disregardFiles(files, inputFragFileExt, outputFragFileExt) else: ending = inputFileExt outEnding = outputFileExt files = Helper.derivedFiles(Helper.relevantFiles(), inputFileExt) if (not force): files = Helper.disregardFiles(files, inputFileExt, outputFileExt) for file in files: tokenized = Tokenization.tokenizeFile(file, fragments=fragments) if (debug): json.dump(tokenized, open(file.replace(ending, outputDebugFileExt), 'w')) map = createMap(tokenized, fragments=fragments) json.dump(map, open(file.replace(ending, outEnding), 'w')) Helper.incProgress() print ''
def ApplyLexicon(node, lex=None, stemming_version="stem"): global _SuffixList if not C1ID: InitLengthSet() OOVFeatureSet = { utils.FeatureID_JM, utils.FeatureID_JM2, utils.FeatureID_JS, utils.FeatureID_JS2 } OOVFeatureSet |= LengthSet if not lex: lex = SearchLexicon(node.text) # if not node.lexicon: # If lexicon is assigned before, then don't do the search # # because the node.word is not as reliable as stem. # node.lexicon = SearchLexicon(node.word) #attempt stemming if lexicon fails (O.O) word = node.text.lower() if lex is None and len(word) >= 4: if stemming_version == "stem": start = len(word) - 1 stop = 2 step = -1 else: start = 3 stop = len(word) step = 1 for stem_length in range(start, stop, step): stem_word = word[:stem_length] lex_copy = SearchStem(stem_word) suffix = word[stem_length:].lower() if lex_copy is not None and suffix in _SuffixList: # both the stem_word exists and the suffix exists lex = LexiconNode(word) lex.atom = lex_copy.atom lex.norm = lex_copy.norm lex.features.update(lex_copy.features) # set the node essentially equal to lex, so it technically sends lex into MatchAndApplyRuleFile o_norm = node.norm o_atom = node.atom o_text = node.text node.norm = lex.norm node.atom = lex.atom node.text = suffix if utils.FeatureID_NEW in lex.features: node.features = set() node.features.update(lex.features) node.features.remove(utils.FeatureID_NEW) else: node.features.update(lex.features) orig_feature = len(node.features) SingleNodeList = Tokenization.SentenceLinkedList() SingleNodeList.append(node) ProcessSentence.MatchAndApplyRuleFile(SingleNodeList, _InfFile) node = SingleNodeList.head # all we want is the updated features lex.features = set() lex.features.update(node.features) new_feature = len(node.features) node.norm = o_norm node.atom = o_atom node.text = o_text node.features = set() # if features don't change, it didn't match, thus stemming failed if orig_feature != new_feature: break else: lex = None if stemming_version == "stem": # failing from small suffixes could still work for longer ones continue else: # starting for longer suffixes, if matching failed it would fail everything break if lex is None: if utils.IsCD(node.text): node.ApplyFeature(utils.FeatureID_CD) elif node.text in string.punctuation: node.ApplyFeature(utils.FeatureID_SYM) elif node.norm == " ": node.ApplyFeature(utils.FeatureID_CM) # not to apply NNP/OOV to space. else: node.ApplyFeature(utils.FeatureID_NNP) node.ApplyFeature(utils.FeatureID_OOV) else: node.norm = lex.norm #to have correct stem, e.g. carries -> carrie -> carry if lex.norm in _StemDict: stem_lex = SearchStem(lex.norm) if stem_lex.norm: node.norm = stem_lex.norm node.atom = lex.atom if utils.FeatureID_NEW in lex.features: node.features = set() node.features.update(lex.features) node.features.remove(utils.FeatureID_NEW) else: node.features.update(lex.features) # _ApplyWordStem(node, lex) (o.o) if len(node.features) == 0 or \ len(node.features - OOVFeatureSet) == 0: node.ApplyFeature(utils.FeatureID_OOV) # node.features.add(utils.FeatureID_OOV) ApplyWordLengthFeature(node) node.ApplyFeature(utils.FeatureID_0) return node
def runTest(self): token = 'StyledEditorKit' expected = ['Styled', 'Editor', 'Kit'] self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected), token + ' is not correctly tokenized')
def runTest(self): token = 'HTMLEditor' expected = ['HTML', 'Editor'] self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected), token + ' is not correctly tokenized')
import Normalization import Tokenization def cleanText(text, tokenization, normalization): """ Input: string of text Return: a list of term/vocabulary after tokenization and normalization """ # perform tokenization tokens = tokenization.tokenize(text) # perform normalization tokens = normalization.lemmatize(tokens) # get rid of non-meaningful character after tokenization tokens = tokenization.getRidPuncuation(tokens) return tokens normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() dd = cleanText( "adad.adad ada...adad..ad 1941.http u.s.a. #Dadad #Rats sgsgs...", tokenization, normalization) print(dd)
def runTest(self): token = 'isOSGiCompatible' expected = ['is', 'OSGi', 'Compatible'] self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected), token + ' not correctly tokenized')
def test_ApplyWordLengthFeature(self): Sentence="李四abc456,sab98中文" NodeList = Tokenization.Tokenize(Sentence) ApplyLexiconToNodes(NodeList) self.assertTrue(C1ID in NodeList.head.features) self.assertTrue(D1ID in NodeList.get(1).features)
def main(): """ The program must accept two command line arguments: the first is the directory containing the documents to be indexed, and the second must be the directory where the index will be stored. """ # first handle user input if len(sys.argv) != 3: # number of argument is not correct print("Two arguments are needed:") print("1. the directory containing the documents to be indexed") print("2. the directory where the index will be stored") return docDir = sys.argv[1] indexDir = sys.argv[2] if not os.path.isdir(docDir) or not os.path.isdir(indexDir): # the given input dir are invalid print("The given directory is invalid") return # append / if not present in the directory if docDir[-1] != "/": docDir += "/" if indexDir[-1] != "/": indexDir += "/" if indexDir == "/": indexDir = "." + indexDir if docDir == "/": docDir = "." + docDir # retrieve all documents in the given directory allDoc = [] for subDir in os.walk(docDir): # recursively retrieve all files in each subDir # docDir is also a subDir of itself for doc in subDir[2]: # all documents in subDir allDoc.append(doc) ####################################################################################################################### # intialization for building index import Normalization import Tokenization import SQLite3database # init text processing classes normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() # create a SQLite3 database indexDatabase = SQLite3database.Database(indexDir+"index.db") # create title index database titleDatabase = SQLite3database.Database(indexDir+"title.db") # create table createTable(indexDatabase) createTable(titleDatabase) # init final insert string indexDatabase.initInsertString() indexDatabase.addBeginTransactionString() titleDatabase.initInsertString() titleDatabase.addBeginTransactionString() # intializing insert string insertDocument = "INSERT INTO document VALUES" insertDictionary = "INSERT INTO dictionary VALUES" insertTermPosition = "INSERT INTO termPosition VALUES" insertDocumentFrequency = "INSERT INTO documentFrequency VALUES" insertTermFrequency = "INSERT INTO termFrequency VALUES" insertDocumentTitle = "INSERT INTO document VALUES" insertDictionaryTitle = "INSERT INTO dictionary VALUES" insertTermPositionTitle = "INSERT INTO termPosition VALUES" insertDocumentFrequencyTitle = "INSERT INTO documentFrequency VALUES" insertTermFrequencyTitle = "INSERT INTO termFrequency VALUES" # store document frequency of each vocabulary dictionary = {} # contain all vocabulary over all (vocabulary as key, document frequncy as value) titleDic = {} for doc in allDoc: # First read and process text from the current document # open file to read text = open(docDir+doc,"r").read() noTxt = doc.rstrip(".txt") title = " ".join(noTxt.split("_")[2:]) # process raw text from document tokens = cleanText(text, tokenization, normalization) # return a list of term/vocabulary after tokenization and normalization titleTokens = cleanText(title.lower(), tokenization, normalization) # Then # Traverse the term/vocabulary list and record the information # -position # -count # init termFrequency = {} # (vocabulary and documentID as key, term frequency as value) titleTermFrequency = {} documentID = int(doc.split("_")[1]) # extract document ID insertDocument += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(tokens)) insertDocumentTitle += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(titleTokens)) alreadyIncrement = {} # use for check if the document frequency in this document is already increment alreadyIncrementTitle = {} for index,token in enumerate(tokens): # insert position of this token in the document insertTermPosition += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1) if token not in dictionary: dictionary[token] = 1 alreadyIncrement[token] = None # insert if this token is the first time encounter overall insertDictionary += """ ("{word}"),""".format(word=token) elif token not in alreadyIncrement: dictionary[token] += 1 alreadyIncrement[token] = None if token not in termFrequency: termFrequency[token] = 1 else: termFrequency[token] += 1 for key,val in termFrequency.items(): insertTermFrequency += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val) for index,token in enumerate(titleTokens): # insert position of this token in the document insertTermPositionTitle += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1) if token not in titleDic: titleDic[token] = 1 alreadyIncrementTitle[token] = None # insert if this token is the first time encounter overall insertDictionaryTitle += """ ("{word}"),""".format(word=token) elif token not in alreadyIncrementTitle: titleDic[token] += 1 alreadyIncrementTitle[token] = None if token not in titleTermFrequency: titleTermFrequency[token] = 1 else: titleTermFrequency[token] += 1 for key,val in titleTermFrequency.items(): insertTermFrequencyTitle += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val) # insert the document frequency for key,val in dictionary.items(): insertDocumentFrequency += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val) for key,val in titleDic.items(): insertDocumentFrequencyTitle += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val) # get rid of the ',' at the end of each insert string # replace it with ';' insertDocument = insertDocument[:-1] + ";" insertDictionary = insertDictionary[:-1] + ";" insertTermPosition = insertTermPosition[:-1] + ";" insertTermFrequency = insertTermFrequency[:-1] + ';' insertDocumentFrequency = insertDocumentFrequency[:-1] + ";" insertDocumentTitle = insertDocumentTitle[:-1] + ";" insertDictionaryTitle = insertDictionaryTitle[:-1] + ";" insertTermPositionTitle = insertTermPositionTitle[:-1] + ";" insertTermFrequencyTitle = insertTermFrequencyTitle[:-1] + ';' insertDocumentFrequencyTitle = insertDocumentFrequencyTitle[:-1] + ";" # add all insert string to the final insert string indexDatabase.addInsertString(insertDocument) indexDatabase.addInsertString(insertDictionary) indexDatabase.addInsertString(insertTermPosition) indexDatabase.addInsertString(insertTermFrequency) indexDatabase.addInsertString(insertDocumentFrequency) indexDatabase.addCommitString() indexDatabase.execute(indexDatabase.getInsertString()) createBtreeIndex(indexDatabase) indexDatabase.close() titleDatabase.addInsertString(insertDocumentTitle) titleDatabase.addInsertString(insertDictionaryTitle) titleDatabase.addInsertString(insertTermPositionTitle) titleDatabase.addInsertString(insertTermFrequencyTitle) titleDatabase.addInsertString(insertDocumentFrequencyTitle) titleDatabase.addCommitString() titleDatabase.execute(titleDatabase.getInsertString()) createBtreeIndex(titleDatabase) titleDatabase.close()
for i in range(0, 22): if i < 10: filename = 'file\\reut2-00%i.sgm' % (i) else: filename = 'file\\reut2-0%i.sgm' % (i) f = open(filename, 'r', errors='ignore') s = f.read() '''parse the file and output the results''' parser = Preprocess.Preprocess() parser.parse(s) '''create another text directory''' if not os.path.isdir('tokenization'): os.makedirs('tokenization') tokenization = Tokenization.Tokenization() for j in range(1, 21579): filename_txt = 'text\\%d.txt' % (j) f = open(filename_txt, 'r') s = f.read() s = tokenization.case_folding(s) # print('******case folding *****\n%s'%(s)) s = tokenization.remove_num(s) # print('******num*****\n%s'%(s)) s = tokenization.remove_punctuation(s) s = tokenization.toke_nize(s) # # print ('*******removal punctuation*******\n%s'%s)
def main(): # First of all check the user input indexFilePath, k, printScore, queryTermString = checkInput() # open the database file that is given indexDatabase = SQLite3database.Database( sys.argv[1]) #This also handle file error # cursor cursor = indexDatabase.getCursor() # check if the tables needed exists in the index storage file tablesNeeded = [ "dictionary", "document", "termPosition", "documentFrequency", "termFrequency" ] if checkIfTableNeedExist(indexDatabase, cursor, tablesNeeded) == False: print( "The given index storage file does not contain the required Tables." ) indexDatabase.close() return # last check for k cursor.execute("SELECT COUNT(*) FROM document;") NumberOfDocument = cursor.fetchall()[0][0] if k > int(NumberOfDocument): print( "The second argument k is larger than the number of document in the input collection." ) print("Arugmnet k should be less or equal to: %d" % (int(NumberOfDocument))) indexDatabase.close() sys.exit(-1) ################################################################################################################################## """ At this point, all input should be all validated, and database file has opened, The database file has all the information represent the each document language model -tf (term frequency) in each of the document -document length for each document and along with some other extra information """ # First of all, do text processing(clean text) on the query term # (The same way that is done to the input data document terms) import Normalization import Tokenization normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() queryTermsList = cleanText(queryTermString, tokenization, normalization) print("Query Terms:") print(queryTermsList) # Perform the computation of probability of generating the query terms on the document model topKdocument = ComputeProbabilityGeneratingQueryTerms( queryTermsList, cursor, k) if printScore == "y": print(" %4s %63s" % ("Document Name:", "Query Likelyhood:")) for index, document in enumerate(topKdocument): print("%4d. %-60s" % (index + 1, document[0]), end="") print(document[1]) else: print(" %4s" % ("Document Name:")) for index, document in enumerate(topKdocument): print("%4d. %-60s" % (index + 1, document[0])) # close the database file after indexDatabase.close()