def main(): """ The program must accept two command line arguments: -train.json -test.json """ # first handle user input trainJSONData, testJSONData = command_parser() # import the text process after checking user input import Normalization import Tokenization # init text processing classes global normalization, tokenization normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() print("Pre-processing begin >>>>>>>>") # Perform Data pre-processing (text processing and get each document terms) Document_vectors, corpus, number_of_document, corpus_count = pre_processing( trainJSONData) print("<<<<<<<< Pre-processing done") # apply the kNN best_accuary = -1 best_k = -1 decrease = 0 k_parameter_accuracy = [] # try all different parameter k # until if there are two consectively decreases # then stop for k in range(1, number_of_document): print("Apply kNN begin with K=%d >>>>>>>>" % (k)) accuracy = apply_kNN_on_test_documents(testJSONData, Document_vectors, corpus, number_of_document, corpus_count, k) k_parameter_accuracy.append(accuracy) print("<<<<<<<< Apply kNN done with K=%d" % (k)) print("Accuracy: " + str(accuracy) + " with K=%d" % (k)) if accuracy > best_accuary: best_accuary = accuracy best_k = k if k > 1 and accuracy < k_parameter_accuracy[k - 2]: decrease += 1 if decrease == 2: # if consectively decreasing break print("Two consectively decreasing accuracy! Stop here") break print("") print("Best Accuracy: %f with parameter K=%d" % (best_accuary, best_k))
def main(): """ The program must accept two command line arguments: the first is the directory containing the documents to be indexed, and the second must be the directory where the index will be stored. """ # first handle user input if len(sys.argv) != 3: # number of argument is not correct print("Two arguments are needed:") print("1. the directory containing the documents to be indexed") print("2. the directory where the index will be stored") return docDir = sys.argv[1] indexDir = sys.argv[2] if not os.path.isdir(docDir) or not os.path.isdir(indexDir): # the given input dir are invalid print("The given directory is invalid") return # append / if not present in the directory if docDir[-1] != "/": docDir += "/" if indexDir[-1] != "/": indexDir += "/" if indexDir == "/": indexDir = "." + indexDir if docDir == "/": docDir = "." + docDir # retrieve all documents in the given directory allDoc = [] for subDir in os.walk(docDir): # recursively retrieve all files in each subDir # docDir is also a subDir of itself for doc in subDir[2]: # all documents in subDir allDoc.append(doc) ####################################################################################################################### # intialization for building index import Normalization import Tokenization import SQLite3database # init text processing classes normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() # create a SQLite3 database indexDatabase = SQLite3database.Database(indexDir+"index.db") # create title index database titleDatabase = SQLite3database.Database(indexDir+"title.db") # create table createTable(indexDatabase) createTable(titleDatabase) # init final insert string indexDatabase.initInsertString() indexDatabase.addBeginTransactionString() titleDatabase.initInsertString() titleDatabase.addBeginTransactionString() # intializing insert string insertDocument = "INSERT INTO document VALUES" insertDictionary = "INSERT INTO dictionary VALUES" insertTermPosition = "INSERT INTO termPosition VALUES" insertDocumentFrequency = "INSERT INTO documentFrequency VALUES" insertTermFrequency = "INSERT INTO termFrequency VALUES" insertDocumentTitle = "INSERT INTO document VALUES" insertDictionaryTitle = "INSERT INTO dictionary VALUES" insertTermPositionTitle = "INSERT INTO termPosition VALUES" insertDocumentFrequencyTitle = "INSERT INTO documentFrequency VALUES" insertTermFrequencyTitle = "INSERT INTO termFrequency VALUES" # store document frequency of each vocabulary dictionary = {} # contain all vocabulary over all (vocabulary as key, document frequncy as value) titleDic = {} for doc in allDoc: # First read and process text from the current document # open file to read text = open(docDir+doc,"r").read() noTxt = doc.rstrip(".txt") title = " ".join(noTxt.split("_")[2:]) # process raw text from document tokens = cleanText(text, tokenization, normalization) # return a list of term/vocabulary after tokenization and normalization titleTokens = cleanText(title.lower(), tokenization, normalization) # Then # Traverse the term/vocabulary list and record the information # -position # -count # init termFrequency = {} # (vocabulary and documentID as key, term frequency as value) titleTermFrequency = {} documentID = int(doc.split("_")[1]) # extract document ID insertDocument += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(tokens)) insertDocumentTitle += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(titleTokens)) alreadyIncrement = {} # use for check if the document frequency in this document is already increment alreadyIncrementTitle = {} for index,token in enumerate(tokens): # insert position of this token in the document insertTermPosition += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1) if token not in dictionary: dictionary[token] = 1 alreadyIncrement[token] = None # insert if this token is the first time encounter overall insertDictionary += """ ("{word}"),""".format(word=token) elif token not in alreadyIncrement: dictionary[token] += 1 alreadyIncrement[token] = None if token not in termFrequency: termFrequency[token] = 1 else: termFrequency[token] += 1 for key,val in termFrequency.items(): insertTermFrequency += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val) for index,token in enumerate(titleTokens): # insert position of this token in the document insertTermPositionTitle += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1) if token not in titleDic: titleDic[token] = 1 alreadyIncrementTitle[token] = None # insert if this token is the first time encounter overall insertDictionaryTitle += """ ("{word}"),""".format(word=token) elif token not in alreadyIncrementTitle: titleDic[token] += 1 alreadyIncrementTitle[token] = None if token not in titleTermFrequency: titleTermFrequency[token] = 1 else: titleTermFrequency[token] += 1 for key,val in titleTermFrequency.items(): insertTermFrequencyTitle += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val) # insert the document frequency for key,val in dictionary.items(): insertDocumentFrequency += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val) for key,val in titleDic.items(): insertDocumentFrequencyTitle += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val) # get rid of the ',' at the end of each insert string # replace it with ';' insertDocument = insertDocument[:-1] + ";" insertDictionary = insertDictionary[:-1] + ";" insertTermPosition = insertTermPosition[:-1] + ";" insertTermFrequency = insertTermFrequency[:-1] + ';' insertDocumentFrequency = insertDocumentFrequency[:-1] + ";" insertDocumentTitle = insertDocumentTitle[:-1] + ";" insertDictionaryTitle = insertDictionaryTitle[:-1] + ";" insertTermPositionTitle = insertTermPositionTitle[:-1] + ";" insertTermFrequencyTitle = insertTermFrequencyTitle[:-1] + ';' insertDocumentFrequencyTitle = insertDocumentFrequencyTitle[:-1] + ";" # add all insert string to the final insert string indexDatabase.addInsertString(insertDocument) indexDatabase.addInsertString(insertDictionary) indexDatabase.addInsertString(insertTermPosition) indexDatabase.addInsertString(insertTermFrequency) indexDatabase.addInsertString(insertDocumentFrequency) indexDatabase.addCommitString() indexDatabase.execute(indexDatabase.getInsertString()) createBtreeIndex(indexDatabase) indexDatabase.close() titleDatabase.addInsertString(insertDocumentTitle) titleDatabase.addInsertString(insertDictionaryTitle) titleDatabase.addInsertString(insertTermPositionTitle) titleDatabase.addInsertString(insertTermFrequencyTitle) titleDatabase.addInsertString(insertDocumentFrequencyTitle) titleDatabase.addCommitString() titleDatabase.execute(titleDatabase.getInsertString()) createBtreeIndex(titleDatabase) titleDatabase.close()
import Normalization import Tokenization def cleanText(text, tokenization, normalization): """ Input: string of text Return: a list of term/vocabulary after tokenization and normalization """ # perform tokenization tokens = tokenization.tokenize(text) # perform normalization tokens = normalization.lemmatize(tokens) # get rid of non-meaningful character after tokenization tokens = tokenization.getRidPuncuation(tokens) return tokens normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() dd = cleanText( "adad.adad ada...adad..ad 1941.http u.s.a. #Dadad #Rats sgsgs...", tokenization, normalization) print(dd)
def main(): # First of all check the user input indexFilePath, k, printScore, queryTermString = checkInput() # open the database file that is given indexDatabase = SQLite3database.Database( sys.argv[1]) #This also handle file error # cursor cursor = indexDatabase.getCursor() # check if the tables needed exists in the index storage file tablesNeeded = [ "dictionary", "document", "termPosition", "documentFrequency", "termFrequency" ] if checkIfTableNeedExist(indexDatabase, cursor, tablesNeeded) == False: print( "The given index storage file does not contain the required Tables." ) indexDatabase.close() return # last check for k cursor.execute("SELECT COUNT(*) FROM document;") NumberOfDocument = cursor.fetchall()[0][0] if k > int(NumberOfDocument): print( "The second argument k is larger than the number of document in the input collection." ) print("Arugmnet k should be less or equal to: %d" % (int(NumberOfDocument))) indexDatabase.close() sys.exit(-1) ################################################################################################################################## """ At this point, all input should be all validated, and database file has opened, The database file has all the information represent the each document language model -tf (term frequency) in each of the document -document length for each document and along with some other extra information """ # First of all, do text processing(clean text) on the query term # (The same way that is done to the input data document terms) import Normalization import Tokenization normalization = Normalization.Normalizer() tokenization = Tokenization.Tokenizer() queryTermsList = cleanText(queryTermString, tokenization, normalization) print("Query Terms:") print(queryTermsList) # Perform the computation of probability of generating the query terms on the document model topKdocument = ComputeProbabilityGeneratingQueryTerms( queryTermsList, cursor, k) if printScore == "y": print(" %4s %63s" % ("Document Name:", "Query Likelyhood:")) for index, document in enumerate(topKdocument): print("%4d. %-60s" % (index + 1, document[0]), end="") print(document[1]) else: print(" %4s" % ("Document Name:")) for index, document in enumerate(topKdocument): print("%4d. %-60s" % (index + 1, document[0])) # close the database file after indexDatabase.close()