コード例 #1
0
def main():
    """
	The program must accept two command line arguments: 
	-train.json
	-test.json
	"""
    # first handle user input
    trainJSONData, testJSONData = command_parser()

    # import the text process after checking user input
    import Normalization
    import Tokenization

    # init text processing classes
    global normalization, tokenization
    normalization = Normalization.Normalizer()
    tokenization = Tokenization.Tokenizer()

    print("Pre-processing begin >>>>>>>>")
    # Perform Data pre-processing (text processing and get each document terms)
    Document_vectors, corpus, number_of_document, corpus_count = pre_processing(
        trainJSONData)
    print("<<<<<<<< Pre-processing done")
    # apply the kNN
    best_accuary = -1
    best_k = -1
    decrease = 0
    k_parameter_accuracy = []
    # try all different parameter k
    # until if there are two consectively decreases
    # then stop
    for k in range(1, number_of_document):
        print("Apply kNN begin with K=%d  >>>>>>>>" % (k))
        accuracy = apply_kNN_on_test_documents(testJSONData, Document_vectors,
                                               corpus, number_of_document,
                                               corpus_count, k)
        k_parameter_accuracy.append(accuracy)
        print("<<<<<<<< Apply kNN done with K=%d" % (k))
        print("Accuracy: " + str(accuracy) + "  with K=%d" % (k))
        if accuracy > best_accuary:
            best_accuary = accuracy
            best_k = k
        if k > 1 and accuracy < k_parameter_accuracy[k - 2]:
            decrease += 1
        if decrease == 2:
            # if consectively decreasing break
            print("Two consectively decreasing accuracy! Stop here")
            break
    print("")
    print("Best Accuracy: %f  with parameter K=%d" % (best_accuary, best_k))
def main():
	"""
	The program must accept two command line arguments: 
	the first is the directory containing the documents to be indexed, 
	and the second must be the directory where the index will be stored.
	"""
	# first handle user input
	if len(sys.argv) != 3:
		# number of argument is not correct
		print("Two arguments are needed:")
		print("1. the directory containing the documents to be indexed")
		print("2. the directory where the index will be stored")
		return 
	docDir = sys.argv[1]
	indexDir = sys.argv[2]
	if not os.path.isdir(docDir) or not os.path.isdir(indexDir):
		# the given input dir are invalid
		print("The given directory is invalid")
		return 
	# append / if not present in the directory
	if docDir[-1] != "/":
		docDir += "/"
	if indexDir[-1] != "/":
		indexDir += "/"
	if indexDir == "/":
		indexDir = "." + indexDir
	if docDir == "/":
		docDir = "." + docDir
	# retrieve all documents in the given directory
	allDoc = []
	for subDir in os.walk(docDir):
		# recursively retrieve all files in each subDir
		# docDir is also a subDir of itself
		for doc in subDir[2]:
			# all documents in subDir
			allDoc.append(doc)

	#######################################################################################################################

	# intialization for building index
	import Normalization 
	import Tokenization 
	import SQLite3database 
	# init text processing classes
	normalization = Normalization.Normalizer()
	tokenization = Tokenization.Tokenizer()
	# create a SQLite3 database
	indexDatabase = SQLite3database.Database(indexDir+"index.db")
	# create title index database
	titleDatabase = SQLite3database.Database(indexDir+"title.db")
	# create table
	createTable(indexDatabase)
	createTable(titleDatabase)
	# init final insert string
	indexDatabase.initInsertString()
	indexDatabase.addBeginTransactionString()
	titleDatabase.initInsertString()
	titleDatabase.addBeginTransactionString()
	# intializing insert string
	insertDocument = "INSERT INTO document VALUES"
	insertDictionary = "INSERT INTO dictionary VALUES"
	insertTermPosition = "INSERT INTO termPosition VALUES"
	insertDocumentFrequency = "INSERT INTO documentFrequency VALUES"
	insertTermFrequency = "INSERT INTO termFrequency VALUES"

	insertDocumentTitle = "INSERT INTO document VALUES"
	insertDictionaryTitle = "INSERT INTO dictionary VALUES"
	insertTermPositionTitle = "INSERT INTO termPosition VALUES"
	insertDocumentFrequencyTitle = "INSERT INTO documentFrequency VALUES"
	insertTermFrequencyTitle = "INSERT INTO termFrequency VALUES" 

	# store document frequency of each vocabulary
	dictionary = {} # contain all vocabulary over all (vocabulary as key, document frequncy as value)
	titleDic = {} 
	for doc in allDoc:
		# First read and process text from the current document
		# open file to read
		text = open(docDir+doc,"r").read()

		noTxt = doc.rstrip(".txt")
		title = " ".join(noTxt.split("_")[2:])

		# process raw text from document
		tokens = cleanText(text, tokenization, normalization) # return a list of term/vocabulary after tokenization and normalization
		titleTokens = cleanText(title.lower(), tokenization, normalization) 
		# Then
		# Traverse the term/vocabulary list and record the information
		# -position 
		# -count
		# init 
		termFrequency = {} # (vocabulary and documentID as key, term frequency as value)
		titleTermFrequency = {}
		documentID = int(doc.split("_")[1]) # extract document ID
		insertDocument += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(tokens))
		insertDocumentTitle += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(titleTokens))
		alreadyIncrement = {} # use for check if the document frequency in this document is already increment
		alreadyIncrementTitle = {}
		for index,token in enumerate(tokens):
			# insert position of this token in the document
			insertTermPosition += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1)
			if token not in dictionary:
				dictionary[token] = 1
				alreadyIncrement[token] = None
				# insert if this token is the first time encounter overall 
				insertDictionary += """ ("{word}"),""".format(word=token)
			elif token not in alreadyIncrement:
				dictionary[token] += 1
				alreadyIncrement[token] = None
			if token not in termFrequency:
				termFrequency[token] = 1
			else:
				termFrequency[token] += 1
		for key,val in termFrequency.items():
			insertTermFrequency += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val)

		for index,token in enumerate(titleTokens):
			# insert position of this token in the document
			insertTermPositionTitle += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1)
			if token not in titleDic:
				titleDic[token] = 1
				alreadyIncrementTitle[token] = None
				# insert if this token is the first time encounter overall 
				insertDictionaryTitle += """ ("{word}"),""".format(word=token)
			elif token not in alreadyIncrementTitle:
				titleDic[token] += 1
				alreadyIncrementTitle[token] = None
			if token not in titleTermFrequency:
				titleTermFrequency[token] = 1
			else:
				titleTermFrequency[token] += 1
		for key,val in titleTermFrequency.items():
			insertTermFrequencyTitle += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val)


	# insert the document frequency
	for key,val in dictionary.items():
		insertDocumentFrequency += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val)

	for key,val in titleDic.items():
		insertDocumentFrequencyTitle += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val)

	# get rid of the ',' at the end of each insert string
	# replace it with ';'
	insertDocument = insertDocument[:-1] + ";"
	insertDictionary = insertDictionary[:-1] + ";"
	insertTermPosition = insertTermPosition[:-1] + ";"
	insertTermFrequency = insertTermFrequency[:-1] + ';'
	insertDocumentFrequency = insertDocumentFrequency[:-1] + ";"


	insertDocumentTitle = insertDocumentTitle[:-1] + ";"
	insertDictionaryTitle = insertDictionaryTitle[:-1] + ";"
	insertTermPositionTitle = insertTermPositionTitle[:-1] + ";"
	insertTermFrequencyTitle = insertTermFrequencyTitle[:-1] + ';'
	insertDocumentFrequencyTitle = insertDocumentFrequencyTitle[:-1] + ";"

	# add all insert string to the final insert string
	indexDatabase.addInsertString(insertDocument)
	indexDatabase.addInsertString(insertDictionary)
	indexDatabase.addInsertString(insertTermPosition)
	indexDatabase.addInsertString(insertTermFrequency)
	indexDatabase.addInsertString(insertDocumentFrequency)
	indexDatabase.addCommitString()
	indexDatabase.execute(indexDatabase.getInsertString())
	createBtreeIndex(indexDatabase)
	indexDatabase.close()

	titleDatabase.addInsertString(insertDocumentTitle)
	titleDatabase.addInsertString(insertDictionaryTitle)
	titleDatabase.addInsertString(insertTermPositionTitle)
	titleDatabase.addInsertString(insertTermFrequencyTitle)
	titleDatabase.addInsertString(insertDocumentFrequencyTitle)
	titleDatabase.addCommitString()
	titleDatabase.execute(titleDatabase.getInsertString())
	createBtreeIndex(titleDatabase)
	titleDatabase.close()
import Normalization
import Tokenization


def cleanText(text, tokenization, normalization):
    """
	Input: string of text
	Return: a list of term/vocabulary after tokenization and normalization 
	"""
    # perform tokenization
    tokens = tokenization.tokenize(text)
    # perform normalization
    tokens = normalization.lemmatize(tokens)
    # get rid of non-meaningful character after tokenization
    tokens = tokenization.getRidPuncuation(tokens)
    return tokens


normalization = Normalization.Normalizer()
tokenization = Tokenization.Tokenizer()

dd = cleanText(
    "adad.adad ada...adad..ad 1941.http u.s.a. #Dadad #Rats sgsgs...",
    tokenization, normalization)
print(dd)
コード例 #4
0
def main():
    # First of all check the user input
    indexFilePath, k, printScore, queryTermString = checkInput()
    # open the database file that is given
    indexDatabase = SQLite3database.Database(
        sys.argv[1])  #This also handle file error
    # cursor
    cursor = indexDatabase.getCursor()
    # check if the tables needed exists in the index storage file
    tablesNeeded = [
        "dictionary", "document", "termPosition", "documentFrequency",
        "termFrequency"
    ]
    if checkIfTableNeedExist(indexDatabase, cursor, tablesNeeded) == False:
        print(
            "The given index storage file does not contain the required Tables."
        )
        indexDatabase.close()
        return
    # last check for k
    cursor.execute("SELECT COUNT(*) FROM document;")
    NumberOfDocument = cursor.fetchall()[0][0]
    if k > int(NumberOfDocument):
        print(
            "The second argument k is larger than the number of document in the input collection."
        )
        print("Arugmnet k should be less or equal to: %d" %
              (int(NumberOfDocument)))
        indexDatabase.close()
        sys.exit(-1)

    ##################################################################################################################################
    """
	At this point, all input should be all validated,
	and database file has opened,
	The database file has all the information represent the each document language model
	-tf (term frequency) in each of the document
	-document length for each document
	and along with some other extra information
	"""

    # First of all, do text processing(clean text) on the query term
    # (The same way that is done to the input data document terms)
    import Normalization
    import Tokenization

    normalization = Normalization.Normalizer()
    tokenization = Tokenization.Tokenizer()
    queryTermsList = cleanText(queryTermString, tokenization, normalization)
    print("Query Terms:")
    print(queryTermsList)
    # Perform the computation of probability of generating the query terms on the document model
    topKdocument = ComputeProbabilityGeneratingQueryTerms(
        queryTermsList, cursor, k)
    if printScore == "y":
        print(" %4s %63s" % ("Document Name:", "Query Likelyhood:"))
        for index, document in enumerate(topKdocument):
            print("%4d. %-60s" % (index + 1, document[0]), end="")
            print(document[1])
    else:
        print(" %4s" % ("Document Name:"))
        for index, document in enumerate(topKdocument):
            print("%4d. %-60s" % (index + 1, document[0]))
    # close the database file after
    indexDatabase.close()