def loadTextFromFile(self, InputFile): ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")") # Convert to natural language object sentence = [] for line in open(InputFile, 'r', encoding='UTF-8'): line = self.thu.cut(line.strip(), text=True) sentence.extend(line.split()) ConsoleOutput.printGreen("Data load successful. WordCount: " + str(len(sentence))) self._nloTextData = NaturalLanguageObject(sentence)
def loadTextFromFile(self, InputFile): ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")") sentence = [] # Convert to natural language object for line in open(InputFile): #line = line.lower() # remove completely line = line.replace('"', '') line = line.replace("'", '') # seperate punctuation from eachother so they have seprate tokens line = re.sub(r'(.)([,.!?:;"()\'\"])', r'\1 \2', line) # seperate from both directions line = re.sub(r'([,.!?:;"()\'\"])(.)', r'\1 \2', line) sentence.extend(line.split()) ConsoleOutput.printGreen("Data load successful. WordCount: " + str(len(sentence))) self._nloTextData = NaturalLanguageObject(sentence)
def loadTextFromFile_backup(self, InputFile): ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")") sentence = [] # Convert to natural language object for line in open(InputFile, 'r', encoding='UTF-8'): #line = line.lower() # remove completely line = line.replace('"', '') line = line.replace("'", '') # seperate punctuation from each other so they have seprate tokens #line = re.sub( r'(.)([,.!?:;"()\'\"])', r'\1 \2', line) # seperate from both directions #line = re.sub( r'([,.!?:;"()\'\"])(.)', r'\1 \2', line) #sub函数第一个参数是要匹配的模式,第二个参数是要替换成的目标,第三个参数是要被正则匹配的源 line = re.sub(r'(.)([,。!?:“()‘”“’])', r'\1 \2', line) line = re.sub(r'([,。!?:;”()“”‘’])(.)', r'\1 \2', line) sentence.extend(line.split()) ConsoleOutput.printGreen("Data load successful. WordCount: " + str(len(sentence))) self._nloTextData = NaturalLanguageObject(sentence)
def Main(): _isUnitTestingSS = False _isUnitTestingV = False _recursiveInput = False _TrainingDataInputFile = "Datasets/Sstt.utf8.txt" _TestSentence = "" _TestSequenceGenSize = 30 _OutputFile = None consoleInArgs = sys.argv[1:] # check input arguments for index, val in enumerate(consoleInArgs): # Runs the unit testing module on initiation if(val == "-utss"): _isUnitTestingSS = True # Unit testing for the vocabulary network elif(val == "-utv"): _isUnitTestingV = True elif(len(consoleInArgs) >= index+1): # specify training data location if(val == "-td"): _TrainingDataInputFile = consoleInArgs[index+1] ConsoleOutput.printGreen("Training data load locaiton changed to: \"" + _TrainingDataInputFile + "\"") # give a generation sentence input elif(val == "-ts"): _TestSentence = consoleInArgs[index+1] if(len(_TestSentence.split()) != _TrainRangeSS): raise ValueError('Test sequence must be the same length as the vector training size. (' + str(_TrainRangeSS) + ')') # set the amount of words generated after input elif(val == "-tsc"): _TestSequenceGenSize = int(consoleInArgs[index+1]) ConsoleOutput.printGreen("Test sequence generation size changed to: " + str(_TestSequenceGenSize)) # set the output file for the generated data to be printed to elif(val == "-of"): _OutputFile = str(consoleInArgs[index+1]) ConsoleOutput.printGreen("Output generation location changed to: (" + consoleInArgs[index+1]+ ")") else: raise ValueError('Un-recognized console argument: ' + str(val)) # Initialise colorama cross-platform console logging init() MLNetworkSS = NNSentenceStructure() MLNetworkV = NNVocabulary() # Network trainer converts text data into normalized vectors that # can be passed into the networks networkTrainer = NetworkTrainer(_TrainRangeSS, _TrainRangeV) networkTrainer.loadTextFromFile(_TrainingDataInputFile) # Trainer parses the structure into vector normal arrays of size (_TrainRangeSS) # the next word of the squence is used as the target, example # ["Harry", "sat", "on", "his"] - ["broomstick"] <-- target networkTrainer.loadSentenceStructureNormals() networkTrainer.loadVocabularyNormals(MLNetworkV) # Pass the vectors into the network MLNetworkSS.loadVectorsIntoNetwork(networkTrainer._TrainingSequenceSS, networkTrainer._TrainingTargetsSS) # Passs into vocab network here **** # Fit data MLNetworkSS.FitNetwork() MLNetworkV.FitNetwork() # Fit to vocab network here **** # Use console argument "-utss" to activate #testing uTester = None if(_isUnitTestingSS): #if(uTester == None): #uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS, _TrainRangeV) #uTester.TestSentenceStructuring() print("_isUnitTestingSS is true") # use console argument "-utv" to activate if(_isUnitTestingV): #if(uTester == None): #uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS, _TrainRangeV) #uTester.TestVocabulary() print("_isUnitTestingV is true") if(_TestSentence != ""): print("_TestSentence is true") printToFile = False f = None # user has specified output location if(_OutputFile != None): printToFile = True f = open(_OutputFile,'w') genSize = _TestSequenceGenSize #要生成的目标文章的大小 initialInput = _TestSentence if(printToFile): f.write(initialInput + " ") else: print(initialInput + " ", end="") initialInput = initialInput.split() # 输入的关键词分割 # generate a sentence of genSize for index in range(0, genSize): #print(initialInput) nlo = NaturalLanguageObject(initialInput) #解决中文切词中存在的二意切词问题,为了让测试数据的维度能匹配训练数据的维度,要丢掉[('word', tag), ('word', tag),('word', tag),('word', tag),('word', tag)...] #头部多余的部分tuple,否则KNN分类器会报错。 diff = len(nlo.sentenceNormalised) - _TrainRangeSS if(diff > 0): nlo.sentenceNormalised = nlo.sentenceNormalised[diff:] # since nlo will always be the right size, we can use that variable predToke = MLNetworkSS.getPrediction([nlo.sentenceNormalised]) nextToke = nlo.tokeniseNormals([predToke]) # now we have the next toke in the sentence, convert that to word word = MLNetworkV.getPredictedWord(nlo.sentenceNormalised[-1], nextToke[0]) # decide whether to print to file or console if(printToFile): f.write(str(word) + " ") else: print(str(word) + " ", end="") initialInput.append(word) # maintain a size of 'genSize' del initialInput[0] print("\n") # Reset console back to original state deinit()
class UnitTester: neuralNetworkSS = None neuralNetworkV = None VectorSizeSS = 3 VectorSizeV = 1 _TestingPara = testingParaHarryPotter _TestingParaNlo = NaturalLanguageObject(_TestingPara) def TestVocabulary(self): #testingPara = testingParaHarryPotter testingPara = self._TestingPara passedTests = [] nonFatalTests = [] failedTests = [] # Build a test sequence form each word for index, val in enumerate( self._TestingParaNlo.sentenceTokenList[1:]): prevWord = self._TestingParaNlo.sentenceTokenList[index - 1][0] prevWordToken = self._TestingParaNlo.sentenceTokenList[index - 1][1] prevWordTokenNormal = self._TestingParaNlo.sentenceNormalised[index - 1] curWord = val[0] curToken = val[1] curNormal = self._TestingParaNlo.sentenceNormalised[index] prediction = self.neuralNetworkV.getPredictedWord( prevWordTokenNormal, curToken) probList = self.neuralNetworkV.getPredictionProbability( prevWordTokenNormal, curToken) prob = 0 for val in probList[0]: if (val > prob): prob = val if (str(curWord.lower()) == str(prediction).lower()): passedTests.append("(" + str(prevWord) + ", " + str(prevWordToken) + ") Target: " + str(curWord) + " Pred: " + str(prediction) + " " + str(prob * 100) + "%") else: if (prob < 0.2): failedTests.append("(" + str(prevWord) + ", " + str(prevWordToken) + ") Target: " + str(curWord) + " Pred: " + str(prediction) + " " + str(prob * 100) + "%") elif (prob > 0.6): passedTests.append("(" + str(prevWord) + ", " + str(prevWordToken) + ") Target: " + str(curWord) + " Pred: " + str(prediction) + " " + str(prob * 100) + "%") else: nonFatalTests.append("(" + str(prevWord) + ", " + str(prevWordToken) + ") Target: " + str(curWord) + " Pred: " + str(prediction) + " " + str(prob * 100) + "%") # print results print("\n") print("********** TestSentenceStructuring() **********") print("\n") ConsoleOutput.printUnderline("Failed Tests: (" + str(len(failedTests)) + "/" + str(len(testingPara)) + ")") for val in failedTests: ConsoleOutput.printRed(val) print("\n") ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" + str(len(nonFatalTests)) + "/" + str(len(testingPara)) + ")") for val in nonFatalTests: ConsoleOutput.printYellow(val) print("\n") ConsoleOutput.printUnderline("Passed Tests: (" + str(len(passedTests)) + "/" + str(len(testingPara)) + ")") for val in passedTests: ConsoleOutput.printGreen(val) print("\n") ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) + " Non-Fatals: " + str(len(nonFatalTests)) + " Fails: " + str(len(failedTests))) print("\n") def TestSentenceStructuring(self): #testingPara = testingParaHarryPotter testingPara = self._TestingPara passedTests = [] nonFatalTests = [] failedTests = [] # used to predict accuracy of the network acTestPred = [] acTestTrue = [] # Build a test sequence form each word for index, val in enumerate(testingPara): tmpTestSeq = [] target = None # grab the next 3 words after if (index < len(testingPara) - (self.VectorSizeSS + 1)): for index2 in range(0, self.VectorSizeSS): tmpTestSeq.append(testingPara[index + index2]) target = testingPara[index + self.VectorSizeSS] # convert to natural language object nloTester = NaturalLanguageObject(tmpTestSeq) nloTarget = NaturalLanguageObject([target]) # get nerual network prediction normalPred = self.neuralNetworkSS.getPrediction( nloTester.sentenceNormalised) prediction = str(nloTester.tokeniseNormals([normalPred])) comp = str(nloTarget.sentenceTags) cTrue = nloTarget.sentenceNormalised[0] acTestTrue.append(cTrue * 100) acTestPred.append(normalPred * 100) #if first letters match, this means 'NN' will match with 'NNS' if (prediction[2] == comp[2]): #filter for probability probList = self.neuralNetworkSS.getPredictionProbability( nloTester.sentenceNormalised) prob = 0 for val in probList[0]: if (val > prob): prob = val passedTests.append( str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " + str(prob * 100) + "%") else: probList = self.neuralNetworkSS.getPredictionProbability( nloTester.sentenceNormalised) prob = 0 for val in probList[0]: if (val > prob): prob = val # if accuracy s less than 30% add to failed list if (prob < 0.3): failedTests.append( str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " + str(prob * 100) + "%") else: # if probability is more than 60% its probably passed if (prob > 0.6): passedTests.append( str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " + str(prob * 100) + "%") else: nonFatalTests.append( str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " + str(prob * 100) + "%") # print results print("\n") print("********** TestSentenceStructuring() **********") print("\n") ConsoleOutput.printUnderline("Failed Tests: (" + str(len(failedTests)) + "/" + str(len(testingPara)) + ")") for val in failedTests: ConsoleOutput.printRed(val) print("\n") ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" + str(len(nonFatalTests)) + "/" + str(len(testingPara)) + ")") for val in nonFatalTests: ConsoleOutput.printYellow(val) print("\n") ConsoleOutput.printUnderline("Passed Tests: (" + str(len(passedTests)) + "/" + str(len(testingPara)) + ")") for val in passedTests: ConsoleOutput.printGreen(val) print("\n") nnAccuracy = accuracy_score( np.array(acTestTrue).astype(int), np.array(acTestPred).astype(int)) ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) + " Non-Fatals: " + str(len(nonFatalTests)) + " Fails: " + str(len(failedTests))) ConsoleOutput.printYellow("NeuralNetork accuracy: " + str(round(nnAccuracy * 100, 1)) + "%") print("\n") def __init__(self, inNeuralNetworkSS, inNeuralNetworkV, inVectorSizeSS, inVectorSizeV): self.neuralNetworkSS = inNeuralNetworkSS self.neuralNetworkV = inNeuralNetworkV self.VectorSizeSS = inVectorSizeSS self.VectorSizeV = inVectorSizeV
def Main(): _isUnitTestingSS = False _isUnitTestingV = False _recursiveInput = False _TrainingDataInputFile = "Datasets/HarryPotter(xxlarge).txt" _TestSentence = "" _TestSequenceGenSize = 30 consoleInArgs = sys.argv[1:] # check input arguments for index, val in enumerate(consoleInArgs): # Runs the unit testing module on initiation if (val == "-utss"): _isUnitTestingSS = True # Unit testing for the vocabulary network elif (val == "-utv"): _isUnitTestingV = True elif (len(consoleInArgs) >= index + 1): # specify training data location if (val == "-td"): _TrainingDataInputFile = consoleInArgs[index + 1] ConsoleOutput.printGreen( "Training data load locaiton changed to: \"" + _TrainingDataInputFile + "\"") # give a generation sentence input elif (val == "-ts"): _TestSentence = consoleInArgs[index + 1] if (len(_TestSentence.split()) != _TrainRangeSS): raise ValueError( 'Test sequence must be the same length as the vector training size. (' + str(_TrainRangeSS) + ')') # set the amount of words generated after input elif (val == "-tsc"): _TestSequenceGenSize = int(consoleInArgs[index + 1]) else: raise ValueError('Un-recognized console argument: ' + str(val)) # Initialise colorama cross-platform console logging init() MLNetworkSS = NNSentenceStructure() MLNetworkV = NNVocabulary() # Network trainer converts text data into normalized vectors that # can be passed into the networks networkTrainer = NetworkTrainer(_TrainRangeSS, _TrainRangeV) networkTrainer.loadTextFromFile(_TrainingDataInputFile) # Trainer parses the structure into vector normal arrays of size (_TrainRangeSS) # the next word of the squence is used as the target, example # ["Harry", "sat", "on", "his"] - ["broomstick"] <-- target networkTrainer.loadSentenceStructureNormals() networkTrainer.loadVocabularyNormals(MLNetworkV) # Pass the vectors into the network MLNetworkSS.loadVectorsIntoNetwork(networkTrainer._TrainingSequenceSS, networkTrainer._TrainingTargetsSS) # Passs into vocab network here **** # Fit data MLNetworkSS.FitNetwork() MLNetworkV.FitNetwork() # Fit to vocab network here **** # Use console argument "-utss" to activate #testing uTester = None if (_isUnitTestingSS): if (uTester == None): uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS, _TrainRangeV) uTester.TestSentenceStructuring() # use console argument "-utv" to activate if (_isUnitTestingV): if (uTester == None): uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS, _TrainRangeV) uTester.TestVocabulary() if (_TestSentence != ""): genSize = _TestSequenceGenSize initialInput = _TestSentence print(initialInput + " ", end="") initialInput = initialInput.split() # generate a sentence of genSize for index in range(0, genSize): nlo = NaturalLanguageObject(initialInput) # since nlo will always be the right size, we can use that variable predToke = MLNetworkSS.getPrediction(nlo.sentenceNormalised) nextToke = nlo.tokeniseNormals([predToke]) # now we have the next toke in the sentence, convert that to word word = MLNetworkV.getPredictedWord(nlo.sentenceNormalised[-1], nextToke[0]) print(str(word) + " ", end="") initialInput.append(word) # maintain a size of 'genSize' del initialInput[0] print("\n") # Reset console back to original state deinit()
def Main(): _isUnitTestingSS = False _isUnitTestingV = False _recursiveInput = False _TrainingDataInputFile = "Datasets/HarryPotter(xxlarge).txt" _TestSentence = "" _TestSequenceGenSize = 30 consoleInArgs = sys.argv[1:] # check input arguments for index, val in enumerate(consoleInArgs): # Runs the unit testing module on initiation if(val == "-utss"): _isUnitTestingSS = True # Unit testing for the vocabulary network elif(val == "-utv"): _isUnitTestingV = True elif(len(consoleInArgs) >= index+1): # specify training data location if(val == "-td"): _TrainingDataInputFile = consoleInArgs[index+1] ConsoleOutput.printGreen("Training data load locaiton changed to: \"" + _TrainingDataInputFile + "\"") # give a generation sentence input elif(val == "-ts"): _TestSentence = consoleInArgs[index+1] if(len(_TestSentence.split()) != _TrainRangeSS): raise ValueError('Test sequence must be the same length as the vector training size. (' + str(_TrainRangeSS) + ')') # set the amount of words generated after input elif(val == "-tsc"): _TestSequenceGenSize = int(consoleInArgs[index+1]) else: raise ValueError('Un-recognized console argument: ' + str(val)) # Initialise colorama cross-platform console logging init() neuralNetworkSS = NNSentenceStructure() neuralNetworkV = NNVocabulary() # Network trainer converts text data into normalized vectors that # can be passed into the networks networkTrainer = NetworkTrainer(_TrainRangeSS, _TrainRangeV) networkTrainer.loadTextFromFile(_TrainingDataInputFile) # Trainer parses the structure into vector normal arrays of size (_TrainRangeSS) # the next word of the squence is used as the target, example # ["Harry", "sat", "on", "his"] - ["broomstick"] <-- target networkTrainer.loadSentenceStructureNormals() networkTrainer.loadVocabularyNormals(neuralNetworkV) # Pass the vectors into the network neuralNetworkSS.loadVectorsIntoNetwork(networkTrainer._TrainingSequenceSS, networkTrainer._TrainingTargetsSS) # Passs into vocab network here **** # Fit data neuralNetworkSS.FitNetwork() neuralNetworkV.FitNetwork() # Fit to vocab network here **** # Use console argument "-utss" to activate #testing uTester = None if(_isUnitTestingSS): if(uTester == None): uTester = UnitTester(neuralNetworkSS, neuralNetworkV, _TrainRangeSS, _TrainRangeV) uTester.TestSentenceStructuring() # use console argument "-utv" to activate if(_isUnitTestingV): if(uTester == None): uTester = UnitTester(neuralNetworkSS, neuralNetworkV, _TrainRangeSS, _TrainRangeV) uTester.TestVocabulary() if(_TestSentence != ""): genSize = _TestSequenceGenSize initialInput = _TestSentence print(initialInput + " ", end="") initialInput = initialInput.split() # generate a sentence of genSize for index in range(0, genSize): nlo = NaturalLanguageObject(initialInput) # since nlo will always be the right size, we can use that variable predToke = neuralNetworkSS.getPrediction(nlo.sentenceNormalised) nextToke = nlo.tokeniseNormals([predToke]) # now we have the next toke in the sentence, convert that to word word = neuralNetworkV.getPredictedWord(nlo.sentenceNormalised[-1], nextToke[0]) print(str(word) + " ", end="") initialInput.append(word) # maintain a size of 'genSize' del initialInput[0] print("\n") # Reset console back to original state deinit()
def TestSentenceStructuring(self): #testingPara = testingParaHarryPotter testingPara = self._TestingPara passedTests = [] nonFatalTests = [] failedTests = [] # used to predict accuracy of the network acTestPred = [] acTestTrue = [] # Build a test sequence form each word for index, val in enumerate(testingPara): tmpTestSeq = [] target = None # grab the next 3 words after if(index < len(testingPara)-(self.VectorSizeSS+1)): for index2 in range(0, self.VectorSizeSS): tmpTestSeq.append(testingPara[index+index2]) target = testingPara[index+self.VectorSizeSS] # convert to natural language object nloTester = NaturalLanguageObject(tmpTestSeq) nloTarget = NaturalLanguageObject([target]) # get nerual network prediction normalPred = self.neuralNetworkSS.getPrediction(nloTester.sentenceNormalised) prediction = str(nloTester.tokeniseNormals([normalPred])) comp = str(nloTarget.sentenceTags) cTrue = nloTarget.sentenceNormalised[0] acTestTrue.append(cTrue*100) acTestPred.append(normalPred*100) #if first letters match, this means 'NN' will match with 'NNS' if(prediction[2] == comp[2]): #filter for probability probList = self.neuralNetworkSS.getPredictionProbability(nloTester.sentenceNormalised) prob = 0 for val in probList[0]: if(val > prob): prob = val passedTests.append(str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " +str(prob*100) + "%") else: probList = self.neuralNetworkSS.getPredictionProbability(nloTester.sentenceNormalised) prob = 0 for val in probList[0]: if(val > prob): prob = val # if accuracy s less than 30% add to failed list if(prob < 0.3): failedTests.append(str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " +str(prob*100) + "%") else: # if probability is more than 60% its probably passed if(prob > 0.6): passedTests.append(str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " +str(prob*100) + "%") else: nonFatalTests.append(str(nloTester.sentenceTokenList) + " Target: " + str(nloTarget.sentenceTokenList) + " Prediction: " + prediction + " " +str(prob*100) + "%") # print results print("\n") print("********** TestSentenceStructuring() **********") print("\n") ConsoleOutput.printUnderline("Failed Tests: (" + str(len(failedTests)) + "/" + str(len(testingPara)) + ")") for val in failedTests: ConsoleOutput.printRed(val) print("\n") ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" + str(len(nonFatalTests)) + "/" + str(len(testingPara)) + ")") for val in nonFatalTests: ConsoleOutput.printYellow(val) print("\n") ConsoleOutput.printUnderline("Passed Tests: (" + str(len(passedTests)) + "/" + str(len(testingPara)) + ")") for val in passedTests: ConsoleOutput.printGreen(val) print("\n") nnAccuracy = accuracy_score(np.array(acTestTrue).astype(int), np.array(acTestPred).astype(int)) ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) + " Non-Fatals: " + str(len(nonFatalTests)) + " Fails: " + str(len(failedTests))) ConsoleOutput.printYellow("NeuralNetork accuracy: " + str(round(nnAccuracy*100,1)) + "%") print("\n")