def predict(parser_params, metric: int) -> List[int]: ''' Runs method to parse the review's file obtaining a List of reviews. Then uses this List to predict the reviews quality with the correspondent metric (1 or 2). :Return List[int]: the quality predicted for each review (0 - insufficient; 1 - sufficient; 2 - good; 3 - excellent) ''' # runs the parser parsed_data = _parse_file(parser_params) if not parsed_data: print('Failed to parse reviews') input_prediction = [] predictions = None for entry in parsed_data: text_features = [] # enter author's reputation text_features.append(1) # finding patterns tokens = tokensManager.GetTokens(entry, 0) tagsTokens = taggerManager.TaggerComment(entry) tags = taggerManager.TagsDict(tagsTokens) patt1, patt3, patt4, patt5 = patternsManager.GetPatternsDict(tags) number_tuples = len(patt1[1]) + len(patt3[1]) + len(patt3[1]) + len( patt3[1]) text_features.append(number_tuples) # correctness correctness = correctnessManager.Correctness(entry) text_features.append(correctness) # enter features input_prediction.append(text_features) predictions = annManager.AnnPredict(input_prediction, metric) if not predictions.any(): print( "Prediction failed. Make sure the requested model was trained and the given metric is valid(1 or 2)." ) return [] return predictions
def Correctness(comment): tokens = tokensManager.GetTokens(comment, 1) corr = 0 if not tokens: #no tokens found return 0.0 with open('UserDictionary_pt.txt', encoding="utf8") as list1: dict1 = list1.read().split("\n") for w in tokens: if w in dict1: corr += 1 else: pass acc = (corr / len(tokens)) * 100 return acc
def train_model(parsed_data: List, metric: int): ''' Trains the model using the specified metric(1 or 2) or input 0 to train both. ''' input_text = [] output_text = [] for entry in parsed_data: text_features = [] # enter author's reputation text_features.append(1) # finding patterns tokens = tokensManager.GetTokens(entry[0], 0) tagsTokens = taggerManager.TaggerComment(entry[0]) tags = taggerManager.TagsDict(tagsTokens) patt1, patt3, patt4, patt5 = patternsManager.GetPatternsDict(tags) number_tuples = len(patt1[1]) + len(patt3[1]) + len(patt3[1]) + len( patt3[1]) text_features.append(number_tuples) # correctness correctness = correctnessManager.Correctness(entry[0]) text_features.append(correctness) # enter features input_text.append(text_features) # enter classification output_text.append(entry[1]) print("The features vector (input) is", input_text, "and the expected value (output) is", output_text, "\n") annManager.AnnTraining(input_text, output_text, metric)
output_text = [] # from here, you must have load your texts, to extract their features. It can be by SQL file, txt files, csv... anyway, feel free, it is important the plain text! file_examples = open("examples.txt", encoding="utf8") for text in file_examples.readlines(): # list with the text features. text_features = [] # author reputation author = 1 print("Author Reputation:", author) text_features.append(author) # number of tuples tokens = tokensManager.GetTokens(text, 0) tagsTokens = taggerManager.TaggerComment(text) tags = taggerManager.TagsDict(tagsTokens) patt1, patt3, patt4, patt5 = patternsManager.GetPatternsDict(tags) number_tuples = len(patt1[1]) + len(patt3[1]) + len(patt3[1]) + len( patt3[1]) print("Number of Tuples:", number_tuples) text_features.append(number_tuples) # correctness correctness = correctnessManager.Correctness(text) print("Correctness: ", correctness, "\n-----\n") text_features.append(correctness) # in the end, add this list in the input text list, to the training purposes.
def TaggerComment(comment): tokens = tokensManager.GetTokens(comment, 0) tags = TaggerOffline(tokens) #tags = TaggerOnline(tokens) return tags