) # Neural Dependency Parser from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = dep_parser.parse( 'What is the airspeed of an unladen swallow ?'.split()) print([[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]) print( "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n" ) # Tokenizer parser = CoreNLPParser(url='http://localhost:9000') print(list(parser.tokenize('What is the airspeed of an unladen swallow?'))) print( "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n" ) # POS Tagger pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') print( list(pos_tagger.tag( 'What is the airspeed of an unladen swallow ?'.split()))) print( "\nExpected: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]\n" ) # NER Tagger ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
parser = CoreNLPParser(url='http://localhost:9001') print('parser generated!') exception_sen = [] tree_list = [] p_phrase_trees = None f_input = open(args.input, mode='r', encoding='utf-8') f_output = open(args.output, mode='w', encoding='utf-8') # f_output = open(args.output, 'wt') for senid, line in enumerate(f_input): print(senid) # if senid > 100: # break try: p_parse_trees = list(parser.parse(parser.tokenize(line))) except ValueError: print('parsing fail') exception_sen.append(senid) p_parse_trees = [Tree.fromstring('(S (NULL ERROR))') ] # we simply give a dummy tree f_output.write('%d\n' % len(p_parse_trees)) for sub_tree in p_parse_trees: f_output.write(str(sub_tree)) f_output.write('\n|||\n') # str_tree = ' '.join(p_parse_trees) # f_output.write(str_tree) # f_output.write('\n') # tree_list.append(p_parse_trees) f_input.close()
class custom_parse_handler: corenlp_host = 'http://localhost:9000' # CoreNLP server host main_categories = ['geography', 'music', 'movies'] # Categories utilized in the project def __init__(self, input_file, output_file, dbConnector): self.ip_file = input_file # Input statements self.op_file = output_file # Generated output streamed to this file apart from the command prompt self.parser = CoreNLPParser( url=self.corenlp_host ) # Initializing the connection with CoreNLP parse self.testParserConnection() # Setting up the word2vec model corpusFilePath = os.path.dirname(os.path.realpath( __file__)) + os.path.sep + "tools" + os.path.sep + "word2vec" corpusFileName = "GoogleNews-vectors-negative300.bin" self.filePath = corpusFilePath # file path self.fileName = corpusFilePath + os.path.sep + corpusFileName # Constructing the full path for the file name self.model = KeyedVectors.load_word2vec_format(self.fileName, binary=True) self.stopWords = nltk.corpus.stopwords.words('english') self.fileNewLine = "\n" self.dbConnector = dbConnector def testParserConnection(self): str = "This is a test statement" try: list(self.parser.parse(str.split())) except Exception as e: print("Error while connecting to CoreNLP server. Exiting.") sys.exit() def getParseTree(self, sentence): return list(self.parser.parse(sentence.split())) def displayConstructedParseTree(self, parseTree, fileObj=None): for entry in parseTree: if fileObj is None: entry.pretty_print() else: entry.pretty_print(stream=fileObj) def updatePredictedCategoryForWord(self, entry, category_sum): for i in range(len(self.main_categories)): try: sim_val = self.model.similarity(entry, self.main_categories[i]) category_sum[self.main_categories[i]] += sim_val except KeyError: pass return category_sum def getCategoryWithMaxVoting(self, categoryMap): max_val = None max_category = None for entry in categoryMap: val = categoryMap[entry] if max_val is None or val > max_val: max_val = val max_category = entry return max_category def assignCategory(self, statement): # Special case: Statements associated with Geography are misclassified when beginning with "where is" if statement.lower().startswith('where is'): return 'geography' tokens = list(self.parser.tokenize(statement)) filtered_words = [w for w in tokens if not w in self.stopWords] filtered_words_lower = [w.lower() for w in filtered_words] category_sum = {} for entry in self.main_categories: # Exclude the category 'geography' if the words related to 'birth' are present in the sentence if (entry != 'geography' and 'capital' not in filtered_words_lower ) or (entry == 'geography' and not ('born' in filtered_words_lower or 'birth' in filtered_words_lower)): category_sum[entry] = 0 for entry in filtered_words: category_sum = self.updatePredictedCategoryForWord( entry, category_sum) return self.getCategoryWithMaxVoting(category_sum) # Direct the output to the default output stream and an output file. def outputGenerator(self, statement, query, answer, opFileObj=None): if opFileObj is not None: opFileObj.write("<QUESTION> " + statement) opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) if query is not None: opFileObj.write("<QUERY> " + query) else: opFileObj.write("<QUERY> ") opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) opFileObj.write("<ANSWER> " + answer) opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) print("<QUESTION> ", statement, "\n") if query is not None: print("<QUERY> ", query, "\n") else: print("<QUERY>\n") print("<ANSWER> ", answer, "\n\n") # Process the parse tree to extract projections and perform translation into SQL queries def extractProjections(self, parseTree, queryObj, category): # Starts off the entire tree recursion process for entry in parseTree: # entry.pretty_print() self.processRecurse(None, entry, queryObj, category) # Recursively traverse the parse tree using DFS and generate transitions for each parent, child node-pair def processRecurse(self, parent, treeObj, queryObj, category): if type(treeObj) != nltk.tree.Tree: # leaf node transition_obj = transition(parent, treeObj, None, queryObj, category) return treeObj, transition_obj if "." == treeObj.label() or "DT" == treeObj.label( ): # do not handle determiners or punctuation return "", None str_transition = treeObj.label() + " " + "->" current_children = [] for i in range(len(treeObj)): label, transition_obj_inter = self.processRecurse( treeObj.label(), treeObj[i], queryObj, category) if transition_obj_inter is not None: current_children.append(transition_obj_inter) str_transition += " " + label if treeObj.label() != 'ROOT': transition_obj_fin = transition(parent, str_transition, current_children, queryObj, category) else: # Root of the tree is encountered transition_obj_fin = None return treeObj.label(), transition_obj_fin def parseInputFile( self ): # Parse the statements in the input file sequentially and perform semantic tranformation ipFileObj = open(self.ip_file, "r") opFileObj = open(self.op_file, "w") try: for entry in ipFileObj: question = entry.strip() if not question.startswith('--'): queryObj = queryForm() parseTree = self.getParseTree( question) # Generate parse tree category = self.assignCategory( question) # Assign probable category self.extractProjections( parseTree, queryObj, category ) # Extract the projections and generate the query object # queryObj.printComponents() queryObj.constructQuery() # Construct the final query results = self.dbConnector.getResults( queryObj, category ) # Execute the query in the database and generate results self.outputGenerator(question, queryObj.getQueryStr(), results, opFileObj) except Exception as e: print("Error while processing.") print(e) finally: ipFileObj.close() opFileObj.close()
for category in categoryNames: #set up list of responses for each category wordListResponse = [] responseLengthList = [] completeWordList = [] responseList = [] #loop through each response for that category for response in d.loc[d["image"] == category, "nameing_response"]: #break response into a list of unique words #first clean up response (removing punctuation, emoji, etc.) response_cleaned = clean_response(response) #now tokenize curWordList = list(parser.tokenize(response_cleaned)) #tokenize #add to list of word response lists wordListResponse.append(curWordList) #add to list tracking the number of words in each response responseLengthList.append(len(curWordList)) #list of all individual word responses completeWordList = completeWordList + curWordList responseList.append(".".join(curWordList)) #number of responses to category number_responses.append(len(responseLengthList))
#set up list of responses for each category wordListResponse_1 = [] lemmaListResponse_1 = [] responseLengthList_1 = [] completeWordList_1 = [] completeLemmaList_1 = [] responseList_1 = [] #loop through each response for that category for response in d.loc[d["angle"] == category1, "nameing_response"]: #break response into a list of unique words #first clean up response (removing punctuation, emoji, etc.) response_cleaned = clean_response(response) #now tokenize curWordList = list(parser.tokenize(response)) #tokenize #add to list of word response lists wordListResponse_1.append(curWordList) #add to list tracking the number of words in each response responseLengthList_1.append(len(curWordList)) #list of all individual word responses completeWordList_1 = completeWordList_1 + curWordList responseList_1.append(".".join(curWordList)) #set up list of responses for each category wordListResponse_2 = [] lemmaListResponse_2 = []
class SingleSentencePlot(SentencePlot): def __init__(self, config, nl_model): super().__init__(config) self.nl_model = nl_model self.server_url = 'http://localhost:9000' self.parser = CoreNLPParser(url=self.server_url) def plot(self, max_relevance_words_in_plot, sentence_id="1004293:0"): results_file = self.config.get_file_of_results( self.nl_model.config.name_of_model) if not os.path.isfile(results_file): raise ("[!] Data %s not found" % results_file) index = -1 original_sentences = self.nl_model.internal_data_loader.original_sentence_training with open(results_file, 'r') as file: for line in file: sentences = json.loads(line) sentences.pop(0) for sentence in sentences: index += 1 if sentence['sentence_id'] == sentence_id: sentence_index = sentence['sentence_index'] tokenized_sentence = list( self.parser.tokenize( original_sentences[sentence_index])) print("tokenized_sentence ", tokenized_sentence) max_word_relevance = np.full( (3, max_relevance_words_in_plot), -1.) relevant_words = np.empty( [3, max_relevance_words_in_plot], dtype=object) for attribute_dict in sentence[ 'subsets_word_relevance_linear_regression']: for i in range(3): min_value = max_word_relevance[i].min() min_index = max_word_relevance[i].argmin() if attribute_dict[str(i)] > min_value: max_word_relevance[i][ min_index] = attribute_dict[str(i)] n_indices = len( attribute_dict['indices_attribute']) word = str(tokenized_sentence[ attribute_dict['indices_attribute'] [0]]) for w in range(1, n_indices): word = word + ' ' + \ str(tokenized_sentence[attribute_dict['indices_attribute'][w]]) relevant_words[i][min_index] = word intercept = sentence['intercepts_slr'] self.plot_final_results( max_word_relevance, relevant_words, np.argmax(sentence['prediction']), "OWN", sentence_index, sentence_id, self.nl_model.config.name_of_model, intercept) self.plot_final_results( max_word_relevance, relevant_words, np.argmax(sentence['aspect_polarity_matrix']), "OWN", sentence_index, sentence_id, self.nl_model.config.name_of_model, intercept) max_word_relevance = np.full( (3, max_relevance_words_in_plot), -1.) relevant_words = np.empty( [3, max_relevance_words_in_plot], dtype=object) for attribute_dict in sentence[ 'subsets_word_relevance_pred_difference']: for i in range(3): min_value = max_word_relevance[i].min() min_index = max_word_relevance[i].argmin() if attribute_dict[str(i)] > min_value: max_word_relevance[i][ min_index] = attribute_dict[str(i)] indices = attribute_dict[ 'indices_attribute'] if type(indices) == list: n_indices = len( attribute_dict['indices_attribute'] ) word = str(tokenized_sentence[ attribute_dict['indices_attribute'] [0]]) for w in range(1, n_indices): word = word + ' ' + \ str(tokenized_sentence[attribute_dict['indices_attribute'][w]]) else: word = str(tokenized_sentence[indices]) relevant_words[i][min_index] = word intercept = np.zeros(len(sentence['prediction'])) self.plot_final_results( max_word_relevance, relevant_words, np.argmax(sentence['prediction']), "LACE", sentence_index, sentence_id, self.nl_model.config.name_of_model, intercept) self.plot_final_results( max_word_relevance, relevant_words, np.argmax(sentence['aspect_polarity_matrix']), "LACE", sentence_index, sentence_id, self.nl_model.config.name_of_model, intercept) max_word_relevance = np.full( (3, max_relevance_words_in_plot), -1.) relevant_words = np.empty( [3, max_relevance_words_in_plot], dtype=object) for attribute_dict in sentence[ 'word_relevance_linear_regression']: for i in range(3): min_value = max_word_relevance[i].min() min_index = max_word_relevance[i].argmin() if attribute_dict[str(i)] > min_value: max_word_relevance[i][ min_index] = attribute_dict[str(i)] relevant_words[i][min_index] = str( tokenized_sentence[attribute_dict[ 'indices_attribute'][0]]) intercept = sentence['intercepts_slr'] self.plot_final_results( max_word_relevance, relevant_words, np.argmax(sentence['prediction']), "LIME", sentence_index, sentence_id, self.nl_model.config.name_of_model, intercept) self.plot_final_results( max_word_relevance, relevant_words, np.argmax(sentence['aspect_polarity_matrix']), "LIME", sentence_index, sentence_id, self.nl_model.config.name_of_model, intercept)
class SentenceExplanationPlot: def __init__(self, neural_language_model): self.neural_language_model = neural_language_model self.server_url = 'http://localhost:9000' self.parser = CoreNLPParser(url=self.server_url) def run(self, sentence_id): file = self.neural_language_model.config.get_explanation_file( self.neural_language_model.config.name_of_model, sentence_id) with open(file, 'r') as file: for line in file: sentences = json.loads(line) for sentence in sentences: lemmatized_sentence = sentence['lemmatized_sentence'] original_sentence = sentence['original_sentence'] tokenized_sentence = list( self.parser.tokenize(original_sentence)) aspect_indices = sentence['aspects'] sentence_id = sentence['sentence_id'] sentence_index = sentence['sentence_index'] argmax_pred = np.argmax(sentence['prediction']) lace = [] lime = [] own = [] relation_yes = {} aspect_sentiment_positive = {} aspect_sentiment_negative = {} word_sentiment_positive = {} word_sentiment_negative = {} attention_score = {} x = [] if self.neural_language_model.config.name_of_model == "LCR_Rot_hop_model": for i in range(self.neural_language_model.config. n_iterations_hop): relation_yes[i] = [] aspect_sentiment_positive[i] = [] aspect_sentiment_negative[i] = [] word_sentiment_positive[i] = [] word_sentiment_negative[i] = [] attention_score[i] = [] else: relation_yes[0] = [] aspect_sentiment_positive[0] = [] aspect_sentiment_negative[0] = [] word_sentiment_positive[0] = [] word_sentiment_negative[0] = [] attention_score[0] = [] for index in range(len(lemmatized_sentence)): if index in aspect_indices: continue original_word = tokenized_sentence[index] x.append(original_word) lemma = lemmatized_sentence[index] word_info = sentence[lemma] lime.append(word_info['relevance_linear_regression'] [argmax_pred]) lace.append(word_info['subset_pred_dif'][argmax_pred]) own.append(word_info['subset_linear_reg'][argmax_pred]) if self.neural_language_model.config.name_of_model == "LCR_Rot_hop_model": for i in range(self.neural_language_model.config. n_iterations_hop): attention_score[i].append( word_info['attention_score_' + str(i)]) aspect_sentiment_positive[i].append(word_info[ 'weighted_states_pred_aspect_sentiments_' + str(i)][0]) aspect_sentiment_negative[i].append(word_info[ 'weighted_states_pred_aspect_sentiments_' + str(i)][1]) relation_yes[i].append( word_info['weighted_states_pred_relations_' + str(i)][0]) word_sentiment_positive[i].append(word_info[ 'weighted_states_pred_word_sentiments_' + str(i)][0]) word_sentiment_negative[i].append(word_info[ 'weighted_states_pred_word_sentiments_' + str(i)][1]) else: attention_score[0].append( word_info['attention_score']) aspect_sentiment_positive[0].append(word_info[ 'weighted_states_pred_aspect_sentiments'][0]) aspect_sentiment_negative[0].append(word_info[ 'weighted_states_pred_aspect_sentiments'][1]) relation_yes[0].append( word_info['weighted_states_pred_relations'][0]) word_sentiment_positive[0].append(word_info[ 'weighted_states_pred_word_sentiments'][0]) word_sentiment_negative[0].append(word_info[ 'weighted_states_pred_word_sentiments'][1]) sum_lace = np.sum(np.abs(lace)) sum_lime = np.sum(np.abs(lime)) sum_own = np.sum(np.abs(own)) average_lace = np.array(lace) / sum_lace print("average_lace ", average_lace) average_lime = np.array(lime) / sum_lime print("average_lime ", average_lime) average_own = np.array(own) / sum_own print("own ", own) print("average_own ", average_own) for i in range(self.neural_language_model.config. n_iterations_hop): self.plot(x, average_lime, average_lace, average_own, attention_score[i], aspect_sentiment_positive[i], aspect_sentiment_negative[i], word_sentiment_positive[i], word_sentiment_negative[i], relation_yes[i], sentence_id, sentence_index, i) def plot(self, x, lime, lace, own, attention_score, aspect_sentiment_positive, aspect_sentiment_negative, word_sentiment_positive, word_sentiment_negative, relation_yes, sentence_id, index_number, weight_number): fig, ax = plt.subplots() fig.set_size_inches(40.5, 14.5) ax.tick_params(length=15, axis='x', width=3, labelsize=58) ax.tick_params(length=15, axis='y', width=3, labelsize=40) plt.subplots_adjust(bottom=0.55) ax.set_ylim([-0.1, 1.1]) ax.axhline(0, color='grey', alpha=0.50) index = np.array([2 + x * 1.25 for x in range(len(x))]) print(index) bar_width = 0.10 opacity = 0.8 rects1 = plt.bar(index, relation_yes, bar_width, alpha=opacity, align='center', color='C0', label='ARC', edgecolor='black') rects2 = plt.bar(index + bar_width, aspect_sentiment_positive, bar_width, alpha=opacity, align='center', color='C1', label='ARWSC positive', edgecolor='black') rects3 = plt.bar(index + bar_width * 2, aspect_sentiment_negative, bar_width, alpha=opacity, align='center', color='C2', label='ARWSC negative', edgecolor='black') rects4 = plt.bar(index + bar_width * 3, word_sentiment_positive, bar_width, alpha=opacity, align='center', color='C3', label='WSC positive', edgecolor='black') rects5 = plt.bar(index + bar_width * 4, word_sentiment_negative, bar_width, alpha=opacity, align='center', color='C4', label='WSC negative', edgecolor='black') rects6 = plt.bar(index + bar_width * 5, attention_score, bar_width, alpha=opacity, align='center', color='C9', label='Attention score', edgecolor='black') rects7 = plt.bar(index + bar_width * 6, lime, bar_width, alpha=opacity, align='center', color='C6', label='A-LIME', edgecolor='black') rects8 = plt.bar(index + bar_width * 7, lace, bar_width, alpha=opacity, align='center', color='C7', label='A-LACE', edgecolor='black') rects9 = plt.bar(index + bar_width * 8, own, bar_width, alpha=opacity, align='center', color='C8', label='LETA', edgecolor='black') plt.xticks(index + bar_width, x) plt.xticks(index + bar_width * 2, x) plt.xticks(index + bar_width * 3, x) plt.xticks(index + bar_width * 4, x) plt.legend(loc='upper left', prop={'size': 36}) plt.tight_layout() # plt.show() model_name = self.neural_language_model.config.name_of_model file = self.neural_language_model.config.get_plot_entire_sentence( model_name, sentence_id, index_number, weight_number) plt.savefig(file)
senid += 1 # stopline = 1899 # if senid != stopline: # continue # senid = stopline fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase) alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]), len(fe_phrase[0])) # fe_phrase = fe_phrases[id] # ef_phrase = ef_phrases[id] BP, BP_pos = phrase_extraction(fe_phrase[0], ef_phrase[0], alignment) # fe_phrase[0] 是 e 句子 f_sen = ' '.join(ef_phrase[0]) try: p_parse_trees = list(parser.parse(parser.tokenize(f_sen))) except ValueError: print('parsing fail') exception_sen.append(senid) p_parse_trees = [Tree.fromstring('(S (NULL ERROR))') ] # we simply give a dummy tree # create a dict to keep all phrase in different categories p_phrase_dict = {} for tag in phrase_tag: p_phrase_dict[tag] = [] for one_tree in p_parse_trees: # print(one_tree) traverse(one_tree, p_phrase_dict, phrase_tag)
if last_two in categories: next_category = last_two elif last_three in categories: next_category = last_three # make sure all entries have each of the categories above as keys for c in category_english: if c not in chengyu_data: chengyu_data[c] = "" translation_help = dict() # fields that support translation for field in to_translate: to_segment = chengyu_data[field] # substitute out · to help with segmentation to_segment = re.sub(r'·', ' ', to_segment) if to_segment: new_words = list(chengyu_segmenter.tokenize(to_segment)) else: new_words = [] # add extra fields translation_help[field + '_Segmentation'] = new_words translations, sent_dict = lookup(new_words, zh_en_simp_dict) chengyu_data[field + '_Translations'] = translations translation_help[field + '_Sentence_Code'] = sent_dict chengyu_english[chengyu_number] = translation_help chengyu_index[chengyu_number] = chengyu_data chengyu_number += 1 # make necessary json files (chengyu index, translation, simplified chinese dictionary) corpus_from_dict(chengyu_index, chengyu_json_file) corpus_from_dict(chengyu_english, translation_json_file)
from nltk.parse import CoreNLPParser parser = CoreNLPParser('http://localhost:9001') ner_tagger = CoreNLPParser(url='http://localhost:9001', tagtype='ner') segs = list( parser.tokenize( u'截至1月20日24时,中国境内累计报告新型冠状病毒感染的肺炎确诊病例291例(湖北270例,北京5例,广东14例,上海2例)。')) print(list(ner_tagger.tag(segs)))
class ExternalDataLoader: def __init__(self, config): self.ontology_tagging = OntologyTagging() self.config = config self.word_dictionary = self.compute_all_embeddings() self.server_url = 'http://localhost:9000' self.parser = CoreNLPParser(url=self.server_url) self.core_nlp_dependency_parser = CoreNLPDependencyParser( url=self.server_url) def load_external_data(self, load_external_file_name, write_internal_file_name): if not os.path.isfile(load_external_file_name): raise ("[!] Data %s not found" % load_external_file_name) xml_tree = elementTree.parse(load_external_file_name) root = xml_tree.getroot() opinion_counter = 0 total_counter = 0 all_sentences = [] for sentence in root.iter('sentence'): sentence_id = sentence.get('id') original_sentence = sentence.find('text').text tokenized_sentence = list(self.parser.tokenize(original_sentence)) aspects = [] aspect_indices = [] polarities = [] polarity_matrix = [] categories = [] category_matrix = [] for opinions in sentence.iter('Opinions'): for opinion in opinions.findall('Opinion'): total_counter += 1 aspect = opinion.get('target') if aspect != "NULL": opinion_counter += 1 aspects.append(aspect) category = opinion.get('category') polarity = opinion.get('polarity') categories.append(category) polarities.append(polarity) tokenized_aspect = list(self.parser.tokenize(aspect)) aspect_indices.append( self.get_aspect_indices(tokenized_aspect, tokenized_sentence)) polarity_matrix.append( self.get_polarity_number(polarity)) category_matrix.append( self.get_category_number(category)) if len(aspects) != 0: print("opinion_counter ", opinion_counter) sentiment_distribution = self.annotate(original_sentence, properties={ "annotators": "sentiment", "outputFormat": "json", }) processed_sentence = self.process_characters( tokenized_sentence) lemmatized_sentence, part_of_speech_sentence, aspect_dependencies, sentence_negation, sentiments = \ self.lemmatize_and_pos_tagging(processed_sentence, aspect_indices) ontology_classes_sentence = self.ontology_tagging.ontology_classes_tagging( lemmatized_sentence) mentions = self.ontology_tagging.mention_tagging( ontology_classes_sentence) ont_sentiments_sentence, aspect_sentiments_sentence, sentiments_sentence, relations_sentence = \ self.ontology_tagging.polarity_and_aspect_relation_tagging(ontology_classes_sentence, aspect_indices, categories, aspect_dependencies, sentiments) word_embedding_sentence = self.compute_word_embeddings( lemmatized_sentence) dict_sentence = { 'sentence_id': sentence_id, 'original_sentence': original_sentence, 'lemmatized_sentence': lemmatized_sentence, 'sentiment_distribution': sentiment_distribution, 'part_of_speech_tags': part_of_speech_sentence, 'negation_in_sentence': sentence_negation, 'word_polarities': ont_sentiments_sentence, 'aspect_sentiments': aspect_sentiments_sentence, 'word_sentiments': sentiments_sentence, 'word_mentions': mentions, 'aspect_relations': relations_sentence, 'aspects': aspects, 'aspect_indices': aspect_indices, 'polarities': polarities, 'polarity_matrix': polarity_matrix, 'categories': categories, 'category_matrix': category_matrix, 'word_embeddings': word_embedding_sentence } all_sentences.append(dict_sentence) with open(write_internal_file_name, 'w') as outfile: json.dump(all_sentences, outfile, ensure_ascii=False) def get_polarity_number(self, polarity): if polarity == "positive": return [1, 0, 0] elif polarity == "neutral": return [0, 1, 0] elif polarity == "negative": return [0, 0, 1] else: raise Exception("Polarity ", polarity, " is not in the sentence.") def get_category_number(self, category): if category == "AMBIENCE#GENERAL": return [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] elif category == "DRINKS#PRICES": return [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] elif category == "DRINKS#QUALITY": return [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] elif category == "DRINKS#STYLE_OPTIONS": return [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] elif category == "FOOD#GENERAL": return [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0] elif category == "FOOD#PRICES": return [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0] elif category == "FOOD#QUALITY": return [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] elif category == "FOOD#STYLE_OPTIONS": return [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] elif category == "LOCATION#GENERAL": return [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] elif category == "RESTAURANT#GENERAL": return [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] elif category == "RESTAURANT#MISCELLANEOUS": return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0] elif category == "RESTAURANT#PRICES": return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0] elif category == "SERVICE#GENERAL": return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] else: raise Exception("Category ", category, " is not in the sentence.") @staticmethod def get_aspect_indices(aspect, sentence): number_words_in_aspect = len(aspect) number_words_in_sentence = len(sentence) for i in range(number_words_in_sentence): if aspect[0] == sentence[i]: return list(range(i, i + number_words_in_aspect)) raise Exception("Aspect ", aspect, " is not in the sentence ", sentence) def compute_all_embeddings(self): word_dictionary = {} with open(self.config.glove_embeddings, 'r', encoding="utf8") as f: for line in f: word_embedding = line.strip().split() word_dictionary[word_embedding[0]] = list( map(float, word_embedding[1:])) return word_dictionary def compute_word_embeddings(self, sentence): number_words_in_sentence = len(sentence) word_embeddings = np.random.normal(0, 0.05, [number_words_in_sentence, 300]) for word_index in range(number_words_in_sentence): if sentence[word_index] in self.word_dictionary: word_embeddings[word_index] = self.word_dictionary[ sentence[word_index]] return word_embeddings.tolist() @staticmethod def process_characters(sentence): number_words_in_sentence = len(sentence) processed_sentence = [] punctuation_and_numbers = [ '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', '*', '$', '&', '%', '@', '#', '^', '!', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ] alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '' ] punctuation_to_be_replaced = {'–': '-', '’': '\''} for word_index in range(number_words_in_sentence): list_of_word = list(sentence[word_index].lower()) for char_index in range(len(list_of_word) - 1): if list_of_word[char_index] in punctuation_to_be_replaced: list_of_word[char_index] = punctuation_to_be_replaced[ list_of_word[char_index]] if list_of_word[char_index] in alphabet and list_of_word[ char_index + 1] in punctuation_and_numbers: list_of_word[char_index + 1] = '' elif list_of_word[ char_index] in punctuation_and_numbers and list_of_word[ char_index + 1] in alphabet: list_of_word[char_index] = '' word = "".join(list_of_word) if word == '.' and sentence[word_index - 1] == '.': pass else: if word == '.......' or word == '....' or word == '.....' or word == '......' or word == '..': word = '...' processed_sentence.append(word) return processed_sentence def lemmatize_and_pos_tagging(self, sentence, aspect_indices): punctuations = [ '–', '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', '’', '*', '$', '&', '%', '@', '#', '^', '!', '\'', '-' ] parses = self.core_nlp_dependency_parser.parse(sentence) dependencies = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses][0] wordnet_lemmatizer = nltk.WordNetLemmatizer() part_of_speech_sentence = list(range(len(sentence))) lemmatized_sentence = list(range(len(sentence))) sentiments = list(range(len(sentence))) aspects_dependencies = [['no'] * len(sentence) for i in range(len(aspect_indices))] backup_sentence = sentence.copy() interesting_translates = { '-LRB-': '(', '-RRB-': ')', '2\xa01/2': '2 1/2', "''": '"', ':-RRB-': ':)' } sentence_negations = [] for dependency in dependencies: words = [dependency[0][0], dependency[2][0]] part_of_speech = [dependency[0][1], dependency[2][1]] if words[0] in interesting_translates: words[0] = interesting_translates[words[0]] if words[1] in interesting_translates: words[1] = interesting_translates[words[1]] range_list = [0, 1] if words[0] in sentence: index_of_word1 = sentence.index(words[0]) sentence[index_of_word1] = '' else: index_of_word1 = backup_sentence.index(words[0]) range_list = [1] if words[1] in sentence: index_of_word2 = sentence.index(words[1]) sentence[index_of_word2] = '' else: index_of_word2 = backup_sentence.index(words[1]) range_list = [0] word_indices = [index_of_word1, index_of_word2] if dependency[1] == 'neg': sentence_negations.append(word_indices) for aspect_index in range(len(aspect_indices)): if index_of_word1 in aspect_indices[aspect_index] and index_of_word2 not in \ aspect_indices[aspect_index]: aspects_dependencies[aspect_index][ index_of_word2] = dependency[1] elif index_of_word1 not in aspect_indices[aspect_index] and index_of_word2 in \ aspect_indices[aspect_index]: aspects_dependencies[aspect_index][ index_of_word1] = dependency[1] elif index_of_word1 in aspect_indices[ aspect_index] and index_of_word2 in aspect_indices[ aspect_index]: if aspects_dependencies[aspect_index][ index_of_word1] == 'no': aspects_dependencies[aspect_index][ index_of_word1] = dependency[1] else: aspects_dependencies[aspect_index][ index_of_word2] = dependency[1] for i in range_list: if part_of_speech[i].startswith('V'): # Verb part_of_speech_sentence[word_indices[i]] = [1, 0, 0, 0, 0] word = spell(words[i]) lemma = wordnet_lemmatizer.lemmatize(word, wordnet.VERB) sentiments[word_indices[i]] = self.get_sentiment_of_word( word, lemma, wordnet.VERB) lemmatized_sentence[word_indices[i]] = lemma.lower() elif part_of_speech[i].startswith('J'): # Adjective part_of_speech_sentence[word_indices[i]] = [0, 1, 0, 0, 0] word = spell(words[i]) lemma = wordnet_lemmatizer.lemmatize(word, wordnet.ADJ) sentiments[word_indices[i]] = self.get_sentiment_of_word( word, lemma, wordnet.ADJ) lemmatized_sentence[word_indices[i]] = lemma.lower() elif part_of_speech[i].startswith('R'): # Adverb part_of_speech_sentence[word_indices[i]] = [0, 0, 1, 0, 0] word = spell(words[i]) lemma = wordnet_lemmatizer.lemmatize(word, wordnet.ADV) sentiments[word_indices[i]] = self.get_sentiment_of_word( word, lemma, wordnet.ADV) lemmatized_sentence[word_indices[i]] = lemma.lower() elif part_of_speech[i].startswith('N'): # Noun part_of_speech_sentence[word_indices[i]] = [0, 0, 0, 1, 0] word = spell(words[i]) lemma = wordnet_lemmatizer.lemmatize(word, wordnet.NOUN) sentiments[word_indices[i]] = self.get_sentiment_of_word( word, lemma, wordnet.NOUN) lemmatized_sentence[word_indices[i]] = lemma.lower() else: # Otherwise part_of_speech_sentence[word_indices[i]] = [0, 0, 0, 0, 1] if words[i] not in punctuations: words[i] = spell(words[i]) lemma = wordnet_lemmatizer.lemmatize(words[i]) sentiments[word_indices[i]] = [0, 0, 1] lemmatized_sentence[word_indices[i]] = lemma.lower() return lemmatized_sentence, part_of_speech_sentence, aspects_dependencies, sentence_negations, sentiments @staticmethod def get_sentiment_of_word(word, lemma, pos): synsets = wordnet.synsets(word, pos=pos) if len(synsets) != 0: memorized_synset_01 = None check_boolean_01 = False memorized_synset_rest = None check_boolean_rest = False list_of_numbers = [ '04', '02', '03', '05', '06', '07', '08', '09', '10', '11', '12' ] for synset in synsets: synset_split = synset.name().split(".") if synset_split[0] == lemma: swn_synset = sentiwordnet.senti_synset(synset.name()) pos_score = swn_synset.pos_score() neg_score = swn_synset.neg_score() if pos_score > neg_score: return [1, 0, 0] elif neg_score > pos_score: return [0, 1, 0] else: return [0, 0, 1] if synset_split[2] == '01' and not check_boolean_01: memorized_synset_01 = synset check_boolean_01 = True elif synset_split[ 2] in list_of_numbers and not check_boolean_rest: memorized_synset_rest = synset check_boolean_rest = True if check_boolean_01: synset = memorized_synset_01 else: synset = memorized_synset_rest swn_synset = sentiwordnet.senti_synset(synset.name()) pos_score = swn_synset.pos_score() neg_score = swn_synset.neg_score() if pos_score > neg_score: return [1, 0, 0] elif neg_score > pos_score: return [0, 1, 0] else: return [0, 0, 1] return [0, 0, 1] def annotate(self, text, properties=None): assert isinstance(text, str) if properties is None: properties = {} else: assert isinstance(properties, dict) # Checks that the Stanford CoreNLP server is started. try: requests.get(self.server_url) except requests.exceptions.ConnectionError: raise Exception( 'Check whether you have started the CoreNLP server e.g.\n' '$ cd stanford-corenlp-full-2018-02-27/ \n' '$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer' ) data = text.encode() r = requests.post(self.server_url, params={'properties': str(properties)}, data=data, headers={'Connection': 'close'}) output = r.text char_index1 = output.index("sentimentDistribution") char_index2 = output.index("sentimentTree") distribution = output[(char_index1 - 1):(char_index2 - 2)] new_distribution = [] word = [] for char_index in range(len(distribution)): if distribution[char_index].isnumeric(): word.append(distribution[char_index]) elif distribution[char_index] == ',' and len(word) == 1: word.append('.') elif (distribution[char_index] == ',' or distribution[char_index] == ']') and len(word) != 1: number = float("".join(word)) new_distribution.append(number) word = [] return new_distribution
entry_value = entry[0] entry_type = entry[1] if entry_type == 'LOCATION': entities.append(entry_value) return entities currentDT = datetime.datetime.now() print(str(currentDT)) count = 0 passed = 0 for i, city in enumerate(cities['City'].unique()): try: city_ = parser.tokenize(city) classified_paragraphs_list = ner_tagger.tag_sents([city_]) formatted_result = formatted_entities(classified_paragraphs_list) if len(formatted_result) > 0: count += 1 except Exception as e: passed += 1 print(i, city, 'error:', e) pass if i % 100 == 0: print(i, count, passed, city, city_, 'result:', ' '.join(formatted_result)) print(f'Stanford knows {count} out of {cities.City.unique().shape[0]}') print('couldnt process:', passed) currentDT = datetime.datetime.now()
print("\nRaw string") # Parse raw string. print(list(parser.raw_parse(sentence))) # [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])] # Neural Dependency Parser print("\nNeural Dependency Parser") from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = dep_parser.parse(sentence.split()) # [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses] # [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]] # Tokenizer parser = CoreNLPParser(url='http://localhost:9000') print("\nTokenizer") print(list(parser.tokenize(sentence))) # ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?'] # POS Tagger print("\nPOS Tagger") pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') print(list(pos_tagger.tag(sentence.split()))) # [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] # NER Tagger print("\nNER Tagger") ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') print(list(ner_tagger.tag((sentence.split())))) # [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]