def _calc_a_file(self, files_to_compare, debug): assert(isinstance(files_to_compare, FilesToCompare)) # Reading test answers. test_opins = OpinionCollection.from_file( files_to_compare.test_filepath, self.synonyms_filepath, stemmer=self.stemmer) # Reading etalon answers. etalon_opins = OpinionCollection.from_file( files_to_compare.etalon_filepath, self.synonyms_filepath, stemmer=self.stemmer) if debug: print "{} <-> {}, {}".format( files_to_compare.test_filepath, files_to_compare.etalon_filepath, files_to_compare.index) # Comparing test and etalon results. results = self._check(etalon_opins, test_opins) # Save result comparison into file. # TODO. remove path declaration from here. comparison_file = "{}/art{}.comp.txt".format( self.user_answers, str(files_to_compare.index)) if debug: print "Save comparison file: {}".format(comparison_file) results.to_csv(comparison_file) return self._calcPrecisionAndRecall(results)
def to_opinion_collections(self, news_indices, synonyms): # O(N^2) assert (isinstance(news_indices, list)) assert (isinstance(synonyms, SynonymsCollection)) result = [] for news_ID in news_indices: result_opinions = OpinionCollection(None, synonyms) for r in self.relations: assert (isinstance(r, ExtractedRelation)) if r.text_position.news_ID != news_ID: continue if r.label == NeutralLabel(): # ignore neutral labels continue o = r.create_opinion() if not result_opinions.has_opinion_by_synonyms(o): result_opinions.add_opinion(o) result.append((news_ID, result_opinions)) return result
def predict(self, dest_data_type=DataType.Test): self.relation_collections[dest_data_type].reset_labels() for index, relation_groups in enumerate( self.relation_collections[dest_data_type]. iter_by_linked_relations_groups(self.Settings.BatchSize)): batch = Batch(relation_groups, self.Settings.GroupSize) feed_dict = self.create_feed_dict(batch, dest_data_type) result = self.sess.run([self.network.Labels], feed_dict=feed_dict) uint_labels = result[0] for group_index, group in enumerate(batch.iter_groups): for relation in group: assert (isinstance(relation, ExtractedRelation)) self.relation_collections[dest_data_type].apply_label( label=Label.from_uint(int(uint_labels[group_index])), relation_id=relation.relation_id) for news_ID in self.io.get_data_indices(dest_data_type): collection = OpinionCollection(None, self.synonyms, self.Settings.Stemmer) self.relation_collections[dest_data_type].fill_opinion_collection( collection, news_ID, lambda labels: labels[0], debug_check_collection=False) collection.save( self.io.get_opinion_output_filepath( news_ID, self.io.get_model_root(dest_data_type))) return self._evaluate(dest_data_type, self.Settings.Stemmer)
def read_opinions(filepath, synonyms, custom_opin_ends_iter=None, read_sentiment=True, skip_non_added=True): assert (isinstance(synonyms, SynonymsCollection)) assert (callable(custom_opin_ends_iter) or custom_opin_ends_iter is None) assert (isinstance(read_sentiment, bool)) assert (isinstance(skip_non_added, bool)) opinions = OpinionCollection(opinions=[], synonyms=synonyms) it = __iter_opinion_end_values(filepath, read_sentiment) if custom_opin_ends_iter is None \ else custom_opin_ends_iter(read_sentiment) for left_value, right_value, sentiment in tqdm(it, "Reading opinions:"): o = Opinion(value_left=left_value, value_right=right_value, sentiment=Label.from_int(sentiment)) add_result = opinions.try_add_opinion(o) msg = "Warning: opinion '{}->{}' was skipped!".format( o.value_left, o.value_right) if add_result is False: if not skip_non_added: raise Exception(msg) else: print(msg) return opinions
def _process_into_collections(self, indices, data_type): """ Processing all parameters into collections. returns: NewsWordsCollection and RelationCollection """ def find_feature_vector_for_opinion(opinion_vector_collections, opinion): assert(isinstance(opinion_vector_collections, list)) for collection in opinion_vector_collections: assert(isinstance(collection, OpinionVectorCollection)) if not collection.has_opinion(opinion): continue return collection.find_by_opinion(opinion) return None assert(isinstance(indices, list)) erc = ExtractedRelationsCollection() ntc = NewsTermsCollection() for news_index in indices: assert(isinstance(news_index, int)) entity_filepath = self.io.get_entity_filepath(news_index) news_filepath = self.io.get_news_filepath(news_index) opin_filepath = self.io.get_opinion_input_filepath(news_index) neutral_filepath = self.io.get_neutral_filepath(news_index, data_type) news = News.from_file(news_filepath, EntityCollection.from_file(entity_filepath, self.Settings.Stemmer), stemmer=self.Settings.Stemmer) opinions_collections = [OpinionCollection.from_file(neutral_filepath, self.io.get_synonyms_collection_filepath(), self.Settings.Stemmer)] if data_type == DataType.Train: opinions_collections.append(OpinionCollection.from_file(opin_filepath, self.io.get_synonyms_collection_filepath(), self.Settings.Stemmer)) news_terms = NewsTerms.create_from_news(news_index, news, keep_tokens=self.Settings.KeepTokens) for relations, opinion in self._extract_relations(opinions_collections, news, news_terms): feature_vector = find_feature_vector_for_opinion(self.get_opinion_vector_collection(news_index, data_type), opinion) erc.add_news_relations(relations, opinion, news_terms, news_index, feature_vector) ntc.add_news_terms(news_terms) return ntc, erc
def __read_collection(self, io, data_type, settings): assert(isinstance(io, RuSentRelNetworkIO)) assert(isinstance(data_type, unicode)) assert(isinstance(settings, CommonModelSettings)) erc = ExtractedRelationsCollection() ntc = NewsTermsCollection() entities_list = [] missed_relations_total = 0 for news_index in io.get_data_indices(data_type): assert(isinstance(news_index, int)) entity_filepath = io.get_entity_filepath(news_index) news_filepath = io.get_news_filepath(news_index) opin_filepath = io.get_etalon_doc_opins_filepath(news_index) neutral_filepath = io.get_neutral_filepath(news_index, data_type) entities = EntityCollection.from_file(entity_filepath, settings.Stemmer, self.__synonyms) news = News.from_file(news_filepath, entities) opinions_collections = [OpinionCollection.from_file(neutral_filepath, self.__synonyms)] if data_type == DataType.Train: opinions_collections.append(OpinionCollection.from_file(opin_filepath, self.__synonyms)) news_terms = NewsTerms.create_from_news(news_index, news, keep_tokens=settings.KeepTokens) news_terms_helper = NewsTermsHelper(news_terms) if DebugKeys.NewsTermsStatisticShow: news_terms_helper.debug_statistics() if DebugKeys.NewsTermsShow: news_terms_helper.debug_show_terms() for relations, opinion, opinions in self.__extract_relations(opinions_collections, news, news_terms): reversed = ContextModelInitHelper.__find_or_create_reversed_opinion(opinion, opinions_collections) missed = erc.add_news_relations(relations=relations, label=self.__labels_helper.create_label_from_opinions(forward=opinion, backward=reversed), news_terms=news_terms, news_index=news_index, check_relation_is_correct=lambda r: Sample.check_ability_to_create_sample( window_size=settings.TermsPerContext, relation=r)) missed_relations_total += missed ntc.add_news_terms(news_terms) entities_list.append(entities) return ntc, erc, entities_list, missed_relations_total
def __clone_with_different_label(self, opinions, label): assert(isinstance(opinions, OpinionCollection)) assert(isinstance(label, Label)) ro = OpinionCollection(opinions=[], synonyms=self.Synonyms) for o in opinions: assert(isinstance(o, Opinion)) no = Opinion(value_left=o.value_left, value_right=o.value_right, sentiment=label) ro.add_opinion(no) return ro
def calc_a_file(self, files_to_compare, debug): assert (isinstance(files_to_compare, FilesToCompare)) # Reading test answers. test_opins = OpinionCollection.from_file( filepath=files_to_compare.TestFilepath, synonyms=self.__synonyms) # Reading etalon answers. etalon_opins = OpinionCollection.from_file( filepath=files_to_compare.EtalonFilepath, synonyms=self.__synonyms) if debug: print "{} <-> {}, {}".format(files_to_compare.TestFilepath, files_to_compare.EtalonFilepath, files_to_compare.index) return test_opins, etalon_opins
def __save_etalon(self, relation_collection_helper): assert (isinstance(relation_collection_helper, ExtractedRelationsCollectionHelper)) relation_collection_helper.save_into_opinion_collections( create_opinion_collection=lambda: OpinionCollection( opinions=None, synonyms=self.ReadOnlySynonymsCollection), create_filepath_by_news_id=lambda news_id: self.IO. get_etalon_doc_opins_filepath(news_id), label_calculation_mode=LabelCalculationMode.FIRST_APPEARED)
def __extract_sentence_opinion_refs(text_objects_collection, title_opinions, synonyms): assert(isinstance(text_objects_collection, TextObjectsCollection)) opinion_list = [] opinion_refs = [] added_opinions = OpinionCollection(opinions=None, synonyms=synonyms) TextProcessor.__setup_tags(text_objects_collection=text_objects_collection, synonyms=synonyms) for l_obj in text_objects_collection: for r_obj in text_objects_collection: if l_obj.CollectionInd == r_obj.CollectionInd: continue opinion = Opinion(value_left=l_obj.get_value(), value_right=r_obj.get_value(), sentiment=NeutralLabel()) is_title_already_has_opinion = title_opinions.has_synonymous_opinion(opinion) is_already_added = added_opinions.has_synonymous_opinion(opinion) is_appropriate = is_title_already_has_opinion and not is_already_added if not is_appropriate: continue opinion = title_opinions.get_synonymous_opinion(opinion) o = RefOpinion(left_index=l_obj.CollectionInd, right_index=r_obj.CollectionInd, sentiment=opinion.sentiment) opinion_refs.append(o) opinion_list.append(opinion) add_result = added_opinions.try_add_opinion(opinion) assert(add_result) return opinion_refs, opinion_list
def create_test_opinions(test_collections, labels, synonyms_filepath, stemmer): assert (isinstance(test_collections, list)) assert (isinstance(labels, np.ndarray)) assert (isinstance(stemmer, Stemmer)) label_index = 0 opinion_collection_list = [] synonyms = SynonymsCollection.from_file(synonyms_filepath, stemmer=stemmer) for c in test_collections: opinions = OpinionCollection(None, synonyms, stemmer) for opinion_vector in c: l = Label.from_int(int(labels[label_index])) opinion_vector.set_label(l) o = opinions.create_opinion(opinion_vector.value_left, opinion_vector.value_right, opinion_vector.label) if not opinions.has_opinion_by_synonyms(o) and not isinstance( l, NeutralLabel): opinions.add_opinion(o) elif not isinstance(l, NeutralLabel): print "Failed for o={}".format(o.to_unicode().encode('utf-8')) label_index += 1 opinion_collection_list.append(opinions) return opinion_collection_list
def _process_into_collections(self, indices, entity_indices, word_embedding, window_size_in_words, is_train_collection): assert (isinstance(indices, list)) assert (isinstance(word_embedding, Embedding)) assert (isinstance(is_train_collection, bool)) rc = ExtractedRelationsCollection() nwc = NewsWordsCollection(entity_indices, word_embedding) for n in indices: assert (type(n) == int) entity_filepath = self.io.get_entity_filepath(n) news_filepath = self.io.get_news_filepath(n) opin_filepath = self.io.get_opinion_input_filepath(n) neutral_filepath = self.io.get_neutral_filepath( n, is_train_collection) news = News.from_file(news_filepath, EntityCollection.from_file(entity_filepath)) opinions_collections = [ OpinionCollection.from_file(neutral_filepath, self.synonyms_filepath) ] if is_train_collection: opinions_collections.append( OpinionCollection.from_file(opin_filepath, self.synonyms_filepath)) news_words = NewsWords(n, news) news_descriptor = self.create_news_descriptor( n, news, news_words, opinions_collections, is_train_collection) rc.add_news_relations(news_descriptor, self.synonyms, window_size_in_words, is_train_collection) nwc.add_news(news_words) return nwc, rc
def get_method_statistic(files_to_compare_list, synonyms_filepath, stemmer): """ Calculate statistic based on result files files_to_compare_list: list list of FilesToCompare objects synonyms_filepath: str stemmer: Stemmer """ assert(isinstance(stemmer, Stemmer)) columns = ["t_all", "t_pos", "t_neg", "e_all", "e_pos", "e_neg"] df = pd.DataFrame(columns=columns) for files_to_compare in files_to_compare_list: assert(isinstance(files_to_compare, FilesToCompare)) test_opins = OpinionCollection.from_file( files_to_compare.test_filepath, synonyms_filepath, stemmer=stemmer) etalon_opins = OpinionCollection.from_file( files_to_compare.etalon_filepath, synonyms_filepath, stemmer=stemmer) df.loc[files_to_compare.index] = [ MethodStatistic.founded_opins(test_opins, etalon_opins), MethodStatistic.founded_opins(test_opins, etalon_opins, PositiveLabel()), MethodStatistic.founded_opins(test_opins, etalon_opins, NegativeLabel()), len(etalon_opins), len(list(etalon_opins.iter_sentiment(PositiveLabel()))), len(list(etalon_opins.iter_sentiment(NegativeLabel())))] df.loc['sum'] = [float(df[c].sum()) for c in columns] df.loc['found'] = None df.loc['found']['t_all'] = float(df.loc['sum']['t_all']) / df.loc['sum']['e_all'] df.loc['found']['t_pos'] = float(df.loc['sum']['t_pos']) / df.loc['sum']['e_pos'] df.loc['found']['t_neg'] = float(df.loc['sum']['t_neg']) / df.loc['sum']['e_neg'] return df
def process_news_content(self, news_info, title_opinions, synonyms): """ news_id: assumes a unique name/key Perform sentences parsing, excluding news title """ assert(isinstance(news_info, NewsInfo)) text_opinions = OpinionCollection(opinions=None, synonyms=synonyms) cds = [] for index in range(news_info.sentences_count()): _, parsed_sentence, s_objects, s_frames = self._process_sentence_core(news_info, s_ind=index) s_opinion_refs, s_opinions_list = self.__extract_sentence_opinion_refs( text_objects_collection=s_objects, title_opinions=title_opinions, synonyms=synonyms) if len(s_opinion_refs) == 0: continue for opinion in s_opinions_list: if not text_opinions.has_synonymous_opinion(opinion): add_result = text_opinions.try_add_opinion(opinion) assert(add_result) cd = ContextDescriptor( sentence_index=index, parsed_text=parsed_sentence, opinion_refs=s_opinion_refs, objects_collection=s_objects, text_frames=s_frames, frames=self.Settings.Frames) cds.append(cd) return cds, text_opinions
def predict_core(self, dest_data_type, rc_labeling_callback): assert(isinstance(dest_data_type, unicode)) assert(callable(rc_labeling_callback)) rc = self.get_relations_collection(dest_data_type) rch = self.get_relations_collection_helper(dest_data_type) assert(isinstance(rc, ExtractedRelationsCollection)) assert(isinstance(rch, ExtractedRelationsCollectionHelper)) rc.reset_labels() assert(rc.check_all_relations_without_labels()) predict_log = rc_labeling_callback(rc, dest_data_type) assert(rc.check_all_relations_has_labels()) rch.debug_labels_statistic() rch.save_into_opinion_collections( create_opinion_collection=lambda: OpinionCollection(opinions=None, synonyms=self.ReadOnlySynonymsCollection), create_filepath_by_news_id=lambda news_id: self.IO.get_model_doc_opins_filepath(doc_id=news_id, data_type=dest_data_type), label_calculation_mode=self.Settings.RelationLabelCalculationMode) eval_result = self.get_eval_helper().evaluate_model(data_type=dest_data_type, io=self.IO, indices=rch.iter_unique_news_ids(), synonyms=self.ReadOnlySynonymsCollection) rc.reset_labels() return eval_result, predict_log
def opinions_between_entities(E, diff, news, synonyms, sentiment_opins=None): """ Relations that had the same difference """ def try_add_opinion(o, added, neutral_opins): assert (isinstance(o, Opinion)) assert (isinstance(neutral_opins, OpinionCollection)) # Filter if there is a sentiment relation if sentiment_opins is not None: if sentiment_opins.has_opinion_by_synonyms(o): return if neutral_opins.has_opinion_by_synonyms(o): return added.add(o.create_value_id()) neutral_opins.add_opinion(o) def is_ignored(entity): # TODO. Move ignored entities into core. return env.stemmer.lemmatize_to_str(entity.value) in IGNORED_ENTITIES def get_entity_synonyms(entity): return synonyms.get_synonyms_list(entity.value), \ synonyms.get_synonym_group_index(entity.value) added = set() c = OpinionCollection(opinions=None, synonyms=synonyms) for i in range(E.shape[0]): for j in range(E.shape[1]): if E[i][j] != diff: continue e1 = news.entities.get_entity_by_index(i) e2 = news.entities.get_entity_by_index(j) if is_ignored(e1) or is_ignored(e2): continue if not synonyms.has_synonym(e1.value): synonyms.add_synonym(e1.value) if not synonyms.has_synonym(e2.value): synonyms.add_synonym(e2.value) sl1, g1 = get_entity_synonyms(e1) sl2, g2 = get_entity_synonyms(e2) r_left = sl1[0] r_right = sl2[0] # Filter the same groups if g1 == g2: "Entities '{}', and '{}' a part of the same synonym group".format( r_left.encode('utf-8'), r_right.encode('utf-8')) continue try_add_opinion(Opinion(r_left, r_right, NeutralLabel()), added, c) try_add_opinion(Opinion(r_right, r_left, NeutralLabel()), added, c) return c
# # Train # root = io_utils.train_root() for n in io_utils.train_indices(): entity_filepath = root + "art{}.ann".format(n) news_filepath = root + "art{}.txt".format(n) opin_filepath = root + "art{}.opin.txt".format(n) neutral_filepath = root + "art{}.neut.txt".format(n) print neutral_filepath entities = EntityCollection.from_file(entity_filepath) news = News.from_file(news_filepath, entities) opinions = OpinionCollection.from_file(opin_filepath, io_utils.get_synonyms_filepath()) neutral_opins = make_neutrals(news, synonyms, opinions) neutral_opins.save(neutral_filepath) # # Test # root = io_utils.test_root() for n in io_utils.test_indices(): entity_filepath = path.join(root, "art{}.ann".format(n)) news_filepath = path.join(root, "art{}.txt".format(n)) neutral_filepath = path.join(root, "art{}.neut.txt".format(n)) print neutral_filepath
def predict(self, dest_data_type=DataType.Test): def calculate_label(relation_labels): assert(isinstance(relation_labels, list)) label = None if self.Settings.RelationLabelCalculationMode == LabelCalculationMode.FIRST_APPEARED: label = relation_labels[0] if self.Settings.RelationLabelCalculationMode == LabelCalculationMode.AVERAGE: label = Label.from_int(np.sign(sum([l.to_int() for l in relation_labels]))) if DebugKeys.PredictLabel: print [l.to_int() for l in relation_labels] print "Result: {}".format(label.to_int()) return label assert(isinstance(dest_data_type, unicode)) self._relations_collections[dest_data_type].reset_labels() prediction_collection = RelationPredictionResultCollection(len(self._relations_collections[dest_data_type])) for bags_group in self.bags_collection[dest_data_type].iter_by_groups(self.Settings.BagsPerMinibatch): minibatch = MiniBatch(bags_group) feed_dict = self.create_feed_dict(minibatch, data_type=dest_data_type) log_names, log_params = self.network.Log result = self.sess.run([self.network.Labels, self.network.Output] + log_params, feed_dict=feed_dict) uint_labels = result[0] output = result[1] if DebugKeys.PredictBatchDisplayLog: self._display_log(log_names, result[2:]) # apply labels sample_indices_count = 0 for sample_index, sample in enumerate(minibatch.iter_by_samples()): label = Label.from_uint(int(uint_labels[sample_index])) self._relations_collections[dest_data_type].apply_label(label, sample.RelationID) prediction_collection.add(sample.RelationID, RelationPredictionResult(output[sample_index])) sample_indices_count += 1 assert(sample_indices_count == len(uint_labels)) assert(self._relations_collections[dest_data_type].debug_check_all_relations_has_labels()) self._relations_collections[dest_data_type].debug_labels_statistic(dest_data_type) # Compose Result self._relations_collections[dest_data_type].save( self.io.get_relations_filepath(data_type=dest_data_type, epoch=self._last_fit_epoch_index)) prediction_collection.save( self.io.get_relations_prediction_filepath(data_type=dest_data_type, epoch=self._last_fit_epoch_index)) for news_ID in self.io.get_data_indices(dest_data_type): collection = OpinionCollection(None, self.synonyms, self.settings.Stemmer) self._relations_collections[dest_data_type].fill_opinion_collection(collection, news_ID, calculate_label) collection.save(self.io.get_opinion_output_filepath(news_ID, self.io.get_model_root(dest_data_type))) return self._evaluate(dest_data_type, self.Settings.Stemmer)
def _extract_opinions_from_title(self, title_terms, title_objects, title_frames, synonyms): assert(isinstance(title_terms, list)) assert(isinstance(title_objects, TextObjectsCollection)) assert(isinstance(title_frames, TextFrameVariantsCollection)) assert(isinstance(synonyms, SynonymsCollection)) opinion_refs = [] title_opinions = OpinionCollection(opinions=None, synonyms=synonyms) TextProcessor.__setup_tags(text_objects_collection=title_objects, synonyms=synonyms) for l_obj in title_objects: for r_obj in title_objects: l_bound = l_obj.get_bound() r_bound = r_obj.get_bound() if l_bound.TermIndex == r_bound.TermIndex: continue if l_bound.TermIndex >= r_bound.TermIndex: continue i = l_obj.CollectionInd j = r_obj.CollectionInd if not self.__check_auth_correctness(i=i, j=j, objects=title_objects): continue label = self.decide_label_of_pair_in_title_optional( i=i, j=j, title_objects=title_objects, title_frames=title_frames) if label is None: # Considered by pair-base processor continue opinion = Opinion(value_left=l_obj.get_value(), value_right=r_obj.get_value(), sentiment=label) self.__debug_opinions_created += 1 if self.__check_obj_preposition_in_title: if self.__reject_by_russian_prepositions(l_obj=l_obj, r_obj=r_obj, title_terms=title_terms): self.__debug_opinions_rejected_by_preps += 1 continue if not self.__guarantee_synonyms_presence(synonyms=synonyms, obj_value=opinion.value_left): self.__debug_opinions_with_missed_synonyms += 1 continue if not self.__guarantee_synonyms_presence(synonyms=synonyms, obj_value=opinion.value_right): self.__debug_opinions_with_missed_synonyms += 1 continue lg_ind = synonyms.get_synonym_group_index(opinion.value_left) rg_ind = synonyms.get_synonym_group_index(opinion.value_right) if lg_ind == rg_ind: self.__debug_opinions_looped += 1 continue if not title_opinions.has_synonymous_opinion(opinion): # OK, adding self.__debug_opinions_total_extracted_from_titles += 1 add_result = title_opinions.try_add_opinion(opinion) assert(add_result) else: self.__debug_opinions_title_synonymous_existed += 1 opinion_ref = RefOpinion(left_index=i, right_index=j, sentiment=opinion.sentiment) opinion_refs.append(opinion_ref) return opinion_refs, title_opinions
# Train collection # root = io_utils.train_root() for n in io_utils.train_indices(): entity_filepath = root + "art{}.ann".format(n) opin_filepath = root + "art{}.opin.txt".format(n) neutral_filepath = root + "art{}.neut.txt".format(n) news_filepath = root + "art{}.txt".format(n) vector_output = root + "art{}.vectors.txt".format(n) print vector_output entities = EntityCollection.from_file(entity_filepath) news = News.from_file(news_filepath, entities) sentiment_opins = OpinionCollection.from_file( opin_filepath, io_utils.get_synonyms_filepath()) neutral_opins = OpinionCollection.from_file( neutral_filepath, io_utils.get_synonyms_filepath()) # filter_neutral(neutral_opins) vectors = vectorize_opinions( news, entities, [sentiment_opins, neutral_opins]) vectors.save(vector_output) # # Test collection # root = io_utils.test_root() for n in io_utils.test_indices():