for e2 in entities: i = e1.get_int_ID() j = e2.get_int_ID() E[i - 1][j - 1] = sentences_between(e1, e2, news) return opinions_between_entities(E, 0, news, synonyms_collection, sentiment_opins=opinions) # # Main # synonyms = SynonymsCollection.from_file(io_utils.get_synonyms_filepath()) # # Train # root = io_utils.train_root() for n in io_utils.train_indices(): entity_filepath = root + "art{}.ann".format(n) news_filepath = root + "art{}.txt".format(n) opin_filepath = root + "art{}.opin.txt".format(n) neutral_filepath = root + "art{}.neut.txt".format(n) print neutral_filepath entities = EntityCollection.from_file(entity_filepath) news = News.from_file(news_filepath, entities)
def __init__(self, io, word_embedding, train_indices, test_indices, synonyms_filepath, bag_size, words_per_news, bags_per_minibatch, callback): assert (isinstance(io, NetworkIO)) assert (isinstance(word_embedding, Embedding)) assert (isinstance(callback, Callback)) self.io = io self.sess = None self.train_indices = train_indices self.test_indices = test_indices self.words_per_news = words_per_news self.synonyms_filepath = synonyms_filepath self.synonyms = SynonymsCollection.from_file(self.synonyms_filepath) # Compute embedding vectors for entities of train and test collections. # Initialize entities embedding # size of window which includes relations and also filters them. # len([ ... entity_1 ... entity_2 ...]) = window_size_in_words # TODO. window size should be unchanged all_indices = train_indices + test_indices entities_collections = [ EntityCollection.from_file(self.io.get_entity_filepath(n)) for n in all_indices ] entity_indices = EntityIndices(entities_collections) # Train collection train_news_words_collection, train_relations_collection = self._process_into_collections( train_indices, entity_indices, word_embedding, words_per_news, True) # Test collection test_news_words_collection, test_relations_collection = self._process_into_collections( test_indices, entity_indices, word_embedding, words_per_news, False) words_per_news = min( train_news_words_collection.get_min_words_per_news_count(), test_news_words_collection.get_min_words_per_news_count(), words_per_news) self.train_relations_collection = train_relations_collection self.test_relations_collection = test_relations_collection self.test_news_words_collection = test_news_words_collection train_bags_collection = BagsCollection( train_relations_collection.relations, bag_size) test_bags_collection = BagsCollection( test_relations_collection.relations, bag_size) train_bags_collection.shuffle() test_bags_collection.shuffle() self.test_minibatches = test_bags_collection.to_minibatches( bags_per_minibatch) self.train_minibatches = train_bags_collection.to_minibatches( bags_per_minibatch) self.train_news_words_collection = train_news_words_collection self.E = train_news_words_collection.get_embedding_matrix( ) # test collection has the same matrix. self.network = None self.callback = callback
def __init__(self, io, settings, callback): assert(isinstance(io, ContextLevelNetworkIO)) assert(isinstance(settings, CommonModelSettings)) assert(isinstance(callback, Callback) or callback is None) super(ContextLevelTensorflowModel, self).__init__(io, callback) def iter_all_entity_collections(): all_indices = io.get_data_indices(DataType.Train) + \ io.get_data_indices(DataType.Test) for news_index in all_indices: yield EntityCollection.from_file(self.io.get_entity_filepath(news_index), self.Settings.Stemmer) self.settings = settings self._last_fit_epoch_index = None self.synonyms = SynonymsCollection.from_file(io.get_synonyms_collection_filepath(), stemmer=self.Settings.Stemmer) self.all_existed_entity_indices = EntityIndices.from_entities_collections( iter_all_entity_collections(), self.Settings.Stemmer) # Train collection train_news_terms_collection, train_relations_collection = \ self._process_into_collections(io.get_data_indices(DataType.Train), DataType.Train) # Test collection test_news_terms_collection, test_relations_collection = \ self._process_into_collections(io.get_data_indices(DataType.Test), DataType.Test) self.Settings.update_terms_per_context( min(train_news_terms_collection.calculate_min_terms_per_context(), test_news_terms_collection.calculate_min_terms_per_context())) self.Settings.set_term_embedding( create_term_embedding_matrix(self.Settings.WordEmbedding, self.all_existed_entity_indices)) self.bags_collection = { DataType.Test: BagsCollection( test_relations_collection, self.Settings.BagSize, shuffle=False, create_sample_func=lambda relation: Sample.from_relation(relation, self.all_existed_entity_indices, test_news_terms_collection, self.settings)), DataType.Train: BagsCollection( train_relations_collection, self.Settings.BagSize, shuffle=True, create_sample_func=lambda relation: Sample.from_relation(relation, self.all_existed_entity_indices, train_news_terms_collection, self.settings)) } self._relations_collections = { DataType.Test: test_relations_collection, DataType.Train: train_relations_collection } self.news_terms_collections = { DataType.Test: test_news_terms_collection, DataType.Train: train_news_terms_collection } neu, pos, neg = self._relations_collections[DataType.Train].get_statistic() self.settings.set_class_weights([100.0 / neu, 100.0 / pos, 100.0 / neg]) # log keys, values = self.settings.get_parameters() self._display_log(keys, values) self._relations_collections[DataType.Train].debug_labels_statistic(DataType.Train) self._relations_collections[DataType.Train].debug_unique_relations_statistic() self._relations_collections[DataType.Test].debug_labels_statistic(DataType.Test) self._relations_collections[DataType.Test].debug_unique_relations_statistic()
def __init__(self, io, settings, callback): assert (isinstance(io, TextLevelNetworkIO)) assert (isinstance(settings, TextModelSettings)) super(TextLevelTensorflowModel, self).__init__(io, callback) self._settings = settings self.synonyms = SynonymsCollection.from_file( io.get_synonyms_collection_filepath(), stemmer=self.Settings.Stemmer) contextSettings = CommonModelSettings(load_embedding=False) self.relation_collections = { DataType.Train: ExtractedRelationsCollection.load( io.get_relations_filepath(self._settings.EpochToUse, DataType.Train)), DataType.Test: ExtractedRelationsCollection.load( io.get_relations_filepath(self._settings.EpochToUse, DataType.Test)) } self.predicted_collections = { DataType.Train: RelationPredictionResultCollection.load( io.get_relations_predictions_filepath( self._settings.EpochToUse, DataType.Train)), DataType.Test: RelationPredictionResultCollection.load( io.get_relations_predictions_filepath( self._settings.EpochToUse, DataType.Test)) } assert (len(self.predicted_collections[DataType.Train]) == len( self.relation_collections[DataType.Train])) assert (len(self.predicted_collections[DataType.Test]) == len( self.relation_collections[DataType.Test])) settings.set_embedding_shape( (settings.GroupSize * settings.BatchSize + 1, settings.ClassesCount)) neu, pos, neg = self.relation_collections[ DataType.Train].get_statistic() settings.set_class_weights( [100.0 / (neu + 1), 100.0 / (pos + 1), 100.0 / (neg + 1)]) # log keys, values = settings.get_parameters() self._display_log(keys, values) self._sentences_in_news = self._get_sentences_in_news() self.relation_collections[DataType.Train].debug_labels_statistic( DataType.Train) self.relation_collections[ DataType.Train].debug_unique_relations_statistic() self.relation_collections[DataType.Test].debug_labels_statistic( DataType.Test) self.relation_collections[ DataType.Test].debug_unique_relations_statistic()
def __init__(self, io, settings): assert(isinstance(io, RuSentRelNetworkIO)) assert(isinstance(settings, CommonModelSettings)) print "Loading word embedding: {}".format(io.get_word_embedding_filepath()) word_embedding = RusvectoresEmbedding.from_file( filepath=io.get_word_embedding_filepath(), binary=True, stemmer=settings.Stemmer, pos_tagger=settings.PosTagger) settings.set_word_embedding(word_embedding) self.__synonyms = SynonymsCollection.from_file(filepath=io.get_synonyms_collection_filepath(), stemmer=settings.Stemmer, is_read_only=True) self.__labels_helper = SingleLabelsHelper() if settings.ClassesCount == 3 else PairedLabelsHelper() print "Reading train collection ..." train_news_terms_collection, train_relations_collection, train_entities, train_relations_missed = \ self.__read_collection(io=io, data_type=DataType.Train, settings=settings) print "Reading test collection ..." test_news_terms_collection, test_relations_collection, test_entities, test_relations_missed = \ self.__read_collection(io=io, data_type=DataType.Test, settings=settings) print "Relations rejected (train): {}".format(train_relations_missed) print "Relations rejected (test): {}".format(test_relations_missed) static_embedding = StaticEmbedding(settings.WordEmbedding.VectorSize) self.__fill_static_embedding_from_ntc(static_embedding=static_embedding, word_embedding=settings.WordEmbedding, ntc=train_news_terms_collection) self.__fill_static_embedding_from_ntc(static_embedding=static_embedding, word_embedding=settings.WordEmbedding, ntc=test_news_terms_collection) static_embedding.create_and_add_embedding(word=utils.ENTITY_MASK) settings.set_static_embedding(static_embedding) settings.set_term_embedding( utils.create_term_embedding_matrix(word_embedding=settings.WordEmbedding, static_embedding=settings.StaticWordEmbedding)) self.__bags_collection = { DataType.Test: self.create_bags_collection( relations_collection=test_relations_collection, news_terms_collection=test_news_terms_collection, data_type=DataType.Test, settings=settings), DataType.Train: self.create_bags_collection( relations_collection=train_relations_collection, news_terms_collection=train_news_terms_collection, data_type=DataType.Train, settings=settings) } self.__bags_collection_helpers = { DataType.Train: BagsCollectionHelper(bags_collection=self.__bags_collection[DataType.Train], name=DataType.Train), DataType.Test: BagsCollectionHelper(bags_collection=self.__bags_collection[DataType.Test], name=DataType.Test) } self.__relations_collections = { DataType.Test: test_relations_collection, DataType.Train: train_relations_collection } self.__relations_collection_helpers = { DataType.Test: ExtractedRelationsCollectionHelper(test_relations_collection, labels_helper=self.__labels_helper, name=DataType.Test), DataType.Train: ExtractedRelationsCollectionHelper(train_relations_collection, labels_helper=self.__labels_helper, name=DataType.Train) } self.__news_terms_collections = { DataType.Test: test_news_terms_collection, DataType.Train: train_news_terms_collection } norm, _ = self.__relations_collection_helpers[DataType.Train].get_statistic() settings.set_class_weights(norm)