Exemple #1
0
        for e2 in entities:
            i = e1.get_int_ID()
            j = e2.get_int_ID()
            E[i - 1][j - 1] = sentences_between(e1, e2, news)

    return opinions_between_entities(E,
                                     0,
                                     news,
                                     synonyms_collection,
                                     sentiment_opins=opinions)


#
# Main
#
synonyms = SynonymsCollection.from_file(io_utils.get_synonyms_filepath())

#
# Train
#
root = io_utils.train_root()
for n in io_utils.train_indices():
    entity_filepath = root + "art{}.ann".format(n)
    news_filepath = root + "art{}.txt".format(n)
    opin_filepath = root + "art{}.opin.txt".format(n)
    neutral_filepath = root + "art{}.neut.txt".format(n)

    print neutral_filepath

    entities = EntityCollection.from_file(entity_filepath)
    news = News.from_file(news_filepath, entities)
Exemple #2
0
    def __init__(self, io, word_embedding, train_indices, test_indices,
                 synonyms_filepath, bag_size, words_per_news,
                 bags_per_minibatch, callback):
        assert (isinstance(io, NetworkIO))
        assert (isinstance(word_embedding, Embedding))
        assert (isinstance(callback, Callback))

        self.io = io
        self.sess = None
        self.train_indices = train_indices
        self.test_indices = test_indices
        self.words_per_news = words_per_news
        self.synonyms_filepath = synonyms_filepath
        self.synonyms = SynonymsCollection.from_file(self.synonyms_filepath)

        # Compute embedding vectors for entities of train and test collections.
        # Initialize entities embedding

        # size of window which includes relations and also filters them.
        # len([ ... entity_1 ... entity_2 ...]) = window_size_in_words
        # TODO. window size should be unchanged

        all_indices = train_indices + test_indices
        entities_collections = [
            EntityCollection.from_file(self.io.get_entity_filepath(n))
            for n in all_indices
        ]
        entity_indices = EntityIndices(entities_collections)

        # Train collection
        train_news_words_collection, train_relations_collection = self._process_into_collections(
            train_indices, entity_indices, word_embedding, words_per_news,
            True)

        # Test collection
        test_news_words_collection, test_relations_collection = self._process_into_collections(
            test_indices, entity_indices, word_embedding, words_per_news,
            False)

        words_per_news = min(
            train_news_words_collection.get_min_words_per_news_count(),
            test_news_words_collection.get_min_words_per_news_count(),
            words_per_news)

        self.train_relations_collection = train_relations_collection
        self.test_relations_collection = test_relations_collection
        self.test_news_words_collection = test_news_words_collection

        train_bags_collection = BagsCollection(
            train_relations_collection.relations, bag_size)
        test_bags_collection = BagsCollection(
            test_relations_collection.relations, bag_size)

        train_bags_collection.shuffle()
        test_bags_collection.shuffle()

        self.test_minibatches = test_bags_collection.to_minibatches(
            bags_per_minibatch)
        self.train_minibatches = train_bags_collection.to_minibatches(
            bags_per_minibatch)
        self.train_news_words_collection = train_news_words_collection

        self.E = train_news_words_collection.get_embedding_matrix(
        )  # test collection has the same matrix.

        self.network = None
        self.callback = callback
Exemple #3
0
    def __init__(self, io, settings, callback):
        assert(isinstance(io, ContextLevelNetworkIO))
        assert(isinstance(settings, CommonModelSettings))
        assert(isinstance(callback, Callback) or callback is None)
        super(ContextLevelTensorflowModel, self).__init__(io, callback)

        def iter_all_entity_collections():
            all_indices = io.get_data_indices(DataType.Train) + \
                          io.get_data_indices(DataType.Test)
            for news_index in all_indices:
                yield EntityCollection.from_file(self.io.get_entity_filepath(news_index),
                                                 self.Settings.Stemmer)

        self.settings = settings
        self._last_fit_epoch_index = None

        self.synonyms = SynonymsCollection.from_file(io.get_synonyms_collection_filepath(),
                                                     stemmer=self.Settings.Stemmer)

        self.all_existed_entity_indices = EntityIndices.from_entities_collections(
            iter_all_entity_collections(),
            self.Settings.Stemmer)

        # Train collection
        train_news_terms_collection, train_relations_collection = \
            self._process_into_collections(io.get_data_indices(DataType.Train), DataType.Train)

        # Test collection
        test_news_terms_collection, test_relations_collection = \
            self._process_into_collections(io.get_data_indices(DataType.Test), DataType.Test)

        self.Settings.update_terms_per_context(
            min(train_news_terms_collection.calculate_min_terms_per_context(),
                test_news_terms_collection.calculate_min_terms_per_context()))

        self.Settings.set_term_embedding(
            create_term_embedding_matrix(self.Settings.WordEmbedding, self.all_existed_entity_indices))

        self.bags_collection = {
            DataType.Test: BagsCollection(
                test_relations_collection,
                self.Settings.BagSize,
                shuffle=False,
                create_sample_func=lambda relation: Sample.from_relation(relation,
                                                                         self.all_existed_entity_indices,
                                                                         test_news_terms_collection,
                                                                         self.settings)),

            DataType.Train: BagsCollection(
                train_relations_collection,
                self.Settings.BagSize,
                shuffle=True,
                create_sample_func=lambda relation: Sample.from_relation(relation,
                                                                         self.all_existed_entity_indices,
                                                                         train_news_terms_collection,
                                                                         self.settings))
        }

        self._relations_collections = {
            DataType.Test: test_relations_collection,
            DataType.Train: train_relations_collection
        }

        self.news_terms_collections = {
            DataType.Test: test_news_terms_collection,
            DataType.Train: train_news_terms_collection
        }

        neu, pos, neg = self._relations_collections[DataType.Train].get_statistic()
        self.settings.set_class_weights([100.0 / neu, 100.0 / pos, 100.0 / neg])

        # log
        keys, values = self.settings.get_parameters()
        self._display_log(keys, values)

        self._relations_collections[DataType.Train].debug_labels_statistic(DataType.Train)
        self._relations_collections[DataType.Train].debug_unique_relations_statistic()
        self._relations_collections[DataType.Test].debug_labels_statistic(DataType.Test)
        self._relations_collections[DataType.Test].debug_unique_relations_statistic()
Exemple #4
0
    def __init__(self, io, settings, callback):
        assert (isinstance(io, TextLevelNetworkIO))
        assert (isinstance(settings, TextModelSettings))
        super(TextLevelTensorflowModel, self).__init__(io, callback)

        self._settings = settings
        self.synonyms = SynonymsCollection.from_file(
            io.get_synonyms_collection_filepath(),
            stemmer=self.Settings.Stemmer)

        contextSettings = CommonModelSettings(load_embedding=False)

        self.relation_collections = {
            DataType.Train:
            ExtractedRelationsCollection.load(
                io.get_relations_filepath(self._settings.EpochToUse,
                                          DataType.Train)),
            DataType.Test:
            ExtractedRelationsCollection.load(
                io.get_relations_filepath(self._settings.EpochToUse,
                                          DataType.Test))
        }

        self.predicted_collections = {
            DataType.Train:
            RelationPredictionResultCollection.load(
                io.get_relations_predictions_filepath(
                    self._settings.EpochToUse, DataType.Train)),
            DataType.Test:
            RelationPredictionResultCollection.load(
                io.get_relations_predictions_filepath(
                    self._settings.EpochToUse, DataType.Test))
        }

        assert (len(self.predicted_collections[DataType.Train]) == len(
            self.relation_collections[DataType.Train]))
        assert (len(self.predicted_collections[DataType.Test]) == len(
            self.relation_collections[DataType.Test]))

        settings.set_embedding_shape(
            (settings.GroupSize * settings.BatchSize + 1,
             settings.ClassesCount))

        neu, pos, neg = self.relation_collections[
            DataType.Train].get_statistic()
        settings.set_class_weights(
            [100.0 / (neu + 1), 100.0 / (pos + 1), 100.0 / (neg + 1)])

        # log
        keys, values = settings.get_parameters()
        self._display_log(keys, values)

        self._sentences_in_news = self._get_sentences_in_news()

        self.relation_collections[DataType.Train].debug_labels_statistic(
            DataType.Train)
        self.relation_collections[
            DataType.Train].debug_unique_relations_statistic()
        self.relation_collections[DataType.Test].debug_labels_statistic(
            DataType.Test)
        self.relation_collections[
            DataType.Test].debug_unique_relations_statistic()
    def __init__(self, io, settings):
        assert(isinstance(io, RuSentRelNetworkIO))
        assert(isinstance(settings, CommonModelSettings))

        print "Loading word embedding: {}".format(io.get_word_embedding_filepath())
        word_embedding = RusvectoresEmbedding.from_file(
            filepath=io.get_word_embedding_filepath(),
            binary=True,
            stemmer=settings.Stemmer,
            pos_tagger=settings.PosTagger)
        settings.set_word_embedding(word_embedding)

        self.__synonyms = SynonymsCollection.from_file(filepath=io.get_synonyms_collection_filepath(),
                                                       stemmer=settings.Stemmer,
                                                       is_read_only=True)

        self.__labels_helper = SingleLabelsHelper() if settings.ClassesCount == 3 else PairedLabelsHelper()

        print "Reading train collection ..."
        train_news_terms_collection, train_relations_collection, train_entities, train_relations_missed = \
            self.__read_collection(io=io, data_type=DataType.Train, settings=settings)

        print "Reading test collection ..."
        test_news_terms_collection, test_relations_collection, test_entities, test_relations_missed = \
            self.__read_collection(io=io, data_type=DataType.Test, settings=settings)

        print "Relations rejected (train): {}".format(train_relations_missed)
        print "Relations rejected (test):  {}".format(test_relations_missed)

        static_embedding = StaticEmbedding(settings.WordEmbedding.VectorSize)
        self.__fill_static_embedding_from_ntc(static_embedding=static_embedding,
                                              word_embedding=settings.WordEmbedding,
                                              ntc=train_news_terms_collection)
        self.__fill_static_embedding_from_ntc(static_embedding=static_embedding,
                                              word_embedding=settings.WordEmbedding,
                                              ntc=test_news_terms_collection)
        static_embedding.create_and_add_embedding(word=utils.ENTITY_MASK)

        settings.set_static_embedding(static_embedding)

        settings.set_term_embedding(
            utils.create_term_embedding_matrix(word_embedding=settings.WordEmbedding,
                                               static_embedding=settings.StaticWordEmbedding))

        self.__bags_collection = {
            DataType.Test: self.create_bags_collection(
                relations_collection=test_relations_collection,
                news_terms_collection=test_news_terms_collection,
                data_type=DataType.Test,
                settings=settings),
            DataType.Train: self.create_bags_collection(
                relations_collection=train_relations_collection,
                news_terms_collection=train_news_terms_collection,
                data_type=DataType.Train,
                settings=settings)
        }

        self.__bags_collection_helpers = {
            DataType.Train: BagsCollectionHelper(bags_collection=self.__bags_collection[DataType.Train],
                                                 name=DataType.Train),
            DataType.Test: BagsCollectionHelper(bags_collection=self.__bags_collection[DataType.Test],
                                                name=DataType.Test)
        }

        self.__relations_collections = {
            DataType.Test: test_relations_collection,
            DataType.Train: train_relations_collection
        }

        self.__relations_collection_helpers = {
            DataType.Test: ExtractedRelationsCollectionHelper(test_relations_collection,
                                                              labels_helper=self.__labels_helper,
                                                              name=DataType.Test),
            DataType.Train: ExtractedRelationsCollectionHelper(train_relations_collection,
                                                               labels_helper=self.__labels_helper,
                                                               name=DataType.Train)
        }

        self.__news_terms_collections = {
            DataType.Test: test_news_terms_collection,
            DataType.Train: train_news_terms_collection
        }

        norm, _ = self.__relations_collection_helpers[DataType.Train].get_statistic()

        settings.set_class_weights(norm)