class Word_Embeddings_Differential_Feature_Generator_Unittests(
        unittest.TestCase):
    def setUp(self):
        self._config_parser = getConfig()
        self._db = DB()
        self._db.setUp()

        self._model = Word_Embedding_Differential_Feature_Generator(self._db)

        self._posts = []
        self._author = None
        self._set_author(u'test_user')

    def tearDown(self):
        self._db.session.close()
        self._db.deleteDB()
        self._db.session.close()

    def test_simple_case(self):
        self._add_post(u'of to a for', u'of is')
        self._add_target_article(u'0', u'of was ', u'am that was')
        self._setup_test()

        is_vec1 = self._get_word_dimension(u'is', 0)
        was_vec_d1 = self._get_word_dimension(u'was', 0)
        expected_val = was_vec_d1 - is_vec1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d0"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        expected_val = was_vec_d1 - is_vec1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_np.mean_target_articles_title_to_posts_content_d0"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        is_vec = self._words[u'is']
        was_vec = self._words['was']
        expected_val = commons.euclidean_distance(is_vec, was_vec)
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_distance_function_euclidean_distance_target_articles_title_np.mean_TO_posts_content_np.mean"
        ).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_few_words(self):
        self._add_post(u'of to a for', u'of is on')
        self._add_target_article(u'0', u'of was that', u'am that was')
        self._setup_test()

        dimension = 0
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot2 - tot1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)
        dimension = 140
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot2 - tot1
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_opposite(self):
        self._add_post(u'am that was', u'of was that')
        self._add_target_article(u'0', u'of is on', u'of to a for')
        self._setup_test()

        dimension = 0
        is_vec1 = self._get_word_dimension(u'is', dimension)
        on_vec1 = self._get_word_dimension(u'on', dimension)
        tot1 = is_vec1 + on_vec1
        was_vec_d1 = self._get_word_dimension(u'was', dimension)
        that_vec_d1 = self._get_word_dimension(u'that', dimension)
        tot2 = was_vec_d1 + that_vec_d1
        expected_val = tot1 - tot2
        db_val = self._db.get_author_feature(
            self._author.author_guid,
            u"word_embeddings_differential_sum_target_articles_title_to_posts_content_d"
            + str(dimension)).attribute_value
        self.assertAlmostEqual(float(db_val), expected_val, places=4)

    def test_empty_word(self):
        self._add_post(u'of to a for', u'')
        self._add_target_article(u'0', u'of was that', u'am that was')
        self._setup_test()
        self.assertTrue(True)

    def _add_post(self, title, content):
        post = Post()
        post.author = self._author.author_full_name
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = u'test'
        post.post_id = len(self._posts)
        post.guid = post.post_id
        self._db.addPost(post)
        self._posts.append(post)

    def _set_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'name' + author_guid
        author.name = u'name' + author_guid
        author.domain = u'test'
        self._db.add_author(author)
        self._author = author

    def _setup_test(self):
        self._db.session.commit()
        self._word_embedding_model_creator = GloveWordEmbeddingModelCreator(
            self._db)
        self._word_embedding_model_creator.execute(None)

        params = {'authors': [self._author], 'posts': self._posts}
        self._model = Word_Embedding_Differential_Feature_Generator(
            self._db, **params)
        self._model.execute()

        self._words = self._db.get_word_embedding_dictionary()

    def _get_word_dimension(self, word, dimension):
        word_vec = self._words[word]
        return word_vec[dimension]

    def _add_target_article(self, post_id, title, description):
        target_article = Target_Article()
        target_article.author_guid = u'test_user'
        target_article.post_id = post_id
        target_article.title = title
        target_article.description = description
        target_article.keywords = 'temp, lala, fafa'
        self._db.add_target_articles([target_article])

    def _add_target_article_item(self, post_id, type, content, author_guid):
        article_item = Target_Article_Item()
        article_item.post_id = post_id
        article_item.type = type
        article_item.item_number = 3
        article_item.content = content
        article_item.author_guid = author_guid
        self._db.addPosts([article_item])
class TestGensimWordEmbeddingsModelTrainer(TestCase):
    def setUp(self):
        self._config_parser = getConfig()
        self._db = DB()
        self._db.setUp()

        # self._Word_Embedding_Model_Creator.execute(None)
        self._is_load_wikipedia_300d_glove_model = True
        self._wikipedia_model_file_path = "data/input/glove/test_glove.6B.300d_small.txt"
        self._table_name = "wikipedia_model_300d"
        self._word_vector_dict_full_path = "data/output/word_embedding/"
        self._word_vector_dict = {}

        self._author = None
        self._set_author(u'test_user')
        self._counter = 0
        self._posts = []

    def tearDown(self):
        self._db.session.close()

    def test_add_additional_fields_to_existing_table(self):
        self._add_post(u'was', u'is')
        self._add_post(u'is', u'was')
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)

        self._word_embedding_model_creator.execute(None)
        self._word_embedding_model_creator._aggregation_functions_names = [
            'sum'
        ]
        self._word_embedding_model_creator.execute(None)

        file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv"
        data = pd.DataFrame.from_csv(file_output_path)

        word_embedding_results = data.loc[(data['author_id'] == 'test_user')
                                          & (data['table_name'] == u'posts') &
                                          (data['targeted_field_name']
                                           == u'content')]
        sum_value_df = word_embedding_results.loc[
            word_embedding_results[u'word_embedding_type'] == u'sum']
        mean_value_df = word_embedding_results.loc[
            word_embedding_results[u'word_embedding_type'] == u'np.mean']

        try:
            if len(sum_value_df.values.tolist()) > 0 and len(
                    mean_value_df.values.tolist()) > 0:
                self.assertTrue(True)
            else:
                self.fail()
        except:
            self.fail()

    def test_case_post_represent_by_posts(self):
        self._add_post(u'post1', u'the claim', u'Claim')
        self._add_post(u'post2', u'dog cat pig man')  # 2
        self._add_post(u'post3', u'TV is the best guys')  # 1
        self._add_claim_tweet_connection(u'post1', u'post2')
        self._add_claim_tweet_connection(u'post1', u'post3')
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)
        self._word_embedding_model_creator._targeted_fields_for_embedding = [{
            'source': {
                'table_name': 'posts',
                'id': 'post_id'
            },
            'connection': {
                'table_name': 'claim_tweet_connection',
                'source_id': 'claim_id',
                'target_id': 'post_id'
            },
            'destination': {
                'table_name': 'posts',
                'id': 'post_id',
                'target_field': 'content',
                "where_clauses": []
            }
        }]

        self._word_embedding_model_creator.execute(None)
        model_name_path = self._word_embedding_model_creator._prepare_model_name_path(
        )
        model = Word2Vec.load(model_name_path)
        word_vector_dict = self._word_embedding_model_creator._get_word_embedding_dict(
            model)
        self._words = word_vector_dict
        self._words_vectors = self._get_posts_val()
        expected_val = self._calc_results()
        self._generic_test(expected_val, u'post1')

    def _setup_test(self):
        self._db.session.commit()
        self._word_embedding_model_creator = GensimWordEmbeddingsModelTrainer(
            self._db)
        self._word_embedding_model_creator.execute(None)

        self._words = self._db.get_word_embedding_dictionary()
        self._words_vectors = self._get_posts_val()

    def _generic_test(self, expected_value, source_id=u""):
        if source_id == u"":
            source_id = self._author.author_guid

        file_output_path = self._word_embedding_model_creator._saved_models_path + self._word_embedding_model_creator._table_name + ".csv"
        data = pd.DataFrame.from_csv(file_output_path)

        word_embedding_results = data.loc[(data['author_id'] == source_id)
                                          & (data['table_name'] == u'posts') &
                                          (data['targeted_field_name']
                                           == u'content')]

        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'min')
        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'max')
        self.assert_word_embedding(word_embedding_results, expected_value,
                                   u'np.mean')

    def assert_word_embedding(self, db_results, expected_value, type):
        result_value = db_results.loc[db_results[u'word_embedding_type'] ==
                                      type, '0':].values.tolist()[0]
        self.assertEquals(list(expected_value[type]), result_value)

    def _generic_non_equal_test(self, expected_value):
        db_results = self._db.get_author_word_embedding(
            self._author.author_guid, u'posts', u'content')
        self.assertNotEqual(expected_value[u'min'], db_results[u'min'])
        self.assertNotEqual(expected_value[u'max'], db_results[u'max'])
        self.assertNotEqual(expected_value[u'np.mean'], db_results[u'np.mean'])

    def _set_author(self, author_guid):
        author = Author()
        author.author_guid = author_guid
        author.author_full_name = u'name' + author_guid
        author.name = u'name' + author_guid
        author.domain = u'test'
        self._db.add_author(author)
        self._author = author

    def _add_post(self, title, content, _domain=u'Microblog'):
        post = Post()
        post.author = self._author.author_full_name
        post.author_guid = self._author.author_guid
        post.content = content
        post.title = title
        post.domain = _domain
        post.post_id = title
        post.guid = title
        self._db.addPost(post)
        self._posts.append(post)

    def _get_posts_val(
            self):  # return the vectors for all the words in the added posts
        vals = {}
        for post in self._posts:
            for word in post.content.split():
                if word in self._words.keys():
                    vals[word] = self._words[word]
        return vals.values()

    def _calc_mean(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('np.mean'), ziped_vec)
        return tuple(result)

    def _calc_min(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('min'), ziped_vec)
        return tuple(result)

    def _calc_max(self, vectors):
        vectors = self._get_posts_val()
        if len(vectors) == 0:
            return (0, ) * 300
        ziped_vec = zip(*vectors)
        result = map(eval('max'), ziped_vec)
        return tuple(result)

    def _calc_results(self):
        vectors = self._words_vectors
        results = {}
        results[u'min'] = self._calc_min(vectors)
        results[u'max'] = self._calc_max(vectors)
        results[u'np.mean'] = self._calc_mean(vectors)
        return results

    def _add_target_article(self, post_id, title, description, author_guid):
        target_article = Target_Article()
        target_article.author_guid = author_guid
        target_article.post_id = post_id
        target_article.title = title
        target_article.description = description
        target_article.keywords = 'temp, lala, fafa'
        self._db.add_target_articles([target_article])

    def _add_target_article_item(self, post_id, type, content, author_guid):
        article_item = Target_Article_Item()
        article_item.post_id = post_id
        article_item.type = type
        article_item.item_number = 3
        article_item.content = content
        article_item.author_guid = author_guid
        self._db.addPosts([article_item])

    def _add_claim_tweet_connection(self, claim_id, post_id):
        connection = Claim_Tweet_Connection()
        connection.claim_id = claim_id
        connection.post_id = post_id
        self._db.add_claim_connections([connection])
        pass