def create_feature(self):
        if is_test_or_val_set(self.dataset_id):

            train_dataset_id = get_train_set_id_from_test_or_val_set(self.dataset_id)

            engager_id_feature = MappedFeatureEngagerId(train_dataset_id)
            engagement_feature = TweetFeatureEngagementIsReply(train_dataset_id)

            engager_id_df = engager_id_feature.load_or_create()
            engagement_df = engagement_feature.load_or_create()

            # Load the media column
            dataframe = pd.concat([
                engager_id_df,
                engagement_df,
            ],
                axis=1
            )
            dataframe = dataframe[dataframe[engagement_feature.feature_name]]
            dataframe = pd.DataFrame({self.feature_name: dataframe.groupby(engager_id_feature.feature_name).size()})
            dictionary = dataframe.to_dict()[self.feature_name]

            test_engager_id_feature = MappedFeatureEngagerId(self.dataset_id)
            test_engager_id_df = test_engager_id_feature.load_or_create()

            engagement_count_df = pd.DataFrame(
                test_engager_id_df[engager_id_feature.feature_name].map(lambda x: dictionary.get(x, 0)))
            self.save_feature(engagement_count_df)
        else:

            engager_id_feature = MappedFeatureEngagerId(self.dataset_id)
            engagement_feature = TweetFeatureEngagementIsReply(self.dataset_id)

            engager_id_df = engager_id_feature.load_or_create()
            engagement_df = engagement_feature.load_or_create()

            # Load the media column
            dataframe = pd.concat([
                engager_id_df,
                engagement_df,
            ],
                axis=1
            )
            dataframe = dataframe[dataframe[engagement_feature.feature_name]]
            dataframe = pd.DataFrame({self.feature_name: dataframe.groupby(engager_id_feature.feature_name).size()})
            dictionary = dataframe.to_dict()[self.feature_name]

            engagement_count_df = pd.DataFrame(
                engager_id_df[engager_id_feature.feature_name].map(lambda x: dictionary.get(x, 0)))
            self.save_feature(engagement_count_df)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)

        # Save the column name
        eng_col = engagers_feature.feature_name

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            engagers_feature.load_or_create(),
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        engager_counter_array = np.zeros(
            dataframe[engagers_feature.feature_name].max() + 1, dtype=int)

        result = pd.DataFrame([
            find_and_increase(engager_id, engager_counter_array)
            for engager_id in dataframe[eng_col]
        ],
                              index=dataframe.index)

        if not EngagerFeatureNumberOfPreviousEngagement(
                train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerFeatureNumberOfPreviousEngagement(
                train_dataset_id).save_feature(result)
        if not EngagerFeatureNumberOfPreviousEngagement(
                test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                engagers_feature.load_or_create(),
            ],
                                  axis=1)

            if dataframe[engagers_feature.feature_name].max(
            ) + 1 > engager_counter_array.size:
                engager_counter_array = np.pad(
                    engager_counter_array,
                    pad_width=(0,
                               dataframe[engagers_feature.feature_name].max() +
                               1 - engager_counter_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame(
                dataframe[eng_col].map(lambda x: engager_counter_array[x]),
                index=dataframe.index)
            EngagerFeatureNumberOfPreviousEngagement(
                test_dataset_id).save_feature(result)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        language_feature = MappedFeatureTweetLanguage(train_dataset_id)
        engagement_feature = TweetFeatureEngagementIsNegative(train_dataset_id)

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            engagers_feature.load_or_create(),
            engagement_feature.load_or_create(),
            creators_feature.load_or_create(),
            language_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        # KEY: a tuple (creator, engager)
        # VALUE: the number of time the engager has engaged with the creator
        # If key does not exists -> 0 times.
        engagement_dict = {}

        result = pd.DataFrame([
            find_and_increase_engager(eng_id, cre_id, lang, engagement_dict)
            if engagement else engagement_dict.get((eng_id, lang), 0)
            for eng_id, cre_id, lang, engagement in zip(
                dataframe[engagers_feature.feature_name], dataframe[
                    creators_feature.feature_name], dataframe[
                        language_feature.feature_name], dataframe[
                            engagement_feature.feature_name])
        ],
                              index=dataframe.index)

        if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                train_dataset_id).save_feature(result)
        if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)
            language_feature = MappedFeatureTweetLanguage(test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                engagers_feature.load_or_create(),
                creators_feature.load_or_create(),
                language_feature.load_or_create()
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            result = pd.DataFrame([
                find_and_increase_creator(eng_id, cre_id, lang,
                                          engagement_dict)
                for eng_id, cre_id, lang in zip(
                    dataframe[engagers_feature.feature_name], dataframe[
                        creators_feature.feature_name], dataframe[
                            language_feature.feature_name])
            ],
                                  index=dataframe.index)
            result.sort_index(inplace=True)

            EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                test_dataset_id).save_feature(result)
    def create_feature(self):

        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)
        language_feature = MappedFeatureGroupedTweetLanguage(train_dataset_id)
        engagement_feature = TweetFeatureEngagementIsPositive(train_dataset_id)

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            creators_feature.load_or_create(),
            engagers_feature.load_or_create(),
            language_feature.load_or_create(),
            engagement_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        engager_counter_array = np.zeros(
            (data.DataStats.get_max_user_id() + 1, 70), dtype=np.uint16)

        result = pd.DataFrame([
            find_and_increase_engager(engager_id, creator_id, language,
                                      engagement, engager_counter_array)
            for engager_id, creator_id, language, engagement in zip(
                dataframe[engagers_feature.feature_name], dataframe[
                    creators_feature.feature_name], dataframe[
                        language_feature.feature_name], dataframe[
                            engagement_feature.feature_name])
        ],
                              index=dataframe.index)
        if not EngagerMainGroupedLanguage(train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerMainGroupedLanguage(train_dataset_id).save_feature(result)
        if not EngagerMainGroupedLanguage(test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)
            language_feature = MappedFeatureGroupedTweetLanguage(
                test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                creators_feature.load_or_create(),
                engagers_feature.load_or_create(),
                language_feature.load_or_create()
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            result = pd.DataFrame([
                find_and_increase_engager(engager_id, creator_id, language,
                                          False, engager_counter_array)
                for engager_id, creator_id, language in zip(
                    dataframe[engagers_feature.feature_name], dataframe[
                        creators_feature.feature_name], dataframe[
                            language_feature.feature_name])
            ],
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            EngagerMainGroupedLanguage(test_dataset_id).save_feature(result)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        start_time = time.time()

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creator_id_feature = MappedFeatureCreatorId(train_dataset_id)
        engager_id_feature = MappedFeatureEngagerId(train_dataset_id)
        engagement_feature = self._get_engagement_feature(train_dataset_id)

        # save column names
        creator_id_col = creator_id_feature.feature_name
        engager_id_col = engager_id_feature.feature_name
        engagement_col = engagement_feature.feature_name

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            creator_id_feature.load_or_create(),
            engager_id_feature.load_or_create(),
            engagement_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        max_id = max(dataframe[creator_id_col].max(),
                     dataframe[engager_id_col].max())
        counter_array = np.zeros(max_id + 1, dtype=int)

        result = pd.DataFrame([
            find_and_increase(creator_id=creator_id,
                              engager_id=engager_id,
                              counter_array=counter_array)
            if engagement else counter_array[creator_id]
            for creator_id, engager_id, engagement in zip(
                dataframe[creator_id_col], dataframe[engager_id_col],
                dataframe[engagement_col])
        ],
                              index=dataframe.index)
        self._save_train_result_if_not_present(result, train_dataset_id)

        if not self._exists_test_feature(test_dataset_id):
            # Load features
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creator_id_feature = MappedFeatureCreatorId(test_dataset_id)
            engager_id_feature = MappedFeatureEngagerId(test_dataset_id)

            # save column names
            creator_id_col = creator_id_feature.feature_name
            engager_id_col = engager_id_feature.feature_name

            dataframe = pd.concat([
                creator_id_feature.load_or_create(),
                engager_id_feature.load_or_create(),
                creation_timestamps_feature.load_or_create(),
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            max_id = max(dataframe[creator_id_col].max(),
                         dataframe[engager_id_col].max())

            if max_id + 1 > counter_array.size:
                counter_array = np.pad(counter_array,
                                       pad_width=(0, max_id + 1 -
                                                  counter_array.size),
                                       mode='constant',
                                       constant_values=0)

            result = pd.DataFrame(
                dataframe[creator_id_col].map(lambda x: counter_array[x]),
                index=dataframe.index)

            result.sort_index(inplace=True)

            print("time:")
            print(time.time() - start_time)

            self._save_test_result(result, test_dataset_id)