예제 #1
0
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        import Utils.Data.Data as data
        train_df = data.get_dataset([
            f"mapped_feature_creator_id", f"mapped_feature_engager_id",
            f"tweet_feature_engagement_is_{self._get_suffix()}"
        ], train_dataset_id)
        if is_test_or_val_set(self.dataset_id):
            test_df = data.get_dataset(
                [f"mapped_feature_creator_id", f"mapped_feature_engager_id"],
                test_dataset_id)
            train_df = train_df[
                train_df[f"tweet_feature_engagement_is_{self._get_suffix()}"]
                == True]
            res = compute(train_df, test_df)
            res.sort_index(inplace=True)
            self._save_test_result(res, test_dataset_id)
        else:
            # Compute the folds
            X_train_folds = np.array_split(train_df.sample(frac=1),
                                           self.number_of_folds)

            result = None

            for i in range(self.number_of_folds):
                local_train = pd.concat([
                    X_train_folds[x] for x in range(self.number_of_folds)
                    if x is not i
                ])
                local_train = local_train[local_train[
                    f"tweet_feature_engagement_is_{self._get_suffix()}"] ==
                                          True]
                local_test = X_train_folds[i]

                res = compute(local_train, local_test)

                if result is None:
                    result = res
                else:
                    result = pd.concat([result, res])

            self._save_train_result_if_not_present(result, train_dataset_id)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)

        # Save the column name
        eng_col = engagers_feature.feature_name

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            engagers_feature.load_or_create(),
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        engager_counter_array = np.zeros(
            dataframe[engagers_feature.feature_name].max() + 1, dtype=int)

        result = pd.DataFrame([
            find_and_increase(engager_id, engager_counter_array)
            for engager_id in dataframe[eng_col]
        ],
                              index=dataframe.index)

        if not EngagerFeatureNumberOfPreviousEngagement(
                train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerFeatureNumberOfPreviousEngagement(
                train_dataset_id).save_feature(result)
        if not EngagerFeatureNumberOfPreviousEngagement(
                test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                engagers_feature.load_or_create(),
            ],
                                  axis=1)

            if dataframe[engagers_feature.feature_name].max(
            ) + 1 > engager_counter_array.size:
                engager_counter_array = np.pad(
                    engager_counter_array,
                    pad_width=(0,
                               dataframe[engagers_feature.feature_name].max() +
                               1 - engager_counter_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame(
                dataframe[eng_col].map(lambda x: engager_counter_array[x]),
                index=dataframe.index)
            EngagerFeatureNumberOfPreviousEngagement(
                test_dataset_id).save_feature(result)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        tweet_id_feature = MappedFeatureTweetId(train_dataset_id)

        # Save the column name
        creators_col = creators_feature.feature_name
        tweet_id_col = tweet_id_feature.feature_name

        length_dict = TweetTokenLengthFeatureDictArray().load_or_create()
        length_unique_dict = TweetTokenLengthUniqueFeatureDictArray(
        ).load_or_create()

        dataframe = pd.concat([
            creators_feature.load_or_create(),
            creation_timestamps_feature.load_or_create(),
            tweet_id_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        creator_length_array = np.zeros(
            dataframe[creators_feature.feature_name].max() + 1, dtype=int)
        creator_length_unique_array = np.zeros(
            dataframe[creators_feature.feature_name].max() + 1, dtype=int)

        result = pd.DataFrame([
            find_ratio_and_update(
                creator_id, creator_length_array, creator_length_unique_array,
                length_dict[tweet_id], length_unique_dict[tweet_id])
            for creator_id, tweet_id in zip(dataframe[creators_col],
                                            dataframe[tweet_id_col])
        ],
                              index=dataframe.index)

        if not CreatorFrequencyUniqueTokens(train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            CreatorFrequencyUniqueTokens(train_dataset_id).save_feature(result)

        if not CreatorFrequencyUniqueTokens(test_dataset_id).has_feature():

            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)
            tweet_id_feature = MappedFeatureTweetId(test_dataset_id)

            # Save the column name
            creators_col = creators_feature.feature_name
            tweet_id_col = tweet_id_feature.feature_name

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                creators_feature.load_or_create(),
                tweet_id_feature.load_or_create(),
            ],
                                  axis=1)
            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            # if there are new creators in the test set, pad the arrays
            if dataframe[creators_col].max() + 1 > creator_length_array.size:
                creator_length_array = np.pad(
                    creator_length_array,
                    pad_width=(0, dataframe[creators_col].max() + 1 -
                               creator_length_array.size),
                    mode='constant',
                    constant_values=0)

                creator_length_unique_array = np.pad(
                    creator_length_array,
                    pad_width=(0, dataframe[creators_col].max() + 1 -
                               creator_length_unique_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame([
                find_ratio_and_update(
                    creator_id, creator_length_array,
                    creator_length_unique_array, length_dict[tweet_id],
                    length_unique_dict[tweet_id]) for creator_id, tweet_id in
                zip(dataframe[creators_col], dataframe[tweet_id_col])
            ],
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            CreatorFrequencyUniqueTokens(test_dataset_id).save_feature(result)
예제 #4
0
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        start_time = time.time()

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creator_id_feature = MappedFeatureCreatorId(train_dataset_id)
        engagement_feature = self._get_engagement_feature(train_dataset_id)

        # save column names
        creator_id_col = creator_id_feature.feature_name
        engagement_col = engagement_feature.feature_name

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            creator_id_feature.load_or_create(),
            engagement_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        creator_counter_array = np.zeros(dataframe[creator_id_col].max() + 1,
                                         dtype=int)

        result = pd.DataFrame([
            find_and_increase(creator_id=creator_id,
                              counter_array=creator_counter_array)
            if engagement else creator_counter_array[creator_id]
            for creator_id, engagement in zip(dataframe[creator_id_col],
                                              dataframe[engagement_col])
        ],
                              index=dataframe.index)
        self._save_train_result_if_not_present(result, train_dataset_id)

        if not self._exists_test_feature(test_dataset_id):
            # Load features
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creator_id_feature = MappedFeatureCreatorId(test_dataset_id)

            # save column names
            creator_id_col = creator_id_feature.feature_name

            dataframe = pd.concat([
                creator_id_feature.load_or_create(),
                creation_timestamps_feature.load_or_create(),
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            if dataframe[creator_id_col].max(
            ) + 1 > creator_counter_array.size:
                creator_counter_array = np.pad(
                    creator_counter_array,
                    pad_width=(0, dataframe[creator_id_col].max() + 1 -
                               creator_counter_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame(dataframe[creator_id_col].map(
                lambda x: creator_counter_array[x]),
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            print("time:")
            print(time.time() - start_time)

            self._save_test_result(result, test_dataset_id)
def analyze_cold_creator(dataset_id: str):
    train_dataset = dataset_id
    test_dataset = get_test_or_val_set_id_from_train(dataset_id)

    is_positive = TweetFeatureEngagementIsPositive(train_dataset)
    is_positive_df = is_positive.load_or_create()
    positive_mask = is_positive_df[is_positive.feature_name]

    train_creator_id = MappedFeatureCreatorId(train_dataset)
    train_engager_id = MappedFeatureEngagerId(train_dataset)

    train_creator_id_df = train_creator_id.load_or_create()[positive_mask]
    train_engager_id_df = train_engager_id.load_or_create()[positive_mask]

    train_users_id_df = pd.DataFrame(
        pd.concat([
            train_creator_id_df[train_creator_id.feature_name],
            train_engager_id_df[train_engager_id.feature_name]
        ],
                  axis=0).unique())

    train_users_id_df['is_train'] = True
    train_users_id_df.set_index(train_users_id_df.columns[0], inplace=True)

    test_creator_id = MappedFeatureCreatorId(test_dataset)
    # test_engager_id = MappedFeatureEngagerId(test_dataset)

    test_creator_id_df = test_creator_id.load_or_create()
    # test_engager_id_df = test_engager_id.load_or_create()

    total_number_of_engagements = len(test_creator_id_df)

    test_users_id_df = pd.DataFrame(
        pd.concat(
            [
                test_creator_id_df[test_creator_id.feature_name],
                # test_engager_id_df[test_engager_id.feature_name]
            ],
            axis=0).unique())

    count = pd.DataFrame({
        'count':
        test_users_id_df.groupby(test_users_id_df.columns[0]).size()
    })

    test_users_id_df['is_test'] = True
    test_users_id_df.set_index(test_users_id_df.columns[0], inplace=True)

    x = train_users_id_df.join(test_users_id_df, how='outer')
    x = x.fillna(False)

    train_mask = x['is_train'] == False
    test_mask = x['is_test'] == True

    mask = train_mask & test_mask

    cold_users = np.array(x[mask].index.array)
    print(f"------------------------")
    print(dataset_id)
    print(f"Unique test creators are: {len(test_users_id_df)}")
    print(f"Unique test cold creators are: {len(cold_users)}")
    print(f"Engagements in test set are: {total_number_of_engagements}")
    print(
        f"Engagements of cold users in test set are: {count['count'][mask].sum()}"
    )
    print(
        f"Man number of engagement per cold creator in test set are: {count['count'][mask].max()}"
    )
    print(
        f"Probability that an engagement is engaged by a cold creators: {len(cold_users)/total_number_of_engagements}"
    )
    print(f"------------------------")
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        language_feature = MappedFeatureTweetLanguage(train_dataset_id)
        engagement_feature = TweetFeatureEngagementIsNegative(train_dataset_id)

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            engagers_feature.load_or_create(),
            engagement_feature.load_or_create(),
            creators_feature.load_or_create(),
            language_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        # KEY: a tuple (creator, engager)
        # VALUE: the number of time the engager has engaged with the creator
        # If key does not exists -> 0 times.
        engagement_dict = {}

        result = pd.DataFrame([
            find_and_increase_engager(eng_id, cre_id, lang, engagement_dict)
            if engagement else engagement_dict.get((eng_id, lang), 0)
            for eng_id, cre_id, lang, engagement in zip(
                dataframe[engagers_feature.feature_name], dataframe[
                    creators_feature.feature_name], dataframe[
                        language_feature.feature_name], dataframe[
                            engagement_feature.feature_name])
        ],
                              index=dataframe.index)

        if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                train_dataset_id).save_feature(result)
        if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)
            language_feature = MappedFeatureTweetLanguage(test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                engagers_feature.load_or_create(),
                creators_feature.load_or_create(),
                language_feature.load_or_create()
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            result = pd.DataFrame([
                find_and_increase_creator(eng_id, cre_id, lang,
                                          engagement_dict)
                for eng_id, cre_id, lang in zip(
                    dataframe[engagers_feature.feature_name], dataframe[
                        creators_feature.feature_name], dataframe[
                            language_feature.feature_name])
            ],
                                  index=dataframe.index)
            result.sort_index(inplace=True)

            EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                test_dataset_id).save_feature(result)
    def create_feature(self):

        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)
        language_feature = MappedFeatureGroupedTweetLanguage(train_dataset_id)
        engagement_feature = TweetFeatureEngagementIsPositive(train_dataset_id)

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            creators_feature.load_or_create(),
            engagers_feature.load_or_create(),
            language_feature.load_or_create(),
            engagement_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        engager_counter_array = np.zeros(
            (data.DataStats.get_max_user_id() + 1, 70), dtype=np.uint16)

        result = pd.DataFrame([
            find_and_increase_engager(engager_id, creator_id, language,
                                      engagement, engager_counter_array)
            for engager_id, creator_id, language, engagement in zip(
                dataframe[engagers_feature.feature_name], dataframe[
                    creators_feature.feature_name], dataframe[
                        language_feature.feature_name], dataframe[
                            engagement_feature.feature_name])
        ],
                              index=dataframe.index)
        if not EngagerMainGroupedLanguage(train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerMainGroupedLanguage(train_dataset_id).save_feature(result)
        if not EngagerMainGroupedLanguage(test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)
            language_feature = MappedFeatureGroupedTweetLanguage(
                test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                creators_feature.load_or_create(),
                engagers_feature.load_or_create(),
                language_feature.load_or_create()
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            result = pd.DataFrame([
                find_and_increase_engager(engager_id, creator_id, language,
                                          False, engager_counter_array)
                for engager_id, creator_id, language in zip(
                    dataframe[engagers_feature.feature_name], dataframe[
                        creators_feature.feature_name], dataframe[
                            language_feature.feature_name])
            ],
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            EngagerMainGroupedLanguage(test_dataset_id).save_feature(result)