def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) import Utils.Data.Data as data train_df = data.get_dataset([ f"mapped_feature_creator_id", f"mapped_feature_engager_id", f"tweet_feature_engagement_is_{self._get_suffix()}" ], train_dataset_id) if is_test_or_val_set(self.dataset_id): test_df = data.get_dataset( [f"mapped_feature_creator_id", f"mapped_feature_engager_id"], test_dataset_id) train_df = train_df[ train_df[f"tweet_feature_engagement_is_{self._get_suffix()}"] == True] res = compute(train_df, test_df) res.sort_index(inplace=True) self._save_test_result(res, test_dataset_id) else: # Compute the folds X_train_folds = np.array_split(train_df.sample(frac=1), self.number_of_folds) result = None for i in range(self.number_of_folds): local_train = pd.concat([ X_train_folds[x] for x in range(self.number_of_folds) if x is not i ]) local_train = local_train[local_train[ f"tweet_feature_engagement_is_{self._get_suffix()}"] == True] local_test = X_train_folds[i] res = compute(local_train, local_test) if result is None: result = res else: result = pd.concat([result, res]) self._save_train_result_if_not_present(result, train_dataset_id)
def create_feature(self): if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set(self.dataset_id) engager_id_feature = MappedFeatureEngagerId(train_dataset_id) engagement_feature = TweetFeatureEngagementIsReply(train_dataset_id) engager_id_df = engager_id_feature.load_or_create() engagement_df = engagement_feature.load_or_create() # Load the media column dataframe = pd.concat([ engager_id_df, engagement_df, ], axis=1 ) dataframe = dataframe[dataframe[engagement_feature.feature_name]] dataframe = pd.DataFrame({self.feature_name: dataframe.groupby(engager_id_feature.feature_name).size()}) dictionary = dataframe.to_dict()[self.feature_name] test_engager_id_feature = MappedFeatureEngagerId(self.dataset_id) test_engager_id_df = test_engager_id_feature.load_or_create() engagement_count_df = pd.DataFrame( test_engager_id_df[engager_id_feature.feature_name].map(lambda x: dictionary.get(x, 0))) self.save_feature(engagement_count_df) else: engager_id_feature = MappedFeatureEngagerId(self.dataset_id) engagement_feature = TweetFeatureEngagementIsReply(self.dataset_id) engager_id_df = engager_id_feature.load_or_create() engagement_df = engagement_feature.load_or_create() # Load the media column dataframe = pd.concat([ engager_id_df, engagement_df, ], axis=1 ) dataframe = dataframe[dataframe[engagement_feature.feature_name]] dataframe = pd.DataFrame({self.feature_name: dataframe.groupby(engager_id_feature.feature_name).size()}) dictionary = dataframe.to_dict()[self.feature_name] engagement_count_df = pd.DataFrame( engager_id_df[engager_id_feature.feature_name].map(lambda x: dictionary.get(x, 0))) self.save_feature(engagement_count_df)
def create_feature(self): kind = "comment" # Load the hashtags column feature = MappedFeatureTweetHashtags(self.dataset_id) feature_df = feature.load_or_create() # Load the list of discriminative for the like class if not is_test_or_val_set(self.dataset_id): kind_pos, kind_neg = self.loadDiscriminative( kind, self.dataset_id, feature_df, feature.feature_name, 3, 3) elif is_test_or_val_set(self.dataset_id): kind_pos, kind_neg = loadPosAndNegLists(kind) # Create the feature kind_disc_df = pd.DataFrame() kind_disc_df[self.feature_name + "pos"] = feature_df[feature.feature_name].progress_map( lambda x: containsHashtag(x, kind_pos) if x is not None else False) kind_disc_df[self.feature_name + "neg"] = feature_df[feature.feature_name].progress_map( lambda x: containsHashtag(x, kind_neg) if x is not None else False) kind_disc_df = kind_disc_df.astype(int) self.save_feature(kind_disc_df)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) engagers_feature = MappedFeatureEngagerId(train_dataset_id) # Save the column name eng_col = engagers_feature.feature_name dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) engager_counter_array = np.zeros( dataframe[engagers_feature.feature_name].max() + 1, dtype=int) result = pd.DataFrame([ find_and_increase(engager_id, engager_counter_array) for engager_id in dataframe[eng_col] ], index=dataframe.index) if not EngagerFeatureNumberOfPreviousEngagement( train_dataset_id).has_feature(): result.sort_index(inplace=True) EngagerFeatureNumberOfPreviousEngagement( train_dataset_id).save_feature(result) if not EngagerFeatureNumberOfPreviousEngagement( test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) engagers_feature = MappedFeatureEngagerId(test_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), ], axis=1) if dataframe[engagers_feature.feature_name].max( ) + 1 > engager_counter_array.size: engager_counter_array = np.pad( engager_counter_array, pad_width=(0, dataframe[engagers_feature.feature_name].max() + 1 - engager_counter_array.size), mode='constant', constant_values=0) result = pd.DataFrame( dataframe[eng_col].map(lambda x: engager_counter_array[x]), index=dataframe.index) EngagerFeatureNumberOfPreviousEngagement( test_dataset_id).save_feature(result)
def create_feature(self): # Check if the dataset id is train or test if not is_test_or_val_set(self.dataset_id): # Compute train and test dataset ids train_dataset_id = self.dataset_id # Load the dataset and shuffle it import Utils.Data.Data as data X_train = data.get_dataset(features=self.features, dataset_id=train_dataset_id, nthread=64) print(X_train) print(X_train.memory_usage()) Y_train = data.get_dataset(features=self.label, dataset_id=train_dataset_id, nthread=64) print(Y_train) print(Y_train.memory_usage()) # Declare list of scores (of each folds) # used for aggregating results scores = [] kf = KFold(n_splits=4, shuffle=True, random_state=8) # Train multiple models with 1-fold out strategy for train_index, test_index in kf.split(X_train): train_index = np.random.choice(train_index, int(len(train_index) / 20), replace=True) local_X_train = X_train.iloc[train_index] local_Y_train = Y_train.iloc[train_index] # Compute the test set local_X_test = X_train.iloc[test_index] # Generate the dataset id for this fold fold_dataset_id = f"{self.feature_name}_{self.dataset_id}_fold_{len(scores)}" # Create the sub-feature feature = XGBEnsembling(fold_dataset_id, local_X_train, local_Y_train, local_X_test, self.param_dict) # Retrieve the scores scores.append( pd.DataFrame(feature.load_or_create(), index=local_X_test.index)) print(scores) # Compute the resulting dataframe and sort the results result = pd.concat(scores).sort_index() # Save it as a feature self.save_feature(result) else: test_dataset_id = self.dataset_id train_dataset_id = get_train_set_id_from_test_or_val_set( test_dataset_id) # Load the train dataset import Utils.Data.Data as data X_train = data.get_dataset_batch(features=self.features, dataset_id=train_dataset_id, total_n_split=1, split_n=0, sample=0.05) Y_train = data.get_dataset_batch(features=self.label, dataset_id=train_dataset_id, total_n_split=1, split_n=0, sample=0.05) # Load the test dataset X_test = data.get_dataset(features=self.features, dataset_id=test_dataset_id, nthread=64) fold_dataset_id = f"{self.feature_name}_{self.dataset_id}" # Create the sub-feature feature = XGBEnsembling(fold_dataset_id, X_train, Y_train, X_test, self.param_dict) # Retrieve the scores result = pd.DataFrame(feature.load_or_create(), index=X_test.index) # Save it as a feature self.save_feature(result)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) creators_feature = MappedFeatureCreatorId(train_dataset_id) tweet_id_feature = MappedFeatureTweetId(train_dataset_id) # Save the column name creators_col = creators_feature.feature_name tweet_id_col = tweet_id_feature.feature_name length_dict = TweetTokenLengthFeatureDictArray().load_or_create() length_unique_dict = TweetTokenLengthUniqueFeatureDictArray( ).load_or_create() dataframe = pd.concat([ creators_feature.load_or_create(), creation_timestamps_feature.load_or_create(), tweet_id_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) creator_length_array = np.zeros( dataframe[creators_feature.feature_name].max() + 1, dtype=int) creator_length_unique_array = np.zeros( dataframe[creators_feature.feature_name].max() + 1, dtype=int) result = pd.DataFrame([ find_ratio_and_update( creator_id, creator_length_array, creator_length_unique_array, length_dict[tweet_id], length_unique_dict[tweet_id]) for creator_id, tweet_id in zip(dataframe[creators_col], dataframe[tweet_id_col]) ], index=dataframe.index) if not CreatorFrequencyUniqueTokens(train_dataset_id).has_feature(): result.sort_index(inplace=True) CreatorFrequencyUniqueTokens(train_dataset_id).save_feature(result) if not CreatorFrequencyUniqueTokens(test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) creators_feature = MappedFeatureCreatorId(test_dataset_id) tweet_id_feature = MappedFeatureTweetId(test_dataset_id) # Save the column name creators_col = creators_feature.feature_name tweet_id_col = tweet_id_feature.feature_name dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creators_feature.load_or_create(), tweet_id_feature.load_or_create(), ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) # if there are new creators in the test set, pad the arrays if dataframe[creators_col].max() + 1 > creator_length_array.size: creator_length_array = np.pad( creator_length_array, pad_width=(0, dataframe[creators_col].max() + 1 - creator_length_array.size), mode='constant', constant_values=0) creator_length_unique_array = np.pad( creator_length_array, pad_width=(0, dataframe[creators_col].max() + 1 - creator_length_unique_array.size), mode='constant', constant_values=0) result = pd.DataFrame([ find_ratio_and_update( creator_id, creator_length_array, creator_length_unique_array, length_dict[tweet_id], length_unique_dict[tweet_id]) for creator_id, tweet_id in zip(dataframe[creators_col], dataframe[tweet_id_col]) ], index=dataframe.index) result.sort_index(inplace=True) CreatorFrequencyUniqueTokens(test_dataset_id).save_feature(result)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) start_time = time.time() # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) creator_id_feature = MappedFeatureCreatorId(train_dataset_id) engagement_feature = self._get_engagement_feature(train_dataset_id) # save column names creator_id_col = creator_id_feature.feature_name engagement_col = engagement_feature.feature_name dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creator_id_feature.load_or_create(), engagement_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) creator_counter_array = np.zeros(dataframe[creator_id_col].max() + 1, dtype=int) result = pd.DataFrame([ find_and_increase(creator_id=creator_id, counter_array=creator_counter_array) if engagement else creator_counter_array[creator_id] for creator_id, engagement in zip(dataframe[creator_id_col], dataframe[engagement_col]) ], index=dataframe.index) self._save_train_result_if_not_present(result, train_dataset_id) if not self._exists_test_feature(test_dataset_id): # Load features # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) creator_id_feature = MappedFeatureCreatorId(test_dataset_id) # save column names creator_id_col = creator_id_feature.feature_name dataframe = pd.concat([ creator_id_feature.load_or_create(), creation_timestamps_feature.load_or_create(), ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) if dataframe[creator_id_col].max( ) + 1 > creator_counter_array.size: creator_counter_array = np.pad( creator_counter_array, pad_width=(0, dataframe[creator_id_col].max() + 1 - creator_counter_array.size), mode='constant', constant_values=0) result = pd.DataFrame(dataframe[creator_id_col].map( lambda x: creator_counter_array[x]), index=dataframe.index) result.sort_index(inplace=True) print("time:") print(time.time() - start_time) self._save_test_result(result, test_dataset_id)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) engagers_feature = MappedFeatureEngagerId(train_dataset_id) creators_feature = MappedFeatureCreatorId(train_dataset_id) language_feature = MappedFeatureTweetLanguage(train_dataset_id) engagement_feature = TweetFeatureEngagementIsNegative(train_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), engagement_feature.load_or_create(), creators_feature.load_or_create(), language_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) # KEY: a tuple (creator, engager) # VALUE: the number of time the engager has engaged with the creator # If key does not exists -> 0 times. engagement_dict = {} result = pd.DataFrame([ find_and_increase_engager(eng_id, cre_id, lang, engagement_dict) if engagement else engagement_dict.get((eng_id, lang), 0) for eng_id, cre_id, lang, engagement in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name], dataframe[ engagement_feature.feature_name]) ], index=dataframe.index) if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( train_dataset_id).has_feature(): result.sort_index(inplace=True) EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( train_dataset_id).save_feature(result) if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) engagers_feature = MappedFeatureEngagerId(test_dataset_id) language_feature = MappedFeatureTweetLanguage(test_dataset_id) creators_feature = MappedFeatureCreatorId(test_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), creators_feature.load_or_create(), language_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) result = pd.DataFrame([ find_and_increase_creator(eng_id, cre_id, lang, engagement_dict) for eng_id, cre_id, lang in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name]) ], index=dataframe.index) result.sort_index(inplace=True) EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( test_dataset_id).save_feature(result)
def create_feature(self): if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) # Load the necessary features creator_id_feature = MappedFeatureCreatorId(train_dataset_id) engager_id_feature = MappedFeatureEngagerId(train_dataset_id) language_id_feature = MappedFeatureTweetLanguage(train_dataset_id) engagement_feature = TweetFeatureEngagementIsLike(train_dataset_id) # Load the dataframes creator_id_df = creator_id_feature.load_or_create() engager_id_df = engager_id_feature.load_or_create() language_id_df = language_id_feature.load_or_create() engagement_df = engagement_feature.load_or_create() # Concatenate the dataframes dataframe = pd.concat( [creator_id_df, engager_id_df, language_id_df, engagement_df], axis=1) # Filter the negative interactions positive_dataframe = dataframe[dataframe[ engagement_feature.feature_name]] # Let's compute the known language when the user is creator dictionary_creator_df = pd.DataFrame(positive_dataframe[[ creator_id_feature.feature_name, language_id_feature.feature_name, engagement_feature.feature_name ]].groupby([ creator_id_feature.feature_name, language_id_feature.feature_name ]).first()) dictionary_creator_df.columns = ['users'] dictionary_creator = dictionary_creator_df.to_dict()['users'] # Let's compute the known language when the user is engager dictionary_engager_df = pd.DataFrame(positive_dataframe[[ engager_id_feature.feature_name, language_id_feature.feature_name, engagement_feature.feature_name ]].groupby([ engager_id_feature.feature_name, language_id_feature.feature_name ]).first()) dictionary_engager_df.columns = ['users'] dictionary_engager = dictionary_engager_df.to_dict()['users'] # Merge the two dictionaries dictionary_user = {**dictionary_creator, **dictionary_engager} # Load the test information test_engager_id_feature = MappedFeatureEngagerId(self.dataset_id) test_tweet_langugage_feature = MappedFeatureTweetLanguage( self.dataset_id) test_engager_id_df = test_engager_id_feature.load_or_create() test_tweet_langugage_df = test_tweet_langugage_feature.load_or_create( ) test_dataframe = pd.concat( [test_engager_id_df, test_tweet_langugage_df], axis=1) # Apply the super duper dictionary result_df = pd.DataFrame(test_dataframe[[ engager_id_feature.feature_name, language_id_feature.feature_name ]].apply(lambda x: dictionary_user.get((x[0], x[1]), False), axis=1)) # Save back the dataframe self.save_feature(result_df) else: # Load the necessary features creator_id_feature = MappedFeatureCreatorId(self.dataset_id) engager_id_feature = MappedFeatureEngagerId(self.dataset_id) language_id_feature = MappedFeatureTweetLanguage(self.dataset_id) engagement_feature = TweetFeatureEngagementIsLike(self.dataset_id) # Load the dataframes creator_id_df = creator_id_feature.load_or_create() engager_id_df = engager_id_feature.load_or_create() language_id_df = language_id_feature.load_or_create() engagement_df = engagement_feature.load_or_create() # Concatenate the dataframes dataframe = pd.concat( [creator_id_df, engager_id_df, language_id_df, engagement_df], axis=1) # Filter the negative interactions positive_dataframe = dataframe[dataframe[ engagement_feature.feature_name]] # Let's compute the known language when the user is creator dictionary_creator_df = pd.DataFrame(positive_dataframe[[ creator_id_feature.feature_name, language_id_feature.feature_name, engagement_feature.feature_name ]].groupby([ creator_id_feature.feature_name, language_id_feature.feature_name ]).first()) dictionary_creator_df.columns = ['users'] dictionary_creator = dictionary_creator_df.to_dict()['users'] # Let's compute the known language when the user is engager dictionary_engager_df = pd.DataFrame(positive_dataframe[[ engager_id_feature.feature_name, language_id_feature.feature_name, engagement_feature.feature_name ]].groupby([ engager_id_feature.feature_name, language_id_feature.feature_name ]).first()) dictionary_engager_df.columns = ['users'] dictionary_engager = dictionary_engager_df.to_dict()['users'] # Merge the two dictionaries dictionary_user = {**dictionary_creator, **dictionary_engager} # Apply the super duper dictionary result_df = pd.DataFrame(dataframe[[ engager_id_feature.feature_name, language_id_feature.feature_name ]].apply(lambda x: dictionary_user.get((x[0], x[1]), False), axis=1)) # Save back the dataframe self.save_feature(result_df)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) creators_feature = MappedFeatureCreatorId(train_dataset_id) engagers_feature = MappedFeatureEngagerId(train_dataset_id) language_feature = MappedFeatureGroupedTweetLanguage(train_dataset_id) engagement_feature = TweetFeatureEngagementIsPositive(train_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creators_feature.load_or_create(), engagers_feature.load_or_create(), language_feature.load_or_create(), engagement_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) engager_counter_array = np.zeros( (data.DataStats.get_max_user_id() + 1, 70), dtype=np.uint16) result = pd.DataFrame([ find_and_increase_engager(engager_id, creator_id, language, engagement, engager_counter_array) for engager_id, creator_id, language, engagement in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name], dataframe[ engagement_feature.feature_name]) ], index=dataframe.index) if not EngagerMainGroupedLanguage(train_dataset_id).has_feature(): result.sort_index(inplace=True) EngagerMainGroupedLanguage(train_dataset_id).save_feature(result) if not EngagerMainGroupedLanguage(test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) creators_feature = MappedFeatureCreatorId(test_dataset_id) engagers_feature = MappedFeatureEngagerId(test_dataset_id) language_feature = MappedFeatureGroupedTweetLanguage( test_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creators_feature.load_or_create(), engagers_feature.load_or_create(), language_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) result = pd.DataFrame([ find_and_increase_engager(engager_id, creator_id, language, False, engager_counter_array) for engager_id, creator_id, language in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name]) ], index=dataframe.index) result.sort_index(inplace=True) EngagerMainGroupedLanguage(test_dataset_id).save_feature(result)