def __init__(self, label_file_path): self.global_config = StaticConfig() self.label =self.path_to_numpy_array(label_file_path) self.existing_predicts = [] self.model = FeedforwardEnsemblingModel()
def __init__(self): # self._model = None self.global_config = StaticConfig() self.dynamic_config = DynamicConfig()
def __init__(self): self.global_config = StaticConfig() self.dynamic_config = DynamicConfig()
def __init__(self, tokenizer=None): self.global_config = StaticConfig() self.tokenizer = tokenizer
def __init__(self): self.x_test = None self.global_config = StaticConfig() self.preprocessor = None
def prepare_data_folder(self, train_input_path, output_folder_path, train_test_factor=0.9, debug_factor=1.0): ''' This method will take the train data and then divide it into train and test sets by a factor of train_test_factor. Then the train dataset is splitted according the column name first. The folders under splitted_output_folder splitted by the label names: 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' Then the folder has static_config.preprocess_splits splitted slices of data. Each data has all of the label=1 from the above label of the training data set along with equal number of label=0 data rows from the training dataset as well. The test data is output to the test data folder for validation. :param train_input_path: file path for original training :param splitted_output_folder: output folder :param train_test_factor: ratio to split train and test :param debug_factor: if run with this config, only sample a small proportion of the raw data. 1.0 means no debug :return: ''' print( "##################### preprocessor starts ########################" ) global_config = StaticConfig() create_folder(output_folder_path) label_cols = [ 'id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] raw_data = pd.read_csv(train_input_path) raw_data = raw_data.sample(frac=1) self.tokenizer = text.Tokenizer( num_words=self.global_config.max_features) list_sentences_raw_data = raw_data["comment_text"].fillna( "CVxTz").values self.tokenizer.fit_on_texts(list(list_sentences_raw_data)) pickle.dump( self.tokenizer, open( '{}/{}'.format(output_folder_path, self.global_config.tokenizer_save_name), "wb")) train_data_size = int(raw_data.shape[0] * train_test_factor * debug_factor) train = pd.DataFrame(raw_data[:train_data_size], columns=label_cols) if self.global_config.use_raw_for_test: test = pd.DataFrame(raw_data) else: test = pd.DataFrame(raw_data[train_data_size:] if debug_factor == 1.0 else raw_data[-100:]) test_name = "{}/{}".format(output_folder_path, "test.csv") test.to_csv(test_name) for label_name in global_config.model_names: label_output = output_folder_path + "/" + label_name create_folder(label_output) sub_train_output_file_path = '{}/tr_train_{}.csv'.format( label_output, label_name) train.to_csv(sub_train_output_file_path) # , index=False) print('output train for No. {} subset to file '.format( label_name, sub_train_output_file_path))
train_data_size = int(raw_data.shape[0] * train_test_factor * debug_factor) train = pd.DataFrame(raw_data[:train_data_size], columns=label_cols) if self.global_config.use_raw_for_test: test = pd.DataFrame(raw_data) else: test = pd.DataFrame(raw_data[train_data_size:] if debug_factor == 1.0 else raw_data[-100:]) test_name = "{}/{}".format(output_folder_path, "test.csv") test.to_csv(test_name) for label_name in global_config.model_names: label_output = output_folder_path + "/" + label_name create_folder(label_output) sub_train_output_file_path = '{}/tr_train_{}.csv'.format( label_output, label_name) train.to_csv(sub_train_output_file_path) # , index=False) print('output train for No. {} subset to file '.format( label_name, sub_train_output_file_path)) if __name__ == "__main__": wrapper = SeqProcessor() wrapper.prepare_data_folder( './input/train.csv', './preprocessing_wrapper_demo_output', train_test_factor=StaticConfig().train_test_factor, debug_factor=1.0)
def __init__(self, label_file_path): self.global_config = StaticConfig() self.label = pd.read_csv(label_file_path)
def __init__(self): self.data_sets = [] self.global_config = StaticConfig() self.preprocessor = None
def __init__(self): self.global_config = StaticConfig() self.predictor = Predictor()