def train_w2v_hyperparam():
    """ Build *ordered* bag of words from text corpus."""
    tc = TopCoder()
    req = tc.get_filtered_requirements() # no overview extraction
    sentences = [tokenize_str(remove_stop_words_from_str(remove_punctuation(remove_digits(r.lower())))) for cha_id, r in req.itertuples()]

    for epochs in range(5, 51, 5): # [5, 10, 15, ..., 49, 50]
        for window in range(5, 21, 5): #[5, 10, 15, 20]:
            for init_lr in (0.025, 0.02, 0.01, 0.002): # some random learning rate
                print('Hyper param:')
                pprint({'epochs': epochs, 'window': window, 'initial_learning_rate': init_lr})

                print('Training Word2Vec model', end='|', flush=True)
                model = Word2Vec(sentences=sentences, alpha=init_lr, window=window, min_count=10, iter=epochs, sg=1, hs=1, seed=42, min_alpha=2e-5, workers=8)
                
                print('Decomposing vectors', end='|', flush=True)
                vectors = np.asarray([model.wv[word] for word in model.wv.vocab])
                labels = np.asarray([word for word in model.wv.vocab])

                tsne = TSNE(n_components=2, init='pca', random_state=42, perplexity=50, n_iter=5000)
                reduced_vec = tsne.fit_transform(vectors)

                print('Saving decomposed vectors')
                fp = os.path.join(os.curdir, 'result', 'word2vec', f'w2v-epochs{epochs}-window{window}-init_lr{init_lr}.json')
                pd.DataFrame.from_dict({'label': labels, 'x': reduced_vec[:, 0], 'y': reduced_vec[:, 1]}, orient='columns').to_json(fp, orient='index')
Exemple #2
0
def run_bert_meta_regression_tfmodel():
    """ Run self defined combined model."""
    timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.getenv('OUTPUT_DIR'), timestamp)
    model_plot = f'regression_model_{timestamp}.png'

    tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
    config = AutoConfig.from_pretrained(os.getenv('MODEL_NAME'), num_labels=1)
    distilebert_model = TFDistilBertModel.from_pretrained(
        os.getenv('MODEL_NAME'), config=config)

    print(config, tokenizer, sep='\n')
    # tf.keras.utils.plot_model(distilebert_model, to_file=model_plot, show_shapes=True)

    tc = TopCoder()
    encoded_text = tc.get_bert_encoded_txt_features(tokenizer)
    metadata = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True)
    target = tc.get_target()

    split = int((4 / 5) * len(target))
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(**encoded_text, meta_input=metadata), target))
    dataset = dataset.shuffle(len(target))
    train_ds, test_ds = dataset.take(split).batch(16), dataset.skip(
        split).batch(8)

    print(train_ds, test_ds, sep='\n')
    # for i in train_ds.take(2):
    #     pprint(i)
    # print()
    # for i in test_ds.take(2):
    #     pprint(i)

    # model = TCPMDistilBertRegression.from_pretrained(os.getenv('MODEL_NAME'), config=config)
    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                    histogram_freq=1)
    model = build_tcpm_model_distilbert_regression(distilebert_model)
    model.summary()
    model.compile(optimizer=tf.keras.optimizers.Adam(2e-6),
                  loss='mse',
                  metrics=['mae', 'mse', mre])
    history = model.fit(
        train_ds,
        epochs=12,
    )
    result = model.evaluate(
        test_ds,
        return_dict=True,
    )

    pprint(result)

    history_df = pd.DataFrame(history.history)
    history_df.to_json(os.path.join(log_dir, 'train_history.json'),
                       orient='index',
                       indent=4)
    with open(os.path.join(log_dir, 'result.json'), 'w') as f:
        json.dump(result, f, indent=4)
Exemple #3
0
def build_dataset(tokenizer):
    """ Build td.data.Dataset out of text and prize range."""
    # Load TopCoder data
    tc = TopCoder()
    tc_req = tc.get_filtered_requirements()
    tc_meta = tc.get_filtered_challenge_info()
    metadata_cols = [
        'number_of_platforms', 'number_of_technologies', 'project_id',
        'challenge_duration'
    ]

    # Convert float prize into categorical prize range
    interval = np.linspace(
        0, 2500,
        51)[:-1]  # this will be modified manuanly when the dataset is changed
    tc_prz_range = tc_meta['total_prize'].apply(
        lambda prz: np.searchsorted(interval, prz, side='right') - 1)
    tc_prz_range.name = 'prize_cat'

    # use this df to ensure the index of text and metadata and label is aligned
    req_prz_df = pd.concat([
        tc_req['requirements'],
        tc_meta.reindex(metadata_cols, axis=1), tc_prz_range
    ],
                           axis=1)

    dataset_size = len(req_prz_df)
    num_labels = len(req_prz_df['prize_cat'].unique()) + 1

    # batched encode the str to `input_ids` and `attention_mask`
    batched_encoded = tokenizer(req_prz_df['requirements'].to_list(),
                                padding='max_length',
                                truncation=True)

    # List((enccoded_str, metadata, prize_cat),...)
    features = [({k: batched_encoded[k][i]
                  for k in batched_encoded},
                 req_prz_df.reindex(metadata_cols, axis=1).iloc[i],
                 req_prz_df['prize_cat'].iloc[i])
                for i in range(len(req_prz_df))]

    def gen():
        """ generator used in `tf.data.Dataset.from_generator`."""
        for encoded_str, metadata, label in features:
            yield dict(
                **encoded_str, meta_input=metadata
            ), label  # NOTE: it's import the key if named "meta_input" to match the input_layer's name in the model

    dataset = tf.data.Dataset.from_generator(
        gen,
        (dict(**{k: tf.int32
                 for k in batched_encoded}, meta_input=tf.float32), tf.int32),
        (dict(**{k: tf.TensorShape([512])
                 for k in batched_encoded},
              meta_input=tf.TensorShape([4])), tf.TensorShape([])))

    return (dataset, dataset_size, num_labels)
def train_selected_w2v_model():
    """ Select the hyper param
        epochs = 10, window = 5, learning rate = 0.002
    """
    tc = TopCoder()
    req = tc.get_filtered_requirements() # no overview extraction
    sentences = [tokenize_str(remove_stop_words_from_str(remove_punctuation(remove_digits(r.lower())))) for cha_id, r in req.itertuples()]

    model = Word2Vec(sentences=sentences, alpha=0.002, window=5, min_count=10, iter=10, sg=1, hs=1, seed=42, min_alpha=2e-5, workers=8)
    model.wv.save(os.path.join(os.curdir, 'result', 'word2vec', 'selected_model'))
Exemple #5
0
def run_bert_regression_tfmodel():
    """ Run BERT for regression as a tfmodel."""
    print('START TRAINNING FOR REGRESSION')

    # Initialize BERT model
    tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
    config = AutoConfig.from_pretrained(os.getenv('MODEL_NAME'), num_labels=1)
    model = TFDistilBertForSequenceClassification.from_pretrained(
        os.getenv('MODEL_NAME'), config=config)

    print('\nModel Config:')
    print(config)
    print('Tokenizer: ', tokenizer)
    print('Model: ', model)

    # Preparing training data
    tc = TopCoder()
    encoded_text = tc.get_bert_encoded_txt_features(tokenizer)
    target = tc.get_target()

    print(f'\nSize of dataset: {len(target)}')

    dataset = tf.data.Dataset.from_tensor_slices((encoded_text, target))
    dataset = dataset.shuffle(len(target))
    train_ds, test_ds = dataset.take(int(
        (4 / 5) * len(target))), dataset.skip(int((4 / 5) * len(target)))
    train_ds = train_ds.batch(16)
    test_ds = test_ds.batch(8)

    print('\nTrain dataset samples:')
    for el in train_ds.take(3):
        pprint(el)
    print('\nTest dataset samples:')
    for el in test_ds.take(3):
        pprint(el)

    # TF-Fashioned training model
    log_dir = os.path.join(os.getenv('OUTPUT_DIR'), 'logs',
                           datetime.now().strftime('%Y%m%d-%H%M%S'))
    tensorboard_cb = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir, histogram_freq=1)  # fancy visulization :)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-6),
                  loss='mse',
                  metrics=['mae', 'mse', mre])
    history = model.fit(train_ds, epochs=6, callbacks=[tensorboard_cb])
    result = model.evaluate(test_ds, return_dict=True)

    pprint(result)

    history_df = pd.DataFrame(history.history)
    history_df.to_json(os.path.join(log_dir, 'train_history.json'),
                       orient='index',
                       indent=4)
    with open(os.path.join(log_dir, 'result.json'), 'w') as f:
        json.dump(result, f, indent=4)
def build_new_docvec():
    """ Build new document vector from newly trained word2vec model."""
    tc = TopCoder()
    req = tc.get_filtered_requirements()
    sentences = {cha_id: tokenize_str(remove_stop_words_from_str(remove_punctuation(remove_digits(r.lower())))) for cha_id, r in req.itertuples()}

    wv = KeyedVectors.load(os.path.join(os.curdir, 'result', 'word2vec', 'selected_model'))
    sentences = {cha_id: [w for w in tokens if w in wv.vocab] for cha_id, tokens in sentences.items()}

    docvec = {cha_id: (sum([wv[token] for token in tokens]) / len(tokens)).tolist() for cha_id, tokens in sentences.items()}
    pprint(list(docvec.items())[:2])
    with open(os.path.join(os.curdir, 'data', 'new_docvec.json'), 'w') as fwrite:
        json.dump(docvec, fwrite)
Exemple #7
0
def run_bert_regression_trainer():
    """ Run bert single class classification(a.k.a regression) model."""
    print('START TRAINNING FOR REGRESSION')
    log_dir = os.path.join(os.getenv('OUTPUT_DIR'), 'hf_trainer')

    tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
    config = AutoConfig.from_pretrained(os.getenv('MODEL_NAME'), num_labels=1)

    training_args = TFTrainingArguments(
        output_dir=log_dir,
        logging_dir=log_dir,
        logging_first_step=True,
        logging_steps=1,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        learning_rate=2e-5,
        debug=True,
    )

    with training_args.strategy.scope():
        # model = TFDistilBertForSequenceClassification.from_pretrained(os.getenv('MODEL_NAME'), config=config)
        model = TCPMDistilBertRegression.from_pretrained(
            os.getenv('MODEL_NAME'), config=config)

    print('\nModel Config:')
    print(config)
    print('Tokenizer: ', tokenizer)
    print('Model: ', model)
    print('\nTFTraingArguments:')
    print(training_args)

    tc = TopCoder()
    encoded_text = tc.get_bert_encoded_txt_features(tokenizer)
    metadata = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True)
    target = tc.get_target()

    split = int((4 / 5) * len(target))
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(**encoded_text, meta_input=metadata), target))
    dataset = dataset.shuffle(len(target))
    train_ds, test_ds = dataset.take(split), dataset.skip(split)

    print('\nTrain dataset samples:')
    for el in train_ds.take(3):
        pprint(el)
    print('\nTest dataset samples:')
    for el in test_ds.take(3):
        pprint(el)

    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(log_dir)

    result = trainer.evaluate()
    print('\nTrainning eval:')
    pprint(result)
    with open(os.path.join(log_dir, 'eval_results.json'), 'w') as fwrite:
        json.dump(result, fwrite, indent=4)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import LabelEncoder

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import WordPunctTokenizer

from gensim.models import LdaModel
from gensim.corpora import Dictionary

from tc_data import TopCoder
import preprocessing_util as P

TC = TopCoder()


def clean_and_tokenize(doc):
    """ clean and tokenize an input document."""
    lemmatizer = WordNetLemmatizer()

    tc_stopwords = set(stopwords.words('english'))
    tc_stopwords.update(
        ('project', 'overview', 'final', 'submission', 'documentation',
         'provid', 'submission', 'deliverables'))

    word_only_doc = P.remove_digits(
        P.remove_punctuation(P.remove_url(doc.lower())))
    lemmatized_doc = ' '.join(
        [lemmatizer.lemmatize(word) for word in word_only_doc.split()])
def build_learning_dataset(tc: TopCoder):
    """ Build learning dataset for prediction of 
        - avg_score
        - number_of_registration
        - sub_reg_ratio

        I assume that these target data are regressionally imbalanced, thus we should resample it before learning.
        The threshold are set as followed:
        - avg_score: 90
        - number_of_registration: 30
        - sub_reg_ratio: 0.25

        :param contain_docvec: Boolean: Whether include document vector in the feature. Default as False
        :param normalize: Boolean: Whether to normalzie the X data.
    """
    # manually set data resampling threshold
    target_resamp_info = {
        'avg_score': {
            'threshold': 90,
            'extreme': 'low',
            'upper_bound': 100
        },
        'number_of_registration': {
            'threshold': 30,
            'extreme': 'high',
            'lower_bound': 0
        },
        'sub_reg_ratio': {
            'threshold': 0.25,
            'extreme': 'high',
            'upper_bound': 1
        },
    }
    test_size = 954  # len(feature_df) * 0.2 ~= 953.8, use 20% of the data for testing
    storage_path = os.path.join(os.curdir, 'result', 'boosting_learn',
                                'learning_data')

    # get the raw data from TopCoder data object
    cha_info = tc.get_filtered_challenge_info()
    feature_df = tc\
        .get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True)\
        .join(cha_info.reindex(['total_prize'], axis=1))
    docvec_df = pd.read_json(os.path.join(os.curdir, 'data',
                                          'new_docvec.json'),
                             orient='index')
    target_df = cha_info.reindex(list(target_resamp_info.keys()), axis=1)
    if not (target_df.index == feature_df.index).all():
        raise ValueError(
            'Check index of target_df and feature_df, it\'s not equal.')

    for col, info in target_resamp_info.items():
        print(f'Building dataset for {col}')
        target_sr = target_df[col]
        test_index = util_stratified_split_regression(target_sr,
                                                      info['threshold'],
                                                      info['extreme'],
                                                      test_size)

        X_train_raw = feature_df.loc[~feature_df.index.isin(test_index
                                                            )].sort_index()
        X_test_raw = feature_df.loc[feature_df.index.isin(
            test_index)].sort_index()
        y_train_raw = target_sr[~target_sr.index.isin(test_index)].sort_index()
        y_test_raw = target_sr[target_sr.index.isin(test_index)].sort_index()
        if not ((X_train_raw.index == y_train_raw.index).all() and
                (X_test_raw.index == y_test_raw.index).all()):
            raise ValueError('Check X, y test index, they are not equal.')

        for dv in True, False:
            print(f'Resampling with dv={dv}...')
            test_data_fn = os.path.join(storage_path,
                                        f'{col}_test_dv{int(dv)}.json')
            train_data_original_fn = os.path.join(
                storage_path, f'{col}_train_original_dv{int(dv)}.json')
            train_data_resample_fn = os.path.join(
                storage_path, f'{col}_train_resample_dv{int(dv)}.json')
            X_train, X_test, y_train, y_test = X_train_raw.copy(
            ), X_test_raw.copy(), y_train_raw.copy(), y_test_raw.copy()

            if dv:
                X_train = X_train.join(docvec_df)
                X_test = X_test.join(docvec_df)

            # From now on it's pure numpy till storage ;-)
            X_train, X_test = X_train.to_numpy(), X_test.to_numpy()
            y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

            scaler = StandardScaler().fit(X_train)
            normalizer = Normalizer().fit(X_train)

            X_train, X_test = scaler.transform(X_train), scaler.transform(
                X_test)
            X_train, X_test = normalizer.transform(
                X_train), normalizer.transform(X_test)

            print(
                f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}'
            )
            print(
                f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

            test_data = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1)
            test_data_df = pd.DataFrame(test_data)
            test_data_df.columns = [
                *[f'x{i}' for i in range(X_test.shape[1])], 'y'
            ]
            test_data_df.to_json(test_data_fn, orient='index')
            print(f'Test data DataFrame shape: {test_data_df.shape}')

            train_data_original = np.concatenate(
                (X_train, y_train.reshape(-1, 1)), axis=1)
            train_data_original_df = pd.DataFrame(train_data_original)
            train_data_original_df.columns = [
                *[f'x{i}' for i in range(X_test.shape[1])], 'y'
            ]
            train_data_original_df.to_json(train_data_original_fn,
                                           orient='index')
            print(
                f'Training data original shape: {train_data_original_df.shape}'
            )

            attempt = 0
            while True:
                print(f'Attempt #{attempt}...')
                try:
                    train_data_resample_df = smoter(
                        data=train_data_original_df,
                        y='y',
                        samp_method='extreme',
                        rel_xtrm_type=info['extreme']).reset_index(
                            drop=True
                        )  # just use the default setting for SMOGN
                except ValueError as e:
                    print(f'Encounter error: "{e}", rerun the SMOGN...')
                    continue
                else:
                    print(
                        f'Training data resample shape: {train_data_resample_df.shape} - before boundary filtering'
                    )
                    if 'upper_bound' in info:
                        train_data_resample_df = train_data_resample_df.loc[
                            train_data_resample_df['y'] <= info['upper_bound']]

                    if 'lower_bound' in info:
                        train_data_resample_df = train_data_resample_df.loc[
                            train_data_resample_df['y'] >= info['lower_bound']]

                    train_data_resample_df.to_json(train_data_resample_fn,
                                                   orient='index')
                    print(
                        f'Training data resample shape: {train_data_resample_df.shape} - after boundary filtering'
                    )
                    print('Data stored\n\n')
                    break
def random_serach_top_tiers():
    """ Perform random search to find the best hyper parameters."""
    tc = TopCoder()

    model_dct = {
        'BayesianRidge': BayesianRidge,
        'DecisionTreeRegressor': DecisionTreeRegressor,
        'GaussianProcessRegressor': GaussianProcessRegressor,
        'GradientBoostingRegressor': GradientBoostingRegressor,
        'KNeighborsRegressor': KNeighborsRegressor,
        'RandomForestRegressor': RandomForestRegressor,
        'SVR': SVR,
    }

    model_args_dct = {
        'BayesianRidge': {
            'fixed_args': dict(n_iter=1000),
            'tuned_args': dict(tol=[1e-3, 1e-4, 1e-5, 1e-6], ),
        },
        'DecisionTreeRegressor': {
            'fixed_args':
            dict(random_state=42),
            'tuned_args':
            dict(criterion=['mse', 'mae', 'friedman_mse'],
                 max_depth=[None, 3, 5, 10]),
        },
        'GaussianProcessRegressor': {
            'fixed_args':
            dict(),
            'tuned_args':
            dict(kernel=[
                1.0 * RBF(), 1.0 * RationalQuadratic(),
                ConstantKernel() * (DotProduct()**2),
                DotProduct() * WhiteKernel()
            ]),
        },
        'GradientBoostingRegressor': {
            'fixed_args':
            dict(random_state=42, n_iter_no_change=5),
            'tuned_args':
            dict(
                loss=['ls', 'lad'],
                n_estimators=[200, 500, 1000, 1500],
                learning_rate=[0.01, 0.001, 1e-4],
                tol=[0.01, 0.001, 1e-4, 1e-5, 2e-5, 1e-6],
            ),
        },
        'KNeighborsRegressor': {
            'fixed_args':
            dict(n_jobs=-1),
            'tuned_args':
            dict(
                n_neighbors=[5, 10, 15, 20],
                weights=['uniform', 'distance'],
                algorithm=['ball_tree', 'kd_tree'],
                leaf_size=[30, 60, 100],
            ),
        },
        'RandomForestRegressor': {
            'fixed_args':
            dict(n_jobs=-1, verbose=1, random_state=42, bootstrap=True),
            'tuned_args':
            dict(
                n_estimators=[100, 200, 500, 1000],
                max_features=['auto', 'sqrt', 0.333],
                criterion=['mae', 'mse'],
            ),
        },
        'SVR': {
            'fixed_args':
            dict(cache_size=15000),
            'tuned_args': [
                dict(kernel=['rbf'],
                     gamma=['scale', 'auto'],
                     C=[1, 10, 100, 1000]),
                dict(kernel=['linear'], C=[1, 10, 100, 1000]),
                dict(kernel=['poly'],
                     degree=[2, 3, 5],
                     coef0=[0, 0.5, 5, 50, 100],
                     C=[1, 10, 100, 1000]),
            ],
        },
    }

    scoring = {
        'mae': make_scorer(mean_absolute_error, greater_is_better=False),
        'mre': make_scorer(mre, greater_is_better=False),
    }

    rs_path = os.path.join(os.curdir, 'result', 'random_search_res')

    with open(
            os.path.join(os.curdir, 'result', 'simple_regression',
                         'top4_reg_dct.json')) as f:
        top_regs_dct = {
            target: list(metrics.keys())
            for target, metrics in json.load(f).items() if target != 'price'
        }

    for target, reg_lst in top_regs_dct.items():
        print(f'{target} | Random Searching....')
        X, y = tc.build_final_dataset(target)
        Xnp, ynp = X.to_numpy(), y.to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(Xnp,
                                                            ynp,
                                                            test_size=0.3,
                                                            random_state=42)

        for reg_name in reg_lst:
            print(f'RS on {reg_name}...')

            rs_res_path = os.path.join(rs_path, f'{target}_{reg_name}_rs.json')
            if os.path.isfile(rs_res_path):
                continue

            reg = model_dct[reg_name]
            args = model_args_dct[reg_name]

            rs = RandomizedSearchCV(
                reg(**args['fixed_args']),
                param_distributions=args['tuned_args'],
                n_iter=6,
                scoring=scoring,
                refit='mre',
                n_jobs=-1,
                cv=10,
                random_state=42,
            )
            rs.fit(X_train, y_train)

            rs_res = {
                'regressor': reg_name,
                'best_params': rs.best_params_,
                'best_score_in_rs': rs.best_score_,
            }

            with open(rs_res_path, 'w') as f:
                json.dump(rs_res, f, indent=4)