Esempio n. 1
0
def run(n_init,
        max_features,
        umap_n_components,
        dataset,
        val_dataset,
        labels,
        result_dir,
        random_state
        ):
    # Set random states
    np.random.seed(random_state)

    target_names = fetch_20newsgroups().target_names
    idx = [target_names.index(l) for l in labels]

    # load data
    train_df = pd.read_csv(dataset)
    train_df = train_df.query("labels in @idx")
    print(train_df.shape)
    train_texts = train_df['texts'].to_numpy()
    train_labels = train_df['labels'].to_numpy()
    print(train_texts.shape)

    val_df = pd.read_csv(val_dataset)
    val_df = val_df.query("labels in @idx")
    print(val_df.shape)
    val_texts = val_df['texts'].to_numpy()
    val_labels = val_df['labels'].to_numpy()
    print(val_texts.shape)


    tfidf = TfidfVectorizer(max_features=max_features, stop_words='english')
    X_train = tfidf.fit_transform(train_texts)
    X_test = tfidf.transform(val_texts)

    #umap = UMAP(n_components=umap_n_components)
    #X_train = umap.fit_transform(X_train.toarray())
    #X_test = umap.transform(X_test.toarray())

    print(len(np.unique(train_labels)))
    kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels)))
    kmeans.fit(X_train)
    predicted_labels = kmeans.predict(X_test)

    best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels)
    ari = adjusted_rand_score(val_labels, predicted_labels)
    nmi = normalized_mutual_info_score(val_labels, predicted_labels)
    purity = purity_score(y_true=val_labels, y_pred=predicted_labels)

    run_results = {}
    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results['purity'] = purity  # use purity to compare with microsoft paper

    os.makedirs(result_dir, exist_ok=True)
    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir, f'20newsgroups_samples-kmeans.csv'), index=False)
Esempio n. 2
0
def run(n_init, max_features, umap_n_components, dataset, result_dir,
        random_state, do_umap):
    # Set random states
    np.random.seed(random_state)

    # load data
    train_df = pd.read_csv(dataset)

    texts = train_df['texts'].to_numpy()
    labels = train_df['labels'].to_numpy()

    le = LabelEncoder()
    labels = le.fit_transform(labels)

    tfidf = TfidfVectorizer(max_features=max_features, stop_words='english')
    X_train = tfidf.fit_transform(texts)

    if do_umap:
        umap = UMAP(n_components=umap_n_components)
        X_train = umap.fit_transform(X_train.toarray())

    kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(labels)))
    predicted_labels = kmeans.fit_predict(X_train)

    best_matching, accuracy = cluster_accuracy(labels, predicted_labels)
    ari = adjusted_rand_score(labels, predicted_labels)
    nmi = normalized_mutual_info_score(labels, predicted_labels)
    purity = purity_score(y_true=labels, y_pred=predicted_labels)

    run_results = {}
    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results[
        'purity'] = purity  # use purity to compare with microsoft paper

    os.makedirs(result_dir, exist_ok=True)
    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir, f'trec6-kmeans.csv'),
                     index=False)
Esempio n. 3
0
def run(n_init, models, embedding_extractor, batch_size, dataset,
        train_idx_file, val_idx_file, result_dir, random_state, device):

    # Set random states
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    os.makedirs(result_dir, exist_ok=True)

    # load data
    df = pd.read_csv(dataset)

    with open(train_idx_file, 'r') as f:
        train_idx = np.array(list(map(int, f.readlines())))

    with open(val_idx_file, 'r') as f:
        val_idx = np.array(list(map(int, f.readlines())))

    all_idx = np.concatenate((train_idx, val_idx))

    df_train = df.iloc[all_idx].copy()

    train_texts = df_train['texts'].to_numpy()
    train_labels = df_train['labels'].to_numpy()

    train_data = TextDataset(train_texts, train_labels)
    train_data_loader = DataLoader(dataset=train_data,
                                   batch_size=batch_size,
                                   shuffle=False)

    df_val = df.iloc[val_idx].copy()

    val_texts = df_val['texts'].to_numpy()
    val_labels = df_val['labels'].to_numpy()

    val_data = TextDataset(val_texts, val_labels)
    val_data_loader = DataLoader(dataset=val_data,
                                 batch_size=batch_size,
                                 shuffle=False)

    results = []
    for model in models:
        # init lm model & tokenizer
        lm_model = AutoModel.from_pretrained(model,
                                             return_dict=True,
                                             output_hidden_states=True)
        tokenizer = AutoTokenizer.from_pretrained(model,
                                                  return_dict=True,
                                                  output_hidden_states=True)
        lm_model.to(device)

        train_embeddings = []
        train_labels = []
        for batch_texts, batch_labels in tqdm(
                train_data_loader, desc="Extracting train embeddings"):
            inputs = tokenizer(list(batch_texts),
                               return_tensors='pt',
                               padding=True,
                               truncation=True)
            inputs = inputs.to(device)
            with torch.no_grad():
                outputs = lm_model(**inputs)
            extracted_embeddings = embedding_extractor(
                outputs).cpu().detach().numpy()
            train_embeddings.append(extracted_embeddings)
            train_labels.extend(batch_labels.numpy().astype('int'))

        X_train = np.vstack(train_embeddings)
        train_labels = np.array(train_labels)

        test_embeddings = []
        val_labels = []
        for batch_texts, batch_labels in tqdm(
                val_data_loader, desc="Extracting val embeddings"):
            inputs = tokenizer(list(batch_texts),
                               return_tensors='pt',
                               padding=True,
                               truncation=True)
            inputs = inputs.to(device)
            with torch.no_grad():
                outputs = lm_model(**inputs)
            extracted_embeddings = embedding_extractor(
                outputs).cpu().detach().numpy()
            test_embeddings.append(extracted_embeddings)
            val_labels.extend(batch_labels.numpy().astype('int'))

        X_test = np.vstack(test_embeddings)
        val_labels = np.array(val_labels)

        kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels)))
        kmeans.fit(X_train)
        predicted_labels = kmeans.predict(X_test)

        best_matching, accuracy = cluster_accuracy(val_labels,
                                                   predicted_labels)
        ari = adjusted_rand_score(val_labels, predicted_labels)
        nmi = normalized_mutual_info_score(val_labels, predicted_labels)
        purity = purity_score(y_true=val_labels, y_pred=predicted_labels)

        run_results = {}
        run_results['model'] = model
        run_results['best_matching'] = best_matching
        run_results['accuracy'] = accuracy
        run_results['ari'] = ari
        run_results['nmi'] = nmi
        run_results[
            'purity'] = purity  # use purity to compare with microsoft paper
        results.append(run_results)

        with open(
                os.path.join(result_dir,
                             f'{model.replace("/", "_")}_embeddings.h'),
                'wb') as f:
            pickle.dump([X_train, train_labels, X_test, val_labels], f)

    result_df = pd.DataFrame.from_records(results)
    result_df.to_csv(os.path.join(
        result_dir, f'ag_news_subset5-sbert-embeddings-kmeans.csv'),
                     index=False)
Esempio n. 4
0
def run(n_epochs, lr, train_batch_size, val_batch_size, base_model,
        clustering_loss_weight, embedding_extractor, annealing_alphas, dataset,
        val_dataset, result_dir, early_stopping, early_stopping_tol, device,
        random_state):
    # Set random states
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    # load data
    train_df = pd.read_csv(dataset)

    train_texts = train_df['texts'].to_numpy()
    train_labels = train_df['labels'].to_numpy()

    train_data = TextDataset(train_texts, train_labels)
    train_data_loader = DataLoader(dataset=train_data,
                                   batch_size=train_batch_size,
                                   shuffle=False)

    val_df = pd.read_csv(val_dataset)

    val_texts = val_df['texts'].to_numpy()
    val_labels = val_df['labels'].to_numpy()

    val_data = TextDataset(val_texts, val_labels)
    val_data_loader = DataLoader(dataset=val_data,
                                 batch_size=val_batch_size,
                                 shuffle=False)

    # init lm model & tokenizer
    lm_model = AutoModelForMaskedLM.from_pretrained(base_model,
                                                    return_dict=True,
                                                    output_hidden_states=True)
    tokenizer = AutoTokenizer.from_pretrained(base_model,
                                              return_dict=True,
                                              output_hidden_states=True)

    lm_model.to(device)

    # init clustering model
    model, initial_centroids, initial_embeddings = init_model(
        lm_model=lm_model,
        tokenizer=tokenizer,
        data_loader=train_data_loader,
        embedding_extractor=embedding_extractor,
        n_clusters=np.unique(train_labels).shape[0],
        device=device)

    # init optimizer & scheduler
    opt = torch.optim.RMSprop(
        params=model.parameters(),
        lr=lr,  # 2e-5, 5e-7,
        eps=1e-8)

    total_steps = len(train_data_loader) * n_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer=opt,
        num_warmup_steps=int(len(train_data_loader) * 0.5),
        num_training_steps=total_steps)

    # train the model
    hist = train(n_epochs=n_epochs,
                 model=model,
                 optimizer=opt,
                 scheduler=scheduler,
                 annealing_alphas=annealing_alphas,
                 train_data_loader=train_data_loader,
                 eval_data_loader=val_data_loader,
                 clustering_loss_weight=clustering_loss_weight,
                 early_stopping=early_stopping,
                 early_stopping_tol=early_stopping_tol,
                 verbose=True)
    # do eval
    run_results = {}

    predicted_labels, true_labels = evaluate(model=model,
                                             eval_data_loader=val_data_loader,
                                             verbose=True)

    best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels)
    ari = adjusted_rand_score(true_labels, predicted_labels)
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    purity = purity_score(y_true=true_labels, y_pred=predicted_labels)

    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results[
        'purity'] = purity  # use purity to compare with microsoft paper

    # save train hist
    os.makedirs(result_dir, exist_ok=True)

    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir, '20_newsgroups-distilbert.csv'),
                     index=False)

    # save results & model
    os.makedirs(result_dir, exist_ok=True)
    with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f:
        pickle.dump(hist, file=f)

    torch.save(model, os.path.join(result_dir, 'model.bin'))
Esempio n. 5
0
def run(hyperparam_grid, dataset, train_idx_file, val_idx_file, result_dir,
        random_state):
    # Set random states
    np.random.seed(random_state)

    # load data
    df = pd.read_csv(dataset)

    with open(train_idx_file, 'r') as f:
        train_idx = np.array(list(map(int, f.readlines())))
    with open(val_idx_file, 'r') as f:
        val_idx = np.array(list(map(int, f.readlines())))

    all_idx = np.append(train_idx, val_idx)

    df_train = df.iloc[all_idx]
    train_texts = df_train['texts'].to_numpy()
    train_labels = df_train['labels'].to_numpy()

    df_val = df.iloc[val_idx]
    val_texts = df_val['texts'].to_numpy()
    val_labels = df_val['labels'].to_numpy()

    tfidf = TfidfVectorizer(max_features=20000, stop_words='english')
    X_train = tfidf.fit_transform(train_texts).toarray()
    X_val = tfidf.transform(val_texts).toarray()

    pipeline = Pipeline([('umap', UMAP()),
                         ('kmeans',
                          KMeans(n_clusters=len(np.unique(train_labels)),
                                 n_jobs=6,
                                 n_init=20))])

    # insert code here!
    results = []
    param_grid = ParameterGrid(hyperparam_grid)
    for run_idx, params in enumerate(param_grid):
        print(f'Run: {run_idx+1}/{len(list(param_grid))}')
        print("Running with params:")
        pprint(params)

        pipeline.set_params(**params)

        pipeline.fit_transform(X_train)
        predicted_labels = pipeline.predict(X_val)

        # do eval
        run_results = {
            **{f'param_{key}': value
               for key, value in params.items()}
        }

        best_matching, accuracy = cluster_accuracy(val_labels,
                                                   predicted_labels)
        ari = adjusted_rand_score(val_labels, predicted_labels)
        nmi = normalized_mutual_info_score(val_labels, predicted_labels)

        run_results['best_matching'] = best_matching
        run_results['accuracy'] = accuracy
        run_results['ari'] = ari
        run_results['nmi'] = nmi

        # save train hist
        os.makedirs(result_dir, exist_ok=True)

        results.append(run_results)
        result_df = pd.DataFrame.from_records(results)
        result_df.to_csv(os.path.join(
            result_dir, 'opt_results_ag_news_subset5_kmeans.csv'),
                         index=False)
Esempio n. 6
0
def run(n_init,
        max_features,
        umap_n_components,
        dataset,
        train_idx_file,
        val_idx_file,
        result_dir,
        random_state,
        do_umap=True):
    # Set random states
    np.random.seed(random_state)

    # load data
    df = pd.read_csv(dataset)

    with open(train_idx_file, 'r') as f:
        train_idx = np.array(list(map(int, f.readlines())))

    with open(val_idx_file, 'r') as f:
        val_idx = np.array(list(map(int, f.readlines())))

    all_idx = np.concatenate((train_idx, val_idx))

    df_train = df.iloc[all_idx].copy()

    train_texts = df_train['texts'].to_numpy()
    train_labels = df_train['labels'].to_numpy()

    df_val = df.iloc[val_idx].copy()

    val_texts = df_val['texts'].to_numpy()
    val_labels = df_val['labels'].to_numpy()

    tfidf = TfidfVectorizer(max_features=max_features, stop_words='english')
    X_train = tfidf.fit_transform(train_texts)
    X_test = tfidf.transform(val_texts)
    if do_umap:
        umap = UMAP(n_components=umap_n_components)
        X_train = umap.fit_transform(X_train.toarray())
        X_test = umap.transform(X_test.toarray())

    kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels)))
    kmeans.fit(X_train)
    predicted_labels = kmeans.predict(X_test)

    best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels)
    ari = adjusted_rand_score(val_labels, predicted_labels)
    nmi = normalized_mutual_info_score(val_labels, predicted_labels)
    purity = purity_score(y_true=val_labels, y_pred=predicted_labels)

    run_results = {}
    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results[
        'purity'] = purity  # use purity to compare with microsoft paper

    os.makedirs(result_dir, exist_ok=True)
    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir, f'ag_news_subset5-kmeans.csv'),
                     index=False)
Esempio n. 7
0
def run(n_epochs, hyperparam_grid, lr, batch_size, val_batch_size, base_model,
        dataset, train_idx_file, val_idx_file, result_dir, early_stopping,
        early_stopping_tol, device, random_state):
    # Set random states
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    # load data
    df = pd.read_csv(dataset)

    with open(train_idx_file, 'r') as f:
        train_idx = np.array(list(map(int, f.readlines())))
    with open(val_idx_file, 'r') as f:
        val_idx = np.array(list(map(int, f.readlines())))

    all_idx = np.append(train_idx, val_idx)

    df_train = df.iloc[all_idx]
    train_texts = df_train['texts'].to_numpy()
    train_labels = df_train['labels'].to_numpy()

    train_data = TextDataset(train_texts, train_labels)
    train_data_loader = DataLoader(dataset=train_data,
                                   batch_size=batch_size,
                                   shuffle=False)

    df_val = df.iloc[val_idx]
    val_texts = df_val['texts'].to_numpy()
    val_labels = df_val['labels'].to_numpy()

    val_data = TextDataset(val_texts, val_labels)
    val_data_loader = DataLoader(dataset=val_data,
                                 batch_size=val_batch_size,
                                 shuffle=False)

    # insert code here!
    results = []
    param_grid = ParameterGrid(hyperparam_grid)
    for run_idx, params in enumerate(param_grid):
        print(f'Run: {run_idx+1}/{len(list(param_grid))}')
        print("Running with params:")
        pprint(params)

        # init lm model & tokenizer
        lm_model = AutoModelForMaskedLM.from_pretrained(
            base_model, return_dict=True, output_hidden_states=True)
        tokenizer = AutoTokenizer.from_pretrained(base_model,
                                                  return_dict=True,
                                                  output_hidden_states=True)

        lm_model.to(device)

        # init clustering model
        model, initial_centroids, initial_embeddings = init_model(
            lm_model=lm_model,
            tokenizer=tokenizer,
            data_loader=train_data_loader,
            embedding_extractor=params['embedding_extractor'],
            n_clusters=np.unique(train_labels).shape[0],
            device=device)

        # init optimizer & scheduler
        opt = torch.optim.RMSprop(
            params=model.parameters(),
            lr=lr,  # hier weitermachen
            eps=1e-8)

        total_steps = len(train_data_loader) * n_epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer=opt,
            num_warmup_steps=int(len(train_data_loader) * 0.5),
            num_training_steps=total_steps)

        # train the model
        hist = train(n_epochs=n_epochs,
                     model=model,
                     optimizer=opt,
                     scheduler=scheduler,
                     annealing_alphas=params['annealing_alphas'],
                     train_data_loader=train_data_loader,
                     eval_data_loader=val_data_loader,
                     clustering_loss_weight=params['clustering_loss_weight'],
                     early_stopping=early_stopping,
                     early_stopping_tol=early_stopping_tol,
                     verbose=True)

        # do eval
        run_results = {
            **{f'param_{key}': value
               for key, value in params.items()}
        }

        predicted_labels, true_labels = evaluate(
            model=model, eval_data_loader=val_data_loader, verbose=True)

        best_matching, accuracy = cluster_accuracy(true_labels,
                                                   predicted_labels)
        ari = adjusted_rand_score(true_labels, predicted_labels)
        nmi = normalized_mutual_info_score(true_labels, predicted_labels)

        run_results['best_matching'] = best_matching
        run_results['accuracy'] = accuracy
        run_results['ari'] = ari
        run_results['nmi'] = nmi

        # save train hist
        os.makedirs(result_dir, exist_ok=True)

        results.append(run_results)
        result_df = pd.DataFrame.from_records(results)
        result_df.to_csv(os.path.join(result_dir,
                                      'opt_results_ag_news_subset5.csv'),
                         index=False)

        with open(os.path.join(result_dir, f'train_hist_run{run_idx}.h'),
                  'wb') as f:
            pickle.dump(hist, file=f)