Exemple #1
0
def main():
    #Load the data
    content_data =  data_pro.main()

    #title_feats
    title_feats = title_transform(content_data)

    #source_feats
    source_feats = pd.get_dummies(content_data['source']).values

    #reporter_feats
    reporter_feats = pd.get_dummies(content_data['reporter']).values

    #week_feats
    week_feats = pd.get_dummies(content_data['week']).values

    #topology_feat
    try:
        content_data['l1_norm'] = pd.read_csv('l1_norm.csv')['l1_norm'] 
    except:
        print('csv file not found recalculate')
        content_data['l1_norm'] = content_topology(content_data)
    
    train_data,_,_ = utils.split(content_data)
    sh_ratio_dict,total_count_dict = utils.compute_author(train_data)
    content_data['sh_ratio_range'] = content_data['author'].apply(lambda x: fill_way(x,sh_ratio_dict))
    content_data['total_count_range'] = content_data['author'].apply(lambda x:fill_way(x,total_count_dict))
    return title_feats,source_feats,reporter_feats,week_feats,content_data
Exemple #2
0
def main():
    data_praw.main()
    print("Downloaded data!")
    data_processing.main()
    print("Extracted features from training!")
    data_processing.test(TEST_IN_PATHS, TEST_OUT_PATH)
    print("Extracted features from test!")
    with open('data_preprocessed.pickle', 'rb') as data_file:
        training_data = pickle.load(data_file)
    with open('test_preprocessed.pickle', 'rb') as data_file:
        test_data = pickle.load(data_file)
    with open('classifier_model.pickle', 'rb') as model_file:
        model = pickle.load(model_file)
    training_labels = training_data['labels']
    new_training_data = remove_extra_features(training_data)
    results = classify(test_data['data'], new_training_data, training_labels)
    score = 0
    for idx, res in enumerate(results):
        if res == GROUND_TRUTH[idx]:
            score += 1
    print("Accuracy: ", score/len(GROUND_TRUTH))
Exemple #3
0
def main():
    DIR = "../TaxiBJ"
    X_hour, X_day, X_week, X_extra, y, timeslots = data_processing.main(
        DIR, True)

    num_epochs = 1
    batch_size = 16
    learning_rate = 1e-3

    model = Attention()
    # summary(model, input_size = [(18,32,32),(1,1,21)])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=1e-5)
    train_dataset, test_dataset = train_test_split(X_hour, X_day, X_week,
                                                   X_extra, y, timeslots)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=batch_size,
                                 shuffle=True)
    for epoch in range(num_epochs):
        for X1, X2, y, timeslot in train_dataloader:
            X1 = Variable(X1.float())
            X2 = Variable(X2.float())
            y = Variable(y.float())
            output = model(X1, X2)
            # m = output[0][0].detach().numpy()
            # path = '../data/{}_{}.original.png'.format(2016021515, epoch)
            # print('drawing fig')
            # draw_fig(m, path)
            loss = criterion(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print('epoch [{}/{}], loss:{}'.format(epoch + 1, num_epochs,
                                                  loss.data))
    torch.save(model, "model")
    for X1, X2, y, timeslot in test_dataloader:
        X1 = Variable(X1.float())
        X2 = Variable(X2.float())
        y = Variable(y.float())
        output = model(X1, X2)
        loss = criterion(output, y)
        print(loss.data)
    return model
Exemple #4
0
def main():
    # Get data from data_engineering step
    df = dp.main()
    df.cache()
    # Class imbalance sample rates
    sample_fractions = {
        "ChurnedInTimeBin": 0.065,
        "WillChurnInNextBin": 0.05,
        "WillChurnSoon": 0.12
    }
    # Use areaUnderPR as evaluation metric
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    # Loop through the labels we're going to try and build models for
    for label, sample_fraction in sample_fractions.items():
        print("Building model for predicting this label: ", label)
        # Define features and labels into a dataframe for modeling
        dataset = define_features_and_label(df, label)
        # Sample the data b/c of the class imbalance
        sampled_data = dataset.sampleBy('label',
                                        fractions={
                                            0: sample_fraction,
                                            1: 1.0
                                        })
        dataset.unpersist()
        sampled_data.unpersist()
        # Do 80/20 split of the dataset for training/testing
        train, test = sampled_data.randomSplit([0.8, 0.2], seed=147309)
        train.cache()
        test.cache()
        # Train, CrossValidate, and Save models
        # Logistic Regression
        bestLR = lr_model(train, test, evaluator)
        bestLR.write().overwrite().save("./Model/" + label + ".LRmodel")
        del bestLR
        # Random Forest
        bestRF = rf_model(train, test, evaluator)
        bestRF.write().overwrite().save("./Model/" + label + ".RFmodel")
        del bestRF
        # Gradient-Boosted Trees
        bestGBT = gbt_model(train, test, evaluator)
        bestGBT.write().overwrite().save("./Model/" + label + ".GBTmodel")
        del bestGBT
        train.unpersist()
        test.unpersist()
    df.unpersist()
Exemple #5
0
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam

import data_processing
from data_post_process import bin_to_families, get_family_membership_idx, \
    get_train_test_idx
from helper_functions import plot_parity



processed_data = data_processing.main()
X, y = processed_data['X'], processed_data['y']

bin_bounds = [2, 5]
n_bins = len(bin_bounds) + 1
descriptor_names = processed_data['descriptor_names']
family_one_hotx = processed_data['family_one_hotx']
y, y_ordinal = bin_to_families(y, bin_upper_bounds=bin_bounds)
# get class membership ids to split to train/ test independently in each set
class_membership_idx = get_family_membership_idx(y_ordinal)
# pyplot parameters
plt.rcParams['svg.fonttype'] = 'none'
plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)

Exemple #6
0
def main():
    # Filter warnings that polute the project stdout.
    filter_warnings()
    # Rationale: produce cleaner results.

    # Set the random seed for the entire project.
    du.set_random_seed(0)
    # Rationale: ensure reproducibility of the results.

    # Flush previous runs.
    # constants.flush_project_results(constants.TMP_PATH,
    #                                 constants.OUTPUT_PATH)
    # Rationale: provide a clear state for the project to run and enforces
    # reproducibility of the results.

    # Download, load and save data.
    data_loading.main()
    dataframe = data_loading.load_data(constants.DATASET_PATH,
                                       constants.TMP_PATH)
    data_loading.save_data(dataframe, constants.TMP_PATH,
                           constants.DATASET_PATH)
    # Rationale: *Loading*: load data in the main module and pass it as a first
    # argument to every other defined function (that relates to the data set)
    # thus saving precious time with data loading. *Saving*: for big data sets
    # saving the dataset as a fast read format (such as HDF5) saves time.

    # Load and combine data processing pipelines.
    data_processing.main(dataframe, nan_strategy='drop')
    # Rationale: prepare data to be fed into the models.
    # Different algorithms make use of different data structures. For instance
    # XGBoost allow for nans. Data transformations usually don't.

    # Perform exploratory data analyses.
    data_exploration.main(dataframe)
    # Rationale: conduct exploratory data analyses.

    # Data split.
    # Removed.
    # Rationale: module 'models' should execute this.

    # Perform grid search.
    # Iteration over processed data sets may occur here since they are model
    # dependent.
    grid_search.main(constants.MODELS, constants.GRIDS)
    best_combination_of_datasets_and_grids = (
        grid_search.dict_of_best_datasets_and_grids(constants.MODELS,
                                                    constants.GRIDS))
    best_datasets = best_combination_of_datasets_and_grids['best_datasets']
    best_grids = best_combination_of_datasets_and_grids['best_grids']
    # Rationale: perform grid search as part of machine learning best
    # practices.

    # Summary of what was executed so far:
    # 1) Setting of the random seed for reproducibility.
    # 2) Flusing of intermediate results for a clean run.
    # 3) Data loading and data saving.
    # 4) Conduction of exploratory data analyses.
    # 5) Grid search of best model hyper parameters.
    # To conclude our project we need the grand finale: model selection and
    # evaluation/comparison.
    models.main(constants.MODELS, best_datasets, best_grids,
                constants.MODEL_FITTING_PARAMETERS)
Exemple #7
0
#####
# Runs API and data processing modules
#####

from api_connect import url_dict, callAPI
import data_processing as dp

if __name__ == "__main__":
    callAPI(url_dict)  # connects to API
    dp.main()  # processes raw tables
Exemple #8
0
def main():
    # import the data
    processed_data = data_processing.main()
    X, y = processed_data['X'], processed_data['y']
    feature_names = processed_data['descriptor_names']
    X = torch.tensor(X, device=device)
    y = torch.tensor(y.reshape(-1,1), device=device)
    # normalize X and y
    y_scale = y.std(dim=0)
    y_mean = y.mean(dim=0)
    X = (X - X.mean(dim=0)) / X.std(dim=0)
    y = (y - y_mean) / y_scale
    # 50% of the data is used as initial training points. The rest are test data
    initial_data_size = int(0.4 * X.shape[0])
    initial_idx = list(np.random.choice(X.shape[0], initial_data_size, \
        replace=False))
    X_test, X_train = tensor_pop(X, to_pop=initial_idx)
    y_test, y_train = tensor_pop(y, to_pop=initial_idx)
    # set up GPR model
    gpr_model, gpr_mll = get_gpr_model(X=X_train, y=y_train)
    
    # plot model performance
    plot_testing(gpr_model, X_test=X, X_train=X_train, y_train=y_train, \
            y_test=y,x_dim=13, feature_names=feature_names)

    # do some optimization!
    N_OPT_STEPS = 25
    opt_bounds = torch.stack([X.min(dim=0).values, X.max(dim=0).values])
    max_val, upper_confidence, lower_confidence = [], [], []
    for _ in range(N_OPT_STEPS):
        # get the point which has the maximum posterior and the variance to it
        print('X_train ', len(X_train), 'X_test ', len(X_test), \
            'max y_train', y_train.max()* y_scale + y_mean)
        gpr_model.eval();
        posterior = gpr_model.posterior(X)
        lower, upper = posterior.mvn.confidence_region()
        max_posterior, index = posterior.mean.max(dim=0)
        print(max_posterior * y_scale + y_mean, 'max post')
        max_val.append(float(max_posterior * y_scale + y_mean))
        upper_confidence.append(float(upper[index] * y_scale + y_mean))
        lower_confidence.append(float(lower[index] * y_scale + y_mean))

        updated_model = optimize_loop(
            model=gpr_model, \
            loss=gpr_mll, X_train=X_train, y_train=y_train, \
                X_test=X_test, y_test=y_test, \
                    bounds=opt_bounds)
        gpr_model, gpr_mll = updated_model['model'], updated_model['loss']
        X_train, y_train =  updated_model['X_train'], updated_model['y_train']
        X_test, y_test = updated_model['X_test'], updated_model['y_test']
        X_new, y_new = updated_model['X_new'], updated_model['y_new']
        # plot model performance
        plot_testing(gpr_model, X_test=X, X_train=X_train, y_train=y_train, \
                y_test=y,x_dim=13, feature_names=feature_names, \
                    X_new=X_new, y_new=y_new)
        

    plt.plot([_ for _ in range(N_OPT_STEPS)], max_val, \
        'go--', linewidth=2, markersize=12)
    plt.fill_between([_ for _ in range(N_OPT_STEPS)], lower_confidence, \
            upper_confidence, alpha=0.5, label = '95% Credibility')
    plt.show()
Exemple #9
0
 def launch_process_data(self):
     print("Launching data processing program")
     data_processing.main()