Ejemplo n.º 1
0
                               random_state=42,
                               n_jobs=-1)
# Fit the random search model
rf_random.fit(features, labels)

pprint(rf_random.best_params_)


def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels, predictions)
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy * 100))

    return accuracy


# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.25, random_state=43)

base_model = RandomForestClassifier(**get_rf_parameters())
base_model.fit(train_features, train_labels)

base_accuracy = evaluate(base_model, test_features, test_labels)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

print('Improvement of {:0.2f}%.'.format(
    100 * (random_accuracy - base_accuracy) / base_accuracy))
Ejemplo n.º 2
0
    'ch.uzh.ciclassifier.features.configuration.UseCache',
    'ch.uzh.ciclassifier.features.configuration.UseDeploy',
    # 'ch.uzh.ciclassifier.features.github.OwnerType',
    # 'ch.uzh.ciclassifier.features.github.PrimaryLanguage',
    # 'ch.uzh.ciclassifier.features.repository.CommitsUntilConfigAdded',
    # 'ch.uzh.ciclassifier.features.repository.ConfigChangeFrequency',
    # 'ch.uzh.ciclassifier.features.repository.DaysUntilConfigAdded',
    # 'ch.uzh.ciclassifier.features.repository.NumberOfConfigurationFileChanges',
    # 'ch.uzh.ciclassifier.features.repository.NumberOfContributorsOnConfigurationFile',
    # 'ch.uzh.ciclassifier.features.repository.ProjectName',
    # 'ch.uzh.ciclassifier.features.travisci.BuildSuccessRatio',
    # 'ch.uzh.ciclassifier.features.travisci.BuildTimeAverage',
    # 'ch.uzh.ciclassifier.features.travisci.BuildTimeLatestAverage',
    # 'ch.uzh.ciclassifier.features.travisci.ManualInteractionRatio',
    # 'ch.uzh.ciclassifier.features.travisci.PullRequestRatio',
    # 'ch.uzh.ciclassifier.features.travisci.TimeToFixAverage',
    # 'ch.uzh.ciclassifier.features.travisci.TimeToFixLatestAverage',
]

# Labels are the values we want to predict
labels = np.array(raw_data['actual'])
features = raw_data[used_features]

# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(**get_rf_parameters())

# Train the model on training data
rf.fit(features, labels)

pickle.dump(rf, open(MODEL_PATH, 'wb'))
Ejemplo n.º 3
0
FEATURES_FILE = 'data/truth.csv'
LANGAUGES = ['Ruby', 'JavaScript', 'Python', 'Java', 'C++', 'PHP']

results = []

NUMBER_OF_RUNS = 10

for language in LANGAUGES:
    raw_data = pd.read_csv(FEATURES_FILE)
    subset = raw_data.loc[raw_data['language'] == language]
    features = subset[get_features()]
    labels = np.array(subset['actual'])
    features = np.array(features)

    cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
    rf = RandomForestClassifier(**get_rf_parameters())
    scores = cross_val_score(rf,
                             features,
                             labels,
                             scoring='accuracy',
                             cv=cv,
                             n_jobs=-1)

    accuracies = []
    precisions = []
    recalls = []
    for run in range(NUMBER_OF_RUNS):
        raw_data = pd.read_csv(FEATURES_FILE)
        subset_train = raw_data.loc[raw_data['language'] != language]
        subset_test = raw_data.loc[raw_data['language'] == language]