'ch.uzh.ciclassifier.features.travisci.BuildTimeAverage', 'ch.uzh.ciclassifier.features.travisci.BuildSuccessRatio', 'ch.uzh.ciclassifier.features.travisci.BuildTimeLatestAverage', 'ch.uzh.ciclassifier.features.travisci.ManualInteractionRatio', 'ch.uzh.ciclassifier.features.travisci.PullRequestRatio', 'ch.uzh.ciclassifier.features.travisci.TimeToFixAverage', 'ch.uzh.ciclassifier.features.travisci.TimeToFixLatestAverage', ] FEATURES_FILE = 'data/truth.csv' TYPES = ['configuration', 'repository', 'travisci'] results = [] for i in range(len(TYPES)): for permutation in list(itertools.combinations(TYPES, i + 1)): raw_data = pd.read_csv(FEATURES_FILE) features = raw_data[get_features()] for feature in AVAILABLE_FEATURES: featureType = helper.type_from_feature_name(feature) if featureType not in permutation and feature in features: features = features.drop(feature, axis=1) labels = np.array(raw_data['actual']) features = np.array(features) cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42) rf = RandomForestClassifier(**get_rf_parameters()) scores = cross_val_score(rf, features, labels, scoring='accuracy',
import csv import pickle import pandas as pd from parameters import get_features TARGET = 'results/ciclassifier_raw_new.csv' EXPORT = 'results/ciclassifier_new.csv' MODEL_PATH = '../models/classifier_configuration.sav' raw_data = pd.read_csv(TARGET) to_predict = raw_data.drop('project', axis=1) features = get_features() features = list(filter(lambda f: "configuration" in f, features)) to_predict = to_predict[features] model = pickle.load(open(MODEL_PATH, 'rb')) predictions = model.predict(to_predict) projects = raw_data.to_dict('records') with open(EXPORT, 'w', newline='') as csvfile: fieldnames = ['project'] fieldnames.extend(features) fieldnames.append('score') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() index = 0 for project in projects: prediction = predictions[index] project['score'] = prediction
from scipy.stats import stats from helper import short_name from parameters import get_features, beautify_feature_name DATA = '../data/truth.csv' raw_data = pd.read_csv(DATA) good = raw_data.loc[raw_data['actual'] == 1] bad = raw_data.loc[raw_data['actual'] == 0] fig, axs = plt.subplots(6, 4, figsize=(10, 12)) index = 0 row = 0 for feature in get_features(): beautiful_name = beautify_feature_name(feature) col = index % 4 print(row, col) tmp_good = good[feature] #tmp_good = tmp_good[tmp_good.between(tmp_good.quantile(.00), tmp_good.quantile(.95))] # tmp_good = tmp_good[(np.abs(stats.zscore(tmp_good)) < 3)] tmp_bad = bad[feature] # tmp_bad = tmp_bad[tmp_bad.between(tmp_bad.quantile(.00), tmp_bad.quantile(.95))] # tmp_bad = tmp_bad[(np.abs(stats.zscore(tmp_bad)) < 3)] #axs[row, col].hist([good[feature],bad[feature]], 20, alpha=0.5) #axs[row, col].hist(, 20, facecolor='r', alpha=0.5)
'ch.uzh.ciclassifier.features.travisci.PullRequestRatio', 'ch.uzh.ciclassifier.features.travisci.TimeToFixAverage', 'ch.uzh.ciclassifier.features.travisci.TimeToFixLatestAverage', ] FEATURES_FILE = 'data/truth.csv' LANGAUGES = ['Ruby', 'JavaScript', 'Python', 'Java', 'C++', 'PHP'] results = [] NUMBER_OF_RUNS = 10 for language in LANGAUGES: raw_data = pd.read_csv(FEATURES_FILE) subset = raw_data.loc[raw_data['language'] == language] features = subset[get_features()] labels = np.array(subset['actual']) features = np.array(features) cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42) rf = RandomForestClassifier(**get_rf_parameters()) scores = cross_val_score(rf, features, labels, scoring='accuracy', cv=cv, n_jobs=-1) accuracies = [] precisions = [] recalls = []