def fit(ml_algorithm, train_files, train_labels, name): train_features, _ = FeatureExtractor.get_features(train_files, []) try: algorithm_type = ALGORITHMS[ml_algorithm] except KeyError: print 'Algorithm type not valid!' return mla = algorithm_type() mla.fit(train_features, train_labels, name)
def get_probs(ml_algorithm, train_files, test_files, name): _, test_features = FeatureExtractor.get_features(train_files, test_files) try: algorithm_type = ALGORITHMS[ml_algorithm] except KeyError: print 'Algorithm type not valid!' return mla = algorithm_type() prob_ind, highest_score = mla.predict(test_features, name) return prob_ind, highest_score
def train_and_predict(ml_algorithm, train_files, train_labels, test_files, test_labels): train_features, test_features = FeatureExtractor.get_features(train_files, test_files) try: algorithm_type = ALGORITHMS[ml_algorithm] except KeyError: print 'Algorithm type not valid!' return mla = algorithm_type() score, indices, probs = mla.fit_and_predict(train_features, test_features, train_labels, test_labels) print 'Score: {}'.format(score) return indices, probs
def train_kfold(ml_algorithm, source_code, labels, ast_nodes=None): source_code = np.array(source_code) labels = np.array(labels) #ast_nodes = np.array(ast_nodes) try: algorithm_type = ALGORITHMS[ml_algorithm] except KeyError: print 'Algorithm type not valid!' return start = time.time() n_trees = [400] for nt in n_trees: mla = algorithm_type(n_trees=nt) print 'nt: {}'.format(nt) k = 10 code_per_author = 10 accuracy = 0 for i in range(k): test_indices = [] it = code_per_author / k for j in range(it): test_indices.extend(np.array(range(i*it+j, len(source_code), code_per_author))) train_indices = [] for sci in range(len(source_code)): if sci not in test_indices: train_indices.append(sci) train_features, test_features = FeatureExtractor.get_features(source_code[train_indices], source_code[test_indices]) score = mla.fit_and_predict(train_features, test_features, labels[train_indices], labels[test_indices]) print 'Score after {}th fold is {}'.format(i, score) accuracy += score print 'Final score: {}'.format(accuracy / float(k)) end = time.time() - start print 'Execution time: {}'.format(end) print '-----------------------------------------------'