Ejemplo n.º 1
0
def fit(ml_algorithm, train_files, train_labels, name):
    train_features, _ = FeatureExtractor.get_features(train_files, [])

    try:
        algorithm_type = ALGORITHMS[ml_algorithm]
    except KeyError:
        print 'Algorithm type not valid!'
        return
    
    mla = algorithm_type()

    mla.fit(train_features, train_labels, name)
Ejemplo n.º 2
0
def get_probs(ml_algorithm, train_files, test_files, name):
    _, test_features = FeatureExtractor.get_features(train_files, test_files)

    try:
        algorithm_type = ALGORITHMS[ml_algorithm]
    except KeyError:
        print 'Algorithm type not valid!'
        return
    
    mla = algorithm_type()

    prob_ind, highest_score = mla.predict(test_features, name)
    return prob_ind, highest_score
Ejemplo n.º 3
0
def train_and_predict(ml_algorithm, train_files, train_labels, test_files, test_labels):
    train_features, test_features = FeatureExtractor.get_features(train_files, test_files)

    try:
        algorithm_type = ALGORITHMS[ml_algorithm]
    except KeyError:
        print 'Algorithm type not valid!'
        return
    
    mla = algorithm_type()

    score, indices, probs = mla.fit_and_predict(train_features, test_features, train_labels, test_labels)

    print 'Score: {}'.format(score)

    return indices, probs
Ejemplo n.º 4
0
def train_kfold(ml_algorithm, source_code, labels, ast_nodes=None):
    source_code = np.array(source_code)
    labels = np.array(labels)
    #ast_nodes = np.array(ast_nodes)

    try:
        algorithm_type = ALGORITHMS[ml_algorithm]
    except KeyError:
        print 'Algorithm type not valid!'
        return

    start = time.time()

    n_trees = [400]

    for nt in n_trees:
        mla = algorithm_type(n_trees=nt)
        print 'nt: {}'.format(nt)
        k = 10
        code_per_author = 10
        accuracy = 0
        
        for i in range(k):
            test_indices = []
            it = code_per_author / k
            for j in range(it):
                test_indices.extend(np.array(range(i*it+j, len(source_code), code_per_author)))
                
            train_indices = []
            for sci in range(len(source_code)):
                if sci not in test_indices:
                    train_indices.append(sci)

            train_features, test_features = FeatureExtractor.get_features(source_code[train_indices],
                                                                            source_code[test_indices])
            

            score = mla.fit_and_predict(train_features, test_features, labels[train_indices], labels[test_indices])
            print 'Score after {}th fold is {}'.format(i, score)
            accuracy += score
                
        print 'Final score: {}'.format(accuracy / float(k))
        end = time.time() - start
        print 'Execution time: {}'.format(end)
        print '-----------------------------------------------'