Ejemplo n.º 1
0
def grab_tree(app_data, instruction_data, kind, features, interactive=True, keep_features=False):
    if keep_features:
        #print features
        all_features = list(app_data) + list(instruction_data)
        steps = get_pipeline_steps(
            kind=kind, data=instruction_data,
            dropped_features=[x for x in all_features if x not in features])
    else:
        steps = get_pipeline_steps(
            kind=kind, data=instruction_data,
            dropped_features=getattr(apollo, features))

    pipeline = DataframePipeline(steps)
    X, y = pipeline.fit_transform(app_data)
    labelencoder = pipeline['y']

    adf = AutoDataFrameMapper()

    pipeline = Pipeline([
        ('mapper', adf),
        ('clf', DecisionTreeClassifier(max_leaf_nodes=4))])
    features = adf.get_feature_list(X)
    pipeline.fit(X,y)

    model = CodeGenerator(kind).get_code(pipeline.steps[-1][1], features,
                             labelencoder.get_labels())
    return model
Ejemplo n.º 2
0
def run_regression(app_data,
                   instruction_data,
                   kind,
                   features,
                   interactive=True,
                   keep_features=False,
                   **kwargs):
    if keep_features:
        all_features = list(app_data) + list(instruction_data)
        steps = get_pipeline_steps(
            kind=kind,
            data=instruction_data,
            dropped_features=[x for x in all_features if x not in features])
    else:
        steps = get_pipeline_steps(kind=kind,
                                   data=instruction_data,
                                   dropped_features=getattr(apollo, features))

    steps = [x for x in steps if x[0] != 'threads']

    pipeline = DataframePipeline(steps)
    X, y = pipeline.fit_transform(app_data)

    pipeline = Pipeline([('mapper', AutoDataFrameMapper()), ('clf', Lasso())])

    scores = cross_val_predict(pipeline, X, y)

    return y, scores
Ejemplo n.º 3
0
def do_time(app_data,
            instruction_data,
            kind,
            features,
            interactive=True,
            keep_features=False):
    if keep_features:
        all_features = list(app_data) + list(instruction_data)
        steps = get_pipeline_steps(
            kind=kind,
            data=instruction_data,
            dropped_features=[x for x in all_features if x not in features])
    else:
        steps = get_pipeline_steps(kind=kind,
                                   data=instruction_data,
                                   dropped_features=getattr(apollo, features))

    pipeline = DataframePipeline(steps)

    X, y = pipeline.fit_transform(app_data)
    optimal = pipeline.get_x('y')
    times = get_time(pipeline.get_x('duplicates'),
                     pipeline.get_x('drop features'),
                     X,
                     y,
                     pipeline['y'],
                     kind=kind)

    if interactive:
        print "Optimal"
        print optmal
        print "Timel"
        print times
    else:
        return (optimal, times)
Ejemplo n.º 4
0
def get_features(app_data, instruction_data, kind, features):
    steps = get_pipeline_steps(kind=kind,
                               data=instruction_data,
                               dropped_features=getattr(apollo, features))

    steps = [x for x in steps if x[0] != 'threads']

    pipeline = DataframePipeline(steps)
    X, y = pipeline.fit_transform(app_data)

    pipeline = Pipeline([('mapper', AutoDataFrameMapper()),
                         ('clf', RandomForestClassifier())])

    pipeline.fit(X, y)

    features = AutoDataFrameMapper().get_feature_list(X)
    forest = pipeline.steps[-1][1]
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    features = [features[indices[f]] for f in range(min(15, len(features)))]
    importances = [
        importances[indices[f]] for f in range(min(15, len(features)))
    ]

    return features, importances
Ejemplo n.º 5
0
def confusion(parser, args):
    warnings.simplefilter("ignore")

    if not args.files:
        sys.stderr.write("install requires two files of application samples\n")
        sys.exit(-1)

    app_data_name, instruction_data_name = (args.files[0], args.files[1])
    app_data, instruction_data = apollo.util.loader.load(
        app_data_name, instruction_data_name)

    steps = get_pipeline_steps(kind=args.predict,
                               data=instruction_data,
                               dropped_features=getattr(apollo, args.features))

    steps = [x for x in steps if x[0] != 'threads']

    pipeline = DataframePipeline(steps)

    X, y = pipeline.fit_transform(app_data)

    pipeline = Pipeline([
        ('mapper', AutoDataFrameMapper()),
        #('pca', PCA(n_components=2)),
        #('pca', PCA(n_components=3)),
        ('clf', clfs[0])
    ])

    train_inds, test_inds = get_train_test_inds(y)

    pipeline.fit(X[train_inds], y[train_inds])
    results = pipeline.predict(X[test_inds])

    cm = confusion_matrix(y[test_inds], results)
    print cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
Ejemplo n.º 6
0
def run_cross_app_model(train_app, train_inst, test_app, test_inst, kind, features):
    train_inst_set = set(list(train_inst))
    test_inst_set = set(list(test_inst))

    train_app_set = set(list(train_app))
    test_app_set = set(list(test_app))

    for inst in train_inst_set - test_inst_set:
        test_inst[inst] = 0
    for inst in test_inst_set - train_inst_set:
        train_inst[inst] = 0

    feats_in_both = train_app_set & test_app_set

    train_app = train_app[list(feats_in_both)]
    test_app = test_app[list(feats_in_both)]

    steps = get_pipeline_steps(
        kind=kind, data=train_inst,
        dropped_features=getattr(apollo, features))
    pipeline = DataframePipeline(steps)

    X_train, y_train = pipeline.fit_transform(train_app)

    steps = get_pipeline_steps(
        kind=kind, data=test_inst,
        dropped_features=getattr(apollo, features))
    pipeline = DataframePipeline(steps)

    X_test, y_test = pipeline.fit_transform(test_app)

    pipeline = Pipeline([
        ('mapper', AutoDataFrameMapper()),
        ('clf', DecisionTreeClassifier())])

    pipeline.fit_transform(X_train, y_train)

    test_size = min(len(X_train), len(X_test))
    #print "Test size is %d" % test_size

    y_pred = pipeline.predict(X_test[:test_size])

    results = (y_pred == y_test[:test_size])
    if True in results:
        return pd.value_counts(results)[True]/float(len(results))
    else:
        return 0.0
Ejemplo n.º 7
0
def gen_code(app_data, instruction_data, kind, features, interactive=True, keep_features=False, instructions=False):
    if keep_features:
        print features
        all_features = list(app_data) + list(instruction_data)
        if instructions:
            steps = get_pipeline_steps(
                kind=kind, data=instruction_data,
                dropped_features=[x for x in all_features if x not in features or x in list(instruction_data)])
        else:
            steps = get_pipeline_steps(
                kind=kind, data=instruction_data,
                dropped_features=[x for x in all_features if x not in features])
    else:
        steps = get_pipeline_steps(
            kind=kind, data=instruction_data,
            dropped_features=getattr(apollo, features))

    pipeline = DataframePipeline(steps)

    #pipeline = DataframePipeline(get_pipeline_steps(
    #    kind='policy', data=instruction_data,
    #    dropped_features=apollo.dropped_features + list(instruction_data)))
    #    #dropped_features=apollo.dropped_features))

    X, y = pipeline.fit_transform(app_data)
    labelencoder = pipeline['y']

    adf = AutoDataFrameMapper()

    pipeline = Pipeline([
        ('mapper', adf),
        ('clf', DecisionTreeClassifier(max_depth=3))])
    features = adf.get_feature_list(X)
    pipeline.fit(X,y)
    model = CodeGenerator(kind).get_code(
        pipeline.steps[-1][1],
        features,
        labelencoder.get_labels(),
        list(instruction_data),
    )

    if interactive:
        print model
    else:
        return model
Ejemplo n.º 8
0
def run_model(app_data, instruction_data, kind, features, interactive=True, keep_features=False, **kwargs):
    if keep_features:
        all_features = list(app_data) + list(instruction_data)
        steps = get_pipeline_steps(
            kind=kind, data=instruction_data,
            dropped_features=[x for x in all_features if x not in features])
    else:

        print features

        steps = get_pipeline_steps(
            kind=kind, data=instruction_data,
            dropped_features=getattr(apollo, features))

    steps = [x for x in steps if x[0] != 'threads']

    pipeline = DataframePipeline(steps)

    X, y = pipeline.fit_transform(app_data)


    #print X['loop']

    pipeline = Pipeline([
        ('mapper', AutoDataFrameMapper()),
        #('pca', PCA(n_components=2)),
        #('pca', PCA(n_components=3)),
        ('clf', DecisionTreeClassifier(max_depth=kwargs.get('depth')))])
        #('clf', SGDClassifier())])
        #('clf', DecisionTreeClassifier())])
        #('clf', DecisionTreeClassifier(max_depth=3))])

    scores = cross_val_score(pipeline, X, y, cv=5)

    if interactive:
        print scores.mean()
    else:
        return scores.mean()