コード例 #1
0
ファイル: train.py プロジェクト: lhmtriet/PUMiner_MSR
import pandas as pd
import sanalytics.estimators.d2vestimator as sed
import logging
logging.basicConfig(level=logging.DEBUG)
from time import time
from progressbar import progressbar
import sanalytics.evaluation.evaluation_metric as see
import sanalytics.algorithms.utils as sau

## Import Data
X_train = pd.read_parquet("datasets/rq3_data/sec1.0_train.parquet")
X_test = pd.read_parquet("datasets/rq3_data/sec1.0_test.parquet")
X_all = pd.concat([X_train, X_test], sort=False)
X_train_90 = X_train[X_train.label=="security"]
X_train_100 = X_all[X_all.label=="security"]

## Train D2V
d2v_90 = sed.D2VEstimator().fit(X_train_90)
d2v_100 = sed.D2VEstimator().fit(X_train_90)
d2v_90.model.save("datasets/rq3_d2v/sec1.0_posonly.model")
d2v_100.model.save("datasets/rq3_d2v/sec1.0_all_posonly.model")
コード例 #2
0
ファイル: train_d2v.py プロジェクト: lhmtriet/PUMiner_MSR
import pandas as pd
import sanalytics.estimators.d2vestimator as sed
import logging
logging.basicConfig(level=logging.DEBUG)
from time import time

## Read test data and training folds
filename = sys.argv[1]
X = pd.read_parquet("datasets/rq3_data/{}_train.parquet".format(filename))
print(filename)
print(len(X))

## Train Doc2Vec for each split
start = time()
d2v = sed.D2VEstimator().fit(X)
end = time()
model = d2v.model
model.save("datasets/rq3_d2v/{}.model".format(filename))
pd.DataFrame([["{}".format(filename), end - start]],
             columns=["set", "training_time"]).to_csv(
                 "outputcsvs/d2v_training_times/{}.csv".format(filename),
                 index=False)

## Read test data and training folds
X = pd.concat([
    pd.read_parquet("datasets/rq3_data/{}_train.parquet".format(filename)),
    pd.read_parquet("datasets/rq3_data/{}_test.parquet".format(filename))
])
print(filename)
print(len(X))
コード例 #3
0
import dask.dataframe as dd
import sanalytics.estimators.d2vestimator as sed
import logging
logging.basicConfig(level=logging.DEBUG)
from time import time
import pandas as pd

if int(sys.argv[1]) == 1:
    filename = "X100_train.parquet+X100_test.parquet"
    files = [
        "datasets/model_selection_CV/{}".format(i) for i in filename.split("+")
    ]
    df = dd.read_parquet(files).fillna(
        '').compute()  # generated using analysis/job_array_processing
    start = time()
    d2vest = sed.D2VEstimator().fit(df)
    end = time()
    d2vest.model.save("datasets/rq3_d2v/sec1.0R100_all.model")
    filename = "sec1.0R100_all"
    pd.DataFrame([["{}".format(filename), end - start]],
                 columns=["set", "training_time"]).to_csv(
                     "outputcsvs/d2v_training_times/{}.csv".format(filename),
                     index=False)

if int(sys.argv[1]) == 2:
    filename = "X100_train.parquet"
    files = [
        "datasets/model_selection_CV/{}".format(i) for i in filename.split("+")
    ]
    df = dd.read_parquet(files).fillna(
        '').compute()  # generated using analysis/job_array_processing