def create_dataset(lookoutvision_client, s3_resource, bucket, project_name,
                   dataset_images, dataset_type):
    """
    Creates a manifest from images in the supplied bucket and then creates
    a dataset.

    :param lookoutvision_client: A Boto3 Lookout for Vision client.
    :param s3_resource: A Boto3 Amazon S3 client.
    :param bucket: The bucket that stores the manifest file.
    :param project_name: The project in which to create the dataset.
    :param dataset_images: The location of the images referenced by the dataset.
    :param dataset_type: The type of dataset to create (train or test).
    """
    print(f"Creating {dataset_type} dataset...")

    manifest_file = f"s3://{bucket}/{project_name}/manifests/{dataset_type}.manifest"

    logger.info("Creating %s manifest file in %s.", dataset_type,
                manifest_file)
    Datasets.create_manifest_file_s3(s3_resource, dataset_images,
                                     manifest_file)

    logger.info("Create %s dataset for project %s", dataset_type, project_name)
    Datasets.create_dataset(lookoutvision_client, project_name, manifest_file,
                            dataset_type)
def test_create_dataset(make_stubber, error_code):
    lookoutvision_client = boto3.client('lookoutvision')
    lookoutvision_stubber = make_stubber(lookoutvision_client)
    project_name = 'test-project_name'
    bucket = 'test-bucket'
    object_key = 'test-object'
    manifest_file = f'{bucket}/{object_key}'
    dataset_type = 'train'
    status = 'CREATE_COMPLETE'
    message = 'Test message'

    lookoutvision_stubber.stub_create_dataset(project_name,
                                              dataset_type,
                                              bucket,
                                              object_key,
                                              status,
                                              message,
                                              error_code=error_code)
    if error_code is None:
        lookoutvision_stubber.stub_describe_dataset(project_name, dataset_type,
                                                    status, message)

    if error_code is None:
        Datasets.create_dataset(lookoutvision_client, project_name,
                                manifest_file, dataset_type)
    else:
        with pytest.raises(ClientError) as exc_info:
            Datasets.create_dataset(lookoutvision_client, project_name,
                                    manifest_file, dataset_type)
        assert exc_info.value.response['Error']['Code'] == error_code
Beispiel #3
0
def test_update_dataset_entries(make_stubber, error_code):
    lookoutvision_client = boto3.client('lookoutvision')
    lookoutvision_stubber = make_stubber(lookoutvision_client)
    project_name = 'test-project_name'
    updates_file = 'test/test_manifests/updates.manifest'
    dataset_type = 'train'
    status_complete = 'UPDATE_COMPLETE'
    status_running = 'UPDATE_IN_PROGRESS'
    message = 'Test message'
    changes = ""

    with open(updates_file) as f:
        changes = f.read()

    lookoutvision_stubber.stub_update_dataset_entries(project_name,
                                                      dataset_type,
                                                      changes,
                                                      status_running,
                                                      error_code=error_code)
    if error_code is None:
        lookoutvision_stubber.stub_describe_dataset(project_name, dataset_type,
                                                    status_complete, message)

    if error_code is None:
        Datasets.update_dataset_entries(lookoutvision_client, project_name,
                                        dataset_type, updates_file)
    else:
        with pytest.raises(ClientError) as exc_info:
            Datasets.update_dataset_entries(lookoutvision_client, project_name,
                                            dataset_type, updates_file)
        assert exc_info.value.response['Error']['Code'] == error_code
Beispiel #4
0
def get_transform(nparray):
    data_file = "framingham.csv"
    numeric_var = [
        "age",
        "cigsPerDay",
        "totChol",
        "sysBP",
        "diaBP",
        "BMI",
        "heartRate",
        "glucose",
    ]
    level_var = ["education"]
    category_var = [
        "male",
        "currentSmoker",
        "BPMeds",
        "prevalentStroke",
        "prevalentHyp",
        "diabetes",
    ]
    target = ["TenYearCHD"]

    # Create Data object
    data = Datasets(
        data_file=data_file,
        cat_cols=category_var,
        num_cols=numeric_var,
        level_cols=level_var,
        label_col=target,
        train=True,
    )
    return data.preprocess_newdata(nparray)
def test_create_manifest_file_s3(make_stubber, monkeypatch, error_code):
    s3_resource = boto3.resource('s3')
    s3_stubber = make_stubber(s3_resource.meta.client)
    image_bucket = 'image-bucket'
    image_prefix = 'image-prefix/'
    image_path = f'{image_bucket}/{image_prefix}'
    mani_bucket = 'mani-bucket'
    mani_prefix = 'mani-prefix/'
    manifest_path = f'{mani_bucket}/{mani_prefix}'

    monkeypatch.setattr(
        s3_resource.meta.client, 'upload_file',
        lambda Filename, Bucket, Key, ExtraArgs, Callback, Config: None)

    s3_stubber.stub_list_objects(image_bucket,
                                 [f'{image_prefix}anomaly/anomaly-test-key'],
                                 f"{image_prefix}anomaly/", '/')
    s3_stubber.stub_list_objects(image_bucket,
                                 [f'{image_prefix}normal/normal-test-key'],
                                 f"{image_prefix}normal/",
                                 '/',
                                 error_code=error_code)

    with open("temp.manifest", 'w') as mani:
        mani.write("Test manifest.")

    if error_code is None:
        Datasets.create_manifest_file_s3(s3_resource, image_path,
                                         manifest_path)
    else:
        with pytest.raises(ClientError) as exc_info:
            Datasets.create_manifest_file_s3(s3_resource, image_path,
                                             manifest_path)
        assert exc_info.value.response['Error']['Code'] == error_code
Beispiel #6
0
def do(config):
    # 데이터 읽기 & 전처리
    print("Read data")
    ds = Datasets(config.data_path)
    data = ds.read_data()

    print("Data preprocessing..")
    preprocessing = Preprocessing(config)
    X = preprocessing.do(data)

    print('Train model')

    if config.sg == 'CBOW':
        model = Word2Vec(
                    sentences=X,
                    size=config.size,
                    window=config.window,
                    min_count=config.min_count,
                    workers=config.workers,
                    sg=0
        )
    else:
        model = Word2Vec(
            sentences=X,
            size=config.size,
            window=config.window,
            min_count=config.min_count,
            workers=config.workers,
            sg=1
        )

    print(model.wv.vectors.shape)

    model.save(os.path.join(config.save_directory, config.ckpt_name))
def train(nn_name='12-net', k=12):
    """
    Fucntion for traning 12-net with testing on part of data
    using cross validation
    """
    suff = str(k)
    if nn_name.find('calib') > 0:
        X_data_name = 'train_data_icalib_' + suff + '.npy'
        y_data_name = 'labels_icalib_' + suff + '.npy'
    else:
        X_data_name = 'train_data_' + suff + '.npy'
        y_data_name = 'labels_' + suff + '.npy'

    rates12 = sp.hstack((0.05 * sp.ones(25, dtype=sp.float32),
                         0.005 * sp.ones(15, dtype=sp.float32),
                         0.0005 * sp.ones(10, dtype=sp.float32)))
    rates24 = sp.hstack((0.01 * sp.ones(25, dtype=sp.float32),
                         0.0001 * sp.ones(15, dtype=sp.float32)))
    rates48 = sp.hstack([
        0.05 * sp.ones(15, dtype=sp.float32),
        0.005 * sp.ones(10, dtype=sp.float32)
    ])
    if nn_name == '24-net':
        nn = Cnnl(
            nn_name=nn_name,
            l_rates=rates24,
            subnet=Cnnl(nn_name='12-net',
                        l_rates=rates12).load_model('12-net_lasagne_.pickle'))
    elif nn_name == '48-net':
        nn = Cnnl(nn_name=nn_name,
                  l_rates=rates48,
                  subnet=Cnnl(
                      nn_name='24-net',
                      l_rates=rates24,
                      subnet=Cnnl(
                          nn_name='12-net',
                          l_rates=rates12).load_model('12-net_lasagne_.pickle')
                  ).load_model('24-net_lasagne_.pickle'))
    else:
        nn = Cnnl(nn_name=nn_name, l_rates=rates12)
    if not os.path.exists(nn_name + '_lasagne_.pickle'):
        if nn_name.find('calib') > 0:
            ds.get_train_wider_calib_data(k=k)
        else:
            ds.get_train_data(k=k)
    X, y = sp.load(X_data_name), sp.load(y_data_name)

    X_train, y_train = X, y

    if not os.path.exists(nn_name + '_lasagne_.pickle'):
        if nn_name == '24-net':
            X_sub_train12 = sp.load('train_data_12.npy')
            nn.fit(X=X_train, y=y_train, X12=X_sub_train12)
        elif nn_name == '48-net':
            X_sub_train12 = sp.load('train_data_12.npy')
            X_sub_train24 = sp.load('train_data_24.npy')
            nn.fit(X=X_train, y=y_train, X12=X_sub_train12, X24=X_sub_train24)
        else:
            nn.fit(X=X_train, y=y_train)
        nn.save_model(nn_name + '_lasagne_.pickle')
Beispiel #8
0
    def __init__(self):
        super(Ui, self).__init__()
        uic.loadUi('mainwindow.ui', self)
        self.show()
        self.load_dir_but.clicked.connect(
            lambda: load_dir_dialog(self, self.load_dir_lineEdit))
        self.load_drift_file_but.clicked.connect(
            lambda: load_file_dialog(self, self.load_drift_file_lineEdit))
        self.load_calibration_file_but.clicked.connect(
            lambda: load_file_dialog(self, self.load_calibration_file_lineEdit
                                     ))

        def select_model_path(self, line_edit):
            self.model = None
            return load_dir_dialog(self, line_edit)

        self.select_model_path_but.clicked.connect(
            lambda: select_model_path(self, self.model_path_lineEdit))

        self.load_data.clicked.connect(self.load_data_func)
        self.label_data_but.clicked.connect(self.label_data_func)
        self.load_prefix_tables()
        self.datasets = Datasets()
        self.model = None

        self.labeled_dataset = None
        self.original_dataset = None
Beispiel #9
0
def finetune_q_model():

    X_train, X_valid, Y_train, Y_valid = Datasets.make_stacked_frame_data(
        data_folder_path)

    # Only need steering targets
    Y_train = Y_train[0]
    Y_valid = Y_valid[0]

    model = q_categorical(input_dimension=(120, 160, 4))

    # Load pretrained Q model for finetuning
    model.load_weights('saved_models/robin_track_v2_highres.h5')
    model.layers[-1].activation = softmax
    adam = Adam(lr=1e-4)  # Use a smaller learning rate for fine-tuning?
    model.compile(loss='categorical_crossentropy', optimizer=adam)
    print("weights Load Successfully!")

    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
    #ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

    model.fit(X_train,
              Y_train,
              epochs=20,
              batch_size=64,
              validation_data=(X_valid, Y_valid))

    timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    model.save_weights('saved_models/finetune_q_' + timestamp + '.h5',
                       overwrite=True)
Beispiel #10
0
def main():
    datasets = [
        {"city": "beijing", "age": 500, "temperature": 26},
        {"city": "shanghai", "age": 550, "temperature": 27},
        {"city": "shenzheng", "age": 300, "temperature": 30},
    ]

    dict_vectorizer = DictVectorizer()
    dv_datasets = dict_vectorizer.fit_transform(datasets)
    print dv_datasets.toarray()
    print dict_vectorizer.vocabulary_
    print dict_vectorizer.feature_names_
    print "-" * 80

    #fh_vectorizer = FeatureHasher(n_features=10, input_type="dict")
    #fh_datasets = fh_vectorizer.fit_transform([{"text": 10, "words": 7}, {"name": 1, "words": 5}, {"gender": 1}])
    fh_vectorizer = FeatureHasher(n_features=10, input_type="string")
    fh_datasets = fh_vectorizer.fit_transform(["Liming love football", "Zhansan likes baseball"])
    print fh_datasets.toarray()

    raw_datasets, _ = Datasets.load_datasets()
    datasets = [v for v in raw_datasets.data[:10]]

    count_vectorizer = CountVectorizer(decode_error="ignore")
    cv_datasets = count_vectorizer.fit_transform(datasets)
    print count_vectorizer.vocabulary_

    tfidf_transformer = TfidfTransformer(smooth_idf=True)
    tfidft_datasets = tfidf_transformer.fit_transform(cv_datasets)
    print tfidft_datasets.toarray()
    print tfidf_transformer.idf_

    hash_vectorizer = HashingVectorizer(n_features=100, decode_error="ignore")
    hv_datasets = hash_vectorizer.fit_transform(datasets)
    print hv_datasets.toarray().shape
Beispiel #11
0
 def fit(self, **kwargs):
     logging.basicConfig(format='%(levelname)s : %(message)s',
                         level=logging.INFO)
     logging.root.level = logging.INFO
     datasets = Datasets()
     params = DOC2VEC_PARAMS
     self.model = Doc2Vec(datasets.tagged_docs, **params)
     self.model.save(self.__get_model_fpath())
Beispiel #12
0
    def __init__(self, dataset_name, model_name, optimizer_name, trial_num):
        """
        :param dataset_name: name of the dataset
        :type dataset_name: str
        :param model_name: name of the model
        :type model_name: str
        :param optimizer_name: name of the optimizer
        :type optimizer_name: str
        :param trial_num: current number of repeated trials
        :type trial_num: int
        """
        # get optimized hyperparameters
        with open(
                f'../params/{dataset_name}_{model_name}_{optimizer_name}/result.json'
        ) as f:
            params = json.load(f)

        # get instances
        self.dataset = Datasets.get(dataset_name)
        self.model = Models.get(model_name, dataset=self.dataset)
        self.optimizer = Optimizers.get(optimizer_name, params=params)

        # get config
        with open('./config.json') as f:
            config = json.load(f)

        # get constants
        c = config['constants'][dataset_name][model_name]
        self.loss = c['loss']
        self.batch_size = c['batch_size']
        self.epochs = c['epochs']

        # configure and initialize directory
        d = self.main_dir = f'../data/{dataset_name}_{model_name}_{optimizer_name}/trial{trial_num}'
        if os.path.exists(d):
            shutil.rmtree(d)
        os.makedirs(d)

        # configure hyperdash experiment
        self.hd_exp = HyperdashExperiment(
            f'{dataset_name}',
            api_key_getter=lambda: config['hyperdash']['api_key'])
        self.hd_exp.param('dataset_name', dataset_name)
        self.hd_exp.param('model_name', model_name)
        self.hd_exp.param('optimizer_name', optimizer_name)
        self.hd_exp.param('trial_num', trial_num)

        for k, v in params.items():
            self.hd_exp.param(k, v)

        # set callbacks
        self.callbacks = [
            Hyperdash(['accuracy', 'loss', 'val_accuracy', 'val_loss'],
                      self.hd_exp),
            TensorBoard(log_dir=f'{self.main_dir}/tensorboard'),
            TimeLogger(filename=f'{self.main_dir}/time.csv'),
            CSVLogger(filename=f'{self.main_dir}/result.csv', append=True)
        ]
def test_delete_dataset(make_stubber, error_code):
    lookoutvision_client = boto3.client('lookoutvision')
    lookoutvision_stubber = make_stubber(lookoutvision_client)
    project_name = 'test-project_name'
    dataset_type = 'train'

    lookoutvision_stubber.stub_delete_dataset(project_name,
                                              dataset_type,
                                              error_code=error_code)

    if error_code is None:
        Datasets.delete_dataset(lookoutvision_client, project_name,
                                dataset_type)
    else:
        with pytest.raises(ClientError) as exc_info:
            Datasets.delete_dataset(lookoutvision_client, project_name,
                                    dataset_type)
        assert exc_info.value.response['Error']['Code'] == error_code
def train(nn_name = '12-net',k = 12):
    """
    Fucntion for traning 12-net with testing on part of data
    using cross validation
    """
    suff = str(k)
    if nn_name.find('calib') > 0:
        X_data_name = 'train_data_icalib_'+ suff +  '.npy'
        y_data_name = 'labels_icalib_'+ suff + '.npy'
    else:
        X_data_name = 'train_data_'+ suff +  '.npy'
        y_data_name = 'labels_'+ suff + '.npy'
    
    rates12 = sp.hstack((0.05 * sp.ones(25,dtype=sp.float32),0.005*sp.ones(15,dtype=sp.float32),0.0005*sp.ones(10,dtype=sp.float32)))
    rates24 = sp.hstack((0.01 * sp.ones(25,dtype=sp.float32),0.0001*sp.ones(15,dtype=sp.float32)))
    rates48 = sp.hstack ([0.05 * sp.ones(15,dtype=sp.float32),0.005*sp.ones(10,dtype=sp.float32) ])
    if nn_name == '24-net':
        nn = Cnnl(nn_name = nn_name,l_rates=rates24,subnet=Cnnl(nn_name = '12-net',l_rates=rates12).load_model(
            '12-net_lasagne_.pickle'))
    elif nn_name == '48-net':    
        nn = Cnnl(nn_name = nn_name,l_rates=rates48,subnet=Cnnl(nn_name = '24-net',l_rates=rates24,subnet=Cnnl(nn_name = '12-net',l_rates=rates12).load_model(
            '12-net_lasagne_.pickle')).load_model('24-net_lasagne_.pickle'))     
    else:
        nn = Cnnl(nn_name = nn_name,l_rates=rates12)
    if not os.path.exists(nn_name   + '_lasagne_.pickle'): 
        if nn_name.find('calib') > 0:
            ds.get_train_wider_calib_data(k=k)  
        else:
            ds.get_train_data(k=k)
    X,y = sp.load(X_data_name),sp.load(y_data_name)
        
    X_train,y_train = X,y
    
    if not os.path.exists(nn_name   + '_lasagne_.pickle'):
        if nn_name == '24-net':
            X_sub_train12 = sp.load('train_data_12.npy')
            nn.fit(X = X_train,y = y_train,X12 = X_sub_train12)
        elif nn_name == '48-net':
            X_sub_train12 = sp.load('train_data_12.npy')
            X_sub_train24 = sp.load('train_data_24.npy')
            nn.fit(X = X_train,y = y_train,X12 = X_sub_train12,X24 = X_sub_train24)
        else:
            nn.fit(X = X_train,y = y_train)
        nn.save_model(nn_name   + '_lasagne_.pickle')
Beispiel #15
0
    def objective(self, params):
        """
        objective function to optimize

        :param params: hyperparamters for optimizer
        :return: maximum validation accuracy
        :rtype: float
        """
        # get instances
        dataset = Datasets.get(self.dataset_name)
        model = Models.get(self.model_name, dataset=dataset)
        optimizer = Optimizers.get(self.optimizer_name, params=params)

        # configure hyperdash experiment
        hd_exp = HyperdashExperiment(
            f'{self.dataset_name}',
            api_key_getter=lambda: self.config['hyperdash']['api_key'])
        hd_exp.param('dataset_name', self.dataset_name)
        hd_exp.param('model_name', self.model_name)
        hd_exp.param('optimizer_name', self.optimizer_name)

        for k, v in params.items():
            hd_exp.param(k, v)

        # set callbacks
        callbacks = [
            Hyperdash(['accuracy', 'loss', 'val_accuracy', 'val_loss'],
                      hd_exp),
            EarlyStopping('val_accuracy',
                          patience=10,
                          min_delta=0.01,
                          verbose=1),
            TerminateOnNaN()
        ]

        # get data
        (x_train, y_train), *_ = dataset.get_batch()

        # start learning
        model.compile(loss=self.loss,
                      optimizer=optimizer,
                      metrics=['accuracy'])
        history = model.fit(x_train,
                            y_train,
                            batch_size=self.batch_size,
                            epochs=self.epochs,
                            callbacks=callbacks,
                            validation_split=0.2,
                            verbose=2)

        # stop hyperdash experiment
        hd_exp.end()

        # return maximum validation accuracy
        val_accuracy = np.array(history.history['val_accuracy'])
        return max(val_accuracy) * (-1)
 def __init__(self, analyze, conf, data_conf, output, algorithms, datasets, metrics):
     self.conf = self._load_conf(conf)
     self.shall_analyze = analyze
     self.data_conf = self._load_conf(data_conf)
     self.data_class = Datasets()
     self.output = output
     self.output_dir = os.path.abspath(self.conf.get("output_dir", "./output"))
     self.shall_plot = self.conf.get("plot_data")
     self.algorithms = algorithms
     self.datasets = datasets
     self.metrics = metrics
Beispiel #17
0
def do(config):
    # 데이터 읽기 & 전처리
    print("Read data")
    ds = Datasets(config.data_path)
    data = ds.read_data()

    print("Data preprocessing..")
    preprocessing = Preprocessing(config)
    x_train, y_train = preprocessing.do(data)

    print("Model build..")
    model, callback = build(config, preprocessing.vocab_size)

    history = model.fit(x_train,
                        y_train,
                        epochs=config.epoch,
                        callbacks=callback,
                        batch_size=config.batch_size,
                        validation_split=0.2)
    model.save(os.path.join(config.save_directory, config.ckpt_name))
 def train_model(self):
     t0 = time.time()
     print(self.model.summary())
     dataset = Datasets(self.options)
     train_gen = dataset.make_split_generator('train')
     validation_gen = dataset.make_split_generator('validation')
     optimizer = Adam(lr=self.options.init_lr, decay=0)
     self.model.compile(
         optimizer, loss=self.loss_function, metrics=self.metrics)
         
     callbacks = self.make_callbacks()
     self.model.fit_generator(
         train_gen,
         initial_epoch=self.options.init_epoch,
         steps_per_epoch=self.options.steps_per_epoch,
         epochs=self.options.epochs,
         validation_data=validation_gen,
         validation_steps=self.options.validation_steps,
         callbacks=callbacks)
         
     print('Training time cost: %0.2f(min).'%((time.time()-t0)/60))
Beispiel #19
0
def get_prediction(sentence):
    sentence = Datasets.normalize_string(sentence)
    sentence = tokenizer.tokenize(sentence)
    sentence = tokenizer.convert_tokens_to_ids(sentence)    
    sentence = [vocab['[CLS]']] + sentence + [vocab['[SEP]']]
    
    output = model(torch.tensor(sentence).unsqueeze(0))
    output_softmax = softmax(output)[0]
    max_out = label_list[output_softmax.argmax()]
    argidx = output_softmax.argsort(descending=True)
    result = {label_list[i]: round(output_softmax[i].item(), 3) for i in range(len(label_list))}
    sorted_result = {label_list[i]: round(output_softmax[i].item(), 3) for i in argidx}
    return max_out, result, sorted_result
def test_describe_dataset(make_stubber, error_code):
    lookoutvision_client = boto3.client('lookoutvision')
    lookoutvision_stubber = make_stubber(lookoutvision_client)
    project_name = 'test-project_name'
    dataset_type = 'train'
    status = 'CREATE_COMPLETE'
    message = 'Test message'
    image_stats = {'Total': 5, 'Labeled': 2, 'Normal': 2, 'Anomaly': 1}

    lookoutvision_stubber.stub_describe_dataset(project_name,
                                                dataset_type,
                                                status,
                                                message,
                                                image_stats,
                                                error_code=error_code)

    if error_code is None:
        Datasets.describe_dataset(lookoutvision_client, project_name,
                                  dataset_type)
    else:
        with pytest.raises(ClientError) as exc_info:
            Datasets.describe_dataset(lookoutvision_client, project_name,
                                      dataset_type)
        assert exc_info.value.response['Error']['Code'] == error_code
Beispiel #21
0
def example():
    from datasets import Datasets
    datasets = Datasets()
    datasets.download()

    training_data = datasets.load()
    test_data = datasets.load(test=True)

    my_xgb_regressor = MyXGBRegressor(datasets)
    my_xgb_regressor.train(training_data)
    predictions = my_xgb_regressor.predict(test_data, save=True)
Beispiel #22
0
def test(model, args):
    model.eval()

    # Load Datasets
    dataset = Datasets(file_path=args.test_data_path,
                       label_list=label_list,
                       pretrained_type=args.pretrained_type)
    # Use custom batch function
    collate_fn = ClassificationBatchFunction(args.max_len, dataset.pad_idx,
                                             dataset.cls_idx, dataset.sep_idx)
    loader = DataLoader(dataset=dataset,
                        batch_size=args.train_batch_size,
                        num_workers=8,
                        pin_memory=True,
                        collate_fn=collate_fn)

    loss, acc, f1, (total_y_hat, cm) = evaluate(args, loader, model, device)
    return loss, acc, f1, total_y_hat, cm
Beispiel #23
0
def train_lstm_model():

    X_train, X_valid, Y_train, Y_valid = Datasets.make_lstm_data(
        data_folder_path)

    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
    #ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

    model = lstm_categorical(input_dimension=(7, 120, 160, 3))
    model.fit(X_train,
              Y_train,
              epochs=20,
              batch_size=64,
              validation_data=(X_valid, Y_valid))

    timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    model.save_weights('saved_models/lstm_categorical_' + timestamp + '.h5',
                       overwrite=True)
Beispiel #24
0
def run_aft(possible_widths, pool_size, max_layers, cand_epochs, final_epochs,
            iteration=''):
    data = Datasets.mnist()
    comp = 0.0

    model = AutoForwardThinking(possible_widths, pool_size, max_layers, data)
    stats = model.train(final_epochs=final_epochs, cand_epochs=cand_epochs,
                        batch_size=128, stopping_comp=comp)

    for key in stats.keys():
        print(key, len(stats[key]))

    # save training stats
    df = pd.DataFrame(stats)
    comments = list(df['comments'])[0]
    df['min_width'] = min(possible_widths)
    df['max_width'] = max(possible_widths)
    df['pool_size'] = pool_size
    df['max_layers'] = max_layers
    df['cand_epochs'] = cand_epochs
    df['final_epochs'] = final_epochs
    df['compensation'] = comp
    layers = comments.split(']')[0].replace('[', '')
    df['n_layers'] = layers.count(',') + 1
    df['layers'] = layers
    comments = comments.split(']')[1].strip(',').strip()
    df['hid_act'] = comments.split(',')[0].strip()
    df['out_act'] = comments.split(',')[1].strip()
    df['optimizer'] = comments.split(',')[-1].replace('prop', '').strip()
    df = df.drop(columns='comments')
    fname = 'aft_min{}_max{}_p{}_maxl{}_canep{}_finep{}_comp{}_{}.csv'.format(
        min(possible_widths), max(possible_widths), pool_size,
        max_layers, cand_epochs, final_epochs,
        str(comp).replace('.', '-'),
        iteration
    )
    while os.path.exists(fname):
        fname = fname.replace('.csv', '_.csv')
    df.to_csv(fname)

    K.clear_session()
def main():
    raw_datasets, _ = Datasets.load_datasets()
    X, Y = gen_datasets(raw_datasets)

    vectorizer = CountVectorizer(decode_error="ignore")
    cv_datasets = vectorizer.fit_transform(X).toarray()

    clf = ExtraTreesClassifier()
    clf = clf.fit(cv_datasets, Y)
    print cv_datasets.shape

    print clf.feature_importances_

    modle = SelectFromModel(clf, prefit=True)
    X_new = modle.transform(cv_datasets)
    print X_new.shape

    binarizer = Binarizer(threshold=1.0)
    b_datasets = binarizer.fit_transform(cv_datasets)
    variance_threshold = VarianceThreshold(.8 * (1 - .8))
    v_datasets = variance_threshold.fit_transform(b_datasets)
    print v_datasets.shape
Beispiel #26
0
def train_single_frame_model():
    """
    Same as default categorical from donkey
    """

    X_train, X_valid, Y_train, Y_valid = Datasets.make_single_frame_data(
        data_folder_path)

    model = default_categorical()

    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
    #ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

    model.fit(X_train,
              Y_train,
              epochs=3,
              batch_size=64,
              validation_data=(X_valid, Y_valid))

    timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    model.save_weights('saved_models/single_frame_categorical_' + timestamp +
                       '.h5',
                       overwrite=True)
Beispiel #27
0
def main(args):
    """Execute a task based on the given command-line arguments.

    This function is the main entry-point of the program. It allows the
    user to extract features, train a model, compute predictions, and
    evaluate predictions using the command-line interface.
    """
    from datasets import Datasets

    datasets = Datasets(args.dataset_path)

    if args.command == 'extract':
        extract(datasets.get(args.dataset), args)
    elif args.command == 'train':
        train(datasets.get('training'), args)
    elif args.command == 'predict':
        predict(datasets.get(args.dataset), args)
    elif args.command == 'evaluate':
        if isinstance(args.training_id, list):
            evaluate_all(datasets.get('test'), args)
        else:
            evaluate(datasets.get('test'), args)
def train(args):
    set_seed(args)
    # Set device
    if args.device == 'cuda':
        device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        logger.info('use cuda')
    else:
        device = torch.device('cpu')
        logger.info('use cpu')

    # Load pretrained model and model configuration
    pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type)
    if args.pretrained_model_path is None:
        # Use pretrained bert model(etri/skt)
        pretrained_model_path = os.path.join(pretrained_path,
                                             'pytorch_model.bin')
    else:
        # Use further-pretrained bert model
        pretrained_model_path = args.pretrained_model_path
    logger.info('Pretrain Model : {}'.format(pretrained_model_path))
    pretrained = torch.load(pretrained_model_path)

    if args.pretrained_type == 'skt' and 'bert.' not in list(
            pretrained.keys())[0]:
        logger.info('modify parameter names')
        # Change parameter name for consistency
        new_keys_ = ['bert.' + k for k in pretrained.keys()]
        old_values_ = pretrained.values()
        pretrained = {k: v for k, v in zip(new_keys_, old_values_)}

    bert_config = BertConfig(
        os.path.join(pretrained_path + '/bert_config.json'))
    model = BertForMLM(bert_config).to(device)
    model.load_state_dict(pretrained, strict=False)

    # Load Datasets
    tr_set = Datasets(file_path=args.train_data_path,
                      pretrained_type=args.pretrained_type,
                      max_len=args.max_len)
    # Use custom batch function
    collate_fn = MLMBatchFunction(args.max_len, tr_set.vocab)
    tr_loader = DataLoader(dataset=tr_set,
                           batch_size=args.train_batch_size,
                           shuffle=True,
                           num_workers=8,
                           pin_memory=True,
                           drop_last=True,
                           collate_fn=collate_fn)

    if args.do_eval:
        dev_set = Datasets(file_path=args.dev_data_path,
                           pretrained_type=args.pretrained_type,
                           max_len=args.max_len)
        dev_loader = DataLoader(dataset=dev_set,
                                batch_size=args.eval_batch_size,
                                num_workers=8,
                                pin_memory=True,
                                drop_last=False,
                                collate_fn=collate_fn)

    # optimizer
    optimizer = layerwise_decay_optimizer(model=model,
                                          lr=args.learning_rate,
                                          layerwise_decay=args.layerwise_decay)

    # lr scheduler
    t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs
    warmup_steps = int(t_total * args.warmup_percent)
    logger.info('total training steps : {}, lr warmup steps : {}'.format(
        t_total, warmup_steps))
    # Use gradual warmup and cosine decay
    scheduler = optimization.WarmupCosineWithHardRestartsSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=t_total)

    # for low-precision training
    if args.fp16:
        try:
            from apex import amp
            logger.info('Use fp16')
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level,
                                          verbosity=0)

    # tensorboard setting
    save_path = "./model_saved_pretrain/lr{},batch{},total{},warmup{},len{},{}".format(
        args.learning_rate,
        args.train_batch_size * args.gradient_accumulation_steps, t_total,
        args.warmup_percent, args.max_len, args.pretrained_type)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    writer = SummaryWriter(save_path)

    # Save best model results with resultwriter
    result_writer = utils.ResultWriter("./model_saved_pretrain/results.csv")
    model.zero_grad()

    best_val_loss = 1e+9
    best_val_acc = 0
    global_step = 0

    train_loss, train_acc = 0, 0
    val_loss, val_acc = 0, 0
    logging_loss, logging_acc = 0, 0

    logger.info('***** Training starts *****')
    total_result = []
    for epoch in tqdm(range(args.epochs), desc='epochs'):
        for step, batch in tqdm(enumerate(tr_loader),
                                desc='steps',
                                total=len(tr_loader)):
            model.train()
            x_train, y_train, mask_train = map(lambda x: x.to(device), batch)

            inputs = {
                'input_ids': x_train,
                'attention_mask': mask_train,
                'masked_lm_labels': y_train,
            }

            output, loss = model(**inputs)
            y_max = output.max(dim=2)[1]

            # Get accuracy for maked tokens
            total_length = torch.ones_like(y_train).masked_fill(
                y_train == -1, 0).sum().item()
            total_sum = torch.zeros_like(y_max).masked_fill(
                y_max == y_train, 1).sum().item()
            batch_acc = total_sum / total_length

            # accumulate measures
            grad_accu = args.gradient_accumulation_steps
            if grad_accu > 1:
                loss /= grad_accu
                batch_acc /= grad_accu

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            train_loss += loss.item()
            train_acc += batch_acc
            if (step + 1) % grad_accu == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.grad_clip_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_norm)

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if global_step % args.logging_step == 0:
                    acc_ = (train_acc - logging_acc) / args.logging_step
                    loss_ = (train_loss - logging_loss) / args.logging_step
                    writer.add_scalars('loss', {'train': loss_}, global_step)
                    writer.add_scalars('acc', {'train': acc_}, global_step)
                    writer.add_scalars('lr', {'lr': scheduler.get_lr()[0]},
                                       global_step)

                    logger.info(
                        '[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}'.format(
                            global_step, t_total, loss_, acc_))

                    logging_acc, logging_loss = train_acc, train_loss

        if args.do_eval:
            # Validation
            val_loss, val_acc = evaluate(args, dev_loader, model, device)
            val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}'.format(
                global_step, t_total, val_loss, val_acc)
            logger.info(val_result)
            total_result.append(val_result)

        if val_loss <= best_val_loss:
            # Save model checkpoints
            torch.save(model.state_dict(),
                       os.path.join(save_path, 'best_model.bin'))
            torch.save(args, os.path.join(save_path, 'training_args.bin'))
            logger.info('Saving model checkpoint to %s', save_path)
            best_val_loss = val_loss
            best_val_acc = val_acc

        if (epoch + 1) % args.saving_step == 0:
            torch.save(
                model.state_dict(),
                os.path.join(save_path, 'epoch{}_model.bin'.format(epoch + 1)))
            # Save results in 'model_saved_pretrain/results.csv'
            results = {
                'train_loss': loss_,
                'train_acc': acc_,
                'val_loss': best_val_loss,
                'val_acc': best_val_acc,
                'save_dir': save_path,
                'global_step': global_step,
            }
            result_writer.update(args, **results)

    return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
Beispiel #29
0
def main(args):
    datasets = Datasets(data_path=args.data_path)

    # Prepare output files
    outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.nll'
    if not os.path.exists(os.path.dirname(outname1)):
        os.makedirs(os.path.dirname(outname1))
    outfile1 = open(outname1, 'w')
    outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\
            + str(args.num_inducing) + '.time'
    outfile2 = open(outname2, 'w')

    running_loss = 0
    running_time = 0
    for i in range(args.splits):
        print('Split: {}'.format(i))
        print('Getting dataset...')
        data = datasets.all_datasets[args.dataset].get_data(i)
        X, Y, Xs, Ys, Y_std = [
            data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
        ]
        Z = kmeans2(X, args.num_inducing, minit='points')[0]

        # set up batches
        batch_size = args.M if args.M < X.shape[0] else X.shape[0]
        train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\
                .prefetch(X.shape[0]//2)\
                .shuffle(buffer_size=(X.shape[0]//2))\
                .batch(batch_size)

        print('Setting up DGP model...')
        kernels = []
        for l in range(args.num_layers):
            kernels.append(SquaredExponential() + White(variance=1e-5))

        dgp_model = DGP(X.shape[1],
                        kernels,
                        Gaussian(variance=0.05),
                        Z,
                        num_outputs=Y.shape[1],
                        num_samples=args.num_samples,
                        num_data=X.shape[0])

        # initialise inner layers almost deterministically
        for layer in dgp_model.layers[:-1]:
            layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5,
                                     transform=triangular())

        optimiser = tf.optimizers.Adam(args.learning_rate)

        def optimisation_step(model, X, Y):
            with tf.GradientTape() as tape:
                tape.watch(model.trainable_variables)
                obj = -model.elbo(X, Y, full_cov=False)
                grad = tape.gradient(obj, model.trainable_variables)
            optimiser.apply_gradients(zip(grad, model.trainable_variables))

        def monitored_training_loop(model, train_dataset, logdir, iterations,
                                    logging_iter_freq):
            # TODO: use tensorboard to log trainables and performance
            tf_optimisation_step = tf.function(optimisation_step)
            batches = iter(train_dataset)

            for i in range(iterations):
                X, Y = next(batches)
                tf_optimisation_step(model, X, Y)

                iter_id = i + 1
                if iter_id % logging_iter_freq == 0:
                    tf.print(
                        f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}')

        print('Training DGP model...')
        t0 = time.time()
        monitored_training_loop(dgp_model,
                                train_dataset,
                                logdir=args.log_dir,
                                iterations=args.iterations,
                                logging_iter_freq=args.logging_iter_freq)
        t1 = time.time()
        print('Time taken to train: {}'.format(t1 - t0))
        outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0))
        outfile2.flush()
        os.fsync(outfile2.fileno())
        running_time += t1 - t0

        m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples)
        test_nll = np.mean(
            logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std),
                      0,
                      b=1 / float(args.test_samples)))
        print('Average test log likelihood: {}'.format(test_nll))
        outfile1.write('Split {}: {}\n'.format(i + 1, test_nll))
        outfile1.flush()
        os.fsync(outfile1.fileno())
        running_loss += t1 - t0

    outfile1.write('Average: {}\n'.format(running_loss / args.splits))
    outfile2.write('Average: {}\n'.format(running_time / args.splits))
    outfile1.close()
    outfile2.close()
Beispiel #30
0
def main(args):
    num_layers = len(args.hidden_dims)
    datasets = Datasets(data_path=args.data_path)

    # Prepare output files
    outname1 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\
            + str(args.num_inducing) + '.rmse'
    if not os.path.exists(os.path.dirname(outname1)):
        os.makedirs(os.path.dirname(outname1))
    outfile1 = open(outname1, 'w')
    outname2 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\
            + str(args.num_inducing) + '.nll'
    outfile2 = open(outname2, 'w')
    outname3 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\
            + str(args.num_inducing) + '.time'
    outfile3 = open(outname3, 'w')

    running_err = 0
    running_loss = 0
    running_time = 0
    test_errs = np.zeros(args.splits)
    test_nlls = np.zeros(args.splits)
    test_times = np.zeros(args.splits)
    for i in range(args.splits):
        print('Split: {}'.format(i))
        print('Getting dataset...')
        data = datasets.all_datasets[args.dataset].get_data(i)
        X, Y, Xs, Ys, Y_std = [
            data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']
        ]

        dgp_model = aep.SDGPR(X, Y, args.num_inducing, args.hidden_dims)
        print('Training DGP model...')
        t0 = time.time()
        dgp_model.optimise(method='Adam',
                           mb_size=args.batch_size,
                           adam_lr=args.learning_rate,
                           maxiter=args.iterations)
        t1 = time.time()
        test_times[i] = t1 - t0
        print('Time taken to train: {}'.format(t1 - t0))
        outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0))
        outfile3.flush()
        os.fsync(outfile3.fileno())
        running_time += t1 - t0

        # Minibatch test predictions
        means, vars = [], []
        test_batch_size = args.test_batch_size
        if len(Xs) > test_batch_size:
            for mb in range(-(-len(Xs) // test_batch_size)):
                m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) *
                                              test_batch_size, :])
                means.append(m)
                vars.append(v)
        else:
            m, v = dgp_model.predict_y(Xs)
            means.append(m)
            vars.append(v)

        mean_ND = np.concatenate(means, 0)
        var_ND = np.concatenate(vars, 0)

        test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5)
        test_errs[i] = test_err
        print('Average RMSE: {}'.format(test_err))
        outfile1.write('Split {}: {}\n'.format(i + 1, test_err))
        outfile1.flush()
        os.fsync(outfile1.fileno())
        running_err += test_err

        test_nll = np.mean(
            norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND**0.5 * Y_std))
        test_nlls[i] = test_nll
        print('Average test log likelihood: {}'.format(test_nll))
        outfile2.write('Split {}: {}\n'.format(i + 1, test_nll))
        outfile2.flush()
        os.fsync(outfile2.fileno())
        running_loss += test_nll

    outfile1.write('Average: {}\n'.format(running_err / args.splits))
    outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs)))
    outfile2.write('Average: {}\n'.format(running_loss / args.splits))
    outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls)))
    outfile3.write('Average: {}\n'.format(running_time / args.splits))
    outfile3.write('Standard deviation: {}\n'.format(np.std(test_times)))
    outfile1.close()
    outfile2.close()
    outfile3.close()
Beispiel #31
0
def run_algorithms(algorithms, datasets, metrics, output, conf):
    dts = Datasets()
    shall_plot = conf.get("plot_data")
    if shall_plot:
        plot_dir = conf.get("plot_dir", "../plots")

        tmp_plot_dir = "../plots_1"
        if os.path.exists(tmp_plot_dir):
            shutil.rmtree(tmp_plot_dir)

        os.mkdir(tmp_plot_dir)

        orig_data_dir = os.path.join(tmp_plot_dir, "original")
        os.mkdir(orig_data_dir)
        for dataset in datasets:
            plot_data(os.path.join(orig_data_dir, "%s-orig.png"  % dataset), "%s-orig" % dataset, dataset)

    if output == 'dump_text' and not os.path.exists("../dumps"):
        os.mkdir("../dumps")

    for algorithm in algorithms:

        if shall_plot:
            algo_dir = os.path.join(tmp_plot_dir, algorithm)
            os.mkdir(algo_dir)

        algo_conf = conf["algorithms"].get(algorithm, None)

        if not algo_conf:
            logging.error("Algorithm %s not found in conf file" % algorithm)
            sys.exit(0)

        algo_conf['name'] = algorithm
        learn_class = _get_algorithm_class(algorithm)
        learn = learn_class(**algo_conf)
        learn._set_cross_validation(conf.get("cv_method", None), conf.get("cv_metric", None), conf.get("cv_params", None))
        results = []
        for dataset in datasets:
            if dataset not in conf["datasets"]:
                logging.error("Dataset %s not found" % dataset)
                sys.exit(0)

            cv_dir = None
            if shall_plot:
                dataset_dir = os.path.join(algo_dir, dataset)
                os.mkdir(dataset_dir)

                if algo_conf.get("cross_validate", True):
                    cv_dir = os.path.join(dataset_dir, "cv")
                    os.mkdir(cv_dir)

            training_sizes = conf.get("training_size", [0.40])
            scores = []
            for training_size in training_sizes:
                data = dts.load_dataset(dataset, training_size)

                learn.set_dataset(dataset, training_size*100, cv_dir)
                if learn.check_type(data["type"]):
                    eval_metrics = []
                    if metrics:
                        eval_metrics.extend(metrics)
                    else:
                        eval_metrics.extend(algo_conf["allowed_metrics"])

                    learn.train(data["x_train"], data["y_train"])
                    result_tups = learn.evaluate(data["x_test"], data["y_test"], eval_metrics)

                    print_results(training_size, algorithm, dataset, result_tups)
                    results.append((algorithm, dataset, training_size, result_tups))

                    if shall_plot:
                        decision_plot_path = os.path.join(dataset_dir, "decision-%s_%s_size_%d.png" % (dataset, algorithm, training_size * 100))
                        learn.plot_results(decision_plot_path, dataset, training_size, data['x_train'], data['x_test'], data['y_train'], data['y_test'])

                        for metric, y_test, score in result_tups:
                            metric_plot_path = os.path.join(dataset_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100))
                            plot_metric(metric_plot_path, data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100)
                    scores.append(result_tups[0][2])
            if shall_plot:
                train_plot_path = os.path.join(dataset_dir, "train_vs_acc-%s_%s.png" % (algorithm, dataset))
                plot_training_results(train_plot_path, [train_size * 100 for train_size in training_sizes], scores)

        if output == "pdf":
            generate_pdf(results)
        elif output == "dump_text":
            dump_results(algorithm, results)
    if conf.get("plot_data", False):
        shutil.rmtree(plot_dir)
        shutil.move(tmp_plot_dir, plot_dir)
class ClassifierLib:
    def __init__(self, analyze, conf, data_conf, output, algorithms, datasets, metrics):
        self.conf = self._load_conf(conf)
        self.shall_analyze = analyze
        self.data_conf = self._load_conf(data_conf)
        self.data_class = Datasets()
        self.output = output
        self.output_dir = os.path.abspath(self.conf.get("output_dir", "./output"))
        self.shall_plot = self.conf.get("plot_data")
        self.algorithms = algorithms
        self.datasets = datasets
        self.metrics = metrics


    def _get_algorithm_class(self, algorithm_name):
        module = importlib.import_module("%s" % algorithm_name)

        if not module:
            logging.error("Module %s not found" % algorithm_name)

        class_name = algorithm_name.replace("_"," ").title().replace(" ","")
        logging.info("Algorithm %s loaded from module %s" % (class_name, algorithm_name))
        return getattr(module, class_name)

    def _load_conf(self, conf_path):
        conf_file = open(os.path.abspath(conf_path))
        return yaml.load(conf_file)


    def run_algorithm(self, algorithm, data, data_conf, training_size):
        algo_conf = self.conf['algorithms'][algorithm]
        learn_class = self._get_algorithm_class(algorithm)
        learn = learn_class(**algo_conf)

        if not learn.check_type(getattr(constants, data_conf["type"])):
            return

        dataset = data_conf['name']

        learn.set_dataset(dataset, training_size)

        if algo_conf.get("cross_validate", False):
            learn._set_cross_validation(self.conf.get("cv_method", None), self.conf.get("cv_metric", None), self.conf.get("cv_params", None))
            learn.cross_validation(data['x_train'], data['y_train'], self.conf.get('print_cv_score', self.conf.get('print_cv_score', False)))

        learn.train(data["x_train"], data["y_train"])
        result = learn.predict(data['x_test'])

        if self.conf.get('evaluate', False):
            eval_metrics = []
            if self.metrics:
                eval_metrics.extend(self.metrics)
            else:
                eval_metrics.extend(algo_conf["allowed_metrics"])

            result = learn.evaluate(result, data["y_test"], eval_metrics)

        return result

    def run(self):
        if os.path.exists(self.output_dir):

            if os.path.exists("%s%s" % (self.output_dir, "_1")):
                shutil.rmtree("%s%s" % (self.output_dir, "_1"))

            shutil.move(self.output_dir, "%s%s" % (self.output_dir, "_1"))

        os.mkdir(self.output_dir)

        for dataset in self.datasets:
            if dataset not in self.data_conf:
                logging.error("Dataset %s not found" % dataset)
                sys.exit(0)

            dataset_dir = os.path.join(self.output_dir, dataset)
            os.mkdir(dataset_dir)

            if self.shall_analyze:
                self.analyze(dataset, dataset_dir)

            for algorithm in self.algorithms:

                algo_dir = os.path.join(dataset_dir, algorithm)
                os.mkdir(algo_dir)


                results = []
                for training_size in self.conf.get('training_sizes', [.4]):

                    data_conf = self.data_conf[dataset]

                    data = self.data_class.load_dataset(dataset, training_size)

                    result = self.run_algorithm(algorithm, data, data_conf, training_size)

                    if self.conf.get('evaluate', True):
                        if self.output == "print":
                            self.print_results(training_size, algorithm, dataset, result)

                        if self.shall_plot:
                            for metric, y_test, score in result:
                                metric_plot_path = os.path.join(algo_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100))
                                plot_metric(data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100, metric_plot_path)
                    else:
                        result_file = open(os.path.join(algo_dir, "result.csv"), 'a+')
                        result_file.write(",".join(results))
                        result_file.close()


    def analyze(self, dataset, dataset_dir):
        data = self.data_class.load_dataset(dataset, train_size=100)

        (X, Y) = (data['x_train'], data['y_train'])
        print_score.print_breakdown(X, Y)

        if self.shall_plot:
            plot_scatter(X, Y, "%s-orig" % dataset, filename=os.path.join(dataset_dir, "%s-orig.png"  % dataset))
            plot_histogram(X, Y, "%s-hist" % dataset, filename=os.path.join(dataset_dir, "%s-hist.png" % dataset))

            pca = PCA()
            pca.fit(X)
            plot_PCA_variance(pca.explained_variance_ratio_ * 100, "%s-pca-#feature-vs-variance" % dataset, filename=os.path.join(dataset_dir, "%s-pca-variance-ratio" % dataset))


    def print_results(self, training_size, algorithm, dataset, metric_tuples):
        #print "\nFor Algorithm::\t%s" % algorithm
        #print "For Dataset::\t%s\n" % dataset
            for met_tup in metric_tuples:
                func = getattr(print_score, "print_%s" % met_tup[0])
                func(training_size, algorithm, dataset, met_tup[2])
Beispiel #33
0
def train(args):
    set_seed(args)
    # Set device
    if args.device == 'cuda':
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        logger.info('use cuda')
    else:
        device = torch.device('cpu')
        logger.info('use cpu')

    # Set label list for classification
    if args.num_label == 'multi':
        label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오']
    elif args.num_label == 'binary':
        label_list = ['긍정', '부정']
    logger.info('use {} labels for training'.format(len(label_list)))

    # Load pretrained model and model configuration
    pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type)
    if args.pretrained_model_path is None:
        # Use pretrained bert model(etri/skt)
        pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin')
    else:
        # Use further-pretrained bert model
        pretrained_model_path = args.pretrained_model_path
    logger.info('Pretrain Model : {}'.format(pretrained_model_path))
    pretrained = torch.load(pretrained_model_path)
    
    if args.pretrained_type == 'skt' and 'bert.' not in list(pretrained.keys())[0]:
        logger.info('modify parameter names')
        # Change parameter name for consistency
        new_keys_ = ['bert.' + k for k in pretrained.keys()]
        old_values_ = pretrained.values()
        pretrained = {k: v for k, v in zip(new_keys_, old_values_)}

    bert_config = BertConfig(os.path.join(pretrained_path + '/bert_config.json'))
    bert_config.num_labels = len(label_list)
    model = BertForEmotionClassification(bert_config).to(device)
    model.load_state_dict(pretrained, strict=False)

    # Load Datasets
    tr_set = Datasets(file_path=args.train_data_path,
                      label_list=label_list,
                      pretrained_type=args.pretrained_type,
                      max_len=args.max_len)
    # Use custom batch function
    collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx)
    tr_loader = DataLoader(dataset=tr_set,
                           batch_size=args.train_batch_size,
                           shuffle=True,
                           num_workers=8,
                           pin_memory=True,
                           collate_fn=collate_fn)

    dev_set = Datasets(file_path=args.dev_data_path,
                       label_list=label_list,
                       pretrained_type=args.pretrained_type,
                       max_len=args.max_len)
    dev_loader = DataLoader(dataset=dev_set,
                            batch_size=args.eval_batch_size,
                            num_workers=8,
                            pin_memory=True,
                            drop_last=False,
                            collate_fn=collate_fn)

    # optimizer
    optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay)

    # lr scheduler
    t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs
    warmup_steps = int(t_total * args.warmup_percent)
    logger.info('total training steps : {}, lr warmup steps : {}'.format(t_total, warmup_steps))
    # Use gradual warmup and linear decay
    scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

    # for low-precision training
    if args.fp16:
        try:
            from apex import amp
            logger.info('Use fp16')
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0)

    # tensorboard setting
    save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format(
        args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total,
        args.warmup_percent, args.max_len, args.pretrained_type)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    writer = SummaryWriter(save_path)

    # Save best model results with resultwriter
    result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv")
    model.zero_grad()

    best_val_loss = 1e+9
    global_step = 0
    
    train_loss, train_acc, train_f1 = 0, 0, 0
    logging_loss, logging_acc, logging_f1 = 0, 0, 0

    logger.info('***** Training starts *****')
    total_result = []
    for epoch in tqdm(range(args.epochs), desc='epochs'):
        for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)):
            model.train()
            x_train, mask_train, y_train = map(lambda x: x.to(device), batch)

            inputs = {
                'input_ids': x_train,
                'attention_mask': mask_train,
                'classification_label': y_train,
            }

            output, loss = model(**inputs)
            y_max = output.max(dim=1)[1]

            cr = classification_report(y_train.tolist(),
                                       y_max.tolist(),
                                       labels=list(range(len(label_list))),
                                       target_names=label_list,
                                       output_dict=True)
            # Get accuracy(micro f1)
            if 'micro avg' not in cr.keys():
                batch_acc = list(cr.items())[len(label_list)][1]
            else:
                # If at least one of labels does not exists in mini-batch, use micro average instead
                batch_acc = cr['micro avg']['f1-score']
            # macro f1
            batch_macro_f1 = cr['macro avg']['f1-score']

            # accumulate measures
            grad_accu = args.gradient_accumulation_steps
            if grad_accu > 1:
                loss /= grad_accu
                batch_acc /= grad_accu
                batch_macro_f1 /= grad_accu

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            train_loss += loss.item()
            train_acc += batch_acc
            train_f1 += batch_macro_f1

            if (global_step + 1) % grad_accu == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm)

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                
                if global_step % args.logging_step == 0:
                    acc_ = (train_acc - logging_acc) / args.logging_step
                    f1_ = (train_f1 - logging_f1) / args.logging_step
                    loss_ = (train_loss - logging_loss) / args.logging_step
                    writer.add_scalars('loss', {'train': loss_}, global_step)
                    writer.add_scalars('acc', {'train': acc_}, global_step)
                    writer.add_scalars('macro_f1', {'train': f1_}, global_step)

                    logger.info('[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}'.format(
                        global_step, t_total, loss_, acc_, f1_
                    ))
                    logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss

                    # Get f1 score for each label
                    f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)]
                    f1_log = "\n".join(["{} : {}".format(l, f) for l, f in f1_results])
                    logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format(
                        confusion_matrix(y_train.tolist(), y_max.tolist())))

        # Validation
        val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device)
        val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format(
            global_step, t_total, val_loss, val_acc, val_macro_f1
        )

        writer.add_scalars('loss', {'val': val_loss}, global_step)
        writer.add_scalars('acc', {'val': val_acc}, global_step)
        writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step)
        logger.info(val_result)
        total_result.append(val_result)

        if val_loss < best_val_loss:
            # Save model checkpoints
            torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin'))
            torch.save(args, os.path.join(save_path, 'training_args.bin'))
            logger.info('Saving model checkpoint to %s', save_path)
            best_val_loss = val_loss
            best_val_acc = val_acc
            best_val_macro_f1 = val_macro_f1

    # Save results in 'model_saved_finetuning/results.csv'
    results = {
        'val_loss': best_val_loss,
        'val_acc': best_val_acc,
        'val_macro_f1' : best_val_macro_f1,
        'save_dir': save_path,
        'pretrained_path': pretrained_path,
    }
    result_writer.update(args, **results)
    return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
Beispiel #34
0
        if 'cosine' in data['tools']:
            initial_head = initial_head + ", th_cosine, size_result_cosine, f1_score_cosine, recall_cosine, runtime_cosine"
        if 'giga' in data['tools']:
            initial_head = initial_head + ", th_giga, size_result_giga, f1_score_giga, recall_giga, runtime_giga"
        if 'knode' in data['tools']:
            initial_head = initial_head + ", th_knode, size_result_knode, f1_score_knode, recall_knode, runtime_knode"
        if 'aminsga2' in data['tools']:
            initial_head = initial_head + ", th_aminsga2, size_result_aminsga2, f1_score_aminsga2, recall_aminsga2, runtime_aminsga2"

        initial_head = initial_head + ", th_baseline, f1_score_baseline, recall_baseline, runtime_baseline \n"

        outfile.write(initial_head)

    for ctr in range(arg.number_of_runs):
        if arg.graph_generation == "guyondata":
            G = Datasets.get_guyon_graph(ctr + 1)
        else:
            G = Datasets.get_scale_free_graph_edge(arg.network_size,
                                                   initial_module, nb_modules,
                                                   arg.module_size, arg.prob_p,
                                                   arg.prob_q,
                                                   arg.removed_edges, rng)

        rate_conection = len(G.edges) / len(G.nodes)

        average_shortest_paths = []
        for _, cluster in Datasets.get_groups(G).items():
            nodes = list(cluster)
            average_shortest_paths.append(
                Scores.average_shortest_path(G, nodes))
        result = str(ctr) + "," + str(arg.network_size) + "," + str(