Esempio n. 1
0
class Tester:
    def __init__(self):
        self.dataset = None
        self.dataloader = None
        self.model = None
        self.lr = cfg.lr
        self.predictions = []

    def build_dataloader(self):
        self.dataset = DatasetManager(cfg.train_datasets, train='val')

        self.dataloader = DataLoader(self.dataset,
                                     batch_size=cfg.batch_size,
                                     num_workers=cfg.num_worker,
                                     shuffle=cfg.shuffle,
                                     collate_fn=self.dataset.collate_fn)

    def load_model(self):
        self.model = MaskRCNN()
        self.model.cuda()
        self.model.eval()
        print(cfg.load_model_path)
        checkpoint = torch.load(cfg.load_model_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        print(self.model)

    def save_results(self, result):
        self.predictions.extend(result)

    def save_jsons(self):
        with open(cfg.save_result_path, 'w') as outfile:
            json.dump(self.predictions, outfile)

    def evaluate(self):
        self.dataset.evaluate()
Esempio n. 2
0
def process():
    '''
    process

    Wrapper to execute pre-processing pipeline
    Load the configuration file, declare utility objects
    '''
    config = json.load(open("config.json"))

    if not isinstance(config, dict):
        raise TypeError("Expected dict; got %s" % type(params).__name__)
    if not config:
        raise ValueError("Expected %s dict; got empty dict" %
                         os.path.basename(__file__))

    verbose = config["verbose"]
    # Verbose mode
    if verbose:

        def verbosePrint(*args):
            for arg in args:
                print(arg, )
    else:
        verbosePrint = lambda *a: Nones

    path_manager = PathManager(config["pathManager"])
    export_path = path_manager.visuals_data_dir()

    transforms_manager = TransformManager(config["transformManager"])
    transforms = transforms_manager.transforms()

    # Pre-process dataset
    dataset_manager = DatasetManager(config["datasetManager"], path_manager,
                                     transforms)
    dataset_manager.process_images()
Esempio n. 3
0
    def build_dataloader(self):
        self.dataset = DatasetManager(cfg.train_datasets, train='val')

        self.dataloader = DataLoader(self.dataset,
                                     batch_size=cfg.batch_size,
                                     num_workers=cfg.num_worker,
                                     shuffle=cfg.shuffle,
                                     collate_fn=self.dataset.collate_fn)
Esempio n. 4
0
def preprocessManual():
    """
		Preprocess audio features manually
	"""

    fid = 21
    name = "processed_data/mfcc_LRD_eval.npy"
    #norm_mean = "processed_data/EF_MLRD_norm_mean.npy"
    #norm_std = "processed_data/EF_MLRD_norm_std.npy"

    train_labels_dir = '../Dataset/train/train_labels.csv'
    test_labels_dir = '../Dataset/test/test_labels.csv'
    eval_labels_dir = '../Dataset/evaluate/evaluate_labels.csv'
    root_dir = '../Dataset'

    print("Preprocessing Starts...")

    # Load all the dataset
    #data_manager = DatasetManager(train_labels_dir, test_labels_dir, root_dir)
    #data_manager.load_all_data(include_test=True)
    data_manager = DatasetManager("", eval_labels_dir, root_dir)
    data_manager.load_all_data(with_labels=False)

    print("Preparing Data...")
    #train_csv, test_csv = data_manager.prepare_data()
    test_csv = data_manager.prepare_test_data()

    print("Loading features...")
    data_manager.load_feature(fid, name)

    print("normalizing")
Esempio n. 5
0
def preprocessAll():
    """
		Preprocess some of the audio features before running the codes
	"""
    # Initialize Features (features and features_index must correspond)
    features = [
        "mono_spec", "left_spec", "right_spec", "LR_spec", "diff_spec",
        "LRD_spec", "hpss_spec", "hpssmono_spec", "mfcc_mono_spec",
        "mfcc_left_spec", "mfcc_right_spec", "mfcc_diff_spec", "mfcc_LRD_spec",
        "mfcc_LRD_spec"
    ]
    feature_index = [0, 1, 2, 3, 4, 6, 7, 8, 11, 12, 13, 14, 15,
                     16]  # refer to the readme

    for i in range(len(features)):
        name = features[i]
        fid = feature_index[i]

        feature = "processed_data/{}.npy".format(name)
        norm_mean = "processed_data/{}_norm_mean.npy".format(name)
        norm_std = "processed_data/{}_norm_std.npy".format(name)

        train_labels_dir = '../Dataset/train/train_labels.csv'
        test_labels_dir = '../Dataset/test/test_labels.csv'
        root_dir = '../Dataset'

        print("Preprocessing Starts...")

        # Load all the dataset
        data_manager = DatasetManager(train_labels_dir, test_labels_dir,
                                      root_dir)
        data_manager.load_all_data(include_test=True)

        print("Preparing Data...")
        train_csv, test_csv = data_manager.prepare_data()

        print("Loading features...")
        data_manager.load_feature(fid, name)

        print("normalizing")
        bm.computeNormalized(norm_std, norm_mean, train_csv, root_dir,
                             data_manager)
def build_stack_model():
    """
		Stacking (Meta Ensembling) - Ensemble Technique to combine multiple models to generate a new model

		Referenced from http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
		Referenced from https://towardsdatascience.com/how-to-train-an-image-classifier-in-pytorch-and-use-it-to-perform-basic-inference-on-single-images-99465a1e9bf5
	"""

    # 0. Split training & test data (should be the same as the one used to train the models) ##############################

    # MOVED TO GLOBAL VARIABLES
    """
	train_labels_dir = '../Dataset/train/train_labels.csv'
	test_labels_dir = '../Dataset/test/test_labels.csv'
	root_dir = '../Dataset'
	processed_root_dir = 'processed_data'
	"""

    # Load all the dataset
    data_manager = DatasetManager(train_labels_dir, test_labels_dir, root_dir)
    data_manager.load_all_data(include_test=True)

    # 1. Partition Training Data into K folds #############################################################################

    kfolds = data_manager.apply_k_fold(K_FOLD)

    # 2. Create 2 dataset (train_meta & test_meta) with n empty columsn (M1, M2, ... Mn) where n = number of models ##############################

    # use k-fold of train data to fill up
    train_meta = np.empty(
        (data_manager.get_train_data_size(),
         len(save_models)))  # (n x m) where n = audio data, m = model
    # use all of train data to fill up
    test_meta = np.empty(
        (data_manager.get_test_data_size(),
         len(save_models)))  # (n x m) where n = audio data, m = model

    # 3. Apply K-fold cross validation to fill up empty columns (M1, M2, .... Mn) of train_meta with prediction results for each folds ##############################

    #print("Getting Prediction Results to fill in train_meta")
    loghub.logMsg(
        msg="{}: Getting Prediction Results to fill in train_meta".format(
            __name__),
        otherlogs=["test_acc"])
    fold = 0  # fold counter
    for train, validate in kfolds:  # train, validate is a list of index
        #print("Cross Validation Fold #%i..." % (fold+1))
        loghub.logMsg(msg="{}: Cross Validation Fold #{}...".format(
            __name__, (fold + 1)),
                      otherlogs=["test_acc"])

        # For each model
        for i in range(len(save_models)):
            #print("Fold #%i for model (%s)..." % ((fold+1), save_models[i]))
            loghub.logMsg(msg="{}: Fold #{} for model ({})...".format(
                __name__, (fold + 1), save_models[i]),
                          otherlogs=["test_acc"])

            # Get feature index
            fid = feat_indices[i]

            # Load/Preprocess Feature for model
            preprocessed_features_filepath = os.path.join(
                processed_root_dir, preprocessed_features[i])
            data_manager.load_feature(fid, preprocessed_features_filepath)

            # Prepare data
            train_csv, test_csv = data_manager.prepare_data(
                train_indices=train,
                test_indices=validate,
                train_csv=temp_train_csv_file,
                test_csv=temp_test_csv_file,
                train_only=True)

            # Load Normalized data
            norm_std = os.path.join(processed_root_dir,
                                    fold_norm_stds[i][fold])
            norm_mean = os.path.join(processed_root_dir,
                                     fold_norm_means[i][fold])

            # Build Model & get prediction results
            model, predictions = bm.buildCNNModel(
                train_csv=train_csv,
                test_csv=test_csv,
                norm_std=norm_std,
                norm_mean=norm_mean,
                data_manager=data_manager,
                num_of_channel=num_of_channels[i],
                save_model=False)

            # Fill up the train_meta with predictions results of test.csv
            for j in range(len(validate)):
                v_idx = validate[j]
                train_meta[v_idx][i] = predictions[j]  # data x model

        #print("End of Fold #%i." % (fold+1))
        loghub.logMsg(msg="{}: End of Fold #{}".format(__name__, (fold + 1)),
                      otherlogs=["test_acc"])
        fold += 1

    #print("Train_meta generated successfully.")
    loghub.logMsg(
        msg="{}: Train_meta generated successfully.".format(__name__),
        otherlogs=["test_acc"])

    # 4. Fit each model to the full training dataset & make predictions on the test dataset, store into test_meta ##############################

    #print("Getting Prediction Results to fill in test_meta...")
    loghub.logMsg(
        msg="{}: Getting Prediction Results to fill in test_meta...".format(
            __name__),
        otherlogs=["test_acc"])

    # For each model
    for i in range(len(save_models)):
        # Get feature index
        fid = feat_indices[i]

        # Load/Preprocess Feature for model
        preprocessed_features_filepath = os.path.join(processed_root_dir,
                                                      preprocessed_features[i])
        data_manager.load_feature(fid, preprocessed_features_filepath)

        # Prepare data
        train_csv, test_csv = data_manager.prepare_data(
            train_csv=temp_train_csv_file, test_csv=temp_test_csv_file)

        # Get Normalized preprocessed data file
        norm_std = os.path.join(processed_root_dir, norm_stds[i])
        norm_mean = os.path.join(processed_root_dir, norm_means[i])

        # Get save model
        model_name = os.path.join(processed_root_dir, save_models[i])

        # Build Model & get prediction results
        model, predictions = bm.buildCNNModel(
            train_csv=train_csv,
            test_csv=test_csv,
            norm_std=norm_std,
            norm_mean=norm_mean,
            data_manager=data_manager,
            num_of_channel=num_of_channels[i],
            saved_model_name=model_name,
            save_model=True)

        # Fill up the train_meta with predictions results of test.csv
        for j in range(data_manager.get_test_data_size()):
            test_meta[j][i] = predictions[j]  # data x model

    #print("Test_meta generated successfully.")
    loghub.logMsg(msg="{}: Test_meta generated successfully.".format(__name__),
                  otherlogs=["test_acc"])

    # 5. Fit (stacking model S) to train_meta, using (M1, M2, ... Mn) as features. ############################################################
    # 6. Use the stacked model S to make final predictions on test_meta ############################################################

    # get the training/testing label
    train_meta_labels = np.asarray(data_manager.train_label_indices)
    test_meta_labels = np.asarray(data_manager.test_label_indices)

    # Fit and Train classifier Model (step 5 & 6)
    classifier = ClassifierModel(train_meta, train_meta_labels, test_meta,
                                 test_meta_labels)
    predicts = classifier.run_decision_tree_classification()

    # Evaluate
    precision, recall, f1_measure = classifier.evaluate_prediction(predicts)
    correct, total = classifier.get_accuracy(predicts)
    percentage = 100 * correct / total

    #print("Stacked Model Prediction:\nAccuracy: {}/{} ({:.0f}%)\n\tPrecision: {}\n\tRecall: {}\n\tF1 Measure:{}".format(
    #	correct, total, percentage, precision, recall, f1_measure))
    loghub.logMsg(
        msg=
        "{}: Stacked Model Prediction:\nAccuracy: {}/{} ({:.0f}%)\n\tPrecision: {}\n\tRecall: {}\n\tF1 Measure:{}"
        .format(__name__, correct, total, percentage, precision, recall,
                f1_measure),
        otherlogs=["test_acc"])

    # 7. Save the ensemble model ########################################################################################################################

    stacked_model_filepath = os.path.join(processed_root_dir,
                                          stacked_model_name)
    classifier.save_model(stacked_model_filepath)
def predict_with_stack_model(with_labels=True):
    """
		load previously saved model to predict labels on test

		with_labels (bool): Indicator to tell us if there is labels in test data.
			- evaluation data has no labels
			- test data has labels
	"""

    # 1. Load the Testing Data #######################################################################################

    # MOVE TO GLOBAL VARIABLES
    """
	train_labels_dir = '../Dataset/train/train_labels.csv'
	test_labels_dir = '../Dataset/test/test_labels.csv'
	eval_labels_dir = "../Dataset/evaluate/evaluate_labels.csv"
	root_dir = '../Dataset'
	processed_root_dir = 'processed_data'
	"""

    # Load all the dataset
    if with_labels:
        # Test Datset (with labels)
        data_manager = DatasetManager(train_labels_dir, test_labels_dir,
                                      root_dir)
        # Load all the dataset
        data_manager.load_all_data(with_labels=True)
    else:
        # Evaluation Datset (with no labels)
        data_manager = DatasetManager("", eval_labels_dir, root_dir)
        # Load all the dataset
        data_manager.load_all_data(with_labels=False)

    # Initialize the input_vector for stacked model
    input_vect = np.empty(
        (data_manager.get_test_data_size(),
         len(save_models)))  # (n x m) where n = audio data, m = model

    # 2. Get Prediction Results from each Model #######################################################################

    # For each model
    for i in range(len(save_models)):
        # Get feature index
        fid = feat_indices[i]

        # Preprocess Feature for model
        if with_labels:
            # Test Datset (with labels)
            preprocessed_features_filepath = os.path.join(
                processed_root_dir, preprocessed_features[i])
        else:
            # Evaluation Datset (with no labels)
            preprocessed_features_filepath = os.path.join(
                processed_root_dir, preprocessed_features_test[i])

        data_manager.load_feature(
            fid, preprocessed_features_filepath
        )  # THIS HAVE TO BE REMOVED (BECAUSE WHEN PREDICTING, we won't have preprocess thea udio file as we don't know what it is. leave it balnk)

        # Prepare data
        if with_labels:
            # Test Datset (with labels)
            train_csv, test_csv = data_manager.prepare_data(
                train_csv=temp_train_csv_file, test_csv=temp_test_csv_file)
        else:
            # Evaluation Datset (with no labels)
            test_csv = data_manager.prepare_test_data(
                test_csv=temp_test_csv_file)

        # Get Normalized preprocessed data file
        norm_std = os.path.join(processed_root_dir, norm_stds[i])
        norm_mean = os.path.join(processed_root_dir, norm_means[i])

        # Get saved model path
        saved_model_path = os.path.join(processed_root_dir, save_models[i])

        # Test the saved model & get prediction results
        if with_labels:
            # Test Data set (with labels)
            predictions = bm.testCNNModel(saved_model_path=saved_model_path,
                                          test_csv=test_csv,
                                          norm_std=norm_std,
                                          norm_mean=norm_mean,
                                          data_manager=data_manager,
                                          num_of_channel=num_of_channels[i],
                                          with_labels=with_labels)
        else:
            # Evaluation Dataset (with no labels)
            predictions = bm.testCNNModel(saved_model_path=saved_model_path,
                                          test_csv=test_csv,
                                          norm_std=norm_std,
                                          norm_mean=norm_mean,
                                          data_manager=data_manager,
                                          num_of_channel=num_of_channels[i],
                                          with_labels=with_labels)

        # Fill up the input_vector with predictions results from model
        for j in range(data_manager.get_test_data_size()):
            input_vect[j][i] = predictions[j]

    # 3. Get Prediction Results from Stack Model based on input_vector  ####################################################

    # Load the stacked model
    stacked_model_filepath = os.path.join(processed_root_dir,
                                          stacked_model_name)
    stacked_em = pickle.load(open(stacked_model_filepath, 'rb'))

    # Get Prediction Results
    predicts = stacked_em.predict(input_vect)

    # Print prediction Accuracy
    if with_labels:
        # Test Dataset (with labels)
        correct, total = util.compare_list_elements(
            predicts, data_manager.test_label_indices)
        percentage = 100 * correct / total
        #print("Stacked Model Prediction Accuracy: {}/{} ({:.0f}%)".format(correct, total, percentage))
        loghub.logMsg(
            msg="{}: Stacked Model Prediction Accuracy: {}/{} ({:.0f}%)".
            format(__name__, correct, total, percentage),
            otherlogs=["test_acc"])

        #np.set_printoptions(precision=2)

        # Plot non-normalized confusion matrix
        #mk.plot_confusion_matrix(data_manager.test_label_indices, predicts, classes=[
        #	'airport', 'bus', 'metro', 'metro_station', 'park', 'public_square', 'shopping_mall',
        #	'street_pedestrian', 'street_traffic', 'tram'
        #	], title='Confusion matrix')

        #plt.show()
    else:
        # Evaluation Datset (with no labels)
        # Store the prediction results
        dcase_eval_data = DCASEDataset(eval_labels_dir, root_dir, data_manager)

        results = []
        headers = ["filename", "label", "label_index"]
        for i in range(len(dcase_eval_data) - 1):
            result = []
            # Get prediction results for each audio file
            result.append(dcase_eval_data.datalist[
                i + 1])  # first line is header...(so add 1 to skip it)
            pred_idx = int(predicts[i])
            result.append(dcase_eval_data.default_labels[pred_idx])
            result.append(pred_idx)
            # Add to list
            results.append(result)
        # Write to csv file
        util.write_to_csv_file(results, predict_results_csv, headers)
Esempio n. 8
0
def main():
    # load the dataset
    datasetManager = DatasetManager()
    datasetManager.initialize('CNN').load()

    #

    counter = 0
    code_archive = []
    languages = []

    for languageFolder in FileManager.getLanguagesFolders(
            FileManager.datasets['training']['url']):
        for exampleFolder in FileManager.getExamplesFolders(
                languageFolder.path):
            originalFileUrl = FileManager.getOriginalFileUrl(
                exampleFolder.path)
            originalFileContent = FileManager.readFile(originalFileUrl)
            #
            counter += 1
            code_archive.append(originalFileContent)
            languages.append(str(languageFolder.name).lower())

    # added - and @
    max_fatures = 100000
    embed_dim = 128
    lstm_out = 64
    batch_size = 32
    epochs = 30
    test_size = 0.001

    tokenizer = Tokenizer(num_words=max_fatures)
    tokenizer.fit_on_texts(code_archive)
    dictionary = tokenizer.word_index
    FileManager.createFile(
        os.path.join(FileManager.getRootUrl(), 'tmp/wordindex.json'),
        json.dumps(dictionary))

    X = tokenizer.texts_to_sequences(code_archive)
    X = pad_sequences(X, 100)
    Y = pd.get_dummies(languages)
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_size)

    # LSTM model
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length=100))
    model.add(
        Conv1D(filters=128,
               kernel_size=3,
               padding='same',
               dilation_rate=1,
               activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(
        Conv1D(filters=64,
               kernel_size=3,
               padding='same',
               dilation_rate=1,
               activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(lstm_out))
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(Dense(len(Y.columns), activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size)

    model.save(os.path.join(FileManager.getRootUrl(), 'tmp/code_model.h5'))
    model.save_weights(
        os.path.join(FileManager.getRootUrl(), 'tmp/code_model_weights.h5'))

    score, acc = model.evaluate(X_test,
                                Y_test,
                                verbose=2,
                                batch_size=batch_size)
    print(model.metrics_names)
    print("Validation loss: %f" % score)
    print("Validation acc: %f" % acc)
Esempio n. 9
0
def predict_with_stack_model(filename, label='', labelidx=0):
    test_labels_dir = 'test_labels.csv'
    root_dir = 'static'
    processed_root_dir = 'processed_data'

    x = filename.split("/")
    x = x[-1]
    filename = x
    # Prepre csv file path
    test_filepath = os.path.join(root_dir, test_labels_dir)
    label = "park"
    labelidx = 4

    # Extract data for test.csv
    test_csv_data = []
    dataset = []
    dataset.append(filename)
    dataset.append(label)
    dataset.append(labelidx)
    test_csv_data.append(dataset)
    # Write into test csv file
    with open(test_filepath, 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows(test_csv_data)
    csvFile.close()

    # Load all the dataset
    data_manager = DatasetManager("", 'static/test_labels.csv', root_dir)
    data_manager.load_all_data()

    # Initialize the input_vector for stacked model
    input_vect = np.empty((1, len(save_models)))

    # 2. Get Prediction Results from each Model #######################################################################

    # For each model
    for i in range(len(save_models)):
        # Get feature index
        fid = feat_indices[i]

        # Preprocess Feature for model
        preprocessed_features_filepath = os.path.join(processed_root_dir,
                                                      preprocessed_features[i])
        data_manager.load_feature(
            fid, preprocessed_features_filepath
        )  # THIS HAVE TO BE REMOVED (BECAUSE WHEN PREDICTING, we won't have preprocess thea udio file as we don't know what it is. leave it balnk)

        # Prepare data
        test_csv = data_manager.prepare_single_data(filename, label, labelidx)

        # Get Normalized preprocessed data file
        norm_std = os.path.join(processed_root_dir, norm_stds[i])
        norm_mean = os.path.join(processed_root_dir, norm_means[i])

        # Get saved model path
        saved_model_path = os.path.join(processed_root_dir, save_models[i])

        # Test the saved model & get prediction results
        predictions = bm.testSingleFile(saved_model_path=saved_model_path,
                                        test_csv=test_csv,
                                        norm_std=norm_std,
                                        norm_mean=norm_mean,
                                        data_manager=data_manager,
                                        num_of_channel=num_of_channels[i])
        print("sfdsfs")
        print(predictions)

        input_vect[0][i] = predictions

    # 3. Get Prediction Results from Stack Model based on input_vector  ####################################################

    # Load the stacked model
    stacked_model_filepath = os.path.join(processed_root_dir,
                                          stacked_model_name)
    stacked_em = pickle.load(open(stacked_model_filepath, 'rb'))

    # Get Prediction Results
    predicts = stacked_em.predict(input_vect)
    if os.path.isfile("processed_data/test_spec.npy"):
        print("yes")
        os.remove("processed_data/test_spec.npy")
    return predicts[0]


# filename = 'test/audio/4286.wav'
# label = 'park'
# labelidx = 4
# predict_with_stack_model(filename,label,labelidx)
Esempio n. 10
0
                                                     cache_dir=cache_dir)
    tokenizer = transformers.BertTokenizer.from_pretrained(model_name_or_path,
                                                           do_lower_case=do_lower_case,
                                                           cache_dir=cache_dir)
    model = models.BertForWikiQA.from_pretrained(model_name_or_path,
                                                 from_tf=bool('.ckpt' in model_name_or_path),
                                                 config=config,
                                                 cache_dir=cache_dir)

    ####### LOAD DATA
    dataset = DatasetManager(
        train_file, valid_file, test_file,
        max_seq_length,
        tokenizer,
        pad_position='right',
        device=device,
        logger=logger,
        mask_padding_with_zero=True,
        pad_token_segment_id=0,
        pad_token=0
    )

    # move model to CUDA
    model = model.cuda(device=device)

    # define optimizer
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
Esempio n. 11
0
 def __init__(self):
     self.type: str = 'MISSING'
     self.Dataset: DatasetInstance = None
     self.model: None
     self.config: dict = {}
     self.DatasetManger = DatasetManager()
Esempio n. 12
0
class _BaseAlgorithm:

    def __init__(self):
        self.type: str = 'MISSING'
        self.Dataset: DatasetInstance = None
        self.model: None
        self.config: dict = {}
        self.DatasetManger = DatasetManager()

    def initialize(self):
        # load the dataset
        self.DatasetManger.initialize(self.type).load()
        # save the dataset instance
        self.Dataset = self.DatasetManger.Dataset

        return self

    #

    def importVocabulary(self):
        return json.loads(FileManager.readFile(FileManager.getVocabularyFileUrl(self.type)))

    def exportVocabulary(self, indexes):
        FileManager.writeFile(FileManager.getVocabularyFileUrl(self.type), json.dumps(indexes))
        return self

    def importKerasTrainedModel(self):
        self.model = load_model(FileManager.getTrainedModelFileUrl(self.type))
        return self

    def importScikitTrainedModel(self):
        self.model = joblib.load(FileManager.getTrainedModelFileUrl(self.type))
        return self

    def exportKerasTrainedModel(self):
        self.model.save(FileManager.getTrainedModelFileUrl(self.type))
        return self

    def exportScikitTrainedModel(self):
        joblib.dump(self.model, FileManager.getTrainedModelFileUrl(self.type))
        return self

    def exportClassificationReport(self, report: str):
        FileManager.writeFile(FileManager.getReportFileUrl(self.type), report)

    #

    def generateWordsIndexesForUnknownExample(self, wordsIndexes, source: str):
        wordvec = []
        max_features: int = self.config['max_features']

        # one really important thing that `text_to_word_sequence` does
        # is make all texts the same length -- in this case, the length
        # of the longest text in the set.
        for word in kpt.text_to_word_sequence(source, filters=TOKENIZER_CONFIG['filter']):
            if word in wordsIndexes:
                if wordsIndexes[word] <= max_features:
                    wordvec.append([wordsIndexes[word]])
                else:
                    wordvec.append([0])
            else:
                wordvec.append([0])

        return wordvec

    #

    def extractSources(self, dataset: str, sourceType: str = 'parsed'):
        X_raw = []
        Y_raw = []
        sources: dict = self.Dataset.getSources(dataset)

        for language in sources:
            for exampleDict in sources[language]:
                source = str(exampleDict[sourceType])
                source = source.replace(ESCAPED_TOKENS['ALPHA'], '')
                source = source.replace(ESCAPED_TOKENS['NUMBER'], '')
                source = source.replace(ESCAPED_TOKENS['NOT_RELEVANT'], '')
                source = source.replace('\n', ' ')

                source = ' '.join([w for w in source.split(' ') if len(w.strip()) > 0])

                X_raw.append(source)
                Y_raw.append(language)

        return X_raw, Y_raw
Esempio n. 13
0
def main():

    # Initialize Timer
    timer = StopWatch()
    timer.startTimer()

    # Step 0: Setting up Training Settings ##################################################

    # Training settings
    parser = argparse.ArgumentParser(
        description='PyTorch Baseline code for ASC Group Project (CS4347)')
    parser.add_argument('--batch-size',
                        type=int,
                        default=16,
                        metavar='N',
                        help='input batch size for training (default: 16)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=16,
                        metavar='N',
                        help='input batch size for testing (default: 16)')
    parser.add_argument('--epochs',
                        type=int,
                        default=200,
                        metavar='N',
                        help='number of epochs to train (default: 200)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    # Step 1a: Preparing Data - Extract data ###########################################################

    # init the train and test directories
    train_labels_dir = '../Dataset/train/train_labels.csv'
    test_labels_dir = '../Dataset/test/test_labels.csv'
    root_dir = '../Dataset'

    # Load all the dataset
    data_manager = DatasetManager(train_labels_dir, test_labels_dir, root_dir)
    data_manager.load_all_data(include_test=False)

    # Load/Preprocess Feature for model
    data_manager.load_feature(feature_index, preprocessed_features)

    # Prepare data
    train_labels_dir, test_labels_dir = data_manager.prepare_data(
        train_csv=temp_train_csv_file, test_csv=temp_test_csv_file)

    # Step 1b: Preparing Data - Transform Data #########################################################

    # Compute Normalization score
    if os.path.isfile(preprocessed_norm_mean_file) and os.path.isfile(
            preprocessed_norm_std_file):
        # get the mean and std. If Normalized already, just load the npy files and comment the NormalizeData() function above
        mean = np.load(preprocessed_norm_mean_file)
        std = np.load(preprocessed_norm_std_file)
    else:
        # If not, run the normalization and save the mean/std
        #print('DATA NORMALIZATION : ACCUMULATING THE DATA')
        loghub.logMsg(
            msg="{}: DATA NORMALIZATION : ACCUMULATING THE DATA".format(
                __name__),
            otherlogs=["test_acc"])
        # load the datase
        dcase_dataset = DCASEDataset(train_labels_dir, root_dir, data_manager,
                                     True)
        mean, std = NormalizeData(train_labels_dir, root_dir, dcase_dataset)
        np.save(preprocessed_norm_mean_file, mean)
        np.save(preprocessed_norm_std_file, std)
        #print('DATA NORMALIZATION COMPLETED')
        loghub.logMsg(msg="{}: DATA NORMALIZATION COMPLETED".format(__name__),
                      otherlogs=["test_acc"])

    # Convert to Torch Tensors
    mean = torch.from_numpy(mean)
    std = torch.from_numpy(std)

    # convert to torch variables
    mean = torch.reshape(
        mean, [num_of_channel, 40, 1]
    )  # numpy broadcast (CxHxW). last dimension is 1 -> which will be automatically broadcasted to 500 (time)
    std = torch.reshape(std, [num_of_channel, 40, 1])

    # init the data_transform
    data_transform = transforms.Compose(
        [cnn.ToTensor(), cnn.Normalize(mean, std)])

    # init the datasets
    dcase_dataset = DCASEDataset(csv_file=train_labels_dir,
                                 root_dir=root_dir,
                                 data_manager=data_manager,
                                 is_train_data=True,
                                 transform=data_transform)
    dcase_dataset_test = DCASEDataset(csv_file=test_labels_dir,
                                      root_dir=root_dir,
                                      data_manager=data_manager,
                                      is_train_data=False,
                                      transform=data_transform)

    # Step 1c: Preparing Data - Load Data ###############################################################

    # set number of cpu workers in parallel
    kwargs = {'num_workers': 16, 'pin_memory': True} if use_cuda else {}

    # get the training and testing data loader
    train_loader = torch.utils.data.DataLoader(dcase_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    test_loader = torch.utils.data.DataLoader(dcase_dataset_test,
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              **kwargs)

    # Step 2: Build Model ###############################################################

    # init the model
    model = BaselineASC(num_of_channel).to(device)

    # init the optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Step 3: Train Model ###############################################################

    #print('MODEL TRAINING START')
    loghub.logMsg(msg="{}: MODEL TRAINING START.".format(__name__),
                  otherlogs=["test_acc"])
    # train the model
    for epoch in range(1, args.epochs + 1):
        cnn.train(args, model, device, train_loader, optimizer, epoch)
        cnn.test(args, model, device, train_loader, 'Training Data')
        cnn.test(args, model, device, test_loader, 'Test Data')

    #print('MODEL TRAINING END')
    loghub.logMsg(msg="{}: MODEL TRAINING END.".format(__name__),
                  otherlogs=["test_acc"])

    # Step 4: Save Model ################################################################

    # save the model
    if (args.save_model):
        torch.save(model.state_dict(), saved_model)

    # stop timer
    timer.stopTimer()
    timer.printElapsedTime()