Ejemplo n.º 1
0
 def __init__(self, project_name , model_name = 'new'):
     # prepare dataset
     prepare_dataset.prepare_dataset(project_name)
     # call ValidationClass
     validObj = validation.ValidateClass(project_name)
     # get all the paths and config file
     validObj.get_validated_paths()
     # get validated paths & config file
     self.paths = validObj.paths 
     self.config = validObj.config
     # set id to current run 
     self.run_id = self.get_run_id() 
     # get model name from function arg
     self.model_name = model_name
     # create object of model class to operate model.py 
     self.modelObj = model.ModelClass(project_name)       
Ejemplo n.º 2
0
def label():
	file = request.files["file"]
	label = request.form['label']

	if(os.path.isdir("dataset\\"+label)):
		print("Word Already Exists")
	else:
		print("Word Does not exist")
		os.mkdir("dataset\\"+label)
		newDirectory = "dataset\\"+label
		file.save(os.path.join(newDirectory,file.filename))
		print("=-=-=-=-=-=-= \n DONE TRAINING")


	prepare_dataset.prepare_dataset(DATASET_PATH, JSON_PATH)
	train.main()

	return jsonify({})
Ejemplo n.º 3
0
def run_pconsc(fold):
	k = fold - 1
	print 'Preparing data...'
	data, target, folds = prepare_dataset.prepare_dataset()
	print 'Fitting random forest...'
	forest = fit_data.fit_data(k, data, target, folds)
	data_io.save_random_forest(forest, constants.intermediate_path, 'pconsc_random_forest_' + str(k) + '.pkl.tar.gz')
	print 'Predicting test data...'
	predict_data.predict_data(k, data, folds, forest, 'pconsc/')
Ejemplo n.º 4
0
def run_pconsc(fold):
    k = fold - 1
    print 'Preparing data...'
    data, target, folds = prepare_dataset.prepare_dataset()
    print 'Fitting random forest...'
    forest = fit_data.fit_data(k, data, target, folds)
    data_io.save_random_forest(
        forest, constants.intermediate_path,
        'pconsc_random_forest_' + str(k) + '.pkl.tar.gz')
    print 'Predicting test data...'
    predict_data.predict_data(k, data, folds, forest, 'pconsc/')
Ejemplo n.º 5
0
def run_pconsc2(fold):
	k = fold - 1
	print 'Preparing data...'
	base_data, target, folds = prepare_dataset.prepare_dataset()
	for i in range(constants.number_of_layers + 1):
		if i == 0:
			data = base_data
		else: 
			print 'Getting layer ' + str(i) + ' data...'
			data = next_layer_dataset.next_layer_dataset(base_data, data, forest)

		print 'Fitting random forest...'
		forest = fit_data.fit_data(k, data, target, folds)
		data_io.save_random_forest(forest, constants.intermediate_path, 'pconsc2_random_forest_' + str(k) + '_layer_' + str(i) + '.pkl.tar.gz')
		print 'Predicting test data...'
		predict_data.predict_data(k, data, folds, forest, 'pconsc2_layer_' + str(i) + '/')
Ejemplo n.º 6
0
def run_pconsc2(fold):
    k = fold - 1
    print 'Preparing data...'
    base_data, target, folds = prepare_dataset.prepare_dataset()
    for i in range(constants.number_of_layers + 1):
        if i == 0:
            data = base_data
        else:
            print 'Getting layer ' + str(i) + ' data...'
            data = next_layer_dataset.next_layer_dataset(
                base_data, data, forest)

        print 'Fitting random forest...'
        forest = fit_data.fit_data(k, data, target, folds)
        data_io.save_random_forest(
            forest, constants.intermediate_path, 'pconsc2_random_forest_' +
            str(k) + '_layer_' + str(i) + '.pkl.tar.gz')
        print 'Predicting test data...'
        predict_data.predict_data(k, data, folds, forest,
                                  'pconsc2_layer_' + str(i) + '/')
Ejemplo n.º 7
0
def node(test_node, test_leaf):
    node_dataset, leaf_dataset, test_dataset = prepare_dataset.prepare_dataset(
        [[1], [7]])
    PATHS = ['./node_net.pth', './left_net.pth', './right_net.pth']

    if test_node:
        for h in range(1):
            training.train(leaf_dataset, PATHS[1])
            training.train(leaf_dataset, PATHS[2])
            for i in range(1, 3, 1):
                print(f'Number of epochs: {i}')
                node = Node.Node(node_dataset, PATHS[1], PATHS[2], PATHS[0])
                node.train(i)
                testing_node.test(test_dataset, PATHS)
            #print('NEW TRY')

    if test_leaf:
        training.train(leaf_dataset, PATHS[1])
        training.train(leaf_dataset, PATHS[2])
        testing_leaf.test(test_dataset, PATHS[1])
        testing_leaf.test(test_dataset, PATHS[2])
def main():
    """
    Generates the weights file based on the given text
    :return:
    """
    # Check for expected number of Arguments
    if len(argv) != number_of_args:
        exit("Invalid number of arguments")

    # Get train, test files path and output folder full path
    script, txt_file_path = argv

    # read txt and lowercase it
    txt = open(txt_file_path).read()
    txt = txt.lower()

    conversion_dic, n_chars, n_vocab = parse(txt)
    char_to_int = conversion_dic["char_to_int"]

    # prepare the dataset of input to output pairs encoded as integers
    seq_length = 100
    X, y, dataX = prepare_dataset(seq_length, txt, n_chars, char_to_int, n_vocab)
    # define the LSTM model
    model = Sequential()
    model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2]),return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(512))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # define the checkpoint
    filepath = "weights-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    # fit the model
    model.fit(X, y, epochs=2, batch_size=128, callbacks=callbacks_list)
    print "done!"
Ejemplo n.º 9
0
import pandas as pd
import matplotlib.pyplot as plt

from prepare_dataset import prepare_dataset
from gains_losses_data import compute_gains_losses


def graph_builder_bar(graph):
    axes = graph.plot(kind='bar', )
    plt.axhline(0, color='b')
    axes.legend(bbox_to_anchor=(1.5, 1.05), )
    return plt.show()


# Load dataset
df_hh = prepare_dataset()
df_hh = compute_gains_losses(df_hh)

# Compute total revenue from policy
tax_revenue_total = (df_hh['total_tax_increase'] *
                     df_hh['hh_weight']).sum() / 1000000
tax_revenue_per_uc = (df_hh['total_tax_increase'] *
                      df_hh['hh_weight']).sum() / (df_hh['consumption_units'] *
                                                   df_hh['hh_weight']).sum()
avg_loss_per_uc = (df_hh['total_expenditures_increase'] *
                   df_hh['hh_weight']).sum() / (df_hh['consumption_units'] *
                                                df_hh['hh_weight']).sum()


def incidence_decile(
    data, targeted, amount
Ejemplo n.º 10
0
def main():
    """
    Receives 2 parameters - the text to use as a starting point, and the weights file
    :return: prints out 1000 characters based on the training and the chosen seed
    """
    # Check for expected number of Arguments
    if len(argv) != number_of_args:
        exit("Invalid number of arguments")

    # load ascii text and covert to lowercase
    filename = argv[1]  #text
    raw_text = open(filename).read()
    raw_text = raw_text.lower()

    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(list(set(raw_text)))
    chars.insert(0, '\r')
    print chars
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))
    # summarize the loaded data
    n_chars = len(raw_text)
    n_vocab = len(chars)
    print "Total Characters: ", n_chars
    print "Total Vocab: ", n_vocab
    # prepare the dataset of input to output pairs encoded as integers
    seq_length = 100
    X, y, dataX = prepare_dataset(seq_length, raw_text, n_chars, char_to_int,
                                  n_vocab)

    # define the LSTM model
    print(X.shape[1], X.shape[2])
    model = Sequential()
    model.add(
        LSTM(512, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(512))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))

    # load the network weights
    weightFile = argv[2]
    model.load_weights(weightFile)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # pick a random seed
    start = numpy.random.randint(0, len(dataX) - 1)
    pattern = dataX[start]
    print "Seed:"
    print "\"", ''.join([int_to_char[value] for value in pattern]), "\""
    # generate characters
    for i in range(1000):
        x = numpy.reshape(pattern, (1, len(pattern), 1))
        x = x / float(n_vocab)
        prediction = model.predict(x, verbose=0)
        index = numpy.argmax(prediction)
        result = int_to_char[index]
        seq_in = [int_to_char[value] for value in pattern]
        sys.stdout.write(result)
        pattern.append(index)
        pattern = pattern[1:len(pattern)]
    print "\nDone."
Ejemplo n.º 11
0
def excute(config):
    torch.cuda.empty_cache()
    torch.manual_seed(1)

    model = HDIClassifier(config['waves'], config['windows'],
                          config['channel_n'], 2)
    device = torch.device(
        config['device']) if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    optim = torch.optim.Adam(model.parameters(), lr=0.0005)
    loss_f = torch.nn.CrossEntropyLoss()  #FocalLoss(logits=True)
    epochs = 15
    batch_size = 270 * (int(math.ceil(3 / len(config['waves']))))

    tr_dataloader, vd_dataloader, te_dataloader = prepare_dataset(
        config['time_point'], batch_size, config['sqi'])

    loss_list = []
    result_df_list = []

    best_auroc = 0
    best_model_path = None

    save_path = make_save_folder(config)
    print(save_path)
    shutil.copytree('./', os.path.join(save_path, 'pyfile'))
    #/ home / jjong / jjong / workplace / datathon_2019 / pyfile
    print('Settings: {}M_{}'.format(config['time_point'],
                                    '_'.join(config['waves'])))
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        tr_pred_digit = []
        tr_pred_prob = []
        tr_target_digit = []

        for idx, (X, y) in enumerate(tr_dataloader):
            X, y = X.to(device), y.to(device).long()
            optim.zero_grad()

            output = model(X)

            #loss_f(output[:,1],tmp[1].squeeze(1).to(device))
            loss = loss_f(output, y.squeeze(1).to(device))
            loss.backward()
            optim.step()

            total_loss += loss.item()
            if idx % 50 == 0:
                print('{}[{}epoch][{}/{}iter][loss:{}]'.format(
                    config['device'], epoch, idx, len(tr_dataloader),
                    loss.item()))

            tr_target_digit.extend(y.cpu().numpy().ravel().tolist())
            tr_pred_digit.extend(output.max(dim=1)[1].cpu().numpy().tolist())
            tr_pred_prob.extend(output[:, 1].detach().cpu().numpy().tolist())

        else:
            loss_list.append(total_loss / len(tr_dataloader))
            print(
                '-----------<[{} epoch] Train Result>----------------'.format(
                    epoch))
            print('Settings: {}M_{}'.format(config['time_point'],
                                            '_'.join(config['waves'])))
            print('Train total: [loss:{}]'.format(total_loss /
                                                  len(tr_dataloader)))
            auroc = roc_auc_score(tr_target_digit, tr_pred_prob)
            auprc = average_precision_score(tr_target_digit, tr_pred_prob)
            print('AUROC : {}'.format(auroc))
            print('AUPRC : {}'.format(auprc))
            print(
                classification_report(tr_target_digit,
                                      tr_pred_digit,
                                      labels=[0, 1],
                                      target_names=['normal', 'Event']))

            report_dict = classification_report(
                tr_target_digit,
                tr_pred_digit,
                labels=[0, 1],
                target_names=['normal', 'Event'],
                output_dict=True)
            report_df = pd.DataFrame(report_dict)
            report_df['epoch'] = epoch
            report_df['state'] = 'train'
            report_df['auroc'] = auroc
            report_df['auprc'] = auprc

            result_df_list.append(report_df)
        '''----Validation----'''
        with torch.no_grad():
            vd_target_digit = []
            vd_pred_digit = []
            vd_pred_prob = []

            model.eval()
            for idx, (X, y) in enumerate(vd_dataloader):
                X, y = X.to(device), y.to(device)

                output = model(X)
                vd_target_digit.extend(y.cpu().numpy().ravel().tolist())
                vd_pred_digit.extend(
                    output.max(dim=1)[1].cpu().numpy().tolist())
                vd_pred_prob.extend(output[:,
                                           1].detach().cpu().numpy().tolist())

            else:
                print('-----------<[{} epoch] Valid Result>----------------'.
                      format(epoch))
                print('Settings: {}M_{}'.format(config['time_point'],
                                                '_'.join(config['waves'])))
                auroc = roc_auc_score(vd_target_digit, vd_pred_prob)
                auprc = average_precision_score(vd_target_digit, vd_pred_prob)
                print('AUROC : {}'.format(auroc))
                print('AUPRC : {}'.format(auprc))
                print(
                    classification_report(vd_target_digit,
                                          vd_pred_digit,
                                          labels=[0, 1],
                                          target_names=['normal', 'Event']))
                report_dict = classification_report(
                    vd_target_digit,
                    vd_pred_digit,
                    labels=[0, 1],
                    target_names=['normal', 'Event'],
                    output_dict=True)
                report_df = pd.DataFrame(report_dict)
                report_df['epoch'] = epoch
                report_df['state'] = 'valid'
                report_df['auroc'] = auroc
                report_df['auprc'] = auprc
                result_df_list.append(report_df)
                pd.concat(result_df_list, sort=True).to_csv(
                    os.path.join(save_path, 'result_df.csv'))

                if auroc > best_auroc:
                    print('SAVED')
                    best_auroc = auroc
                    best_model_path = os.path.join(
                        save_path, 'models',
                        '{}_{:.3f}.pth'.format(epoch, auroc))
                    torch.save(
                        {
                            'epoch': epoch,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optim.state_dict(),
                            'auroc': auroc,
                        }, best_model_path)
    '''----Test----'''
    with torch.no_grad():
        te_target_digit = []
        te_pred_digit = []
        te_pred_prob = []

        checkpoint = torch.load(best_model_path)
        model.load_state_dict(checkpoint['model_state_dict'])

        model.eval()
        for idx, (X, y) in enumerate(te_dataloader):
            X, y = X.to(device), y.to(device)

            output = model(X)
            te_target_digit.extend(y.cpu().numpy().ravel().tolist())
            te_pred_digit.extend(output.max(dim=1)[1].cpu().numpy().tolist())
            te_pred_prob.extend(output[:, 1].detach().cpu().numpy().tolist())
        else:
            print('-----------< Test Result >----------------')
            print('[setting]: {}M_{}'.format(config['time_point'],
                                             '_'.join(config['waves'])))
            print('[Best Model]: ', best_model_path)
            auroc = roc_auc_score(te_target_digit, te_pred_prob)
            auprc = average_precision_score(te_target_digit, te_pred_prob)
            print('AUROC : {}'.format(auroc))
            print('AUPRC : {}'.format(auprc))
            print(
                classification_report(te_target_digit,
                                      te_pred_digit,
                                      labels=[0, 1],
                                      target_names=['normal', 'Event']))

            report_dict = classification_report(
                te_target_digit,
                te_pred_digit,
                labels=[0, 1],
                target_names=['normal', 'Event'],
                output_dict=True)
            report_df = pd.DataFrame(report_dict)
            report_df['epoch'] = epoch
            report_df['state'] = 'test'
            report_df['auroc'] = auroc
            report_df['auprc'] = auprc
            result_df_list.append(report_df)

            f1_score = report_dict['weighted avg']['f1-score']
            recall = report_dict['weighted avg']['recall']
            precision = report_dict['weighted avg']['precision']
            pd.concat(result_df_list, sort=True).to_csv(
                os.path.join(save_path, 'result_df.csv'))
            with open(
                    os.path.join(
                        save_path,
                        '[{} {:.4f}]_[{}{:.4f}]_[{}{:.4f}]_[{}{:.4f}]_[{}{:.4f}].txt'
                        .format('auroc', auroc, 'auprc', auprc, 'f1-score',
                                f1_score, 'recall', recall, 'precision',
                                precision)), 'w') as f:
                f.write('  ')
    del (model)
Ejemplo n.º 12
0
def mnist_cnn():
    start = time.time()

    groups = find_pairings.find_pairings()

    groups_a_l = {}
    acc = []
    losses = []

    for g in groups:
        print(f'Groups: {g}')
        digits = []
        for i in g:
            for k in i:
                digits.append(k)
        (tr_d, te_d) = prepare_dataset.prepare_dataset(digits)
        (train_d_set,
         test_d_set) = divide_dataset.divide_dataset(g, tr_d, te_d)
        loss = training.train(train_d_set)
        accuracy = testing.test(test_d_set)
        groups_a_l[str(g)] = [accuracy]
        groups_a_l[str(g)].append(loss)
        acc.append(accuracy)
        losses.append(loss)

    acc.sort()
    acc.reverse()

    losses.sort()

    for i in groups_a_l:
        if groups_a_l[i][0] == acc[0]:
            print(f'Highest accuracy: {acc[0]}, groups: {i}\n')
        if groups_a_l[i][0] == acc[1]:
            print(f'Second highest accuracy: {acc[1]}, groups: {i}\n')
        if groups_a_l[i][0] == acc[len(acc) - 1]:
            print(f'Lowest accuracy: {acc[len(acc) - 1]}, groups: {i}\n')
        if groups_a_l[i][0] == acc[len(acc) - 2]:
            print(
                f'Second lowest accuracy: {acc[len(acc) - 2]}, groups: {i}\n')

    for i in groups_a_l:
        if groups_a_l[i][1] == losses[0]:
            print(f'Lowest loss: {losses[0]}, groups: {i}\n')
        if groups_a_l[i][1] == losses[1]:
            print(f'Second lowest loss: {losses[1]}, groups: {i}\n')
        if groups_a_l[i][1] == losses[len(losses) - 1]:
            print(f'Highest loss: {losses[len(losses) - 1]}, groups: {i}\n')
        if groups_a_l[i][1] == losses[len(losses) - 2]:
            print(
                f'Second highest loss: {losses[len(losses) - 2]}, groups: {i}\n'
            )

    finish = time.time()

    print(groups_a_l)

    print('Total seconds passed: %.3f' % (finish - start))

    x = losses
    y = []

    for i in x:
        for j in groups_a_l:
            if groups_a_l[j][1] == i:
                y.append(groups_a_l[j][0])

    plt.plot(x, y)
    plt.show()
Ejemplo n.º 13
0
def make_tree(num_epochs_l, num_epochs_n, leaf_groups, node_groups):

    start = time.time()

    leaf_PATHS = []

    for i in range(len(leaf_groups)):
        leaf_PATHS.append('./PATHS/leaf' + str(i + 1) + '_net.pth')

    leaves = []

    for i, leaf_group in enumerate(leaf_groups):
        (leaf_train_set,
         leaf_test_set) = prepare_leaf_dataset.prepare_leaf_dataset(leaf_group)
        leaves.append(
            Node.Node(leaf_train_set, leaf_test_set, None, None, leaf_PATHS[i],
                      True))

    acc_leaves = []

    for leaf in leaves:
        leaf.train(num_epochs_l)
        acc_leaves.append(leaf.test())

    train_sets = []
    test_sets = []
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, ), (0.5, ))])
    train_sets.append(
        torchvision.datasets.MNIST(root='./data',
                                   train=True,
                                   download=True,
                                   transform=transform))
    test_sets.append(
        torchvision.datasets.MNIST(root='./data',
                                   train=False,
                                   download=True,
                                   transform=transform))

    for group in node_groups:
        (train_set, test_set) = prepare_dataset.prepare_dataset(group)
        train_sets.append(train_set)
        test_sets.append(test_set)

    node_PATHS = []

    for i in range(len(train_sets)):
        node_PATHS.append('./PATHS/node' + str(i + 1) + '_net.pth')

    train_sets.reverse()
    test_sets.reverse()
    node_PATHS.reverse()
    leaves.reverse()

    nodes = []

    i = 0

    for j in range(0, len(leaves), 2):
        nodes.append(
            Node.Node(train_sets[i], test_sets[i], leaves[j + 1], leaves[j],
                      node_PATHS[i], False))
        i += 1

    l = 0

    for k in range(i, len(train_sets), 1):
        nodes.append(
            Node.Node(train_sets[k], test_sets[k], nodes[l + 1], nodes[l],
                      node_PATHS[k], False))
        l += 2

    acc_nodes = []

    for node in nodes:
        node.train(num_epochs_n)
        acc_nodes.append(node.test())

    end = time.time()

    print(f'Seconds passed: {end - start}')
    print(f'Minutes passed: {(end - start) / 60}')
    print(f'Hours passed: {(end - start) / 3600}\n')

    return acc_leaves, acc_nodes
Ejemplo n.º 14
0
from sklearn import cross_validation
from sklearn.metrics import classification_report

from classification import Classification
'''
Place the Classic dataset (data files) on creating a folder by name "classic".
Create another folder "Classic_Dataset" and 4 sub folders by name "med", "cran", "cicsi", "cacm".
 
The hardcoded number(a, b, c, d) of files will be copied from the "classic" to corresponding folders ("cacm", "med", "cisi", "cran").

'''
# Move dataset files into the required format
print "Preparing dataset..."
from prepare_dataset import prepare_dataset

prepare_dataset()

#Load The files/dataset
cwd = os.getcwd()
load_path = cwd + "/Classic_Dataset"
dataset = load_files(load_path,
                     description=None,
                     categories=None,
                     load_content=True,
                     shuffle=False,
                     encoding=None,
                     decode_error='strict',
                     random_state=0)

#Class names and assigned numbers
class_names = list(dataset.target_names)