Beispiel #1
0
def generate_idf_string(unsafe_user_inputs):
    cleaned_inputs = validation.Validation().cleaned_inputs(unsafe_user_inputs)

    unicode_idf_file = unicode(
        IDF(idf_list=building.Building(
            _geometry_configurations(cleaned_inputs),
            cleaned_inputs).output_EP_list()))
    return unicode_idf_file
Beispiel #2
0
def test_data(pathname, dbName, table, columns, filename, threshold):
    validate = validation.Validation(threshold)

    connect = connectBD.Connection()

    # dict[table][column] = DataFrame
    #dict_tab_col_df = validate.filterData(connect.getColumnsDB(pathname, table, columns), verbose=False)
    dict_tab_col_df = validate.filterData(connect.getColumnsDB2(
        dbName, table, columns),
                                          verbose=False)

    dict_tab_col_df_pred = dict()
    for tab in dict_tab_col_df:
        dict_tab_col_df_pred[tab] = dict()

    # Test automatic detection
    flag_SVM = False
    for tab, col_df in dict_tab_col_df.items():
        for c in col_df:
            pred = validate.checkPattern(col_df[c], c)
            if pred is None:
                flag_SVM = True
            else:
                dict_tab_col_df_pred[tab][c] = pred

    # SVM predict: if at least one column needs to be predict
    if flag_SVM:
        try:
            classifier, vocabulary_ = load(filename)
        except:
            print("Saved model not loaded (filename as '{}')".format(filename))
            print("Please, train the entire data: python3.6 {} --train".format(
                sys.argv[0]))
            exit(2)

        vectorizer = getVectorizer(dict_tab_col_df, vocabulary_)

        for tab, col_df in dict_tab_col_df.items():
            for c in col_df:
                if c not in dict_tab_col_df_pred[tab]:
                    df = pre_processing(col_df[c])
                    doc = vectorizer.transform(list(set(df)))
                    dict_tab_col_df_pred[tab][c] = np.unique(
                        classifier.predict(doc), return_counts=True)[0][0]

    return dict_tab_col_df_pred
Beispiel #3
0
 def __init__(self, model_conf: ModelConfig):
     """
     :param model_conf: 读取工程配置文件
     """
     self.model_conf = model_conf
     self.validation = validation.Validation(self.model_conf)
Beispiel #4
0
def train_data(pathname, threshold, path_out):
    validate = validation.Validation(threshold)

    bd = connectBD.Connection()

    # Phase 1: filtering the data
    print("Phase 1: Filtering the data...")
    # dict[database][column] = DataFrame
    dict_bd_column_df = validate.filterData(bd.readFilesDB(pathname))

    # Phase 2: detecting pattern
    print("Phase 2: Detecting patterns...")
    y_true = list()  # ground-truth classes
    y_pred = list()  # predicted classes
    dict_bd_column_true = dict()  # manual labeled
    dict_bd_column_pred = dict()  # predicted by pattern

    # TODO: to remove - only for checking the results
    #dict_bd_column_examples = dict() # examples

    # dict[column] = class
    column_class = select_attributes.getAnnotatedColumns()
    noClass_columns = set()

    for bd, column_df in dict_bd_column_df.items():
        dict_bd_column_true[bd] = dict()
        dict_bd_column_pred[bd] = dict()
        for column, df in column_df.items():
            if column not in column_class:
                noClass_columns.add(column)

            pred = validate.checkPattern(df, column)
            if pred is None:
                pred = "nenhuma"
            y_pred.append(pred)
            dict_bd_column_pred[bd][column] = pred

            if column in column_class:
                y_true.append(column_class[column])
                dict_bd_column_true[bd][column] = column_class[column]
            else:
                y_true.append("nenhuma")
                dict_bd_column_true[bd][column] = "nenhuma"

            # TODO: to remove - only for checking the results
            """instances = ""
			n_instances = 1
			c = column
			if len(df.unique()) > 0:
				for value in df.unique():
					instances = instances + str(value) + "#"
					if n_instances == 5:
						break
					n_instances = n_instances + 1
			if bd not in dict_bd_column_examples:
				dict_bd_column_examples[bd] = dict()
			dict_bd_column_examples[bd][column] = instances"""

    # Results of the automatic pattern checking
    #print(classification_report(y_true, y_pred))

    # Phase 3: predicting classes
    print("Phase 3: Trainning the classifier...")
    # Selecting columns that need to be trained
    columns_SVM = set()
    for cl, d in dict(
            classification_report(y_true,
                                  y_pred,
                                  output_dict=True,
                                  zero_division=0)).items():
        if type(d) is dict:
            if d['f1-score'] < threshold and d['support'] != 0:
                columns_SVM.update(select_attributes.getAnnotatedColumn(cl))
        else:
            break

    columns_SVM.update(noClass_columns)

    # Assigning the class of the subclasses
    for c, cl in column_class.items():
        column_class[c] = cl if "_" not in cl else cl[:cl.index("_")]

    X, y, vectorizer = get_X_and_y(dict_bd_column_df, columns_SVM,
                                   column_class)

    classifier = build_model(X, y)

    y_pred, y_true = y_predict(vectorizer, classifier, dict_bd_column_df,
                               columns_SVM, column_class)

    print("Saving Model and Vocabulary...")
    outputName = (path_out if path_out.endswith("/") else path_out +
                  "/") + "classifier_" + datetime.now().strftime(
                      "%d%m%Y_%H%M%S") + ".joblib"
    dump([classifier, vectorizer.vocabulary_], outputName)
    print("Objects saved as '{}'.".format(outputName))

    #print(classification_report(y_true, y_pred))

    index = 0
    for bd, column_df in dict_bd_column_df.items():
        for c in column_df:
            if c in columns_SVM:
                # If it is not a false positive, then update it
                if dict_bd_column_pred[bd][c] is "nenhuma":
                    dict_bd_column_pred[bd][c] = y_pred[index]
                index += 1

    y_true = list()  # ground-truth classes
    y_pred = list()  # predicted classes
    for bd, column_df in dict_bd_column_pred.items():
        for c in column_df:
            cl = dict_bd_column_pred[bd][c]
            y_pred.append(cl if "_" not in cl else cl[:cl.index("_")])
            cl = dict_bd_column_true[bd][c]
            y_true.append(cl if "_" not in cl else cl[:cl.index("_")])
    """
	# TODO: to remove - only for checking the results
	f_write = open("output_checkingData.tsv", "w")
	f_write.write("column\tmanual\tpredita\tnum_DB\tinstances\n")
	for bd,column_df in dict_bd_column_pred.items():
		for c in column_df:
			f_write.write("{}\t{}\t{}\t{}\t{}\n".format(c,"nenhuma" if c not in column_class else column_class[c],dict_bd_column_pred[bd][c],bd,dict_bd_column_examples[bd][c]))
	f_write.close()
	"""

    print("Classification report:")
    print(classification_report(y_true, y_pred))
Beispiel #5
0
import time
import sys
import pandas as pd
import validation as vd
import constants as ct

# Create instance of class Validation
validation = vd.Validation()


def get_filters():
    """Take user input for a city, month, and/or day to filter data.

    Return:
        (str) city - name of the city to filter data
        (str) timeframe - user's choice of timeframe to filter data
        (str) month - name of the month to filter data
        (str) day - name of the day of week to filter data
    """
    month = ''
    day = ''

    city = validation.validate_city(
        input(
            'Would you like to see data for Chicago, New York or Washington:\n'
        ))
    timeframe = validation.validate_timeframe(
        input(
            '\nWould you like to filter the data by month, day, both or not at all? Type "none" for no filter\n'
        ))
    if timeframe == 'month':
def transfer_learning(num_epochs=3,
                      resize=320,
                      batch_size=16,
                      posw=1,
                      data_rate=1,
                      normalize=False,
                      feature_extract=False,
                      pre_trained=False,
                      pre_trained_PATH="",
                      from_checkpoint="",
                      root_PATH=root_PATH,
                      learning_rate=0.0001,
                      num_workers=5,
                      root_PATH_dataset=root_PATH_dataset,
                      saved_model_PATH=saved_model_PATH):

    #batch transformation
    if normalize:
        transform = transforms.Compose([
            transforms.Resize((resize, resize)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, ), (0.5, )),
            transforms.Lambda(lambda x: torch.cat([x, x, x], 0))
        ])
    else:
        transform = transforms.Compose([
            transforms.Resize((resize, resize)),
            transforms.ToTensor(),
            transforms.Lambda(lambda x: torch.cat([x, x, x], 0))
        ])  #,transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    model = models.densenet121(pretrained=pre_trained)
    model.classifier = torch.nn.Linear(1024, 5)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if posw:
        criterion = nn.BCEWithLogitsLoss(
            pos_weight=chexpert_load.load_posw()).to(device=device)
    else:
        criterion = nn.BCEWithLogitsLoss().to(device=device)

    plot_loss = []

    kwarg_Common = {
        "num_epochs": num_epochs,
        "learning_rate": learning_rate,
        "batch_size": batch_size
    }
    kwargs = {"Common": kwarg_Common}

    if pre_trained_PATH or from_checkpoint:
        loader = load_model.Load_Model(method="TL",
                                       pre_trained=pre_trained_PATH,
                                       from_checkpoint=from_checkpoint,
                                       kwargs=kwargs,
                                       model=model,
                                       plot_loss=plot_loss,
                                       use_cuda=use_cuda)
        file_name, optimizer, plot_loss = loader()

    else:
        if pre_trained:
            print("pretrained on ImageNet ")
        else:
            print("training from scratch ")

        file_name = "from_scratch_epoch" + str(num_epochs) + "_batch" + str(
            batch_size) + "_learning_rate" + str(learning_rate) + ".tar"

    file_name = "data_rate" + str(
        data_rate) + "_" + file_name if data_rate != 1 else file_name
    file_name = "pre_trainedIMAGENET_" + file_name if pre_trained else file_name
    file_name = "no_posw_" + file_name if not posw else file_name
    file_name = "normalized" + file_name if normalize else file_name

    if feature_extract:
        for param in model.features.parameters():
            param.requires_grad = False
        optimizer = torch.optim.Adam(model.classifier.parameters(),
                                     lr=learning_rate)

        file_name = "FE_" + file_name

    saved_model_PATH = saved_model_PATH + "saved_models/transfer_learning/" + file_name[:
                                                                                        -4]
    if not os.path.exists(saved_model_PATH): os.mkdir(saved_model_PATH)

    labels_path = root_PATH + "SummerThesis/code/custom_lib/chexpert_load/labels.pt"
    cheXpert_train_dataset, dataloader = chexpert_load.chexpert_load(
        root_PATH + "SummerThesis/code/custom_lib/chexpert_load/train.csv",
        transform,
        kwarg_Common["batch_size"],
        num_workers=num_workers,
        data_rate=data_rate,
        labels_path=labels_path,
        root_dir=root_PATH_dataset)

    currentDT = datetime.datetime.now()
    model = model.to(device=device)
    print("started training")
    print('START--', file_name)
    model.train()
    for epoch in range(num_epochs):
        for i, (images, labels, _) in enumerate(
                dataloader
        ):  # Load a batch of images with its (index, data, class)

            images = images.to(device=device, dtype=torch.float)
            labels = labels.to(device=device, dtype=torch.float)

            outputs = model(images).to(
                device=device
            )  # Forward pass: compute the output class given a image

            loss = criterion(
                outputs, labels
            )  # Compute the loss: difference between the output class and the pre-given label

            optimizer.zero_grad()  # Intialize the hidden weight to all zeros
            loss.backward()  # Backward pass: compute the weight
            optimizer.step()  # Optimizer: update the weights of hidden nodes

            if (i + 1) % 100 == 0:  # Logging
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %
                      (epoch + 1, num_epochs, i + 1,
                       len(cheXpert_train_dataset) // batch_size, loss))
                aftertDT = datetime.datetime.now()
                c = aftertDT - currentDT
                mins, sec = divmod(c.days * 86400 + c.seconds, 60)
                print(mins, "mins ", sec, "secs")

            if i % 200 == 0:
                plot_loss.append(loss)
            #DELETEEEEE
            #break
        #break

    aftertDT = datetime.datetime.now()
    c = aftertDT - currentDT
    mins, sec = divmod(c.days * 86400 + c.seconds, 60)
    print(mins, "mins ", sec, "secs")
    print('END--', file_name)

    # Calculating valid error plotting AUC , Precisinon -Recall , plot loss , saving figures, printingg auc differences
    PATH = saved_model_PATH + "/" + file_name

    val = validation.Validation(chexpert_load=chexpert_load,
                                model=model,
                                plot_loss=plot_loss,
                                bs=16,
                                transform=transform,
                                root_PATH=root_PATH,
                                root_PATH_dataset=root_PATH_dataset,
                                saved_model_PATH=saved_model_PATH,
                                file_name=file_name,
                                gpu=use_cuda)
    val()

    torch.save(
        {
            'epoch': num_epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': plot_loss
        }, PATH)
Beispiel #7
0
 def test_validatePassword_uppercase(self):
     validation = v.Validation()
     self.assertTrue(validation.validatePassword('Testcase123?'))
     self.assertFalse(validation.validatePassword('testcase123?'))
Beispiel #8
0
 def test_validateLogin(self):
     validation = v.Validation()
     self.assertTrue(validation.validateLogin('Testcase123'))
     self.assertFalse(validation.validateLogin('Testca'))
Beispiel #9
0
 def test_validatePasswordMatch(self):
     validation = v.Validation()
     self.assertTrue(
         validation.validatePasswordMatch('Testcase123?', 'Testcase123?'))
     self.assertFalse(
         validation.validatePasswordMatch('Testcase123?', 'testcase123?'))
Beispiel #10
0
 def test_validatePassword_special(self):
     validation = v.Validation()
     self.assertTrue(validation.validatePassword('Testcase123?'))
     self.assertFalse(validation.validatePassword('Testcase123'))