Ejemplo n.º 1
0
def validate_test():
    from preprocess import PreProcess
    p = PreProcess()

    global model, train_indices, val_indices, train_loader, test_loader, optimizer

    model = MonocularVelocityNN(initial_depth=config["depth"])

    dataset = VideoDataLoader(directory=str(Path.cwd() / "data/testprocdir"),
                              delta=config["delta"],
                              Y=p.labels,
                              depth=config["depth"])

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    val_indices = list(
        range(1, dataset_size - config["delta"] - config["depth"]))

    # valid_sampler = Sampler(val_indices)

    test_loader = DataLoader(dataset,
                             batch_size=config["test_batch_size"],
                             sampler=val_indices)

    load_weights(Path.cwd() / "data/weights.pt")

    if not device:
        raise RuntimeError("Only use model with Cuda")
    model.to(device)

    validate(is_labels=False)
Ejemplo n.º 2
0
    def main(self):
        preprocess = PreProcess()
        self.wordsList, self.wordVectors = preprocess.load_word2vec()
        data, list_id, labels, types, max_length = preprocess.load_mutations()
        self.maxSeqLength = max_length + 1

        # create dictionary of type and it's respective value in int
        count = 0
        for i in types:
            dic = {i: count}
            self.types.update(dic)
            count = count + 1

        self.numClasses = len(self.types)

        train_df, test_df, labels_train, labels_test = self.split_data(
            data, labels)
        ids_train, ids_test = self.create_matrix(train_df, test_df)

        # Spit out details about data
        print("\n=================================\nData details:")
        print("- Training-set:\t{}".format(len(train_df)))
        print("- Test-set:\t\t{}".format(len(test_df)))
        print("- Classes:\t\t{}".format(self.types))
        print("=================================\n\n")

        self.train_model(ids_train, labels_train, ids_test, labels_test)
Ejemplo n.º 3
0
def submit():
    if request.form['text_input'] == "" or len(
            request.form['text_input']) < 10:
        return "Please provide input large enough, Classifier can understand :)"

    # todo: change column name to be dynamically taken from training file
    test_data = pd.DataFrame([request.form['text_input']],
                             columns=['Document'])
    session_id = request.cookies['session']
    path = os.path.join(app.config['UPLOAD_FOLDER'], session_id)
    trained_classifier = [i for i in os.listdir(path) if '.pkl' in i]
    vectorizer = os.path.join(path, 'tfidf_vectorizer.pk')
    tfidf_transformer = joblib.load(vectorizer)
    pre_processor = PreProcess(test_data, column_name='Document')
    test_data = pre_processor.clean_html()
    test_data = pre_processor.remove_non_ascii()
    test_data = pre_processor.remove_spaces()
    test_data = pre_processor.remove_punctuation()
    test_data = pre_processor.stemming()
    test_data = pre_processor.lemmatization()
    test_data = pre_processor.stop_words()
    test_data1 = tfidf_transformer.transform(test_data.Document)
    result = {}
    for clf in trained_classifier:
        model = joblib.load(os.path.join(path, clf))
        print(clf, model.predict(test_data1)[0])
        classifier_name = clf.split('/')[-1].split('.')[0]
        result[classifier_name] = model.predict(test_data1)[0]
        print(result)
    return render_template('results.html', result=result)
Ejemplo n.º 4
0
def interence_main():
    CHECKPOINT_PATH = './model/seq2seq_ckpt-7800'
    preprocess = PreProcess()
    # with tf.variable_scope('nmt_model', reuse=None):
    model = NMTModel()
    test_sentence = 'this is a test.'
    test_sentence = preprocess.english2id(test_sentence)
    print(preprocess.id2english(test_sentence))
    output_op = model.inference(test_sentence)
    sess = tf.Session()
    saver = tf.train.Saver()
    saver.restore(sess, CHECKPOINT_PATH)
    with tf.variable_scope('nmt_model', reuse=None):
        model = NMTModel()
        test_sentence = 'this is a test.'
        test_sentence = preprocess.english2id(test_sentence)
        output_op = model.inference(test_sentence)
        sess = tf.Session()
        saver = tf.train.Saver()
        saver.restore(sess, CHECKPOINT_PATH)

        output = sess.run(output_op)

        print(output)
        print(preprocess.id2chinese(output))
        sess.close()
Ejemplo n.º 5
0
 def __init__(self, bus_stop_distance, traffic_light_distance, array_stops,
              array_trfl):
     super().__init__()
     self.bst_dist = bus_stop_distance
     self.trfl_dist = traffic_light_distance
     self.array_stops = array_stops
     self.array_trfl = array_trfl
     self.prepro = PreProcess()
def execute_ALPR(event):
    """
    runs the full license plate recognition process.
    function is called when user clicks on the execut button on the gui
    """

    #time the function execution
    start_time = time.time()

    root_folder = os.path.dirname(os.path.realpath(__file__))
    models_folder = os.path.join(root_folder, 'ml_models')
    pre_process = PreProcess(imagepath)
    
    plate_like_objects = pre_process.get_plate_like_objects()
    # plotting.plot_cca(pre_process.full_car_image,
    #     pre_process.plate_objects_cordinates)

    license_plate = license_plate_extract(plate_like_objects, pre_process)

    if len(license_plate) == 0:
        return False

    ocr_instance = OCROnObjects(license_plate)

    if ocr_instance.candidates == {}:
        wx.MessageBox("No character was segmented",
            "Character Segmentation" ,wx.OK|wx.ICON_ERROR)
        return False

    # plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates'])

    deep_learn = DeepMachineLearning()
    text_result = deep_learn.learn(ocr_instance.candidates['fullscale'],
        os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'),
        (20, 20))

    text_phase = TextClassification()
    scattered_plate_text = text_phase.get_text(text_result)
    plate_text = text_phase.text_reconstruction(scattered_plate_text,
        ocr_instance.candidates['columnsVal'])
    
    print 'ALPR process took '+ str(time.time() - start_time)  + ' seconds'
    
    listResult.InsertStringItem(listRow, plate_text)
    plate_num = Mvrd(plate_text)
    details = plate_num.get_data()
    if details == False or details == {}:
        wx.MessageBox("Vehicle Information could not be retrieved",
            "Information Retrieval", wx.OK|wx.ICON_ERROR)
        return False;
    listResult.SetStringItem(listRow, 1, details['Owner Name'])
    listResult.SetStringItem(listRow, 2, details['Isssue Date'])
    listResult.SetStringItem(listRow, 3, details['Expiry Date'])
    listResult.SetStringItem(listRow, 4, details['Chasis Number'])
    listResult.SetStringItem(listRow, 5, details['Model'])
    #db_aspect.save_alpr(plate_text, str(datetime.today()))
def main():
    data = get_data(
        '/Users/aditya1/Documents/Document_Classification/bbc-dataset')

    ###############################################################################
    # Data Pre-processing steps
    ###############################################################################
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    pre_processor_operations = ['clean_html']
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()

    ###############################################################################
    # Feature extraction
    ###############################################################################

    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    # print(train_x.shape, train_y.shape)
    # print(test_x.shape, test_y.shape)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    joblib.dump(tfidf_transformer, 'vectorizer.pkl')
    test_vectors = tfidf_transformer.transform(test_x)
    print(data.head())

    ###############################################################################
    # Perform classification with SVM, kernel=linear
    model1 = svm.SVC(kernel='linear')
    model1.fit(train_vectors, train_y)
    joblib.dump(model1, 'SVM.pkl')
    y_pred_class = model1.predict(test_vectors)
    print(metrics.accuracy_score(test_y, y_pred_class))
    print("Prediction score for classifier %s:\n%s\n" %
          (model1, metrics.accuracy_score(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model1, metrics.classification_report(test_y, y_pred_class)))

    model2 = MultinomialNB()
    model2.fit(train_vectors, train_y)
    joblib.dump(model2, 'MultinomialNB.pkl')
    y_pred_class = model2.predict(test_vectors)
    print("Accuracy score:", metrics.accuracy_score(test_y, y_pred_class))
    print("Confusion Matrix for classifier %s:\n%s\n" %
          (model2, metrics.confusion_matrix(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model2, metrics.classification_report(test_y, y_pred_class)))
Ejemplo n.º 8
0
def train():
    from preprocess import PreProcess
    p = PreProcess()

    global model, train_indices, val_indices, train_loader, test_loader, optimizer

    model = MonocularVelocityNN(initial_depth=config["depth"])

    dataset = VideoDataLoader(directory=str(Path.cwd() / "data/processed"),
                              delta=config["delta"],
                              Y=p.labels,
                              depth=config["depth"])

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(1, dataset_size - config["delta"] - config["depth"]))
    split = int(np.floor(config["split"] * dataset_size))

    if config["randomize"]:
        np.random.seed(0)
        np.random.shuffle(indices)

    train_indices, val_indices = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = DataLoader(dataset,
                              batch_size=config["batch_size"],
                              sampler=train_sampler)
    test_loader = DataLoader(dataset,
                             batch_size=config["test_batch_size"],
                             sampler=valid_sampler)

    if config["TRAIN"]:
        if not device:
            raise RuntimeError("Only use model with Cuda")
        model.to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

        for epoch in range(1, config["epochs"] + 1):
            try:
                train(epoch)
                validate()
            except:
                raise

    else:
        load_weights(Path.cwd() / "data/weights.pt")

        if not device:
            raise RuntimeError("Only use model with Cuda")
        model.to(device)

        validate()
Ejemplo n.º 9
0
def train(config, device, RS='Supervised'):
    # Init tokenizer.
    tokenizer = Tokenizer(config.temp_dir, config.jieba_dict_file,
                          config.remove_stopwords, config.stopwords_file,
                          config.ivr)
    # Init feature index.
    feature_index = FeatureIndex(config, tokenizer=tokenizer)
    file_list = [config.labeled_file]
    if config.extra_train_file is not None:
        file_list.append(config.extra_train_file)
    if config.valid_file is not None:
        file_list.append(config.valid_file)
    feature_index.build_index(file_list)
    # Preprocess data.
    pre_process = PreProcess(config)
    train_data_dir, valid_data_dir, final_train_file, final_valid_file = pre_process.train_preprocess(
    )
    # Get PyTorch dataset.
    train_dataset = MixnetDataset(config, train_data_dir, feature_index,
                                  tokenizer)
    valid_dataset = MixnetDataset(config, valid_data_dir, feature_index,
                                  tokenizer, True)
    # Get NER model if necessary and compatible.
    need_ner = False
    for (feature, feature_config) in config.feature_config_dict.items():
        need_ner = need_ner or ("text" in feature_config.get(
            "type", "") and feature_config.get("seg_type", "word") == "char"
                                and feature_config.get("ner", False))
    if need_ner:
        logger.info("Enable NER, loading NER model...")
        # Use predict mode since we cannot train it without tag information.
        ner_model = NERModel(device, "predict")
    else:
        logger.info("Disable NER.")
        ner_model = None
    # Get PyTorch data loader.
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=1,
                                   shuffle=False,
                                   num_workers=config.read_workers)
    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=1,
                                   shuffle=False,
                                   num_workers=config.read_workers)
    # Init model.
    model = MixNet(config.model_config_dict,
                   config.output_config_dict,
                   feature_index.feature_info_dict,
                   feature_index.label_info_dict,
                   ner_model=ner_model)
    # Train model.
    solver = Solver(config, train_data_loader, valid_data_loader,
                    feature_index, model, device, RS)
    solver.build()
    solver.train()
Ejemplo n.º 10
0
def train(start=1, end=31):
    # preprocess training data
    preprocess = PreProcess()
    preprocess.set_date_range(start, end)
    preprocess.process()
    la = LineAnalysis()
    la.set_date_range(start, end)
    la.process()
    # train model
    train = Train()
    train.fit()
    return
Ejemplo n.º 11
0
def execute_ALPR():
    """
    runs the full license plate recognition process.
    function is called when user clicks on the execute button on the gui
    """

    # time the function execution
    start_time = time.time()

    root_folder = os.path.dirname(os.path.realpath(__file__))
    models_folder = os.path.join(root_folder, 'ml_models')

    # import requests

    # r = requests.get(url, auth=('admin', 'admin'))
    # file = open(imagepath, "w")
    # file.write(r.content)
    # file.close()
    imagepath=url
    pre_process = PreProcess(imagepath)


    plate_like_objects = pre_process.get_plate_like_objects()
    # plotting.plot_cca(pre_process.full_car_image,
    #     pre_process.plate_objects_cordinates)

    license_plate = license_plate_extract(plate_like_objects, pre_process)

    if len(license_plate) == 0:
        return False

    ocr_instance = OCROnObjects(license_plate)

    if ocr_instance.candidates == {}:
        # print("No Characters Was Segmented")
        # wx.MessageBox("No character was segmented",
        #     "Character Segmentation" ,wx.OK|wx.ICON_ERROR)
        return False

    # plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates'])

    deep_learn = DeepMachineLearning()
    text_result = deep_learn \
        .learn(ocr_instance.candidates['fullscale'], os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'),
               (20, 20))

    text_phase = TextClassification()
    scattered_plate_text = text_phase.get_text(text_result)
    plate_text = text_phase.text_reconstruction(scattered_plate_text, ocr_instance.candidates['columnsVal'])

    # print('ALPR process took ' + str(time.time() - start_time) + ' seconds')

    print(plate_text)
Ejemplo n.º 12
0
    def __init__(self):
        self.preprocess = PreProcess()
        self.CDBG_map = None
        self.precincts_wards_map = None

        self.directory_path = 'result'
        if not os.path.exists(self.directory_path):
            os.mkdir(self.directory_path)

        self.refuse_routes_directory_path = os.path.join(self.directory_path, 'refuse_routes')
        if not os.path.exists(self.refuse_routes_directory_path):
            os.mkdir(self.refuse_routes_directory_path)
Ejemplo n.º 13
0
    def run(self, hand, rgb=False, box=None, hand_crop=None, depth=None):
        if rgb and box and hand_crop.any() and depth.any():
            phand = self._rgb_hand_seg(hand, box, hand_crop, depth)
        else:
            phand = PreProcess().median_smooth(hand, self.MEDIAN_DIM)

        tmp  = phand.copy()
        cont = self._get_largest_contour(phand)
        box  = cv2.boundingRect(cont)
        crop = self._crop_box(tmp, box)
        self._contour = cont
        self._box = box
        return (cont, box, crop)
Ejemplo n.º 14
0
    def main(self):
        preprocess = PreProcess()
        self.wordsList, self.wordVectors = preprocess.load_word2vec()
        data, list_id, labels, types, max_length = preprocess.load_mutations()
        self.maxSeqLength = max_length + 1

        # create dictionary of type and it's respective value in int
        count = 0
        for i in types:
            dic = {i: count}
            self.types.update(dic)
            count = count + 1

        self.numClasses = len(self.types)

        train_df, test_df, labels_train, labels_test = self.split_data(
            data, labels)

        # remove last element to make it even number
        train_df = train_df[:-1]
        labels_train = labels_train[:-1]
        div = int(len(train_df) / self.k)

        # get K = 5 batches
        train_1, train_2, train_3, train_4, train_5 = [
            train_df[i:i + div] for i in range(0, len(train_df), div)
        ]
        labels_1, labels_2, labels_3, labels_4, labels_5 = [
            labels_train[i:i + div] for i in range(0, len(labels_train), div)
        ]

        ids_test = self.create_matrix_teste(test_df)
        ids_train1 = self.create_matrix_train(train_1)
        ids_train2 = self.create_matrix_train(train_2)
        ids_train3 = self.create_matrix_train(train_3)
        ids_train4 = self.create_matrix_train(train_4)
        ids_train5 = self.create_matrix_train(train_5)

        train_bins = [train_1, train_2, train_3, train_4, train_5]
        label_bins = [labels_1, labels_2, labels_3, labels_4, labels_5]
        ids_train = [
            ids_train1, ids_train2, ids_train3, ids_train4, ids_train5
        ]

        self.train_model(train_bins, label_bins, ids_train)
Ejemplo n.º 15
0
def execute_ALPR(event):
    """
    runs the full license plate recognition process.
    function is called when user clicks on the execut button on the gui
    """

    #time the function execution
    start_time = time.time()

    root_folder = os.path.dirname(os.path.realpath(__file__))
    models_folder = os.path.join(root_folder, 'ml_models')
    pre_process = PreProcess(imagepath)
    
    plate_like_objects = pre_process.get_plate_like_objects()
    # plotting.plot_cca(pre_process.full_car_image,
    #     pre_process.plate_objects_cordinates)

    license_plate = license_plate_extract(plate_like_objects, pre_process)

    if len(license_plate) == 0:
        return False
            
    ocr_instance = OCROnObjects(license_plate)

    #plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates'])

    deep_learn = DeepMachineLearning()
    text_result = deep_learn.learn(ocr_instance.candidates['fullscale'],
        os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'),
        (20, 20))

    text_phase = TextClassification()
    scattered_plate_text = text_phase.get_text(text_result)
    plate_text = text_phase.text_reconstruction(scattered_plate_text,
        ocr_instance.candidates['columnsVal'])
    
    print 'ALPR process took '+ str(time.time() - start_time)  + ' seconds'

    listResult.InsertStringItem(listRow, plate_text)
    listResult.SetStringItem(listRow, 1, str(datetime.today()))

    db_aspect.save_alpr(plate_text, str(datetime.today()))
Ejemplo n.º 16
0
def read_process_data(path, files_path):
    data = pd.read_csv(path)
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()
    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    vectorizer_path = os.path.join(files_path, 'tfidf_vectorizer.pk')
    joblib.dump(tfidf_transformer, vectorizer_path)
    return train_vectors, train_y
Ejemplo n.º 17
0
def main():
    dowload_dataset()
    preprocess = PreProcess()
    df = preprocess.load_tsv()
    features = preprocess.clean_data(df)
    features = preprocess.balance_data(features, 15000)
    x_train, y_train, x_test, y_test = preprocess.split_data(features)

    y_train_round, y_test_round = preprocess.round_labels(y_train, y_test)
    y_train_one, y_test_one = preprocess.labels_to_one_hot(
        y_train_round, y_test_round)
    lr = Logistic_Regression()
    lr.fit_and_evaluate(x_train, y_train_round, x_test, y_test_round)
    nn1 = NeuralNetwork1()
    nn1.fit_and_evaluate(x_train, y_train_one, x_test, y_test_one)
    linr = Linear_Regression()
    linr.fit_and_evaluate(x_train, y_train, x_test, y_test)
    nn2 = NeuralNetwork2()
    nn2.fit_and_evaluate(x_train, y_train, x_test, y_test)

    print("\n___________________End of the output___________________")
Ejemplo n.º 18
0
    def __init__(self):
        super().__init__()
        self.folderPath = "textData"

        # without transforming string
        # List structure: [[data1][data1_label][data2][data2_label][data3][data3_label]]
        #self.lemmatizedList = self.preProcess.load_data1(self.folderPath)
        #Extract data
        #self.dataList = self.getDataList()
        #print(self.dataList.__len__())
        #self.allWords = self.extractSentence()
        #print(self.allWords.__len__())
        #Extract label
        #self.Labels = self.extractLabels()
        #print(self.Labels.__len__())
        # transform lemmatized_list(3 generators) into a long string separate by space
        #self.sparseWords =  self.preProcess.cleanByFrequency(self.allWords)
        # filter by sequence
        #self.cleanedSentences = self.preProcess.getCleanedSent(self.sparseWords,self.dataList)
        '''Pre Process'''
        self.preProcess = PreProcess(self.folderPath)
        '''i = 0
        for x in self.preProcess.cleanedSentences:
            i+=1
            print(str(i) + ": ")
            print(x)'''
        

        # vectorise remnant of sentences
        #self.X_train = self.preProcess.vector_Data(self.cleanedSentences,self.Labels)
        '''Process Sentence Level'''
        self.sentenceLevel = SentenceLevel(self.preProcess.cleanedSentences)

        '''Process Document Level'''
        # Start TensorFlow Session
        self.documentLevel = DocumentLevel(self.sentenceLevel.docInput,self.preProcess.Labels)

        # display results
        #self.displayData()
        '''Summary All Process'''
Ejemplo n.º 19
0
	def post(self):
		popu.drop
		abc = PreProcess()
		return {'inserted':'everything'},301
Ejemplo n.º 20
0
from preprocess import PreProcess
import time

start_time = time.time()
thisTime = start_time
files = []
dataFolder = os.path.dirname(os.path.abspath(__file__)) + "/data"
resultFolder = os.path.dirname(os.path.abspath(__file__)) + "/result"
count = 0
commonWordList = {}
for i in os.listdir(dataFolder):
    if i.endswith('.txt'):
        thisFile = os.path.join(dataFolder, i)
        reflection = open(thisFile, "r", encoding="utf8")

        processData = PreProcess(reflection.read())
        wordList = processData.getWordList(reflection.read(), True)
        wordFrequency = processData.wordFrequency(wordList)

        for wordTuple in wordFrequency:
            commonWordList[wordTuple[0]] = commonWordList[
                wordTuple[0]] + wordTuple[1] if wordTuple[
                    0] in commonWordList else wordTuple[1]

        print("--- %s seconds ---" % (time.time() - thisTime))
        thisTime = time.time()

        reflection.close()

result = open(resultFolder + "/wordfrequency.csv", "a+")
result.write("Word,WordCount\n")
Ejemplo n.º 21
0
 def setup_class(self):
     image_path = path.join(path.dirname(path.realpath(__file__)))
     image_path = path.join(
         path.split(image_path)[0], 'test_images', 'car6.jpg')
     self.image_array = imread(image_path, as_grey=True)
     self.pre_process = PreProcess(image_path)
Ejemplo n.º 22
0
# -*- coding: utf-8 -*-
"""

@author: Samip
"""

from preprocess import PreProcess
from classifier import Classifier
from modelEvaluation import ModelEvaluation

#Create the objects
pp = PreProcess()
classifier = Classifier()
me = ModelEvaluation()

#Preprocess the data
x_train, x_test, y_train, y_test = pp.scale_data()

choice = int(
    input(
        "Enter 1 for Logistic Regression, 2 for Decision Tree Classifier, 3 for KNN, 4 for Naive Bayes, 5 for Random Forest, 6 for SVM, 7 for XG Boost, 8 for Adaptive Boosting, 9 for LDA: "
    ))
clf = {
    1: classifier.logistic_regression,
    2: classifier.decision_tree_classifer,
    3: classifier.knn,
    4: classifier.naive_bayes,
    5: classifier.random_forest,
    6: classifier.svm,
    7: classifier.xg_boost,
    8: classifier.ada_boost,
Ejemplo n.º 23
0
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(X)
    fig = pyplot.figure()
    pyplot.plot(X[y==False,0],X[y==False,1],'ro')
    pyplot.plot(X[y==True,0],X[y==True,1],'bo')
    pyplot.title('2D Visualization, Crossmatch LivDet 2013 Testing, ConvNet 5 Layers+PCA')
    pyplot.show()
    """

    testing = Testing()
    testing.divide_by = 5
    testing.n_processes_pproc = 3
    lstFilesX, y = testing.load_dataset('Training', 'LivDet2011', 'digital')

    #PCA only
    pproc = PreProcess('',1,False,False,False,False,1.0,None,\
        None,None,None,None,None,None,None,None,None)
    X = pproc.transform(lstFilesX)
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(X)
    fig = pyplot.figure()
    pyplot.plot(X[y == False, 0], X[y == False, 1], 'ro')
    pyplot.plot(X[y == True, 0], X[y == True, 1], 'bo')
    pyplot.title('2D Visualization, Digital LivDet 2011 Training, PCA only')
    pyplot.show()

    #LBP+PCA
    pproc = PreProcess('LBP',1,False,False,False,False,1.0,None,\
        None,None,None,None,None,None,'uniform',[7,7],False)
    X = pproc.transform(lstFilesX)
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(X)
Ejemplo n.º 24
0
class NB():
    
    
    classification = {}
    number_of_docs = None
    vocab_unique = None
    classes = None

    number_of_docs = None
    inputData = None
    vocab_unique = None
    classes = None

    words = {}
    process = PreProcess()
    testing = {}

    train_file = None
    test_file = None


    def __init__(self, train=None, test=None, train_dir=None, test_dir=None):
        """

        :aim: Classify for either a file or given directory
        :param train: Train file if a file is given
        :param test: Test file is a file is given
        :param train_dir: Train directory if directory is given
        :param test_dir: Test directory if directory is given
        """

        if train is not None or test is not None:
            self.train_file = train
            self.test_file = test

            self.number_of_docs = self.process.getNumberOfDocs(train)
            self.inputData = self.process.processFile(train)
            self.vocab_unique = self.process.createVocab(self.inputData)
            self.classes = self.process.getClasses(self.inputData)

        else:

            for i in os.listdir(train_dir + "neg"):

                self.number_of_docs += self.process.getNumberOfDocs(train)
                self.inputData += self.process.processFile(train)
                self.vocab_unique += self.process.createVocab(self.inputData)
                self.classes += self.process.getClasses(self.inputData)

            for i in os.listdir(train_dir + "pos"):

                self.number_of_docs += self.process.getNumberOfDocs(train)
                self.inputData += self.process.processFile(train)
                self.vocab_unique += self.process.createVocab(self.inputData)
                self.classes += self.process.getClasses(self.inputData)

            """
            for i in os.listdir(test_dir + "neg"):
                
            for i in os.listdir(test_dir + "pos"):
                
            """

    def classify(self):
        """

        :aim: Classify training
        :return: None
        """
        for c in self.classes:
            temp_dict = {}
            for word in self.vocab_unique:
                temp_dict[word] = 0
            self.classification[c] = temp_dict

        # keys = classes, values = count of words in document given class
        for key in self.classification:
            for line in self.inputData:
                if line[len(line) - 1] != key:
                    continue
                else:
                    for word in line:
                        if word not in self.classification[key]:
                            continue
                        (self.classification[key])[word] += 1

    def count_words(self):
        """

        :aim: Count words per class
        :return: None
        """

        for cl in self.classes:
            count = 0
            for k in self.classification[cl]:
                count += self.classification[cl][k]
            self.words[cl] = count

    def classify_smoothed(self):
        """

        :aim: Add one smoothing to probability
        :return: None
        """
        self.dict_likelihood = self.classification.copy()

        for cl in self.classes:
            for k in self.dict_likelihood[cl]:
                self.dict_likelihood[cl][k] = (self.dict_likelihood[cl][k] + 1) / float(self.words[cl] +
                                                                                        len(self.vocab_unique))


    def test_probability(self):
        """

        :aim: Test data on training
        :return: None
        """

        # Number of documents, given a class
        for cl in self.classes:
            d = self.process.numberOfDocsGivenClass(cl, self.inputData)
            self.process.docs[cl] = d

        # prior probabilities
        self.priors = {}
        for cl in self.classes:
            if cl not in self.priors:
                self.priors[cl] = self.process.docs[cl] / float(self.number_of_docs)

        with open(self.test_file) as f:
            content = f.readlines()
        content = [x.strip() for x in content]
        test_data = content

        # add classes as keys to testing dict, values = prior probabilities
        for cl in self.classes:
            if cl not in self.testing:
                self.testing[cl] = self.priors[cl]

        for cl in self.classes:
            for w in test_data:
                for key in self.dict_likelihood[cl]:
                    if key != w:
                        continue
                    elif key == w:
                        self.testing[cl] *= self.dict_likelihood[cl][w]

        # Compute the most likely class
        self.result = max(self.testing, key=self.testing.get)

    def output(self):
        """

        :aim: Write final classification output to file
        :return: None
        """
        string = \
        "Total number of documents/lines in input" + str(self.number_of_docs) + "\n" + \
        "Vocabulary of unique words: " + str(self.vocab_unique) + "\n"+ \
        "Classes:" + str(self.classes) + "\n"+ \
        "Count of words, given class: " + str(self.words) + "\n"+ \
        "Word likelihoods with add - 1 smoothing with respect to class: " + str(self.dict_likelihood) + "\n"+ \
        "Prior probabilities: ",  str(self.priors) + "\n" + \
        "Probabilities of test data: " + str(self.testing) + "\n" + \
        "The most likely class for the test document: " + self.result

        if self.train_file is not None:
            f = open("movies-small.txt", "w")
            f.write("\n".join(string))
            f.close()
        else:
            #for multiple directory implementation
            pass
Ejemplo n.º 25
0
    def run_pipeline(self,
                     feat_extract_name,
                     X_train,
                     y_train,
                     X_test=None,
                     y_test=None):

        if self.DEBUG >= 1:
            print 'training...'
            antes1 = datetime.now()

        pproc= PreProcess(feat_extract_name = feat_extract_name, n_processes = self.n_processes_pproc, size_percentage = self.size_percentage, \
                          roi = self.roi, high_pass = self.high_pass,low_pass = self.low_pass, gauss_noise = self.gauss_noise, \
                          feature_extractor__method = self.lbp__method, feature_extractor__n_tiles = self.lbp__n_tiles, \
                          feature_extractor__n_filters = self.n_filters, feature_extractor__shape_norm = self.shape_norm, \
                          feature_extractor__shape_conv = self.shape_conv, feature_extractor__shape_pool = self.shape_pool, \
                          feature_extractor__stride_pool = self.stride_pool, feature_extractor__stoc_pool = self.stoc_pool, \
                          feature_extractor__div_norm = self.div_norm, \
                          feature_extractor__region_shape = self.region_shape, feature_extractor__region_stride = self.region_stride, \
                          feature_extractor__top_regions = self.top_regions, feature_extractor__stride_pool_recurrent = self.stride_pool_recurrent, \
                          feature_extractor__analysis_shape = self.analysis_shape, multi_column = self.multi_column, \
                          augmentation = self.augmentation, aug_rotate = self.aug_rotate
                          )
        norm = preprocessing.StandardScaler(copy=True)

        piplist = []
        if self.cross_validation:
            piplist.append(('pproc', pproc))
        piplist.append(('norm', norm))

        if self.use_pca:
            pca = RandomizedPCA2(
                whiten=True,
                random_state=0,
                n_components=self.pca__n_components,
                copy=True
            )  #Must use fit_transform instead of fit() and then transform() when copy=false
            #from sklearn.decomposition import PCA
            #pca = PCA(whiten=True, n_components=self.pca__n_components, copy=True)
            #pca = FastICA(whiten=True,random_state=0, n_components=self.pca__n_components, max_iter=400)
            #pca = SparsePCA(random_state=0, n_components=self.pca__n_components) #Must use fit_transform instead of fit() and then transform() when copy=false
            piplist.append(('pca', pca))

        if self.use_lda:
            lda = LDA(n_components=self.lda__n_components)
            piplist.append(('lda', lda))

        if self.predict.lower() == 'svm':
            if self.svm__kernel.lower() == 'rbf':
                pred = SVC2(kernel='rbf',
                            class_weight='auto',
                            random_state=0,
                            C=self.svm__C,
                            gamma=self.svm__gamma,
                            multi_column=self.multi_column,
                            augmentation=self.augmentation,
                            aug_rotate=self.aug_rotate)
            else:
                pred = LinearSVC2(random_state=0,
                                  fit_intercept=False,
                                  class_weight='auto',
                                  C=self.svm__C,
                                  augmentation=self.augmentation)
        elif self.predict.lower() == 'sgd':
            pred = SGD2(loss="hinge",
                        penalty="l2",
                        l1_ratio=0.05,
                        random_state=0,
                        n_iter=5,
                        shuffle=True,
                        augmentation=self.augmentation,
                        alpha=self.sgd__alpha)
        elif self.predict.lower() == 'knn':
            pred = KNeighborsClassifier(n_neighbors=self.knn__n_neighbors,
                                        weights=self.knn__weights)
        piplist.append(('pred', pred))

        pipeline = Pipeline(piplist)

        if self.cross_validation:
            params_grid = self.params_auto.copy()
            params_grid.update(self.dicPredict[self.predict])
            params_grid.update(self.params_pproc)
            if feat_extract_name.lower() != 'none':
                params_grid.update(self.dicfeat_extract[feat_extract_name])

            pipelineGridSearch = GridSearchCV2(
                pipeline,
                params_grid,
                cv=self.n_folds,
                verbose=0,
                n_jobs=self.n_processes_cv,
                n_jobs_last_estimator=self.n_processes_cv_last_estimator,
                augmentation=self.augmentation,
                auto_adjust_params=None,
                testing=self)
            pipelineGridSearch.fit(X_train, y_train)

            #gridsearchRef = GridSearchCV(pipeline, params_grid, cv=self.n_folds, iid=True, scoring = 'roc_auc', verbose=0, n_jobs=1)
            #gridsearchRef.fit(X_train,y_train)
            #print 'ReF=== score=', gridsearchRef.best_score_,'params=', gridsearchRef.best_params_

            return pipelineGridSearch.best_score_, pipelineGridSearch.best_params_
        else:
            antes = datetime.now()

            X_train = pproc.transform(X_train)
            antes2 = datetime.now()
            X_test = pproc.transform(X_test)
            time_pproc = datetime.now() - antes2
            if self.multi_column:

                y_pred_train = []
                y_pred_test = []
                if self.aug_rotate:
                    multiply = 30
                else:
                    multiply = 10

                for i in range(multiply):
                    pipeline = pipeline.fit(X_train[i::multiply, :], y_train)
                    y_pred_train.append(
                        pipeline.predict(X_train[i::multiply, :]))
                    y_pred_test.append(pipeline.predict(
                        X_test[i::multiply, :]))

                y_pred_train = np.mean(np.asarray(y_pred_train), axis=0)
                y_pred_train[y_pred_train >= 0] = 1
                y_pred_train[y_pred_train < 0] = 0

                y_pred_test = np.mean(np.asarray(y_pred_test), axis=0)
                y_pred_test[y_pred_test >= 0] = 1
                y_pred_test[y_pred_test < 0] = 0
            else:
                pipeline = pipeline.fit(X_train, y_train)

                #save the classifier
                with open(
                        self.temp_dir + 'clf_' + self.datasettrain.lower() +
                        '_' + self.sensortrain.lower() + '_' +
                        self.feat_extract_name.lower() + '.pkl',
                        'wb') as output:
                    pickle.dump(pipeline, output, pickle.HIGHEST_PROTOCOL)

                y_pred_train = pipeline.predict(X_train)

                antes2 = datetime.now()
                y_pred_test = []
                for i in range(0, len(X_test), self.mini_batch_size_test):
                    y_pred_test.extend(
                        list(
                            pipeline.predict(
                                X_test[i:i + self.mini_batch_size_test])))

                test_time = (datetime.now() - antes2) + time_pproc

                print 'Tempo Predict= ', test_time  # DEBUG
                print 'Numero de amostras', str(len(X_test))

            score_training = 100. - (100. *
                                     roc_auc_score(y_train, y_pred_train))
            print 'score_training=', score_training

            score = 100. - (100. *
                            roc_auc_score(y_test, np.asarray(y_pred_test)))
            total_time = datetime.now() - antes  # DEBUG

            pca = pipeline.steps[-2][1]
            pca_total_variance = None
            if hasattr(pca, 'explained_variance_ratio_'):
                pca_total_variance = np.sum(pca.explained_variance_ratio_)
            pred = pipeline.steps[-1][1]
            n_support_ = None
            if hasattr(pred, 'n_support_'):
                n_support_ = pred.n_support_

            self.append_results(params=None,
                                score_mean=score,
                                score_std=None,
                                total_time=total_time,
                                test_time=test_time,
                                n_test_samples=str(len(X_test)),
                                score_training=score_training,
                                n_svm_vectors=n_support_,
                                pca_total_variance=pca_total_variance)
            return score, None

        if self.DEBUG >= 1:
            print 'Tempo Fit Pipeline= ', (datetime.now() - antes1)  # DEBUG
Ejemplo n.º 26
0
from corpus_reader import CorpusReader
from preprocess import PreProcess
from tf_idf import TfIdf
from knn import KNN
from metrics import MetricsGenerator
from pprint import pprint as pp

if __name__ == '__main__':
  print('reading...')
  reader = CorpusReader()
  reader.run()
  
  parser = PreProcess()
  parsed_trainning_documents = {}
  print('processing...')
  for k, v in reader.train.items():
    parsed_trainning_documents[k] = parser.process(v)
  
  # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
  # Receberá como entrada um array de tuplas: ([tokens], classe)
  parsed_trainning_documents_with_classes = []
  for k in parsed_trainning_documents.keys():
    parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
  
  # Execução tf-idf
  print('generating tf.idf...')
  tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
  tf_idf_calculator.run()
  
  # testa os parâmetros do knn: métrica de distância e valor de K
  for metric in ['cosine', 'euclid']:
Ejemplo n.º 27
0
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils

# spark, elephas
from keras.optimizers import SGD, Adam

# classes
from preprocess import PreProcess
from file_io import FileIO
import datetime

## MODEL ##

p = PreProcess('./datasets/ag_dataset.txt')
x_train, x_test, y_train, y_test, num_classes = p.run()

# Convert class vectors to binary class matrices
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)
# Reshape
dimension = x_train.shape[1]
x_train = x_train.reshape(x_train.shape[0], 1, dimension, 1)
x_test = x_test.reshape(x_test.shape[0], 1, dimension, 1)
print('# Training Data', x_train.shape, y_train.shape)
print('# Testing Data', x_test.shape, y_test.shape)

# model config
epoch_step = 10
pool_size = (1, 2)
Ejemplo n.º 28
0
 def __init__(self, list_of_features):
     super().__init__()
     self.features = list_of_features
     self.prepro = PreProcess()
Ejemplo n.º 29
0
"""
Created on Wed May 27 07:11:18 2020

@author: rusha
"""

import models
import shap
import pathlib
from preprocess import PreProcess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

preproc = PreProcess()
preproc.splitStandardize()

X_train, _, y_train, y_test = models.Model('tree').getData()
estimator = models.Model('tree').getXGBModel()
estimator.fit(X_train, y_train)
importances = estimator.feature_importances_
cwd = pathlib.Path().absolute()
indices = np.argsort(importances)[::-1]
cols = X_train.columns.tolist()
std = np.std([tree.feature_importances_ for tree in estimator.estimators_],
             axis=0)
tempcols = []
for f in range(X_train.shape[1]):
    print("%d. feature %s (%f)" %
          (f + 1, cols[indices[f]], importances[indices[f]]))
Ejemplo n.º 30
0
        return cv2.undistort(img, self.mtx, self.dist, None, self.mtx)

    def warpperspectiv_img(self, img):
        # Perspective Transform
        undistort = self.undistort_img(img)
        return cv2.warpPerspective(undistort, self.M, undistort.shape[1::-1])

    def unwarpperspectiv_img(self, warped_img):
        return cv2.warpPerspective(warped_img, self.Minv, warped_img.shape[1::-1])


pipline = PipleLine(camera_wide_dist_file)
img_thresh = ImageThresh(r_thresh=(170, 255), s_thresh=(200, 255))
fun_names = ['s_r_threshhold','r_threshshold', 's_threshhold', 'l_b_threshhold', 'x_b_threshhold']
# fun_names = ['l_b_threshhold']
pre = PreProcess(img_thresh, pipline.Minv, fun_names)

def process_image(image):
    undis_image = pipline.undistort_img(image)
    warped_image = pipline.warpperspectiv_img(image)
    try:
        result = pre.draw_data(undis_image, warped_image)
        return result
    except Exception as e:
        cv2.imwrite('not_found_line.jpg', image)
        cv2.imwrite('not_found_warp.jpg', warped_image)
        print(traceback.format_exc())


project_output = 'harder_challenge_video_output.mp4'
clip1 = VideoFileClip("harder_challenge_video.mp4")