Python PreProcess Examples, preprocess.PreProcess Python Examples

Example #1

0

Show file

    def main(self):
        preprocess = PreProcess()
        self.wordsList, self.wordVectors = preprocess.load_word2vec()
        data, list_id, labels, types, max_length = preprocess.load_mutations()
        self.maxSeqLength = max_length + 1

        # create dictionary of type and it's respective value in int
        count = 0
        for i in types:
            dic = {i: count}
            self.types.update(dic)
            count = count + 1

        self.numClasses = len(self.types)

        train_df, test_df, labels_train, labels_test = self.split_data(
            data, labels)
        ids_train, ids_test = self.create_matrix(train_df, test_df)

        # Spit out details about data
        print("\n=================================\nData details:")
        print("- Training-set:\t{}".format(len(train_df)))
        print("- Test-set:\t\t{}".format(len(test_df)))
        print("- Classes:\t\t{}".format(self.types))
        print("=================================\n\n")

        self.train_model(ids_train, labels_train, ids_test, labels_test)

Example #2

0

Show file

File: labels.py Project: michaeloc/its_research

 def __init__(self, bus_stop_distance, traffic_light_distance, array_stops,
              array_trfl):
     super().__init__()
     self.bst_dist = bus_stop_distance
     self.trfl_dist = traffic_light_distance
     self.array_stops = array_stops
     self.array_trfl = array_trfl
     self.prepro = PreProcess()

Example #3

0

Show file

File: full.py Project: hzy-zg/License-Plate-Recognition-Nigerian-vehicles

def execute_ALPR(event):
    """
    runs the full license plate recognition process.
    function is called when user clicks on the execut button on the gui
    """

    #time the function execution
    start_time = time.time()

    root_folder = os.path.dirname(os.path.realpath(__file__))
    models_folder = os.path.join(root_folder, 'ml_models')
    pre_process = PreProcess(imagepath)
    
    plate_like_objects = pre_process.get_plate_like_objects()
    # plotting.plot_cca(pre_process.full_car_image,
    #     pre_process.plate_objects_cordinates)

    license_plate = license_plate_extract(plate_like_objects, pre_process)

    if len(license_plate) == 0:
        return False

    ocr_instance = OCROnObjects(license_plate)

    if ocr_instance.candidates == {}:
        wx.MessageBox("No character was segmented",
            "Character Segmentation" ,wx.OK|wx.ICON_ERROR)
        return False

    # plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates'])

    deep_learn = DeepMachineLearning()
    text_result = deep_learn.learn(ocr_instance.candidates['fullscale'],
        os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'),
        (20, 20))

    text_phase = TextClassification()
    scattered_plate_text = text_phase.get_text(text_result)
    plate_text = text_phase.text_reconstruction(scattered_plate_text,
        ocr_instance.candidates['columnsVal'])
    
    print 'ALPR process took '+ str(time.time() - start_time)  + ' seconds'
    
    listResult.InsertStringItem(listRow, plate_text)
    plate_num = Mvrd(plate_text)
    details = plate_num.get_data()
    if details == False or details == {}:
        wx.MessageBox("Vehicle Information could not be retrieved",
            "Information Retrieval", wx.OK|wx.ICON_ERROR)
        return False;
    listResult.SetStringItem(listRow, 1, details['Owner Name'])
    listResult.SetStringItem(listRow, 2, details['Isssue Date'])
    listResult.SetStringItem(listRow, 3, details['Expiry Date'])
    listResult.SetStringItem(listRow, 4, details['Chasis Number'])
    listResult.SetStringItem(listRow, 5, details['Model'])
    #db_aspect.save_alpr(plate_text, str(datetime.today()))

Example #4

0

Show file

def train(config, device, RS='Supervised'):
    # Init tokenizer.
    tokenizer = Tokenizer(config.temp_dir, config.jieba_dict_file,
                          config.remove_stopwords, config.stopwords_file,
                          config.ivr)
    # Init feature index.
    feature_index = FeatureIndex(config, tokenizer=tokenizer)
    file_list = [config.labeled_file]
    if config.extra_train_file is not None:
        file_list.append(config.extra_train_file)
    if config.valid_file is not None:
        file_list.append(config.valid_file)
    feature_index.build_index(file_list)
    # Preprocess data.
    pre_process = PreProcess(config)
    train_data_dir, valid_data_dir, final_train_file, final_valid_file = pre_process.train_preprocess(
    )
    # Get PyTorch dataset.
    train_dataset = MixnetDataset(config, train_data_dir, feature_index,
                                  tokenizer)
    valid_dataset = MixnetDataset(config, valid_data_dir, feature_index,
                                  tokenizer, True)
    # Get NER model if necessary and compatible.
    need_ner = False
    for (feature, feature_config) in config.feature_config_dict.items():
        need_ner = need_ner or ("text" in feature_config.get(
            "type", "") and feature_config.get("seg_type", "word") == "char"
                                and feature_config.get("ner", False))
    if need_ner:
        logger.info("Enable NER, loading NER model...")
        # Use predict mode since we cannot train it without tag information.
        ner_model = NERModel(device, "predict")
    else:
        logger.info("Disable NER.")
        ner_model = None
    # Get PyTorch data loader.
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=1,
                                   shuffle=False,
                                   num_workers=config.read_workers)
    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=1,
                                   shuffle=False,
                                   num_workers=config.read_workers)
    # Init model.
    model = MixNet(config.model_config_dict,
                   config.output_config_dict,
                   feature_index.feature_info_dict,
                   feature_index.label_info_dict,
                   ner_model=ner_model)
    # Train model.
    solver = Solver(config, train_data_loader, valid_data_loader,
                    feature_index, model, device, RS)
    solver.build()
    solver.train()

Example #5

0

Show file

File: app.py Project: gaybro8777/document-classification

def submit():
    if request.form['text_input'] == "" or len(
            request.form['text_input']) < 10:
        return "Please provide input large enough, Classifier can understand :)"

    # todo: change column name to be dynamically taken from training file
    test_data = pd.DataFrame([request.form['text_input']],
                             columns=['Document'])
    session_id = request.cookies['session']
    path = os.path.join(app.config['UPLOAD_FOLDER'], session_id)
    trained_classifier = [i for i in os.listdir(path) if '.pkl' in i]
    vectorizer = os.path.join(path, 'tfidf_vectorizer.pk')
    tfidf_transformer = joblib.load(vectorizer)
    pre_processor = PreProcess(test_data, column_name='Document')
    test_data = pre_processor.clean_html()
    test_data = pre_processor.remove_non_ascii()
    test_data = pre_processor.remove_spaces()
    test_data = pre_processor.remove_punctuation()
    test_data = pre_processor.stemming()
    test_data = pre_processor.lemmatization()
    test_data = pre_processor.stop_words()
    test_data1 = tfidf_transformer.transform(test_data.Document)
    result = {}
    for clf in trained_classifier:
        model = joblib.load(os.path.join(path, clf))
        print(clf, model.predict(test_data1)[0])
        classifier_name = clf.split('/')[-1].split('.')[0]
        result[classifier_name] = model.predict(test_data1)[0]
        print(result)
    return render_template('results.html', result=result)

Example #6

0

Show file

def execute_ALPR():
    """
    runs the full license plate recognition process.
    function is called when user clicks on the execute button on the gui
    """

    # time the function execution
    start_time = time.time()

    root_folder = os.path.dirname(os.path.realpath(__file__))
    models_folder = os.path.join(root_folder, 'ml_models')

    # import requests

    # r = requests.get(url, auth=('admin', 'admin'))
    # file = open(imagepath, "w")
    # file.write(r.content)
    # file.close()
    imagepath=url
    pre_process = PreProcess(imagepath)


    plate_like_objects = pre_process.get_plate_like_objects()
    # plotting.plot_cca(pre_process.full_car_image,
    #     pre_process.plate_objects_cordinates)

    license_plate = license_plate_extract(plate_like_objects, pre_process)

    if len(license_plate) == 0:
        return False

    ocr_instance = OCROnObjects(license_plate)

    if ocr_instance.candidates == {}:
        # print("No Characters Was Segmented")
        # wx.MessageBox("No character was segmented",
        #     "Character Segmentation" ,wx.OK|wx.ICON_ERROR)
        return False

    # plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates'])

    deep_learn = DeepMachineLearning()
    text_result = deep_learn \
        .learn(ocr_instance.candidates['fullscale'], os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'),
               (20, 20))

    text_phase = TextClassification()
    scattered_plate_text = text_phase.get_text(text_result)
    plate_text = text_phase.text_reconstruction(scattered_plate_text, ocr_instance.candidates['columnsVal'])

    # print('ALPR process took ' + str(time.time() - start_time) + ' seconds')

    print(plate_text)

Example #7

0

Show file

File: gis.py Project: Yinqamster/CS506-Spring2020-Projects

    def __init__(self):
        self.preprocess = PreProcess()
        self.CDBG_map = None
        self.precincts_wards_map = None

        self.directory_path = 'result'
        if not os.path.exists(self.directory_path):
            os.mkdir(self.directory_path)

        self.refuse_routes_directory_path = os.path.join(self.directory_path, 'refuse_routes')
        if not os.path.exists(self.refuse_routes_directory_path):
            os.mkdir(self.refuse_routes_directory_path)

Example #8

0

Show file

File: contour.py Project: mbencherif/pyXKin

    def run(self, hand, rgb=False, box=None, hand_crop=None, depth=None):
        if rgb and box and hand_crop.any() and depth.any():
            phand = self._rgb_hand_seg(hand, box, hand_crop, depth)
        else:
            phand = PreProcess().median_smooth(hand, self.MEDIAN_DIM)

        tmp  = phand.copy()
        cont = self._get_largest_contour(phand)
        box  = cv2.boundingRect(cont)
        crop = self._crop_box(tmp, box)
        self._contour = cont
        self._box = box
        return (cont, box, crop)

Example #9

0

Show file

File: document_classification.py Project: ashhadulislam/document-classification

def main():
    data = get_data(
        '/Users/aditya1/Documents/Document_Classification/bbc-dataset')

    ###############################################################################
    # Data Pre-processing steps
    ###############################################################################
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    pre_processor_operations = ['clean_html']
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()

    ###############################################################################
    # Feature extraction
    ###############################################################################

    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    # print(train_x.shape, train_y.shape)
    # print(test_x.shape, test_y.shape)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    joblib.dump(tfidf_transformer, 'vectorizer.pkl')
    test_vectors = tfidf_transformer.transform(test_x)
    print(data.head())

    ###############################################################################
    # Perform classification with SVM, kernel=linear
    model1 = svm.SVC(kernel='linear')
    model1.fit(train_vectors, train_y)
    joblib.dump(model1, 'SVM.pkl')
    y_pred_class = model1.predict(test_vectors)
    print(metrics.accuracy_score(test_y, y_pred_class))
    print("Prediction score for classifier %s:\n%s\n" %
          (model1, metrics.accuracy_score(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model1, metrics.classification_report(test_y, y_pred_class)))

    model2 = MultinomialNB()
    model2.fit(train_vectors, train_y)
    joblib.dump(model2, 'MultinomialNB.pkl')
    y_pred_class = model2.predict(test_vectors)
    print("Accuracy score:", metrics.accuracy_score(test_y, y_pred_class))
    print("Confusion Matrix for classifier %s:\n%s\n" %
          (model2, metrics.confusion_matrix(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model2, metrics.classification_report(test_y, y_pred_class)))

Example #10

0

Show file

def validate_test():
    from preprocess import PreProcess
    p = PreProcess()

    global model, train_indices, val_indices, train_loader, test_loader, optimizer

    model = MonocularVelocityNN(initial_depth=config["depth"])

    dataset = VideoDataLoader(directory=str(Path.cwd() / "data/testprocdir"),
                              delta=config["delta"],
                              Y=p.labels,
                              depth=config["depth"])

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    val_indices = list(
        range(1, dataset_size - config["delta"] - config["depth"]))

    # valid_sampler = Sampler(val_indices)

    test_loader = DataLoader(dataset,
                             batch_size=config["test_batch_size"],
                             sampler=val_indices)

    load_weights(Path.cwd() / "data/weights.pt")

    if not device:
        raise RuntimeError("Only use model with Cuda")
    model.to(device)

    validate(is_labels=False)

Example #11

0

Show file

    def main(self):
        preprocess = PreProcess()
        self.wordsList, self.wordVectors = preprocess.load_word2vec()
        data, list_id, labels, types, max_length = preprocess.load_mutations()
        self.maxSeqLength = max_length + 1

        # create dictionary of type and it's respective value in int
        count = 0
        for i in types:
            dic = {i: count}
            self.types.update(dic)
            count = count + 1

        self.numClasses = len(self.types)

        train_df, test_df, labels_train, labels_test = self.split_data(
            data, labels)

        # remove last element to make it even number
        train_df = train_df[:-1]
        labels_train = labels_train[:-1]
        div = int(len(train_df) / self.k)

        # get K = 5 batches
        train_1, train_2, train_3, train_4, train_5 = [
            train_df[i:i + div] for i in range(0, len(train_df), div)
        ]
        labels_1, labels_2, labels_3, labels_4, labels_5 = [
            labels_train[i:i + div] for i in range(0, len(labels_train), div)
        ]

        ids_test = self.create_matrix_teste(test_df)
        ids_train1 = self.create_matrix_train(train_1)
        ids_train2 = self.create_matrix_train(train_2)
        ids_train3 = self.create_matrix_train(train_3)
        ids_train4 = self.create_matrix_train(train_4)
        ids_train5 = self.create_matrix_train(train_5)

        train_bins = [train_1, train_2, train_3, train_4, train_5]
        label_bins = [labels_1, labels_2, labels_3, labels_4, labels_5]
        ids_train = [
            ids_train1, ids_train2, ids_train3, ids_train4, ids_train5
        ]

        self.train_model(train_bins, label_bins, ids_train)

Example #12

0

Show file

File: preprocess_gui.py Project: ywang021/HMERV

    def __init__(self, pic_name, flags=0):
        PreProcess.__init__(self, pic_name, flags)

        cv2.namedWindow(winname=PreProcessBoard.winname, flags=1)
        h, w = self.img.shape
        print(self.img.shape)
        # self.resize_board(h/2, w/2)
        cv2.createTrackbar('kernel size', PreProcessBoard.winname, 0, 5, nothing)
        cv2.createTrackbar('bin thresh', PreProcessBoard.winname, 127, 255, nothing)
        cv2.createTrackbar('lambda', PreProcessBoard.winname, 0, 100, nothing)

        while 1:
            s = cv2.getTrackbarPos('kernel size', PreProcessBoard.winname)
            thresh = cv2.getTrackbarPos('bin thresh', PreProcessBoard.winname)
            kernel = np.ones((s, s), np.uint8)
            lamda = cv2.getTrackbarPos('lambda', PreProcessBoard.winname)

            # img_mor = cv2.morphologyEx(self.img, op=cv2.MORPH_CLOSE, kernel=kernel)
            # ret, img_bin = cv2.threshold(self.img, thresh, 255, cv2.THRESH_BINARY)
            self.restore()
            self.img = 255 - self.img
            self.lambda_binary(30 / 100.0)
            self.binarize(thresh, 255, type=cv2.THRESH_BINARY)
            self.img = 255 - self.img
            img1 = self.img.copy()

            self.restore()
            self.img = 255 - self.img
            self.lambda_binary(lamda/100.0)
            self.binarize(thresh, 255, type=cv2.THRESH_BINARY)
            # self.gauss_blur(2)
            # self.morph(True)
            self.img = 255 - self.img

            self.img = (self.img+img1)/2
            self.binarize(thresh, 255, type=cv2.THRESH_BINARY)
            # self.binarize(thresh, 255, cv2.THRESH_BINARY_INV)
            # cv2.imshow(PreProcessBoard.winname, img_mor)
            key = cv2.waitKey(1)
            if key == 27:
                break
            else:
                cv2.imshow(PreProcessBoard.winname, self.img)
                cv2.imwrite('./data/r1.jpg', self.img)

Example #13

0

Show file

def train():
    from preprocess import PreProcess
    p = PreProcess()

    global model, train_indices, val_indices, train_loader, test_loader, optimizer

    model = MonocularVelocityNN(initial_depth=config["depth"])

    dataset = VideoDataLoader(directory=str(Path.cwd() / "data/processed"),
                              delta=config["delta"],
                              Y=p.labels,
                              depth=config["depth"])

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(1, dataset_size - config["delta"] - config["depth"]))
    split = int(np.floor(config["split"] * dataset_size))

    if config["randomize"]:
        np.random.seed(0)
        np.random.shuffle(indices)

    train_indices, val_indices = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = DataLoader(dataset,
                              batch_size=config["batch_size"],
                              sampler=train_sampler)
    test_loader = DataLoader(dataset,
                             batch_size=config["test_batch_size"],
                             sampler=valid_sampler)

    if config["TRAIN"]:
        if not device:
            raise RuntimeError("Only use model with Cuda")
        model.to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

        for epoch in range(1, config["epochs"] + 1):
            try:
                train(epoch)
                validate()
            except:
                raise

    else:
        load_weights(Path.cwd() / "data/weights.pt")

        if not device:
            raise RuntimeError("Only use model with Cuda")
        model.to(device)

        validate()

Example #14

0

Show file

def execute_ALPR(event):
    """
    runs the full license plate recognition process.
    function is called when user clicks on the execut button on the gui
    """

    #time the function execution
    start_time = time.time()

    root_folder = os.path.dirname(os.path.realpath(__file__))
    models_folder = os.path.join(root_folder, 'ml_models')
    pre_process = PreProcess(imagepath)
    
    plate_like_objects = pre_process.get_plate_like_objects()
    # plotting.plot_cca(pre_process.full_car_image,
    #     pre_process.plate_objects_cordinates)

    license_plate = license_plate_extract(plate_like_objects, pre_process)

    if len(license_plate) == 0:
        return False
            
    ocr_instance = OCROnObjects(license_plate)

    #plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates'])

    deep_learn = DeepMachineLearning()
    text_result = deep_learn.learn(ocr_instance.candidates['fullscale'],
        os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'),
        (20, 20))

    text_phase = TextClassification()
    scattered_plate_text = text_phase.get_text(text_result)
    plate_text = text_phase.text_reconstruction(scattered_plate_text,
        ocr_instance.candidates['columnsVal'])
    
    print 'ALPR process took '+ str(time.time() - start_time)  + ' seconds'

    listResult.InsertStringItem(listRow, plate_text)
    listResult.SetStringItem(listRow, 1, str(datetime.today()))

    db_aspect.save_alpr(plate_text, str(datetime.today()))

Example #15

0

Show file

class TestPreProcess():
    @classmethod
    def setup_class(self):
        image_path = path.join(path.dirname(path.realpath(__file__)))
        image_path = path.join(
            path.split(image_path)[0], 'test_images', 'car6.jpg')
        self.pre_process = PreProcess(image_path)

    def test_threshold(self):
        print 'Testing the threshold function'
        bin_image = self.pre_process.threshold(self.pre_process.full_car_image)
        assert bin_image.shape == (548, 700)

Example #16

0

Show file

def interence_main():
    CHECKPOINT_PATH = './model/seq2seq_ckpt-7800'
    preprocess = PreProcess()
    # with tf.variable_scope('nmt_model', reuse=None):
    model = NMTModel()
    test_sentence = 'this is a test.'
    test_sentence = preprocess.english2id(test_sentence)
    print(preprocess.id2english(test_sentence))
    output_op = model.inference(test_sentence)
    sess = tf.Session()
    saver = tf.train.Saver()
    saver.restore(sess, CHECKPOINT_PATH)
    with tf.variable_scope('nmt_model', reuse=None):
        model = NMTModel()
        test_sentence = 'this is a test.'
        test_sentence = preprocess.english2id(test_sentence)
        output_op = model.inference(test_sentence)
        sess = tf.Session()
        saver = tf.train.Saver()
        saver.restore(sess, CHECKPOINT_PATH)

        output = sess.run(output_op)

        print(output)
        print(preprocess.id2chinese(output))
        sess.close()

Example #17

0

Show file

class TestPreProcess():
    @classmethod
    def setup_class(self):
        image_path = path.join(path.dirname(path.realpath(__file__)))
        image_path = path.join(
            path.split(image_path)[0], 'test_images', 'car6.jpg')
        self.image_array = imread(image_path, as_grey=True)
        self.pre_process = PreProcess(image_path)

    def test_resize_if_necessary(self):
        print 'Testing the resize function'
        resized_image = self.pre_process.resize_if_necessary(self.image_array)
        assert resized_image.shape == (470, 600)

Example #18

0

Show file

File: app.py Project: gaybro8777/document-classification

def read_process_data(path, files_path):
    data = pd.read_csv(path)
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()
    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    vectorizer_path = os.path.join(files_path, 'tfidf_vectorizer.pk')
    joblib.dump(tfidf_transformer, vectorizer_path)
    return train_vectors, train_y

Example #19

0

Show file

def train(start=1, end=31):
    # preprocess training data
    preprocess = PreProcess()
    preprocess.set_date_range(start, end)
    preprocess.process()
    la = LineAnalysis()
    la.set_date_range(start, end)
    la.process()
    # train model
    train = Train()
    train.fit()
    return

Example #20

0

Show file

def main():
    dowload_dataset()
    preprocess = PreProcess()
    df = preprocess.load_tsv()
    features = preprocess.clean_data(df)
    features = preprocess.balance_data(features, 15000)
    x_train, y_train, x_test, y_test = preprocess.split_data(features)

    y_train_round, y_test_round = preprocess.round_labels(y_train, y_test)
    y_train_one, y_test_one = preprocess.labels_to_one_hot(
        y_train_round, y_test_round)
    lr = Logistic_Regression()
    lr.fit_and_evaluate(x_train, y_train_round, x_test, y_test_round)
    nn1 = NeuralNetwork1()
    nn1.fit_and_evaluate(x_train, y_train_one, x_test, y_test_one)
    linr = Linear_Regression()
    linr.fit_and_evaluate(x_train, y_train, x_test, y_test)
    nn2 = NeuralNetwork2()
    nn2.fit_and_evaluate(x_train, y_train, x_test, y_test)

    print("\n___________________End of the output___________________")

Example #21

0

Show file

File: main.py Project: saisrikarparuchuri/Data-Mining

    def __init__(self):
        super().__init__()
        self.folderPath = "textData"

        # without transforming string
        # List structure: [[data1][data1_label][data2][data2_label][data3][data3_label]]
        #self.lemmatizedList = self.preProcess.load_data1(self.folderPath)
        #Extract data
        #self.dataList = self.getDataList()
        #print(self.dataList.__len__())
        #self.allWords = self.extractSentence()
        #print(self.allWords.__len__())
        #Extract label
        #self.Labels = self.extractLabels()
        #print(self.Labels.__len__())
        # transform lemmatized_list(3 generators) into a long string separate by space
        #self.sparseWords =  self.preProcess.cleanByFrequency(self.allWords)
        # filter by sequence
        #self.cleanedSentences = self.preProcess.getCleanedSent(self.sparseWords,self.dataList)
        '''Pre Process'''
        self.preProcess = PreProcess(self.folderPath)
        '''i = 0
        for x in self.preProcess.cleanedSentences:
            i+=1
            print(str(i) + ": ")
            print(x)'''
        

        # vectorise remnant of sentences
        #self.X_train = self.preProcess.vector_Data(self.cleanedSentences,self.Labels)
        '''Process Sentence Level'''
        self.sentenceLevel = SentenceLevel(self.preProcess.cleanedSentences)

        '''Process Document Level'''
        # Start TensorFlow Session
        self.documentLevel = DocumentLevel(self.sentenceLevel.docInput,self.preProcess.Labels)

        # display results
        #self.displayData()
        '''Summary All Process'''

Example #22

0

Show file

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils

# spark, elephas
from keras.optimizers import SGD, Adam

# classes
from preprocess import PreProcess
from file_io import FileIO
import datetime

## MODEL ##

p = PreProcess('./datasets/ag_dataset.txt')
x_train, x_test, y_train, y_test, num_classes = p.run()

# Convert class vectors to binary class matrices
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)
# Reshape
dimension = x_train.shape[1]
x_train = x_train.reshape(x_train.shape[0], 1, dimension, 1)
x_test = x_test.reshape(x_test.shape[0], 1, dimension, 1)
print('# Training Data', x_train.shape, y_train.shape)
print('# Testing Data', x_test.shape, y_test.shape)

# model config
epoch_step = 10
pool_size = (1, 2)

Example #23

0

Show file

from corpus_reader import CorpusReader
from preprocess import PreProcess
from tf_idf import TfIdf
from knn import KNN
from metrics import MetricsGenerator
from pprint import pprint as pp

if __name__ == '__main__':
  print('reading...')
  reader = CorpusReader()
  reader.run()
  
  parser = PreProcess()
  parsed_trainning_documents = {}
  print('processing...')
  for k, v in reader.train.items():
    parsed_trainning_documents[k] = parser.process(v)
  
  # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
  # Receberá como entrada um array de tuplas: ([tokens], classe)
  parsed_trainning_documents_with_classes = []
  for k in parsed_trainning_documents.keys():
    parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
  
  # Execução tf-idf
  print('generating tf.idf...')
  tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
  tf_idf_calculator.run()
  
  # testa os parâmetros do knn: métrica de distância e valor de K
  for metric in ['cosine', 'euclid']:

Example #24

0

Show file

File: train_test.py Project: livingbio/avazu-ctr-predict

from preprocess import PreProcess
import numpy as np
import logging
import csv
from sklearn.svm import SVC

if __name__ == "__main__":
    logging.basicConfig(
        format="--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s", datefmt="%Y/%m/%d %H:%M:%S", level=logging.INFO
    )
    logging.info("train_test.py Start")
    SAMPLE = 50000
    p = PreProcess()

    train_filepath = "data/train_1M.csv.out"
    # train_filepath = 'data/train_1000.csv.out'
    # test_filepath = 'data/test.csv.out'
    test_filepattern = "data/test_%d_M.out"

    # Load train data
    logging.info("Loading train set...")
    X_train, y_train = p.load_train_data(train_filepath)

    # Sampling
    if y_train.shape[0] > SAMPLE:
        X_train = X_train[:SAMPLE]
        y_train = y_train[:SAMPLE]
    else:
        SAMPLE = y_train.shape[0]
    logging.info("Sampling %d" % SAMPLE)
    logging.info("Shape X_train = %r, y_train =%r" % (X_train.shape, y_train.shape))

Example #25

0

Show file

File: train_knn_cv.py Project: livingbio/avazu-ctr-predict

from preprocess import PreProcess
import numpy as np
import logging
import csv
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

if __name__ == "__main__":
    logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO)
    logging.info("train_knn_cv.py Start")
    p = PreProcess()

    #out_filepath = 'data/train_1M.csv.out'
    #out_filepath = 'data/train_s404_10K.out'
    out_filepath = 'data/train_1000.csv.out'

    #Load data
    #X, y = p.load_train_data(out_filepath)
    #Load data with category
    X, y, enc, map_dict = p.load_train_data(out_filepath, category = True)
    logging.info("Shape X = %r, y =%r" %(X.shape, y.shape ))
    logging.info("example X = %s\ny =%r" %(X[0], y[0]))
    logging.info("classes: %r" % list(np.unique(y)))

    #Sampling
    #At least 3
    POWER = 6
    CONST = 1
    #At least 2
    n_subsamples = CONST*10**POWER
    n_size = y.shape[0]

Example #26

0

Show file

File: train_sgdc_cv.py Project: livingbio/avazu-ctr-predict

from preprocess import PreProcess
import numpy as np
import logging
import csv
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV

if __name__ == "__main__":
    logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO)
    logging.info("train_sgdc_cv.py Start")
    p = PreProcess()

    out_filepath = 'data/train_1M.csv.out'
    #out_filepath = 'data/train_s404_100K.out.1vs1'
    #out_filepath = 'data/train_1000.csv.out'

    #Load data
    X, y = p.load_train_data(out_filepath)
    logging.info("Shape X = %r, y =%r" %(X.shape, y.shape ))
    logging.info("example X = %s\ny =%r" %(X[0], y[0]))
    logging.info("classes: %r" % list(np.unique(y)))

    #Sampling
    #At least 3
    POWER = 6
    CONST = 1
    #At least 2
    #CV = 5
    n_subsamples = CONST*10**POWER
    n_size = y.shape[0]
    if n_subsamples < n_size:

Example #27

0

Show file

File: testing.py Project: rodrigonogueira4/fingerprint_liveness

    def run_pipeline(self, feat_extract_name, X_train, y_train, X_test=None, y_test=None):

        if self.DEBUG >= 1:
            print "training..."
            antes1 = datetime.now()

        pproc = PreProcess(
            feat_extract_name=feat_extract_name,
            n_processes=self.n_processes_pproc,
            size_percentage=self.size_percentage,
            roi=self.roi,
            high_pass=self.high_pass,
            low_pass=self.low_pass,
            gauss_noise=self.gauss_noise,
            feature_extractor__method=self.lbp__method,
            feature_extractor__n_tiles=self.lbp__n_tiles,
            feature_extractor__n_filters=self.n_filters,
            feature_extractor__shape_norm=self.shape_norm,
            feature_extractor__shape_conv=self.shape_conv,
            feature_extractor__shape_pool=self.shape_pool,
            feature_extractor__stride_pool=self.stride_pool,
            feature_extractor__stoc_pool=self.stoc_pool,
            feature_extractor__div_norm=self.div_norm,
            feature_extractor__region_shape=self.region_shape,
            feature_extractor__region_stride=self.region_stride,
            feature_extractor__top_regions=self.top_regions,
            feature_extractor__stride_pool_recurrent=self.stride_pool_recurrent,
            feature_extractor__analysis_shape=self.analysis_shape,
            multi_column=self.multi_column,
            augmentation=self.augmentation,
            aug_rotate=self.aug_rotate,
        )
        norm = preprocessing.StandardScaler(copy=True)

        piplist = []
        if self.cross_validation:
            piplist.append(("pproc", pproc))
        piplist.append(("norm", norm))

        if self.use_pca:
            pca = RandomizedPCA2(
                whiten=True, random_state=0, n_components=self.pca__n_components, copy=True
            )  # Must use fit_transform instead of fit() and then transform() when copy=false
            # from sklearn.decomposition import PCA
            # pca = PCA(whiten=True, n_components=self.pca__n_components, copy=True)
            # pca = FastICA(whiten=True,random_state=0, n_components=self.pca__n_components, max_iter=400)
            # pca = SparsePCA(random_state=0, n_components=self.pca__n_components) #Must use fit_transform instead of fit() and then transform() when copy=false
            piplist.append(("pca", pca))

        if self.use_lda:
            lda = LDA(n_components=self.lda__n_components)
            piplist.append(("lda", lda))

        if self.predict.lower() == "svm":
            if self.svm__kernel.lower() == "rbf":
                pred = SVC2(
                    kernel="rbf",
                    class_weight="auto",
                    random_state=0,
                    C=self.svm__C,
                    gamma=self.svm__gamma,
                    multi_column=self.multi_column,
                    augmentation=self.augmentation,
                    aug_rotate=self.aug_rotate,
                )
            else:
                pred = LinearSVC2(
                    random_state=0,
                    fit_intercept=False,
                    class_weight="auto",
                    C=self.svm__C,
                    augmentation=self.augmentation,
                )
        elif self.predict.lower() == "sgd":
            pred = SGD2(
                loss="hinge",
                penalty="l2",
                l1_ratio=0.05,
                random_state=0,
                n_iter=5,
                shuffle=True,
                augmentation=self.augmentation,
                alpha=self.sgd__alpha,
            )
        elif self.predict.lower() == "knn":
            pred = KNeighborsClassifier(n_neighbors=self.knn__n_neighbors, weights=self.knn__weights)
        piplist.append(("pred", pred))

        pipeline = Pipeline(piplist)

        if self.cross_validation:
            params_grid = self.params_auto.copy()
            params_grid.update(self.dicPredict[self.predict])
            params_grid.update(self.params_pproc)
            if feat_extract_name.lower() != "none":
                params_grid.update(self.dicfeat_extract[feat_extract_name])

            pipelineGridSearch = GridSearchCV2(
                pipeline,
                params_grid,
                cv=self.n_folds,
                verbose=0,
                n_jobs=self.n_processes_cv,
                n_jobs_last_estimator=self.n_processes_cv_last_estimator,
                augmentation=self.augmentation,
                auto_adjust_params=None,
                testing=self,
            )
            pipelineGridSearch.fit(X_train, y_train)

            # gridsearchRef = GridSearchCV(pipeline, params_grid, cv=self.n_folds, iid=True, scoring = 'roc_auc', verbose=0, n_jobs=1)
            # gridsearchRef.fit(X_train,y_train)
            # print 'ReF=== score=', gridsearchRef.best_score_,'params=', gridsearchRef.best_params_

            return pipelineGridSearch.best_score_, pipelineGridSearch.best_params_
        else:
            antes = datetime.now()

            X_train = pproc.transform(X_train)
            antes2 = datetime.now()
            X_test = pproc.transform(X_test)
            time_pproc = datetime.now() - antes2
            if self.multi_column:

                y_pred_train = []
                y_pred_test = []
                if self.aug_rotate:
                    multiply = 30
                else:
                    multiply = 10

                for i in range(multiply):
                    pipeline = pipeline.fit(X_train[i::multiply, :], y_train)
                    y_pred_train.append(pipeline.predict(X_train[i::multiply, :]))
                    y_pred_test.append(pipeline.predict(X_test[i::multiply, :]))

                y_pred_train = np.mean(np.asarray(y_pred_train), axis=0)
                y_pred_train[y_pred_train >= 0] = 1
                y_pred_train[y_pred_train < 0] = 0

                y_pred_test = np.mean(np.asarray(y_pred_test), axis=0)
                y_pred_test[y_pred_test >= 0] = 1
                y_pred_test[y_pred_test < 0] = 0
            else:
                pipeline = pipeline.fit(X_train, y_train)

                # save the classifier
                with open(
                    self.temp_dir
                    + "clf_"
                    + self.datasettrain.lower()
                    + "_"
                    + self.sensortrain.lower()
                    + "_"
                    + self.feat_extract_name.lower()
                    + ".pkl",
                    "wb",
                ) as output:
                    pickle.dump(pipeline, output, pickle.HIGHEST_PROTOCOL)

                y_pred_train = pipeline.predict(X_train)

                antes2 = datetime.now()
                y_pred_test = []
                for i in range(0, len(X_test), self.mini_batch_size_test):
                    y_pred_test.extend(list(pipeline.predict(X_test[i : i + self.mini_batch_size_test])))

                test_time = (datetime.now() - antes2) + time_pproc

                print "Tempo Predict= ", test_time  # DEBUG
                print "Numero de amostras", str(len(X_test))

            score_training = 100.0 - (100.0 * roc_auc_score(y_train, y_pred_train))
            print "score_training=", score_training

            score = 100.0 - (100.0 * roc_auc_score(y_test, np.asarray(y_pred_test)))
            total_time = datetime.now() - antes  # DEBUG

            pca = pipeline.steps[-2][1]
            pca_total_variance = None
            if hasattr(pca, "explained_variance_ratio_"):
                pca_total_variance = np.sum(pca.explained_variance_ratio_)
            pred = pipeline.steps[-1][1]
            n_support_ = None
            if hasattr(pred, "n_support_"):
                n_support_ = pred.n_support_

            self.append_results(
                params=None,
                score_mean=score,
                score_std=None,
                total_time=total_time,
                test_time=test_time,
                n_test_samples=str(len(X_test)),
                score_training=score_training,
                n_svm_vectors=n_support_,
                pca_total_variance=pca_total_variance,
            )
            return score, None

        if self.DEBUG >= 1:
            print "Tempo Fit Pipeline= ", (datetime.now() - antes1)  # DEBUG

Example #28

0

Show file

	def post(self):
		popu.drop
		abc = PreProcess()
		return {'inserted':'everything'},301

Example #29

0

Show file

File: labels.py Project: michaeloc/its_research

class Labels(PreprocessData):
    def __init__(self, bus_stop_distance, traffic_light_distance, array_stops,
                 array_trfl):
        super().__init__()
        self.bst_dist = bus_stop_distance
        self.trfl_dist = traffic_light_distance
        self.array_stops = array_stops
        self.array_trfl = array_trfl
        self.prepro = PreProcess()

    def add_bus_stop_label(self, data):
        ''' this method is used with multiprocessing
        item[4] is the velocity'''
        chunck = []
        for items in tqdm(data):
            final_item = []
            for item in items:
                for stop in self.array_stops:
                    #                     dist = self.prepro.distance_in_meters([item[0],item[1]], [stop[4],stop[5]])
                    dist = self.prepro.distance_in_meters([item[0], item[1]],
                                                          [stop[1], stop[2]])
                    if item[4] < 5 and dist < self.bst_dist:
                        print('bustop')
                        item.append('bus_stop')
                        break
                final_item.append(item)
            chunck.append(final_item)
        return chunck

    def add_traffic_light_label(self, data):
        chunck = []
        for items in tqdm(data):
            final_item = []
            for item in items:
                for stop in self.array_trfl:
                    #                     dist = self.prepro.distance_in_meters([item[0],item[1]], [stop[7],stop[8]])
                    dist = self.prepro.distance_in_meters([item[0], item[1]],
                                                          [stop[1], stop[2]])
                    if item[4] < 5 and dist < self.trfl_dist and item[
                            10] != 'bus_stop':
                        item[10] = 'traffic_light'
                        break
                final_item.append(item)
            chunck.append(final_item)
        return chunck

    def add_other_stop_label(self, data):
        for items in tqdm(data):
            for item in items:
                if item[4] < 5 and item[10] == 'in_route':
                    item[10] = 'other_stop'

    def get_false_labels(self, data, label, min_dist):
        ''' Remove labels other_stop that is between bus or traffic_light'''
        count_b, count_a = [], []
        for items in tqdm(data):
            for idx in range(len(items) - 1):
                if idx > 0 and idx < (len(items) - 1):
                    lat_lng_b = [items[idx - 1][0], items[idx - 1][1]]
                    lat_lng_a = [items[idx + 1][0], items[idx + 1][1]]
                    lat_lng_c = [items[idx][0], items[idx][1]]
                    if items[idx][16]==label and ((items[idx-1][16]==0.0 or items[idx-1][16]==3.0)\
                    and (items[idx+1][16]==0.0 or items[idx+1][16]==3.0))\
                    and (self.prepro.distance_in_meters(lat_lng_c, lat_lng_b)<min_dist or self.prepro.distance_in_meters(lat_lng_c, lat_lng_a)<min_dist):
                        print(
                            f'before:{items[idx-1][16]}----current:{items[idx][16]}----after:{items[idx+1][16]}'
                        )
                        print(
                            f'before:{self.prepro.distance_in_meters(lat_lng_c, lat_lng_b)}----after:{self.prepro.distance_in_meters(lat_lng_c, lat_lng_a)}'
                        )
                        count_b.append(
                            self.prepro.distance_in_meters(
                                lat_lng_c, lat_lng_b))
                        count_a.append(
                            self.prepro.distance_in_meters(
                                lat_lng_c, lat_lng_a))
                        items[idx][16] = -1

Example #30

0

Show file

 def setup_class(self):
     image_path = path.join(path.dirname(path.realpath(__file__)))
     image_path = path.join(
         path.split(image_path)[0], 'test_images', 'car6.jpg')
     self.image_array = imread(image_path, as_grey=True)
     self.pre_process = PreProcess(image_path)

Example #31

0

Show file

File: main.py Project: samip-thakkar/iphone-sales-analysis

# -*- coding: utf-8 -*-
"""

@author: Samip
"""

from preprocess import PreProcess
from classifier import Classifier
from modelEvaluation import ModelEvaluation

#Create the objects
pp = PreProcess()
classifier = Classifier()
me = ModelEvaluation()

#Preprocess the data
x_train, x_test, y_train, y_test = pp.scale_data()

choice = int(
    input(
        "Enter 1 for Logistic Regression, 2 for Decision Tree Classifier, 3 for KNN, 4 for Naive Bayes, 5 for Random Forest, 6 for SVM, 7 for XG Boost, 8 for Adaptive Boosting, 9 for LDA: "
    ))
clf = {
    1: classifier.logistic_regression,
    2: classifier.decision_tree_classifer,
    3: classifier.knn,
    4: classifier.naive_bayes,
    5: classifier.random_forest,
    6: classifier.svm,
    7: classifier.xg_boost,
    8: classifier.ada_boost,

Example #32

0

Show file

File: word_frequency.py Project: ThapaMahesh/text-analysis

from preprocess import PreProcess
import time

start_time = time.time()
thisTime = start_time
files = []
dataFolder = os.path.dirname(os.path.abspath(__file__)) + "/data"
resultFolder = os.path.dirname(os.path.abspath(__file__)) + "/result"
count = 0
commonWordList = {}
for i in os.listdir(dataFolder):
    if i.endswith('.txt'):
        thisFile = os.path.join(dataFolder, i)
        reflection = open(thisFile, "r", encoding="utf8")

        processData = PreProcess(reflection.read())
        wordList = processData.getWordList(reflection.read(), True)
        wordFrequency = processData.wordFrequency(wordList)

        for wordTuple in wordFrequency:
            commonWordList[wordTuple[0]] = commonWordList[
                wordTuple[0]] + wordTuple[1] if wordTuple[
                    0] in commonWordList else wordTuple[1]

        print("--- %s seconds ---" % (time.time() - thisTime))
        thisTime = time.time()

        reflection.close()

result = open(resultFolder + "/wordfrequency.csv", "a+")
result.write("Word,WordCount\n")

Example #33

0

Show file

File: dataplot.py Project: rodrigonogueira4/fingerprint_liveness

    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(X)
    fig = pyplot.figure()
    pyplot.plot(X[y==False,0],X[y==False,1],'ro')
    pyplot.plot(X[y==True,0],X[y==True,1],'bo')
    pyplot.title('2D Visualization, Crossmatch LivDet 2013 Testing, ConvNet 5 Layers+PCA')
    pyplot.show()
    """

    testing = Testing()
    testing.divide_by = 5
    testing.n_processes_pproc =3
    lstFilesX,y = testing.load_dataset('Training', 'LivDet2011', 'digital')
    
    #PCA only
    pproc = PreProcess('',1,False,False,False,False,1.0,None,\
        None,None,None,None,None,None,None,None,None)
    X = pproc.transform(lstFilesX)
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(X)
    fig = pyplot.figure()
    pyplot.plot(X[y==False,0],X[y==False,1],'ro')
    pyplot.plot(X[y==True,0],X[y==True,1],'bo')
    pyplot.title('2D Visualization, Digital LivDet 2011 Training, PCA only')
    pyplot.show()
    
    #LBP+PCA
    pproc = PreProcess('LBP',1,False,False,False,False,1.0,None,\
        None,None,None,None,None,None,'uniform',[7,7],False)
    X = pproc.transform(lstFilesX)
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(X)

Example #34

0

Show file

"""
Created on Wed May 27 07:11:18 2020

@author: rusha
"""

import models
import shap
import pathlib
from preprocess import PreProcess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

preproc = PreProcess()
preproc.splitStandardize()

X_train, _, y_train, y_test = models.Model('tree').getData()
estimator = models.Model('tree').getXGBModel()
estimator.fit(X_train, y_train)
importances = estimator.feature_importances_
cwd = pathlib.Path().absolute()
indices = np.argsort(importances)[::-1]
cols = X_train.columns.tolist()
std = np.std([tree.feature_importances_ for tree in estimator.estimators_],
             axis=0)
tempcols = []
for f in range(X_train.shape[1]):
    print("%d. feature %s (%f)" %
          (f + 1, cols[indices[f]], importances[indices[f]]))

Example #35

0

Show file

File: nb.py Project: GabriellaUwa/NLP

class NB():
    
    
    classification = {}
    number_of_docs = None
    vocab_unique = None
    classes = None

    number_of_docs = None
    inputData = None
    vocab_unique = None
    classes = None

    words = {}
    process = PreProcess()
    testing = {}

    train_file = None
    test_file = None


    def __init__(self, train=None, test=None, train_dir=None, test_dir=None):
        """

        :aim: Classify for either a file or given directory
        :param train: Train file if a file is given
        :param test: Test file is a file is given
        :param train_dir: Train directory if directory is given
        :param test_dir: Test directory if directory is given
        """

        if train is not None or test is not None:
            self.train_file = train
            self.test_file = test

            self.number_of_docs = self.process.getNumberOfDocs(train)
            self.inputData = self.process.processFile(train)
            self.vocab_unique = self.process.createVocab(self.inputData)
            self.classes = self.process.getClasses(self.inputData)

        else:

            for i in os.listdir(train_dir + "neg"):

                self.number_of_docs += self.process.getNumberOfDocs(train)
                self.inputData += self.process.processFile(train)
                self.vocab_unique += self.process.createVocab(self.inputData)
                self.classes += self.process.getClasses(self.inputData)

            for i in os.listdir(train_dir + "pos"):

                self.number_of_docs += self.process.getNumberOfDocs(train)
                self.inputData += self.process.processFile(train)
                self.vocab_unique += self.process.createVocab(self.inputData)
                self.classes += self.process.getClasses(self.inputData)

            """
            for i in os.listdir(test_dir + "neg"):
                
            for i in os.listdir(test_dir + "pos"):
                
            """

    def classify(self):
        """

        :aim: Classify training
        :return: None
        """
        for c in self.classes:
            temp_dict = {}
            for word in self.vocab_unique:
                temp_dict[word] = 0
            self.classification[c] = temp_dict

        # keys = classes, values = count of words in document given class
        for key in self.classification:
            for line in self.inputData:
                if line[len(line) - 1] != key:
                    continue
                else:
                    for word in line:
                        if word not in self.classification[key]:
                            continue
                        (self.classification[key])[word] += 1

    def count_words(self):
        """

        :aim: Count words per class
        :return: None
        """

        for cl in self.classes:
            count = 0
            for k in self.classification[cl]:
                count += self.classification[cl][k]
            self.words[cl] = count

    def classify_smoothed(self):
        """

        :aim: Add one smoothing to probability
        :return: None
        """
        self.dict_likelihood = self.classification.copy()

        for cl in self.classes:
            for k in self.dict_likelihood[cl]:
                self.dict_likelihood[cl][k] = (self.dict_likelihood[cl][k] + 1) / float(self.words[cl] +
                                                                                        len(self.vocab_unique))


    def test_probability(self):
        """

        :aim: Test data on training
        :return: None
        """

        # Number of documents, given a class
        for cl in self.classes:
            d = self.process.numberOfDocsGivenClass(cl, self.inputData)
            self.process.docs[cl] = d

        # prior probabilities
        self.priors = {}
        for cl in self.classes:
            if cl not in self.priors:
                self.priors[cl] = self.process.docs[cl] / float(self.number_of_docs)

        with open(self.test_file) as f:
            content = f.readlines()
        content = [x.strip() for x in content]
        test_data = content

        # add classes as keys to testing dict, values = prior probabilities
        for cl in self.classes:
            if cl not in self.testing:
                self.testing[cl] = self.priors[cl]

        for cl in self.classes:
            for w in test_data:
                for key in self.dict_likelihood[cl]:
                    if key != w:
                        continue
                    elif key == w:
                        self.testing[cl] *= self.dict_likelihood[cl][w]

        # Compute the most likely class
        self.result = max(self.testing, key=self.testing.get)

    def output(self):
        """

        :aim: Write final classification output to file
        :return: None
        """
        string = \
        "Total number of documents/lines in input" + str(self.number_of_docs) + "\n" + \
        "Vocabulary of unique words: " + str(self.vocab_unique) + "\n"+ \
        "Classes:" + str(self.classes) + "\n"+ \
        "Count of words, given class: " + str(self.words) + "\n"+ \
        "Word likelihoods with add - 1 smoothing with respect to class: " + str(self.dict_likelihood) + "\n"+ \
        "Prior probabilities: ",  str(self.priors) + "\n" + \
        "Probabilities of test data: " + str(self.testing) + "\n" + \
        "The most likely class for the test document: " + self.result

        if self.train_file is not None:
            f = open("movies-small.txt", "w")
            f.write("\n".join(string))
            f.close()
        else:
            #for multiple directory implementation
            pass