Esempio n. 1
0
def convert(src, tgt, txt, nativize, preoptions, postoptions):
    txt = PreProcess.PreProcess(txt, src, tgt)

    if 'siddhamUnicode' in postoptions and tgt == 'Siddham':
        tgt = 'SiddhamUnicode'
    if 'LaoNative' in postoptions and tgt == 'Lao':
        tgt = 'Lao2'
    if 'siddhamUnicode' in preoptions and src == 'Siddham':
        src = 'SiddhamUnicode'
    if 'egrantamil' in preoptions and src == 'Grantha':
        src = 'GranthaGrantamil'
    if 'egrantamil' in postoptions and tgt == 'Grantha':
        tgt = 'GranthaGrantamil'

    for options in preoptions:
        txt = getattr(PreProcess, options)(txt)

    transliteration = Convert.convertScript(txt, src, tgt)

    if nativize:
        transliteration = PostOptions.ApplyScriptDefaults(
            transliteration, src, tgt)
        if tgt != 'Tamil':
            transliteration = PostProcess.RemoveDiacritics(transliteration)
        else:
            transliteration = PostProcess.RemoveDiacriticsTamil(
                transliteration)

    for options in postoptions:
        transliteration = getattr(PostProcess, options)(transliteration)

    return transliteration
Esempio n. 2
0
 def preProc(self):
     dataCleaner = PreProcess(self.df)
     self.df = dataCleaner.df
     #alert user
     tkMessageBox.showinfo("K Means Clustering",
                           "Preprocessing completed successfully!")
     pass
Esempio n. 3
0
def convert(src, tgt, txt, nativize, preoptions, postoptions):
    txt = PreProcess.PreProcess(txt, src, tgt)

    if 'siddhammukta' in postoptions and tgt == 'Siddham':
        tgt = 'SiddhamDevanagari'
    if 'siddhamap' in postoptions and tgt == 'Siddham':
        tgt = 'SiddhamDevanagari'
    if 'siddhammukta' in preoptions and src == 'Siddham':
        src = 'SiddhamDevanagari'
    if 'LaoNative' in postoptions and tgt == 'Lao':
        tgt = 'Lao2'
    if 'egrantamil' in preoptions and src == 'Grantha':
        src = 'GranthaGrantamil'
    if 'egrantamil' in postoptions and tgt == 'Grantha':
        tgt = 'GranthaGrantamil'
    if 'nepaldevafont' in postoptions and tgt == 'Newa':
        tgt = 'Devanagari'
    if 'ranjanalantsa' in postoptions and tgt == 'Ranjana':
        tgt = 'Tibetan'
        nativize = False
    if 'ranjanawartu' in postoptions and tgt == 'Ranjana':
        tgt = 'Tibetan'
        nativize = False

    for options in preoptions:
        txt = getattr(PreProcess, options)(txt)

    transliteration = Convert.convertScript(txt, src, tgt)

    if nativize:
        transliteration = PostOptions.ApplyScriptDefaults(
            transliteration, src, tgt)
        if tgt != 'Tamil':
            transliteration = PostProcess.RemoveDiacritics(transliteration)
        else:
            transliteration = PostProcess.RemoveDiacriticsTamil(
                transliteration)

    for options in postoptions:
        transliteration = getattr(PostProcess, options)(transliteration)

    if src == "Tamil" and tgt == "IPA":
        r = requests.get("http://anunaadam.appspot.com/api?text=" + txt +
                         "&method=2")
        r.encoding = r.apparent_encoding
        transliteration = r.text

    return transliteration
Esempio n. 4
0
    def chooseFile(self, item):
        for index in range(self.listWidget.count()):
            if self.listWidget.item(index).text() == item.text():
                self.itemIndex = index
        preProcess = PreProcess.PreProcess()
        content = preProcess.getArticleContent(repertory + "/" + item.text())
        if self.method == 1:
            self.sents = preProcess.getSents(content)
            size = len(self.sents)
        else:
            size, self.sents = preProcess.getXMLsents(content)

        self.labelRest.setText('0/' + str(size))
        self.file = item.text()
        self.newSent = []
        self.pushButton_save.setDisabled(True)
Esempio n. 5
0
def RunPreprocess():

    print "---PreProcess"
    PreProcess.PreProcess()
    print "---PreProcess1"
    PreProcess1.PreProcess1()
    print "---PreProcess2"
    PreProcess2.PreProcess2()
    print "---PreProcess3"
    PreProcess3.PreProcess3()
    print "---PreProcess4,40"
    PreProcess4.PreProcess4(40)
    print "---PreProcess4,30"
    PreProcess4.PreProcess4(30)
    print "---PreProcess4Base,40"
    PreProcess4Base.PreProcess4Base(40)
    print "---PreProcess4Base,30"
    PreProcess4Base.PreProcess4Base(30)
Esempio n. 6
0
    float(line.rstrip('\n')) for line in open('csvData/train_label.data')
]

rawData = open('csvData/test.data', 'rb')
temp = np.loadtxt(rawData, delimiter=',')
testset = np.c_[np.ones(len(temp)), temp]

test_labels = [
    float(line.rstrip('\n')) for line in open('csvData/test_label.data')
]

###

# pre-process

PP = PreProcess.PreProcess(data, n_buckets=10,
                           func='boolean')  #,swap_labels=True)
data = PP.fit(data)
testset = PP.fit(testset)

data_labels = PP.processLabels(data_labels)
test_labels = PP.processLabels(test_labels)

# cross-validation

best_C = 2
best_ro = 0.01
best_accuracy = 0
best_epoch = 10
best_g0 = 1.001
''' 
for C in [4,2,0.5,0.25,0.125]:#,0.0625,0.03125]:
Esempio n. 7
0
]

f = open('csvData/test.data')
temp = []
for line in f:
    temp.append(line.rstrip().split(','))
testset = temp

test_labels = [
    float(line.rstrip('\n')) for line in open('csvData/test_label.data')
]
###

# pre-process

PP = PreProcess.PreProcess(data, n_buckets=5)
data = PP.fit(data)
testset = PP.fit(testset)

# cross-validation

best_depth = 3
best_f1 = 0
for d in [10, 20, 30, 40, 50, float('inf')]:
    tmp = []

    dt = DecisionTree.DecisionTree(max_depth=d)
    kfold = KFold.KFold(n_splits=5)

    for kf in kfold.split(data):
        train = [data[i] for i in kf[0]]
Esempio n. 8
0
 def processing(self, event, source_object=None):
     self.pushButton_choose.setDisabled(True)
     self.pushButton_keep.setVisible(True)
     self.pushButton_remove.setVisible(True)
     self.listWidget.setDisabled(True)
     global index
     preProcess = PreProcess.PreProcess()
     if source_object.objectName() == "pushButton_choose":
         index = 0
         self.textEdit.clear()
         if self.method == 1:
             sent = self.sents[index]
         else:
             sent = preProcess.getXMLtext(self.sents[index][0])
             if self.sents[index][1] == 0:
                 self.pushButton_skip.setVisible(True)
                 self.pushButton_keep.setVisible(False)
                 self.pushButton_remove.setVisible(False)
             else:
                 self.pushButton_skip.setVisible(False)
                 self.pushButton_keep.setVisible(True)
                 self.pushButton_remove.setVisible(True)
         self.textEdit.setText(sent)
     else:
         sent = ""
         if self.method == 1:
             sent = self.sents[index]
         else:
             sent = preProcess.getXMLtext(self.sents[index][0])
             if self.sents[index][1] == 0:
                 self.pushButton_skip.setVisible(True)
                 self.pushButton_keep.setVisible(False)
                 self.pushButton_remove.setVisible(False)
             else:
                 self.pushButton_skip.setVisible(False)
                 self.pushButton_keep.setVisible(True)
                 self.pushButton_remove.setVisible(True)
         self.textEdit.setText(sent)
         self.labelRest.setText(str(index + 1) + '/' + str(len(self.sents)))
         if source_object.objectName() == "pushButton_keep":
             if self.method == 1:
                 self.newSent.append("<source id='" + str(index + 1) +
                                     "' operation='S'>\n" + sent +
                                     "\n</source>")
             else:
                 self.newSent.append("<source id='" + str(index + 1) +
                                     "' operation='S'>" + sent +
                                     "</source>")
         elif source_object.objectName() == "pushButton_remove":
             if self.method == 1:
                 self.newSent.append("<source id='" + str(index + 1) +
                                     "' operation='R'>\n" + sent +
                                     "\n</source>")
             else:
                 self.newSent.append("<source id='" + str(index + 1) +
                                     "' operation='R'>" + sent +
                                     "</source>")
         elif source_object.objectName() == "pushButton_skip":
             self.newSent.append(str(self.sents[index][0]))
         index += 1
         if index < len(self.sents):
             if self.method == 1:
                 sent = self.sents[index]
             else:
                 sent = preProcess.getXMLtext(self.sents[index][0])
                 if self.sents[index][1] == 0:
                     self.pushButton_skip.setVisible(True)
                     self.pushButton_keep.setVisible(False)
                     self.pushButton_remove.setVisible(False)
                 else:
                     self.pushButton_skip.setVisible(False)
                     self.pushButton_keep.setVisible(True)
                     self.pushButton_remove.setVisible(True)
             self.textEdit.setText(sent)
         if index >= len(self.sents):
             self.pushButton_save.setDisabled(False)
             self.pushButton_keep.setVisible(False)
             self.pushButton_remove.setVisible(False)
             self.pushButton_skip.setVisible(False)
Esempio n. 9
0
            predict_tmp = win.predict(test)
            tmp.append(Stat.F1_Score(predict_tmp,test_label))
                      
        if np.mean(tmp) > best_f1:
            best_f1 = np.mean(tmp)
            best_bucket = b
            best_param = p
                                              
            print("Best result so far >>",best_f1,best_bucket,best_param)
          
print("best bucket:", best_bucket)
print("best param:" , best_param)
'''
###

PP = PreProcess.PreProcess(data, n_buckets=best_bucket, func='boolean')

data = PP.fit(data)
testset = PP.fit(testset)

#data_labels = PP.processLabels(data_labels)
#test_labels = PP.processLabels(test_labels)

unbalanced = Winnow.Winnow(param=best_param)
unbalanced.fit(data, data_labels)

predictTrain = unbalanced.predict(data)
predictTest = unbalanced.predict(testset)

print("unbalanced:")
Esempio n. 10
0
# coding: utf-8

# In[6]:

import numpy as np
import pandas as pd
import pickle
from PreProcess import *

SavedModelFile = "model.pkl"
filePath = "test_potus_by_county.csv"
X = pd.read_csv(filePath)
X = PreProcess(X)

try:
    with open(SavedModelFile, 'rb') as f:
        model = pickle.load(f)
except:
    print("Did not find a saved model, please run build_model.py")
    exit()

predict = [pred for pred in model.predict(X)]

with open('predictions.csv', 'w+') as f:
    f.write("Winner\n")
    for pred in predict:
        f.write(pred + "\n")
Esempio n. 11
0
parser = argparse.ArgumentParser()
parser.add_argument('--morning', help='Folder the .tflite file is located in',
                    default='17:47:0')
parser.add_argument('--afternoon', help='Name of the .tflite file, if different than detect.tflite',
                    default='11:38:0')
parser.add_argument('--evening', help='Name of the labelmap file, if different than labelmap.txt',
                    default='11:39:0')

args = parser.parse_args()
set_time = [args.morning, args.afternoon, args.evening]
name = ''

while True:
    files= os.listdir('input')
    preProcess = PreProcess()
    index_new = preProcess.nameImage()
    d = datetime.now()
    cur_time = str(d.hour) +':'+ str(d.minute) + ':'+ str(d.second)  
    #capture
    for i in range(len(set_time)):
        if cur_time == set_time[i]:
            preProcess.captureImage()
            
    #resize & name
    if len(files)>0:
        for file in files:
            sleep(3)
            test = PreProcess(file=file, name=index_new)
            test.preImages()
            index_new = index_new +1
Esempio n. 12
0
]

f = open('csvData/test.data')
temp = []
for line in f:
    temp.append(line.rstrip().split(','))
testset = temp

test_labels = [
    float(line.rstrip('\n')) for line in open('csvData/test_label.data')
]
###

# pre-process

PP = PreProcess.PreProcess(data, func='boolean', n_buckets=5)
data = PP.fit(data)
testset = PP.fit(testset)
'''
data = [[0,0,1,1],[0,0,1,0],[0,0,0,0],[1,0,1,0],[1,0,1,1],[1,1,1,1]]
data_labels = [0,1,2,3,4,5]
testset = [[1,0,1,1]]
'''
# cross-validation

best_norm = 2
best_K = 3
best_f1 = 0
'''
for k in [1,3,5,7]:
    for p in [1,2,3]:
Esempio n. 13
0
        (x_train, x_test) = SplitSet(x, fold_idx)

        if validation == True:
            y_predictions = np.zeros((y_test.shape[0], 1), dtype=np.int32)
            for sample_proportion_idx in range(int(SAMPLE_PROPORTION_SIZE)):
                sample_proportion = SAMPLE_PROPORTION_LOWER + sample_proportion_idx * SAMPLE_PROPORTION_INTERVAL
                for attribute_size_idx in range(VALIDATION_ATTRIBUTE_SIZE):
                    attribute_size = VALIDATION_ATTRIBUTE_SIZE_LOWER + attribute_size_idx * VALIDATION_ATTRIBUTE_SIZE_INTERVAL
                    forestList = []
                    for class_idx in range(NUM_CLASSES):
                        class_num = class_idx + 1
                        forestList.append([])
                        for tree_idx in range(NUM_TREES_IN_FOREST):
                            x_sample, y_sample = sample(
                                x_train, y_train, sample_proportion)
                            train_targets = PreProcess(y_sample, class_num)
                            attribute_set = sample_attributes(
                                range(x.shape[1]), attribute_size)

                            tree = Decision_Tree_Learning(
                                x_sample, attribute_set, train_targets)

                            forestList[class_idx].append(tree)

                    vote_block = np.zeros((x_test.shape[0], NUM_CLASSES))
                    for forest_idx in range(NUM_CLASSES):
                        choices = decision_forest_vote(forestList[forest_idx],
                                                       x_test)
                        vote_block[:, forest_idx] = choices

                    y_test_predictions = np.zeros((x_test.shape[0], 1))
Esempio n. 14
0
    with io.open(filename, 'w', encoding="utf-8") as csvfile:
        # creating a csv writer object
        csvwriter = csv.writer(csvfile)

        # writing the fields
        csvwriter.writerow(fields)

        # writing the data rows
        csvwriter.writerows(rows)
    return None


if __name__ == '__main__':
    # arr = {1:2,3:4,5:1}
    # print(max(arr.values()))
    preProcess = PreProcess.PreProcess()
    eval = Evaluation.Evaluation()
    #
    for n in range(40, 60, 2):
        kmeans = Kmaens.Kmeans(n, preProcess.vectorize_tf_idf())
        print(eval.purity(n, kmeans.y, preProcess.labels))

    # data_vectors_tf_idf = preProcess.vectorize_tf_idf()
    # data_vectors_wv = preProcess.word2wec()
    # optimal_n = len(set(preProcess.labels))

    # # Gaussian Mixture Model
    # print("Gaussian Mixture Model(tf-idf):")
    # gmm = GMMCluster.GMMCluster(data_vectors_tf_idf[:100], 5)
    # cluster = gmm.cluster("tf-idf")
    # print("ARI= ", eval.adjusted_rand_index(preProcess.labels[:100], cluster))
Esempio n. 15
0
stacked_sample_error_occ = 0
stacked_sample_error_depth = 0
stacked_sample_error_depth_min = 0
stacked_sample_error_occ = 0

for fold_idx in range(0, NUM_FOLDS):  # fold
    fold_num = fold_idx + 1
    (y_train_validate, y_test) = SplitSet(y, fold_idx)
    y_predictions = np.zeros((y_test.shape[0], 1), dtype=np.int32)
    (x_train_validate, x_test) = SplitSet(x, fold_idx)
    treeList = []
    occurrences = CountOccurrence(y_train_validate, NUM_CLASSES)

    for tree_class_idx in range(0, NUM_CLASSES):
        class_num = tree_class_idx + 1
        train_targets = PreProcess(y_train_validate, class_num)
        tree = Decision_Tree_Learning(x_train_validate,
                                      range(x_train_validate.shape[1]),
                                      train_targets, None)
        # if  class_num != 1:
        #     tree = Decision_Tree_Learning(x_train_validate,range(x_train_validate.shape[1]),train_targets,max_depth)
        # else:
        #     tree = Decision_Tree_Learning(x_train_validate,range(x_train_validate.shape[1]),train_targets,10)
        treeList.append(tree)

    predictions_by_depth = TestTreesByDepth(treeList, x_test,
                                            ambiguityHandlingStyle)
    predictions_by_min_depth = TestTreesByMinDepth(treeList, x_test,
                                                   ambiguityHandlingStyle)
    predictions_by_occ = TestTreesByOccurrence(treeList, occurrences, x_test,
                                               ambiguityHandlingStyle)
Esempio n. 16
0
    def analyzing(self):

        CWD_PATH = os.getcwd()
        output_path = os.path.join(CWD_PATH, 'analyze')
        preProcess = PreProcess()
        index = preProcess.nameImage('analyze')

        # If both an image AND a folder are specified, throw an error
        if (self.IM_NAME and self.IM_DIR):
            print('you can only use IM_NAME OR IM_DIR')
            sys.exit()

        # If neither an image or a folder are specified, default to using 'test1.jpg' for image name
        if (not self.IM_NAME and not self.IM_DIR):
            self.IM_DIR = 'new'

        # Import TensorFlow libraries
        # If tensorflow is not installed, import interpreter from tflite_runtime, else import from regular tensorflow
        # If using Coral Edge TPU, import the load_delegate library
        pkg = importlib.util.find_spec('tensorflow')
        if pkg is None:
            from tflite_runtime.interpreter import Interpreter
        else:
            from tensorflow.lite.python.interpreter import Interpreter

        # Get path to current working directory

        # Define path to images and grab all image filenames
        if self.IM_DIR:
            PATH_TO_IMAGES = os.path.join(CWD_PATH, self.IM_DIR)
            images = glob.glob(PATH_TO_IMAGES + '/*')

        elif self.IM_NAME:
            PATH_TO_IMAGES = os.path.join(CWD_PATH, self.IM_NAME)
            images = glob.glob(PATH_TO_IMAGES)

        # Path to .tflite file, which contains the model that is used for object detection
        PATH_TO_CKPT = os.path.join(CWD_PATH, self.MODEL_NAME, self.GRAPH_NAME)

        # Path to label map file
        PATH_TO_LABELS = os.path.join(CWD_PATH, self.MODEL_NAME,
                                      self.LABELMAP_NAME)

        # Load the label map
        with open(PATH_TO_LABELS, 'r') as f:
            labels = [line.strip() for line in f.readlines()]

        # Have to do a weird fix for label map if using the COCO "starter model" from
        # https://www.tensorflow.org/lite/models/object_detection/overview
        # First label is '???', which has to be removed.
        if labels[0] == '???':
            del (labels[0])

        # Load the Tensorflow Lite model.
        interpreter = Interpreter(model_path=PATH_TO_CKPT)

        interpreter.allocate_tensors()

        # Get model details
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        height = input_details[0]['shape'][1]
        width = input_details[0]['shape'][2]
        print(width, height)

        floating_model = (input_details[0]['dtype'] == np.float32)

        input_mean = 127.5
        input_std = 127.5

        # Loop over every image and perform detection
        for image_path in images:
            leaf = flower = melon = 0
            # Load image and resize to expected shape [1xHxWx3]
            image = cv2.imread(image_path)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            imH, imW, _ = image.shape
            image_resized = cv2.resize(image_rgb, (width, height),
                                       interpolation=cv2.INTER_AREA)
            input_data = np.expand_dims(image_resized, axis=0)

            # Normalize pixel values if using a floating model (i.e. if model is non-quantized)
            if floating_model:
                input_data = (np.float32(input_data) - input_mean) / input_std

            # Perform the actual detection by running the model with the image as input
            interpreter.set_tensor(input_details[0]['index'], input_data)
            interpreter.invoke()

            # Retrieve detection results
            boxes = interpreter.get_tensor(output_details[0]['index'])[
                0]  # Bounding box coordinates of detected objects
            classes = interpreter.get_tensor(output_details[1]['index'])[
                0]  # Class index of detected objects
            scores = interpreter.get_tensor(output_details[2]['index'])[
                0]  # Confidence of detected objects
            #num = interpreter.get_tensor(output_details[3]['index'])[0]  # Total number of detected objects (inaccurate and not needed)

            # Loop over all detections and draw detection box if confidence is above minimum threshold
            for i in range(len(scores)):
                if ((scores[i] > self.min_conf_threshold)
                        and (scores[i] <= 1.0)):
                    # Get bounding box coordinates and draw box
                    # Interpreter can return coordinates that are outside of image dimensions, need to force them to be within image using max() and min()
                    ymin = int(max(1, (boxes[i][0] * imH)))
                    xmin = int(max(1, (boxes[i][1] * imW)))
                    ymax = int(min(imH, (boxes[i][2] * imH)))
                    xmax = int(min(imW, (boxes[i][3] * imW)))

                    cv2.rectangle(image, (xmin, ymin), (xmax, ymax),
                                  (10, 255, 0), 2)

                    # Draw label
                    object_name = labels[int(
                        classes[i]
                    )]  # Look up object name from "labels" array using class index
                    label = '%s: %d%%' % (object_name, int(scores[i] * 100)
                                          )  # Example: 'person: 72%'
                    labelSize, baseLine = cv2.getTextSize(
                        label, cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                        2)  # Get font size
                    label_ymin = max(
                        ymin, labelSize[1] + 10
                    )  # Make sure not to draw label too close to top of window
                    cv2.rectangle(
                        image, (xmin, label_ymin - labelSize[1] - 10),
                        (xmin + labelSize[0], label_ymin + baseLine - 10),
                        (255, 255, 255),
                        cv2.FILLED)  # Draw white box to put label text in
                    cv2.putText(image, label, (xmin, label_ymin - 7),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0),
                                2)  # Draw label text
                    if (object_name == 'leaf'):
                        leaf = leaf + 1
                    elif (object_name == 'flower'):
                        flower = flower + 1
                    else:
                        melon = melon + 1
            # All the results have been drawn on the image, now display the image
            print('image', index, ':')
            print('leaf:', leaf)
            print('flower:', flower)
            print('melon:', melon)
            uploadToFirebase = DbFirebase(leaves=leaf,
                                          flowers=flower,
                                          melons=melon)
            uploadToFirebase.add()
            cv2.imshow('Object detector', image)
            out = os.path.join(output_path, str(index) + ".jpg")
            cv2.imwrite(out, image)

            index = index + 1
            # Press any key to continue to next image, or press 'q' to quit
            cv2.waitKey(1)
        preProcess.moveImage()
        # Clean up
        cv2.destroyAllWindows()