def parse(dir, out_fname):
    '''
    Parse all the inkml files in "dir",
    and create a csv file(out_fname) containing one symbol per line
    :param dir: directory containing inkml files
    :param out_fname: the .csv file to which program writes all samples' features
    :return:
    '''

    # get parsed samples dictionary: {"label_id": {"strokes": strokes, "label": label} }
    samples = parse_ink_dir(dir)

    with open(out_fname, 'w') as records:

        for label_id in samples:

            sample = samples[label_id]

            # preprocessing:
            # remove duplicates, smoothing, size normalization, resampling
            preprocessing.process_sample(sample)

            # generate features for every point in the sample
            # cosine of vicinity, normalized y-coord, sin of curvature
            features = feature_generation.gen_feature_vector(sample)

            for x in features:
                records.write(str(x) + ',')

            records.write(label_id + ',')
            records.write(sample['label'])
            records.write('\n')
def get_symbol_label(clf, strokes):
    '''
    Gets the label as predicted by the symbol classifier
    :param clf: classifier model
    :param strokes: a list of strokes representing the symbol
    :return: predicted label
    '''
    strokes_lst = list()

    for s in strokes:
        strokes_lst.append({"x": s[:, 0], "y": s[:, 1]})

    # get symbol features
    preprocessing.resample(strokes_lst)
    feature_vec = feature_generation.gen_feature_vector(strokes_lst)

    # possible class values
    # class_lst = clf.classes_
    # print(class_lst)

    cls = clf.predict([feature_vec])

    # handling predicted comma's
    if cls[0] == ",":
        cls[0] = "COMMA"

    return cls[0]
Exemple #3
0
def rf_test(model_pickle, abs_dir_path, symbols_to_test):
    '''
    Test Random Forest model using the .inkml symbols in file "symbols_to_test"
    :param model_pickle: The pickled Random Forest model
    :param abs_dir_path: absolute path to the a parent directory where all .inkml files(valid, junk, other) can be found
    :param symbols_to_test: a .csv file containing filenames(.inkml) of symbols to test
    :return:
    '''

    rf_pickle = open(model_pickle, 'rb')
    rf = pickle.load(rf_pickle)

    # read all the files in the current directory and sub directories recursively and built a dictionary
    file_dictionary = dict()

    for (dirpath, dirnames, filenames) in os.walk(abs_dir_path):

        for file in filenames:
            file_dictionary[file] = os.path.join(dirpath, file)

    with open(symbols_to_test, 'r') as symbols_list:

        features = []
        label_ids = []

        # read each symbol file, parse, process, generate features
        for line in symbols_list:
            fname = line.strip()

            abs_fpath = file_dictionary[fname]

            data = inkml_parser.parse_file(abs_fpath)

            if data is not None:
                strokes = data["stroke_list"]
                ui = data["label_id"]

                preprocessing.process_sample(strokes)
                feature_vec = feature_generation.gen_feature_vector(strokes)

                features.append(feature_vec)
                label_ids.append(ui)

        test_features = np.array(features)

        class_lst = rf.classes_

        # predict the probability of this sample being in class 1, class 2 ...
        cls_probabilities = rf.predict_proba(test_features)

        # sort probabilities in descending order and get the class indices in cls_list
        indices = np.argsort(-cls_probabilities)

        # retain the top 30 predictions for all samples
        indices = indices[:, :30]  # index in the class list

        n_r, n_c = indices.shape

        inp_fname = os.path.split(symbols_to_test)[-1]
        with open('rf_output_' + inp_fname, 'w') as results_file:

            for r in range(n_r):
                # find the topmost prediction
                results_file.write(label_ids[r] + ',')

                unique_labels = set()
                # write the top ten results of prediction
                for c in range(n_c):
                    idx = indices[r, c]
                    # get the label of sample at this index in training dataset
                    label = class_lst[idx]  # a tuple (label_id, label)

                    # only put the unique labels in results file
                    if label not in unique_labels:
                        unique_labels.add(label)
                        results_file.write(label +
                                           (',' if c < n_c - 1 else ''))

                    if len(unique_labels) == 10:
                        break

                results_file.write('\n')
def train_classifier(model, ds_type, gt_fname, abs_dir_path, symbols_to_train):
    '''
    Trains specified model(kdtree or randomforest) using the symbols specified in "symbols_to_train" file
    :param model: string - 'kdtree' or 'rf' (randomforest)
    :param ds_type: string - 'v' for valid symbols or 'v_j' for valid+junk symbols
    :param gt_fname: name of file containing records Annotation,label (ground truth of the training set is "symbols_to_train")
    :param abs_dir_path: The absolute path to the directory where the .inkml files are stored(it can be a root directory)
    :param symbols_to_train: .csv file containing .inkml filenames of symbol files on which model needs to be trained
    :return:
    '''

    # read the ground truth file
    with open(gt_fname, 'r') as gt_file:
        label_map = {}
        for line in gt_file:

            line = line.strip().split(
                ",")  # handle the case where ',' is a label
            if len(line) == 3:
                label_id, label = [line[0], ","]
            else:
                label_id, label = line
            label_map[label_id] = label

    # read all the files in the current directory and sub directories recursively and built a dictionary
    file_dictionary = dict()

    for (dirpath, dirnames, filenames) in os.walk(abs_dir_path,
                                                  followlinks=True):

        for file in filenames:
            file_dictionary[file] = os.path.join(dirpath, file)

    with open(symbols_to_train, 'r') as symbols_list:

        features = []
        label_ids = []
        labels = []

        for line in symbols_list:
            fname = line.strip()

            if not fname.endswith('.inkml'):
                continue

            # find the file's absolute path from the dictionary of all files
            abs_fpath = file_dictionary[fname]

            data = inkml_parser.parse_file(abs_fpath)

            strokes = data["stroke_list"]
            ui = data["label_id"]

            preprocessing.process_sample(strokes)
            feature_vec = feature_generation.gen_feature_vector(strokes)

            features.append(feature_vec)
            label_ids.append(ui)
            labels.append(label_map[ui])

        print('Data cleaning and prep complete')
        X = np.array(features)
        classifier_model = None
        if model == 'kdtree':
            # leaf size is the the min number of samples in a given node
            tree = skl.KDTree(
                X, leaf_size=60)  # Train the classifier, build the tree
            classifier_model = tree

        elif model == 'rf':
            print('Training RandomForest classifier')
            rf = RandomForestClassifier(n_estimators=100,
                                        max_depth=20,
                                        min_samples_split=10)
            rf.fit(X, labels)
            classifier_model = rf
            print('Training complete')

        # store the model parameters in csv file (kdTree_parameters.csv)
        output_pickle_fname = model + '_' + ds_type + '.pklz'
        with gzip.open(output_pickle_fname, 'wb') as model_file:
            pickle.dump(classifier_model, model_file)

        print('Trained', model, 'classifier model stored in file:',
              output_pickle_fname)
Exemple #5
0
def kdtree_test(model_pickle, tr_gt, abs_dir_path, symbols_to_test):
    '''
    Test KDTree model using the inkml symbols in file "symbols_to_test"
    :param model_pickle: The pickled KDTree model
    :param tr_gt: ground truth file of training symbols which were used to train this model
    :param abs_dir_path: absolute path to the a parent directory where all .inkml files(valid, junk, others) can be found
    :param symbols_to_test: a .csv file containing filenames(.inkml) of symbols to test
    :return:
    '''

    kdtree_model = open(model_pickle, 'rb')
    tree = pickle.load(kdtree_model)

    training_labels = []
    with open(tr_gt, 'r') as training_GT:
        for line in training_GT:
            UI, label = line.strip().split(',')
            training_labels.append(label)

    num_neighbours = 30

    # read all the files in the current directory and sub directories recursively and built a dictionary
    file_dictionary = dict()

    for (dirpath, dirnames, filenames) in os.walk(abs_dir_path):

        for file in filenames:
            file_dictionary[file] = os.path.join(dirpath, file)

    with open(symbols_to_test, 'r') as symbols_list:

        features = []
        label_ids = []

        for line in symbols_list:
            fname = line.strip()

            abs_fpath = file_dictionary[fname]

            data = inkml_parser.parse_file(abs_fpath)

            if data is not None:
                strokes = data["stroke_list"]
                ui = data["label_id"]

                preprocessing.process_sample(strokes)
                feature_vec = feature_generation.gen_feature_vector(strokes)

                features.append(feature_vec)
                label_ids.append(ui)

        test_features = np.array(features)

        # query the tree for k nearest neighbors
        dist, indices = tree.query(test_features, k=num_neighbours)

        n_r, n_c = indices.shape

        inp_fname = os.path.split(symbols_to_test)[-1]
        with open('kdtree_output_' + inp_fname, 'w') as results_file:

            for r in range(n_r):

                # find the label id for this sample
                results_file.write(label_ids[r] + ',')

                unique_labels = set()

                # write the top ten results of prediction
                for c in range(n_c):
                    idx = indices[r, c]
                    # get the label of sample at this index in training dataset
                    label = training_labels[idx]  # a tuple (label_id, label)

                    if label not in unique_labels:
                        unique_labels.add(label)
                        results_file.write(label +
                                           (',' if c < n_c - 1 else ''))

                    if len(unique_labels) == 10:
                        break

                results_file.write('\n')