Esempio n. 1
0
def read_files(directory):
    """
    reads inkml file, extracts features and saves it to csv
    :param directory:
    :return:
    """

    files = os.listdir(directory)
    print(len(files))
    pre = Preprocessing()
    feature_matrix = []
    total = len(files)
    completed = 0
    gt_c = 0
    for file in files[0:]:
        print("Processing file : ", file, " Remaining files : ", total-completed, " Completed files : ", completed)
        f = open(os.path.join(directory, file))
        soup = bs.BeautifulSoup(f, 'html.parser')
        trace_groups = soup.find_all('tracegroup')

        for tracegroup in trace_groups[1:]:
            traceview = tracegroup.find_all('traceview')
            trace_id = []
            for t in traceview:
                trace_id.append(t['tracedataref'])

            gt = tracegroup.annotation.text
            gt_c += 1
            X = []
            Y = []

            for id in trace_id:
                traces = soup.findAll("trace", {'id': id})
                for trace in traces:
                    coords = trace.text.strip().split(",")
                    x = []
                    y = []
                    for coord in coords:
                        trace_parts = coord.strip().split(' ')
                        x.append(float(trace_parts[0]))
                        y.append(float(trace_parts[1]))

                    X.extend(x)
                    Y.extend(y)

            X, Y = pre.dopreprocess(x=X, y=Y)
            ar = pre.get_aspect(X, Y)
            pen = len(trace_id)
            feature_matrix.append(extract_features(X, Y, pen, ar, key=gt))
        completed += 1

    df = pd.DataFrame(feature_matrix)
    print("Shape of Matrix ", df.shape, " Total Ground truths in file", gt_c)
    name = directory.strip().split("/")[0]
    df.to_csv(name + ".csv", index=False)
Esempio n. 2
0
def perfectly_segmented_parser(ink_dir, bonus=False):
    """
    This is a parser for perfectly segmented symbols
    :param ink_dir: inkml directory
    :param bonus: boolean for bonus
    :return:
    """
    start = time.time()

    lg_dir = dir.strip().split("/")[0] + "_output_lg"

    if not os.path.exists(lg_dir):
        os.mkdir(lg_dir)

    ink_files = os.listdir(ink_dir)

    if bonus:
        print("Loaded Bonus classifier")
        clf = joblib.load("relation_classifier_bonus.pkl")
    else:
        print("Loaded relationship classifier")
        clf = joblib.load('relation_classifier4.pkl')
    pre = Preprocessing()
    total = len(ink_files)
    c = 0
    gt_c = 0

    for file in ink_files:
        print("Processing file : ", file, " Files remaining : ", total - c,
              " Files completed : ", c)

        f = open(os.path.join(ink_dir, file))

        soup = bs.BeautifulSoup(f, 'html.parser')
        trace_groups = soup.find_all('tracegroup')
        symbol_list = []

        #loop to isolate symbols
        for tracegroup in trace_groups[1:]:
            traceview = tracegroup.find_all('traceview')
            trace_id = []

            #loop to get strokes in a single symbol
            for t in traceview:
                trace_id.append(t['tracedataref'])

            gt = tracegroup.annotation.text
            gt_c += 1
            X = []
            Y = []

            #extract stroke coordinates
            for id in trace_id:
                traces = soup.findAll("trace", {'id': id})
                for trace in traces:
                    coords = trace.text.strip().split(",")
                    x = []
                    y = []
                    for coord in coords:
                        trace_parts = coord.strip().split(' ')
                        x.append(float(trace_parts[0]))
                        y.append(float(trace_parts[1]))

                    X.append(x)
                    Y.append(y)

            X, Y = pre.dopreprocess(x=X, y=Y, parser=True)
            if gt == ",":
                gt = "COMMA"
            sym_obj = Symbol(x=X, y=Y, label=gt, stroke_id=trace_id)
            symbol_list.append(sym_obj)

        symbol_count = {}

        #Run through list of symbols to get their count
        for sym in symbol_list:
            if sym.symbol not in symbol_count:
                symbol_count[sym.symbol] = 1
                sym.sym_ct = symbol_count[sym.symbol]
            else:
                symbol_count[sym.symbol] += 1
                sym.sym_ct = symbol_count[sym.symbol]

        #perform line of sight
        graph, labels = line_of_sight(symbol_list, clf)
        #run edmonds on los graph
        relations = edmonds(graph)

        #write result to lg
        write_to_lg(file=file,
                    symbol_list=symbol_list,
                    relations=relations,
                    labels=labels,
                    lg_dir=lg_dir)

        c += 1
    print("System executed in ", (time.time() - start) / 60, " minutes.")
def train(ink_dir, lg_dir):
    """
    This function is used for training model

    :param ink_dir:
    :param lg_dir:
    :return:
    """
    lg_files = os.listdir(lg_dir)
    pre = Preprocessing()

    feature_matrix = []
    targets = []
    c = 0
    total = len(lg_files)
    for file in lg_files:
        print(file, total - c, c)
        symbols = {}

        with open(lg_dir + "/" + file) as f:
            for line in f:
                if line.startswith("O"):
                    filt_line = line.strip().split(",")
                    symbols[filt_line[1].strip()] = [
                        filt_line[2], filt_line[4:]
                    ]

        inkml_file = file.replace(".lg", ".inkml")

        with open(ink_dir + "/" + inkml_file) as f:
            soup = bs.BeautifulSoup(f, 'html.parser')
            for key in symbols:
                label = symbols[key][0]
                strokes = symbols[key][1]
                id_list = []
                X = []
                Y = []
                for id in strokes:
                    st_id = id.strip()
                    trace = soup.findAll("trace", {'id': st_id})

                    coords = trace[0].text.strip().split(",")
                    x = []
                    y = []
                    for coord in coords:
                        trace_parts = coord.strip().split(' ')
                        x.append(float(trace_parts[0]))
                        y.append(float(trace_parts[1]))

                    X.append(x)
                    Y.append(y)
                    id_list.append(st_id)
                X, Y = pre.dopreprocess(x=X, y=Y, parser=True)
                symbols[key] = Symbol(label=label, x=X, y=Y, stroke_id=id_list)

        # relations section
        with open(lg_dir + "/" + file) as f:
            for line in f:
                if line.startswith("EO"):
                    filt_line = line.strip().split(",")
                    sym1 = symbols[filt_line[1].strip()]
                    sym2 = symbols[filt_line[2].strip()]
                    relation = filt_line[3].strip()

                    writing_slope = sym1.writing_slope(sym2)
                    writing_curve = sym1.writing_curvature(sym2)
                    bb_dist = sym1.distance_between_box(sym2)
                    distance, horizontal_ofsset, vertical_distance = sym1.distance_between_average_centres(
                        sym2)
                    max_point_pair = sym1.maximal_point_distance(sym2)
                    feature_matrix.append([
                        writing_slope, writing_curve, bb_dist, distance,
                        horizontal_ofsset, vertical_distance, max_point_pair
                    ])
                    targets.append(relation)

        c += 1

    print("Shape of Training matrix")
    print(len(feature_matrix), "x", len(feature_matrix[0]))
    print("Unique labels : ", np.unique(targets))

    rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    rf.fit(X=feature_matrix, y=targets)
    joblib.dump(rf,
                "relation_classifier_bonus.pkl",
                protocol=pickle.HIGHEST_PROTOCOL)

    rf = joblib.load("relation_classifier_bonus.pkl")

    score = accuracy_score(y_true=targets,
                           y_pred=rf.predict(feature_matrix),
                           normalize=True)

    print("accuracy of model is :", (score * 100))