def getfeatures(infilename):
    """
  Returns an array of [feature vector, class label, trace ids].
  1 per every character in the file
  """
    infile = open(infilename, 'r')
    #Identify all of the symbols in the document
    try:
        soup = BeautifulSoup(infile, 'html.parser')
    except UnicodeDecodeError:  #File Corruption
        # print("Bad File: {}".format(infilename))
        #Attempt to load file by ignoring corrupted characters
        with codecs.open(infilename, "r", encoding='utf-8',
                         errors='ignore') as fdata:
            soup = BeautifulSoup(fdata, 'html.parser')

    #Determine all tracegroups (removing the first since it is a group of groups)
    tracegroups = soup.find_all("tracegroup")
    #Abort if tracegroup data not available (segmentation test file)
    if len(tracegroups) == 0:
        soup.decompose()
        infile.close()
        return []
    tracegroups = tracegroups[1:]

    featpairs = []

    #Identify all traces within the group
    for group in tracegroups:
        traceviews = group.find_all("traceview")
        tracedata = []
        traceids = []
        for trace in traceviews:
            data = soup.find("trace", id=trace['tracedataref'])
            data = data.contents
            data = ''.join(data)
            xypairs = [d.strip() for d in data.split(",")]
            data = np.zeros((len(xypairs), 2))
            for i, pair in enumerate(xypairs):
                data[i][0] = float(pair.split(" ")[0])
                data[i][1] = float(pair.split(" ")[1])
            tracedata.append(data)
            traceids.append(trace['tracedataref'])

        #Compute the features based on the traces
        features = extractor.computefeatures(tracedata)

        #Determine the true symbol
        symbol = '\\unknown'
        if group.find("annotation") is not None:
            symbol = ''.join((group.find("annotation")).contents)

        featpairs.append([features, symbol, traceids])

    soup.decompose()  #Free memory
    infile.close()
    return featpairs
Ejemplo n.º 2
0
def segment(strokelist, classifier):
    array = []
    x1 = np.median(strokelist[0])
    y1 = np.median(strokelist[1])
    features = extractor.computefeatures([x[0] for x in strokelist])
    for j in strokelist:
        array.append(classifier.predict_proba([features])[0])

    bestscore = -float('inf')
    bestsymbol = 'UNKNOWN'
    for i, stroke in enumerate(strokelist):
        array[i] = [
            max(array[i - 1]) + classifier.predict_proba(array[j]),
            max(array[i - 2]) + classifier.predict_proba(array[j, j - 1])
        ]
        if (max(stroke[i][0]) + x1 / 8 < min(stroke[i][0] + x1 / 8)):
            array[i] = array[i] + classifier.predict_proba([features])[0]
            score = score + max(array[i])
        if score > bestscore:
            bestscore = score
        bestsymbol = classifier.classes_[np.argmax(score)]
        symbolstrokelist.append(bestsymbol)
    return symbolstrokelist
Ejemplo n.º 3
0
def segandsave(filename, classifier, classifierlock):
    """
	A function to segment and classify symbols
	"""

    #Determine the output filename
    name = filename.split('/')[-1].split('.')[0]

    #Attempt to create ground truth file for comparison
    featset = getfeatures(filename)
    if len(featset) != 0:
        #Create truth .lg values to compare against
        emitoutput(name, [c[1] for c in featset], [c[2] for c in featset],
                   cheat=True)

    #Perform segmentation based on strokes in the file
    strokedata = getstrokes(filename)  #[[pointlist, name], ...]

    ### NN method ###

    #Force single point strokes to merge with others if they are nearby
    for i, stroke in extractor.reverseenumerate(strokedata):
        #If you only have 1 point
        if len(stroke[0]) == 1:
            #See if it's identical to the point in the stroke immediately before
            if i > 0:
                if stroke[0][0][0] == strokedata[i - 1][0][-1][0] and stroke[
                        0][0][1] == strokedata[i - 1][0][-1][1]:
                    #stroke i should be merged with i-1
                    strokedata[i - 1][1] += ', ' + stroke[1]
                    del strokedata[i]
                    continue
            #See if it's identical to the point in the stroke immediately after
            if i < len(strokedata) - 1:
                if stroke[0][0][0] == strokedata[i + 1][0][0][0] and stroke[0][
                        0][1] == strokedata[i + 1][0][0][1]:
                    #stroke i should be merged with i+1
                    strokedata[i + 1][1] += ', ' + stroke[1]
                    del strokedata[i]
                    continue

    #Compute the median width of all strokes
    widths = np.zeros(len(strokedata))
    for i, stroke in enumerate(strokedata):
        points = stroke[0]
        widths[i] = max([point[0] for point in points]) - min(
            [point[0] for point in points])
    medianwidth = np.median(widths)

    #Compute the mean x and y coordinates for each stroke. [x, y, stroke]
    for i, stroke in enumerate(strokedata):
        strokedata[i] = [
            np.mean([x[0] for x in stroke[0]]),
            np.mean([y[1] for y in stroke[0]]), stroke
        ]

    #Sort stroke list by x, then by y so that the y sort takes precedent
    strokedata.sort(key=lambda x: x[0])
    strokedata.sort(key=lambda y: y[1])

    #Compute the nearest neighbors (might need to ignore some neighbors once consumed)
    #neigbors[i] = [[neighbor index, distance from i], ...]
    neighbors = [[[
        a,
        euclidean(strokedata[a][0], strokedata[b][0], strokedata[a][1],
                  strokedata[b][1])
    ] for a in range(len(strokedata))] for b in range(len(strokedata))]

    #Sort each point to get it's nearest neighbors first in list
    for point in neighbors:
        point.sort(key=lambda x: x[1])

    #Record strokes which have already been claimed for a symbol
    alreadyclaimed = [False for x in range(len(strokedata))]

    #Record [[symbol, strokenames], ...] pairs
    symbolstrokelist = []

    #Traverse from top left to bottom right (y axis major to deal with stacking)
    for i, stroke in enumerate(strokedata):
        #Don't go any further if stroke has already been assigned to a symbol
        if alreadyclaimed[i]:
            continue
        alreadyclaimed[i] = True

        nnlist = neighbors[i]
        #Go through all neighbors to find remaining closest 3 (excluding itself)
        nnstrokeids = []
        for j in range(len(nnlist)):
            if not alreadyclaimed[nnlist[j][0]]:
                nnstrokeids.append(nnlist[j][0])
                #Only consider up to 3 nearest neighbors
                if len(nnstrokeids) == 3:
                    break

        #Delete neighbors which have an x gap
        xmaxi = max([x[0]
                     for x in stroke[2][0]])  #Max x-coordinate of stroke i
        xmini = min([x[0]
                     for x in stroke[2][0]])  #Min x-coordinate of stroke i
        # print("Xmaxi: {}, i: {}, id: {}".format(xmaxi, i, stroke[2][1]))
        for j, elem in extractor.reverseenumerate(nnstrokeids):
            xmaxj = max([x[0] for x in strokedata[nnstrokeids[j]][2][0]])
            xminj = min([x[0] for x in strokedata[nnstrokeids[j]][2][0]])
            # print("Xminj: {}, j: {}, id: {}".format(xminj, nnstrokeids[j], strokedata[nnstrokeids[j]][2][1]))
            if xmini <= xminj:
                if xmaxi + medianwidth / 8.0 < xminj - medianwidth / 8.0:
                    del nnstrokeids[j]
            else:
                if xmaxj + medianwidth / 8.0 < xmini - medianwidth / 8.0:
                    del nnstrokeids[j]

        #Build the desired test combinations
        testset = [[i]]
        for l in range(1, len(nnstrokeids) + 1):
            for subset in itertools.combinations(nnstrokeids, l):
                #stroke i must be in every combination
                subsetlist = list(subset)
                subsetlist.extend([i])
                testset.append(subsetlist)

        bestscore = -float('inf')
        bestsymbol = 'UNKNOWN'
        bestsetid = 0

        for t, test in enumerate(testset):
            #Generate features based on the strokes corresponding to the proposed test list
            features = extractor.computefeatures(
                copy.deepcopy([x[0]
                               for x in [strokedata[s][2] for s in test]]))
            scores = 0
            # with classifierlock:
            scores = classifier.predict_proba([features])[0]

            #TODO Multiply score by prior probability, composition probability
            # scores = np.multiply(scores,classpriors)
            # scores = np.multiply(scores,lengthpriors)
            # print(test)
            # print([x[1] for x in [strokedata[s][2] for s in test]])
            # print(scores)
            #Give a slight advantage to more complex characters
            scores += (len(test) / 4.0) * (1.0 / 6.0)
            # print(scores)
            score = max(scores)
            if score > bestscore:
                bestscore = score
                bestsymbol = classifier.classes_[np.argmax(scores)]
                bestsetid = t

        #Mark all desired strokes as claimed
        for strokeid in testset[bestsetid]:
            alreadyclaimed[strokeid] = True

        #Record the classification and segmentation decision
        symbolstrokelist.append([
            bestsymbol,
            [x[1] for x in [strokedata[s][2] for s in testset[bestsetid]]]
        ])

    # ### In-order method

    # symbolstrokelist = []
    # start = 0
    # while start < len(strokedata):

    # 	bestscore = -float('inf')
    # 	bestend = start
    # 	bestsymbol = 'UNKNOWN'

    # 	#Limit the max length of a symbol to 10 strokes
    # 	for end in range(start+1, min(start+12, len(strokedata)+1)):

    # 		features = extractor.computefeatures([x[0] for x in strokedata[start:end]])

    # 		scores = classifier.predict_proba([features])[0]
    # 		#TODO Multiply score by prior probability, composition probability
    # 		# scores = np.multiply(scores,classpriors)
    # 		# scores = np.multiply(scores,lengthpriors)
    # 		score = max(scores)
    # 		if score > bestscore:
    # 			bestscore = score
    # 			bestsymbol = classifier.classes_[np.argmax(scores)]
    # 			# bestsymbol = classifier.predict([features])[0]
    # 			bestend = end

    # 	symbolstrokelist.append([bestsymbol, [x[1] for x in strokedata[start:bestend]]])
    # 	start = bestend

    #Save an output file
    emitoutput(name, [s[0] for s in symbolstrokelist],
               [s[1] for s in symbolstrokelist])
Ejemplo n.º 4
0
def segandsave(filename, classifier, parser):
	"""
	A function to segment and classify symbols
	"""

	#Determine the output filename
	name = filename.split('/')[-1].split('.')[0]

	#Attempt to create ground truth file for comparison
	featset = getfeatures(filename)
	if len(featset) != 0:
		#Create truth .lg values to compare against
		subprocess.run(["crohme2lg", filename, "GroundTruth/"+name+".lg"])


	#Perform segmentation based on strokes in the file
	strokedata = getstrokes(filename) #[[pointlist, name], ...]

	### Pre-Processesing ###

	#Force single point strokes to merge with others if they are nearby
	for i, stroke in extractor.reverseenumerate(strokedata):
		#If you only have 1 point
		if len(stroke[0]) == 1:
			#See if it's identical to the point in the stroke immediately before
			if i > 0:
				if stroke[0][0][0] == strokedata[i-1][0][-1][0] and stroke[0][0][1] == strokedata[i-1][0][-1][1]:
					#stroke i should be merged with i-1
					strokedata[i-1][1] += ', '+stroke[1]
					del strokedata[i]
					continue
			#See if it's identical to the point in the stroke immediately after
			if i < len(strokedata)-1:
				if stroke[0][0][0] == strokedata[i+1][0][0][0] and stroke[0][0][1] == strokedata[i+1][0][0][1]:
					#stroke i should be merged with i+1
					strokedata[i+1][1] += ', '+stroke[1]
					del strokedata[i]
					continue

	#Compute the median width of all strokes
	widths = np.zeros(len(strokedata))
	for i, stroke in enumerate(strokedata):
		points = stroke[0]
		widths[i] = max([point[0] for point in points]) - min([point[0] for point in points])
	medianwidth = np.median(widths)

	#Assume strokes are written in temporal order, with maximum length of 4
	checklist = ['\sin', '\cos', '\\tan', '\lim', '\log', '\ldots']
	maxlength = 4
	bestscores = [[1, -1, 'unknown'] for x in range(-maxlength, len(strokedata))] #[score, backwards pointer, classification]
	for i in range(maxlength, len(bestscores)):
		bestscore = -float('inf')
		bestsymbol = 'unknown'
		bestbackpointer = -1
		for c in range(0, maxlength): #Consider back-tracking up to 3 characters
			candidateindices = [j for j in range(i-maxlength-c, i-maxlength+1)]
			#Ignore candidates involving non-existent characters
			if candidateindices[0] < 0:
				break

			#Reject candidates involving an x-gap greater than threshold level between first and last points
			#unless those candidates are known 3 letter symbols
			xmaxi = max([x[0] for x in strokedata[candidateindices[0]][0]])
			xmini = min([x[0] for x in strokedata[candidateindices[0]][0]])
			xmaxj = max([x[0] for x in strokedata[candidateindices[-1]][0]])
			xminj = min([x[0] for x in strokedata[candidateindices[-1]][0]])
			rejectflag = False
			if xmini <= xminj:
				if xmaxi + medianwidth/8.0 < xminj - medianwidth/8.0:
					rejectflag = True
			else:
				if xmaxj + medianwidth/8.0 < xmini - medianwidth/8.0:
					rejectflag = True

			features = extractor.computefeatures(copy.deepcopy([x[0] for x in [strokedata[s] for s in candidateindices]]))
			scores = classifier.predict_proba([features])[0]
			if not rejectflag:
				score = max(scores)*bestscores[i-len(candidateindices)][0]
			else:
				#Candidate must be inside pre-approved multi-stroke list
				proposedclass = classifier.classes_[np.argmax(scores)]
				if proposedclass not in checklist:
					continue
				#Must be at least 70% certain for consideration
				#to prevent biasing towards large symbols
				score = max(scores)
				if score < 0.7:
					continue
				score = max(scores)*bestscores[i-len(candidateindices)][0]

			if score > bestscore:
				bestscore = score
				bestsymbol = classifier.classes_[np.argmax(scores)]
				bestbackpointer = i-len(candidateindices)

		bestscores[i] = [bestscore, bestbackpointer, bestsymbol]

	#Recover the optimal segmentation
	classlabels = {}
	strokegroups = {}
	symbolID = 0
	decodeindex = len(bestscores)-1
	while decodeindex > maxlength-1:
		classlabels[symbolID] = bestscores[decodeindex][2]
		indicesused = [i-maxlength for i in range(bestscores[decodeindex][1]+1, decodeindex+1)]
		strokegroups[symbolID] = [x[1] for x in [strokedata[s] for s in indicesused]]
		symbolID += 1
		decodeindex = bestscores[decodeindex][1]

	parseandsave(name, strokedata, classlabels, strokegroups, parser)