curAsciiDir = os.path.join(asciiPrefix, l.split('-')[0], l[0:7]) curAsciiFilePath = os.path.join(curAsciiDir, l + ".txt") [curTargetString, curWordTargetString, curCharSet] = getTargetString(curAsciiFilePath) targetStrings.extend(curTargetString) wordTargetStrings.extend(curWordTargetString) # print len(curTargetString) # print curCharSet charSet = charSet.union(curCharSet) # for i in range(len(curTargetString)): # print curWordTargetString[i] # print curTargetString[i] xmlNames = getXmlNames(curXmlDir, l) assert len(curTargetString) == len(xmlNames) for xmlName in xmlNames: seqTags.append(xmlName) xmlFilePath = os.path.join(curXmlDir, xmlName) curLineStroke = getLineStroke(xmlFilePath) # print len(curLine) inputs.extend(curLineStroke) seqLengths.append(len(curLineStroke)) seqDims.append([len(curLineStroke)]) inputsArr = np.array(inputs) inputMeans = np.mean(inputsArr, 0) inputStds = np.std(inputsArr, 0) inputsArr[:, :-1] = (inputsArr[:, :-1] - inputMeans[:-1]) / inputStds[:-1] inputs = inputsArr.tolist()
for l in trainFileList: l = l.strip() print "loading file ", l #print l[0:7] curXmlDir = os.path.join(xmlPrefix, l.split('-')[0], l[0:7]) curAsciiDir = os.path.join(asciiPrefix, l.split('-')[0], l[0:7]) curAsciiFilePath = os.path.join(curAsciiDir, l + ".txt") [curTargetString, curWordTargetString, curCharSet] = getTargetStringCompress(curAsciiFilePath) targetStrings.extend(curTargetString) wordTargetStrings.extend(curWordTargetString) #print len(curTargetString) #print curCharSet charSet = charSet.union(curCharSet) xmlNames = getXmlNames(curXmlDir, l) for xmlName in xmlNames: seqTags.append(xmlName) xmlFilePath = os.path.join(curXmlDir, xmlName) curLineStroke = getLineStrokeOffset(xmlFilePath) #print len(curLine) inputs.append([0.0] * 3) inputs.extend(curLineStroke) inputs = inputs[:-1] seqLengths.append(len(curLineStroke)) seqDims.append([len(curLineStroke)]) for coord in curLineStroke: targetPatterns.append(coord[:-1]); targetClasses.append([coord[-1]]); inputsArr = np.array(inputs)