Esempio n. 1
0
    curAsciiDir = os.path.join(asciiPrefix, l.split('-')[0], l[0:7])

    curAsciiFilePath = os.path.join(curAsciiDir, l + ".txt")
    [curTargetString, curWordTargetString,
     curCharSet] = getTargetString(curAsciiFilePath)
    targetStrings.extend(curTargetString)
    wordTargetStrings.extend(curWordTargetString)
    # print len(curTargetString)
    # print curCharSet
    charSet = charSet.union(curCharSet)

    #     for i in range(len(curTargetString)):
    #         print curWordTargetString[i]
    #         print curTargetString[i]

    xmlNames = getXmlNames(curXmlDir, l)
    assert len(curTargetString) == len(xmlNames)
    for xmlName in xmlNames:
        seqTags.append(xmlName)
        xmlFilePath = os.path.join(curXmlDir, xmlName)
        curLineStroke = getLineStroke(xmlFilePath)
        # print len(curLine)
        inputs.extend(curLineStroke)
        seqLengths.append(len(curLineStroke))
        seqDims.append([len(curLineStroke)])

inputsArr = np.array(inputs)
inputMeans = np.mean(inputsArr, 0)
inputStds = np.std(inputsArr, 0)
inputsArr[:, :-1] = (inputsArr[:, :-1] - inputMeans[:-1]) / inputStds[:-1]
inputs = inputsArr.tolist()
for l in trainFileList:
    l = l.strip()
    print "loading file ", l
    #print l[0:7]
    curXmlDir = os.path.join(xmlPrefix, l.split('-')[0], l[0:7])
    curAsciiDir = os.path.join(asciiPrefix, l.split('-')[0], l[0:7])

    curAsciiFilePath = os.path.join(curAsciiDir, l + ".txt")
    [curTargetString, curWordTargetString, curCharSet] = getTargetStringCompress(curAsciiFilePath)
    targetStrings.extend(curTargetString)
    wordTargetStrings.extend(curWordTargetString)
    #print len(curTargetString)
    #print curCharSet
    charSet = charSet.union(curCharSet)

    xmlNames = getXmlNames(curXmlDir, l)
    for xmlName in xmlNames:
        seqTags.append(xmlName)
        xmlFilePath = os.path.join(curXmlDir, xmlName)
        curLineStroke = getLineStrokeOffset(xmlFilePath)
        #print len(curLine)
        inputs.append([0.0] * 3)
        inputs.extend(curLineStroke)
        inputs = inputs[:-1]
        seqLengths.append(len(curLineStroke))
        seqDims.append([len(curLineStroke)])
        for coord in curLineStroke:
            targetPatterns.append(coord[:-1]);
            targetClasses.append([coord[-1]]);

inputsArr = np.array(inputs)