Beispiel #1
0
def handleOneIndex(indexpath, subdir, indexname, fast):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Prepare'):
        raise utils.EpochError('Please prepare first.\n')
    if utils.check_epoch(indexstatus, 'Populate'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    shmdir = config.getInMemoryFileSystem()

    for i in range(1, N + 1):
        if fast:
            #copy file
            filename = config.getNgramFileName(i)
            filepath = workdir + os.sep + filename
            shmfilepath = shmdir + os.sep + filename
            utils.copyfile(filepath, shmfilepath)
            handleOnePass(indexpath, shmdir, i)
            pruneNgramTable(indexpath, shmdir, i)
            utils.copyfile(shmfilepath, filepath)
            os.unlink(shmfilepath)
        else:
            handleOnePass(indexpath, workdir, i)
            pruneNgramTable(indexpath, workdir, i)

    #sign epoch
    utils.sign_epoch(indexstatus, 'Populate')
    utils.store_status(indexstatuspath, indexstatus)
Beispiel #2
0
def generateOneText(infile, modelfile, reportfile):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'MergeSequence'):
        raise utils.EpochError('Please mergeseq first.\n')
    if utils.check_epoch(infilestatus, 'Generate'):
        return False

    #begin processing
    cmdline = ['../utils/training/gen_k_mixture_model', \
                   '--maximum-occurs-allowed', \
                   str(config.getMaximumOccursAllowed()), \
                   '--maximum-increase-rates-allowed', \
                   str(config.getMaximumIncreaseRatesAllowed()), \
                   '--k-mixture-model-file', \
                   modelfile, infile + \
                   config.getMergedPostfix()]
    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'ab') as f:
            f.writelines(lines)

    (pid, status) = os.waitpid(subprocess.pid, 0)
    if status != 0:
        sys.exit('gen_k_mixture_model encounters error.')
    #end processing

    utils.sign_epoch(infilestatus, 'Generate')
    utils.store_status(infilestatuspath, infilestatus)
    return True
Beispiel #3
0
def handleOneIndex(indexpath):
    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'MergeSequence'):
        return

    #begin processing
    indexfile = open(indexpath, 'r')
    for oneline in indexfile.readlines():
        #remove tailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')

        infile = config.getTextDir() + textpath
        outfile = config.getTextDir() + textpath + config.getMergedPostfix()
        reportfile = config.getTextDir() + textpath + \
            config.getMergedReportPostfix()

        print("Processing " + title + '#' + textpath)
        mergeOneText(infile, outfile, reportfile)
        print("Processed " + title + '#' + textpath)

    indexfile.close()
    #end processing

    utils.sign_epoch(indexstatus, 'MergeSequence')
    utils.store_status(indexstatuspath, indexstatus)
Beispiel #4
0
def mergeOneText(infile, outfile, reportfile):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(infilestatus, 'MergeSequence'):
        return

    infile = infile + config.getSegmentPostfix()

    #begin processing
    cmdline = ['../utils/segment/mergeseq', \
                   '-o', outfile, infile]

    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'wb') as f:
            f.writelines(lines)

    os.waitpid(subprocess.pid, 0)
    #end processing

    utils.sign_epoch(infilestatus, 'MergeSequence')
    utils.store_status(infilestatuspath, infilestatus)
Beispiel #5
0
def handleOneDocument(infile, cur, length):
    print(infile, length)

    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(infilestatus, 'Populate'):
        return False

    sep = config.getWordSep()

    #train
    docfile = open(infile + config.getSegmentPostfix(), 'r')
    words = []

    for oneline in docfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (token, word) = oneline.split(" ", 1)
        token = int(token)

        if 0 == token:
            words = []
        else:
            words.append(word)

        if len(words) < length:
            continue

        if len(words) > length:
            words.pop(0)

        assert len(words) == length

        #do sqlite training
        words_str = sep + sep.join(words) + sep
        #print(words_str)

        rowcount = cur.execute(UPDATE_NGRAM_DML, (words_str, )).rowcount
        #print(rowcount)
        assert rowcount <= 1

        if 0 == rowcount:
            cur.execute(INSERT_NGRAM_DML, (words_str, ))

    docfile.close()

    #sign epoch only after last pass
    if N == length:
        utils.sign_epoch(infilestatus, 'Populate')
        utils.store_status(infilestatuspath, infilestatus)

    return True
Beispiel #6
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'NewWord'):
        raise utils.EpochError('Please new word first.\n')
    if utils.check_epoch(indexstatus, 'MarkPinyin'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    markPinyins(workdir)

    #sign epoch
    utils.sign_epoch(indexstatus, 'MarkPinyin')
    utils.store_status(indexstatuspath, indexstatus)
Beispiel #7
0
def handleOneModel(modelfile, reportfile):
    modelfilestatuspath = modelfile + config.getStatusPostfix()
    modelfilestatus = utils.load_status(modelfilestatuspath)
    if not utils.check_epoch(modelfilestatus, 'Generate'):
        raise utils.EpochError('Please generate first.\n')
    if utils.check_epoch(modelfilestatus, 'Estimate'):
        return

    reporthandle = open(reportfile, 'wb')

    result_line_prefix = "average lambda:"
    avg_lambda = 0.

    #begin processing
    cmdline = ['../utils/training/estimate_k_mixture_model', \
                   '--deleted-bigram-file', \
                   config.getEstimatesModel(), \
                   '--bigram-file', \
                   modelfile]

    subprocess = Popen(cmdline, shell=False, stdout=PIPE, \
                           close_fds=True)

    for line in subprocess.stdout.readlines():
        reporthandle.writelines([line])
        #remove trailing '\n'
        line = line.decode('utf-8')
        line = line.rstrip(os.linesep)
        if line.startswith(result_line_prefix):
            avg_lambda = float(line[len(result_line_prefix):])

    reporthandle.close()

    (pid, status) = os.waitpid(subprocess.pid, 0)
    if status != 0:
        sys.exit('estimate k mixture model returns error.')
    #end processing

    print('average lambda:', avg_lambda)
    modelfilestatus['EstimateScore'] = avg_lambda
    utils.sign_epoch(modelfilestatus, 'Estimate')
    utils.store_status(modelfilestatuspath, modelfilestatus)
Beispiel #8
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'Prepare'):
        return

    #create directory
    onedir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    os.path.exists(onedir) or os.makedirs(onedir)

    #create sqlite databases
    createSqliteDatabases(onedir)

    #sign epoch
    utils.sign_epoch(indexstatus, 'Prepare')
    utils.store_status(indexstatuspath, indexstatus)
Beispiel #9
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'PartialWord'):
        raise utils.EpochError('Please partial word first.\n')
    if utils.check_epoch(indexstatus, 'NewWord'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    createBigramSqlite(workdir)
    populateBigramSqlite(workdir)

    filename = config.getBigramFileName()
    filepath = workdir + os.sep + filename

    conn = sqlite3.connect(filepath)

    prethres = computeThreshold(conn, "prefix")
    indexstatus['NewWordPrefixThreshold'] = prethres
    postthres = computeThreshold(conn, "postfix")
    indexstatus['NewWordPostfixThreshold'] = postthres

    utils.store_status(indexstatuspath, indexstatus)

    filterPartialWord(workdir, conn, prethres, postthres)

    conn.commit()
    if conn:
        conn.close()

    #sign epoch
    utils.sign_epoch(indexstatus, 'NewWord')
    utils.store_status(indexstatuspath, indexstatus)
Beispiel #10
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Populate'):
        raise utils.EpochError('Please populate first.\n')
    if utils.check_epoch(indexstatus, 'PartialWord'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    threshold = getThreshold(workdir)
    indexstatus['PartialWordThreshold'] = threshold
    utils.store_status(indexstatuspath, indexstatus)

    recognizePartialWord(workdir, threshold)

    #sign epoch
    utils.sign_epoch(indexstatus, 'PartialWord')
    utils.store_status(indexstatuspath, indexstatus)
Beispiel #11
0
def gatherModels(path, indexname):
    indexfilestatuspath = indexname + config.getStatusPostfix()
    indexfilestatus = utils.load_status(indexfilestatuspath)
    if utils.check_epoch(indexfilestatuspath, 'Estimate'):
        return

    #begin processing
    indexfile = open(indexname, "w")
    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
        for onefile in files:
            filepath = os.path.join(root, onefile)
            if onefile.endswith(config.getModelPostfix()):
                #append one record to index file
                subdir = os.path.relpath(root, path)
                statusfilepath = filepath + config.getStatusPostfix()
                status = utils.load_status(statusfilepath)
                if not (utils.check_epoch(status, 'Estimate') and \
                        'EstimateScore' in status):
                    raise utils.EpochError('Unknown Error:\n' + \
                                               'Try re-run estimate.\n')
                avg_lambda = status['EstimateScore']
                line = subdir + '#' + onefile + '#' + str(avg_lambda)
                indexfile.writelines([line, os.linesep])
                #record written
            elif onefile.endswith(config.getStatusPostfix()):
                pass
            elif onefile.endswith(config.getIndexPostfix()):
                pass
            elif onefile.endswith(config.getReportPostfix()):
                pass
            else:
                print('Unexpected file:' + filepath)
    indexfile.close()
    #end processing

    utils.sign_epoch(indexfilestatus, 'Estimate')
    utils.store_status(indexfilestatuspath, indexfilestatus)
Beispiel #12
0
                            default=config.getFinalModelDir())
    parser.add_argument('tryname', action='store', \
                            help='the storage directory')

    args = parser.parse_args()
    print(args)
    tryname = 'try' + args.tryname

    trydir = os.path.join(args.finaldir, tryname)
    if not os.access(trydir, os.F_OK):
        sys.exit(tryname + "doesn't exist.")

    cwdstatuspath = os.path.join(trydir, config.getFinalStatusFileName())
    cwdstatus = utils.load_status(cwdstatuspath)
    if not utils.check_epoch(cwdstatus, 'Prune'):
        raise utils.EpochError('Please tryprune first.')

    if utils.check_epoch(cwdstatus, 'Evaluate'):
        sys.exit('already evaluated.')

    print('checking')
    checkData()

    modelfile = os.path.join(trydir, config.getFinalModelFileName())
    destfile = os.path.join(libpinyin_dir, 'data', \
                                config.getFinalModelFileName())

    utils.copyfile(modelfile, destfile)

    print('cleaning')
    cleanUpData()
Beispiel #13
0
def handleOneIndex(indexpath, subdir, indexname, fast):
    inMemoryFile = "model.db"

    modeldir = os.path.join(config.getModelDir(), subdir, indexname)
    os.path.exists(modeldir) or os.makedirs(modeldir)


    def cleanupInMemoryFile():
        modelfile = os.path.join(config.getInMemoryFileSystem(), inMemoryFile)
        reportfile = modelfile + config.getReportPostfix()
        if os.access(modelfile, os.F_OK):
            os.unlink(modelfile)
        if os.access(reportfile, os.F_OK):
            os.unlink(reportfile)

    def copyoutInMemoryFile(modelfile):
        inmemoryfile = os.path.join\
            (config.getInMemoryFileSystem(), inMemoryFile)
        inmemoryreportfile = inmemoryfile + config.getReportPostfix()
        reportfile = modelfile + config.getReportPostfix()

        if os.access(inmemoryfile, os.F_OK):
            utils.copyfile(inmemoryfile, modelfile)
        if os.access(inmemoryreportfile, os.F_OK):
            utils.copyfile(inmemoryreportfile, reportfile)

    def cleanupFiles(modelnum):
        modeldir = os.path.join(config.getModelDir(), subdir, indexname)
        modelfile = os.path.join( \
            modeldir, config.getCandidateModelName(modelnum))
        reportfile = modelfile + config.getReportPostfix()
        if os.access(modelfile, os.F_OK):
            os.unlink(modelfile)
        if os.access(reportfile, os.F_OK):
            os.unlink(reportfile)

    def storeModelStatus(modelfile, textnum, nexttextnum):
        #store model info in status file
        modelstatuspath = modelfile + config.getStatusPostfix()
        #create None status
        modelstatus = {}
        modelstatus['GenerateStart'] = textnum
        modelstatus['GenerateEnd'] = nexttextnum
        utils.sign_epoch(modelstatus, 'Generate')
        utils.store_status(modelstatuspath, modelstatus)

    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'MergeSequence'):
        raise utils.EpochError('Please mergeseq first.\n')
    if utils.check_epoch(indexstatus, 'Generate'):
        return

    #continue generating
    textnum, modelnum, aggmodelsize = 0, 0, 0
    if 'GenerateTextEnd' in indexstatus:
        textnum = indexstatus['GenerateTextEnd']
    if 'GenerateModelEnd' in indexstatus:
        modelnum = indexstatus['GenerateModelEnd']

    #clean up previous file
    if fast:
        cleanupInMemoryFile()

    cleanupFiles(modelnum)

    #begin processing
    indexfile = open(indexpath, 'r')
    for i, oneline in enumerate(indexfile.readlines()):
        #continue last generating
        if i < textnum:
            continue

        #remove trailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')
        infile = config.getTextDir() + textpath
        infilesize = utils.get_file_length(infile + config.getMergedPostfix())
        if infilesize < config.getMinimumFileSize():
            print("Skipping " + title + '#' + textpath)
            continue

        if fast:
            modelfile = os.path.join(config.getInMemoryFileSystem(), \
                                         inMemoryFile)
        else:
            modelfile = os.path.join(modeldir, \
                                         config.getCandidateModelName(modelnum))

        reportfile = modelfile + config.getReportPostfix()
        print("Proccessing " + title + '#' + textpath)
        if generateOneText(infile, modelfile, reportfile):
            aggmodelsize += infilesize
        print("Processed " + title + '#' + textpath)
        if aggmodelsize > config.getCandidateModelSize():
            #copy out in memory file
            if fast:
                modelfile = os.path.join\
                    (modeldir, config.getCandidateModelName(modelnum))
                copyoutInMemoryFile(modelfile)
                cleanupInMemoryFile()

            #the model file is in disk now
            nexttextnum = i + 1
            storeModelStatus(modelfile, textnum, nexttextnum)

            #new model candidate
            aggmodelsize = 0
            textnum = nexttextnum
            modelnum += 1

            #clean up next file
            cleanupFiles(modelnum)

            #save current progress in status file
            indexstatus['GenerateTextEnd'] = nexttextnum
            indexstatus['GenerateModelEnd'] = modelnum
            utils.store_status(indexstatuspath, indexstatus)


    #copy out in memory file
    if fast:
        modelfile = os.path.join\
            (modeldir, config.getCandidateModelName(modelnum))
        copyoutInMemoryFile(modelfile)
        cleanupInMemoryFile()

    #the model file is in disk now
    nexttextnum = i + 1
    storeModelStatus(modelfile, textnum, nexttextnum)

    indexfile.close()
    #end processing

    #save current progress in status file
    modelnum += 1
    indexstatus['GenerateTextEnd'] = nexttextnum
    indexstatus['GenerateModelEnd'] = modelnum

    utils.sign_epoch(indexstatus, 'Generate')
    utils.store_status(indexstatuspath, indexstatus)