Esempio n. 1
0
def handleOneIndex(indexpath, subdir, indexname, fast):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Prepare'):
        raise utils.EpochError('Please prepare first.\n')
    if utils.check_epoch(indexstatus, 'Populate'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    shmdir = config.getInMemoryFileSystem()

    for i in range(1, N + 1):
        if fast:
            #copy file
            filename = config.getNgramFileName(i)
            filepath = workdir + os.sep + filename
            shmfilepath = shmdir + os.sep + filename
            utils.copyfile(filepath, shmfilepath)
            handleOnePass(indexpath, shmdir, i)
            pruneNgramTable(indexpath, shmdir, i)
            utils.copyfile(shmfilepath, filepath)
            os.unlink(shmfilepath)
        else:
            handleOnePass(indexpath, workdir, i)
            pruneNgramTable(indexpath, workdir, i)

    #sign epoch
    utils.sign_epoch(indexstatus, 'Populate')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 2
0
def segmentOneText(infile, outfile, reportfile, fast):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if utils.check_epoch(infilestatus, 'Segment'):
        return

    #begin processing
    if fast:
        cmdline = ['../utils/segment/spseg', \
                       '-o', outfile, infile]
    else:
        cmdline = ['../utils/segment/ngseg', \
                       '-o', outfile, infile]

    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'wb') as f:
            f.writelines(lines)

    os.waitpid(subprocess.pid, 0)
    #end processing

    utils.sign_epoch(infilestatus, 'Segment')
    utils.store_status(infilestatuspath, infilestatus)
Esempio n. 3
0
def handleOneIndex(indexpath):
    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'MergeSequence'):
        return

    #begin processing
    indexfile = open(indexpath, 'r')
    for oneline in indexfile.readlines():
        #remove tailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')

        infile = config.getTextDir() + textpath
        outfile = config.getTextDir() + textpath + config.getMergedPostfix()
        reportfile = config.getTextDir() + textpath + \
            config.getMergedReportPostfix()

        print("Processing " + title + '#' + textpath)
        mergeOneText(infile, outfile, reportfile)
        print("Processed " + title + '#' + textpath)

    indexfile.close()
    #end processing

    utils.sign_epoch(indexstatus, 'MergeSequence')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 4
0
def sortModels(indexname, sortedindexname):
    sortedindexfilestatuspath = sortedindexname + config.getStatusPostfix()
    sortedindexfilestatus = utils.load_status(sortedindexfilestatuspath)
    if utils.check_epoch(sortedindexfilestatus, 'Estimate'):
        return

    #begin processing
    records = []
    indexfile = open(indexname, 'r')
    for line in indexfile.readlines():
        #remove the trailing '\n'
        line = line.rstrip(os.linesep)
        (subdir, modelname, score) = line.split('#', 2)
        score = float(score)
        records.append((subdir, modelname, score))
    indexfile.close()

    records.sort(key=itemgetter(2), reverse=True)

    sortedindexfile = open(sortedindexname, 'w')
    for record in records:
        (subdir, modelname, score) = record
        line = subdir + '#' + modelname + '#' + str(score)
        sortedindexfile.writelines([line, os.linesep])
    sortedindexfile.close()
    #end processing

    utils.sign_epoch(sortedindexfilestatus, 'Estimate')
    utils.store_status(sortedindexfilestatuspath, sortedindexfilestatus)
Esempio n. 5
0
def sortModels(indexname, sortedindexname):
    sortedindexfilestatuspath = sortedindexname + config.getStatusPostfix()
    sortedindexfilestatus = utils.load_status(sortedindexfilestatuspath)
    if utils.check_epoch(sortedindexfilestatus, 'Estimate'):
        return

    #begin processing
    records = []
    indexfile = open(indexname, 'r')
    for line in indexfile.readlines():
        #remove the trailing '\n'
        line = line.rstrip(os.linesep)
        (subdir, modelname, score) = line.split('#', 2)
        score = float(score)
        records.append((subdir, modelname, score))
    indexfile.close()

    records.sort(key=itemgetter(2), reverse=True)

    sortedindexfile = open(sortedindexname, 'w')
    for record in records:
        (subdir, modelname, score) = record
        line = subdir + '#' + modelname + '#' + str(score)
        sortedindexfile.writelines([line, os.linesep])
    sortedindexfile.close()
    #end processing

    utils.sign_epoch(sortedindexfilestatus, 'Estimate')
    utils.store_status(sortedindexfilestatuspath, sortedindexfilestatus)
Esempio n. 6
0
def handleOneIndex(indexpath, subdir, indexname, fast):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Prepare'):
        raise utils.EpochError('Please prepare first.\n')
    if utils.check_epoch(indexstatus, 'Populate'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    shmdir = config.getInMemoryFileSystem()

    for i in range(1, N + 1):
        if fast:
            #copy file
            filename = config.getNgramFileName(i)
            filepath = workdir + os.sep + filename
            shmfilepath = shmdir + os.sep + filename
            utils.copyfile(filepath, shmfilepath)
            handleOnePass(indexpath, shmdir, i)
            pruneNgramTable(indexpath, shmdir, i)
            utils.copyfile(shmfilepath, filepath)
            os.unlink(shmfilepath)
        else:
            handleOnePass(indexpath, workdir, i)
            pruneNgramTable(indexpath, workdir, i)

    #sign epoch
    utils.sign_epoch(indexstatus, 'Populate')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 7
0
def mergeOneText(infile, outfile, reportfile):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(infilestatus, 'MergeSequence'):
        return

    infile = infile + config.getSegmentPostfix()

    #begin processing
    cmdline = ['../utils/segment/mergeseq', \
                   '-o', outfile, infile]

    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'wb') as f:
            f.writelines(lines)

    os.waitpid(subprocess.pid, 0)
    #end processing

    utils.sign_epoch(infilestatus, 'MergeSequence')
    utils.store_status(infilestatuspath, infilestatus)
Esempio n. 8
0
def handleOneIndex(indexpath):
    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'MergeSequence'):
        return

    #begin processing
    indexfile = open(indexpath, 'r')
    for oneline in indexfile.readlines():
        #remove tailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')

        infile = config.getTextDir() + textpath
        outfile = config.getTextDir() + textpath + config.getMergedPostfix()
        reportfile = config.getTextDir() + textpath + \
            config.getMergedReportPostfix()

        print("Processing " + title + '#' + textpath)
        mergeOneText(infile, outfile, reportfile)
        print("Processed " + title + '#' + textpath)

    indexfile.close()
    #end processing

    utils.sign_epoch(indexstatus, 'MergeSequence')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 9
0
def segmentOneText(infile, outfile, reportfile, fast):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if utils.check_epoch(infilestatus, 'Segment'):
        return

    #begin processing
    if fast:
        cmdline = ['../utils/segment/spseg', \
                       '-o', outfile, infile]
    else:
        cmdline = ['../utils/segment/ngseg', \
                       '-o', outfile, infile]

    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'wb') as f:
            f.writelines(lines)

    os.waitpid(subprocess.pid, 0)
    #end processing

    utils.sign_epoch(infilestatus, 'Segment')
    utils.store_status(infilestatuspath, infilestatus)
Esempio n. 10
0
def generateOneText(infile, modelfile, reportfile):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'MergeSequence'):
        raise utils.EpochError('Please mergeseq first.\n')
    if utils.check_epoch(infilestatus, 'Generate'):
        return False

    #begin processing
    cmdline = ['../utils/training/gen_k_mixture_model', \
                   '--maximum-occurs-allowed', \
                   str(config.getMaximumOccursAllowed()), \
                   '--maximum-increase-rates-allowed', \
                   str(config.getMaximumIncreaseRatesAllowed()), \
                   '--k-mixture-model-file', \
                   modelfile, infile + \
                   config.getMergedPostfix()]
    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'ab') as f:
            f.writelines(lines)

    (pid, status) = os.waitpid(subprocess.pid, 0)
    if status != 0:
        sys.exit('gen_k_mixture_model encounters error.')
    #end processing

    utils.sign_epoch(infilestatus, 'Generate')
    utils.store_status(infilestatuspath, infilestatus)
    return True
Esempio n. 11
0
def mergeOneText(infile, outfile, reportfile):
    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(infilestatus, 'MergeSequence'):
        return

    infile = infile + config.getSegmentPostfix()

    #begin processing
    cmdline = ['../utils/segment/mergeseq', \
                   '-o', outfile, infile]

    subprocess = Popen(cmdline, shell=False, stderr=PIPE, \
                           close_fds=True)

    lines = subprocess.stderr.readlines()
    if lines:
        print('found error report')
        with open(reportfile, 'wb') as f:
            f.writelines(lines)

    os.waitpid(subprocess.pid, 0)
    #end processing

    utils.sign_epoch(infilestatus, 'MergeSequence')
    utils.store_status(infilestatuspath, infilestatus)
Esempio n. 12
0
def handleOneDocument(infile, cur, length):
    print(infile, length)

    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(infilestatus, 'Populate'):
        return False

    sep = config.getWordSep()

    #train
    docfile = open(infile + config.getSegmentPostfix(), 'r')
    words = []

    for oneline in docfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (token, word) = oneline.split(" ", 1)
        token = int(token)

        if 0 == token:
            words = []
        else:
            words.append(word)

        if len(words) < length:
            continue

        if len(words) > length:
            words.pop(0)

        assert len(words) == length

        #do sqlite training
        words_str = sep + sep.join(words) + sep
        #print(words_str)

        rowcount = cur.execute(UPDATE_NGRAM_DML, (words_str,)).rowcount
        #print(rowcount)
        assert rowcount <= 1

        if 0 == rowcount:
            cur.execute(INSERT_NGRAM_DML, (words_str,))

    docfile.close()

    #sign epoch only after last pass
    if N == length:
        utils.sign_epoch(infilestatus, 'Populate')
        utils.store_status(infilestatuspath, infilestatus)

    return True
Esempio n. 13
0
def handleOneDocument(infile, cur, length):
    print(infile, length)

    infilestatuspath = infile + config.getStatusPostfix()
    infilestatus = utils.load_status(infilestatuspath)
    if not utils.check_epoch(infilestatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(infilestatus, 'Populate'):
        return False

    sep = config.getWordSep()

    #train
    docfile = open(infile + config.getSegmentPostfix(), 'r')
    words = []

    for oneline in docfile.readlines():
        oneline = oneline.rstrip(os.linesep)

        if len(oneline) == 0:
            continue

        (token, word) = oneline.split(" ", 1)
        token = int(token)

        if 0 == token:
            words = []
        else:
            words.append(word)

        if len(words) < length:
            continue

        if len(words) > length:
            words.pop(0)

        assert len(words) == length

        #do sqlite training
        words_str = sep + sep.join(words) + sep
        #print(words_str)

        rowcount = cur.execute(UPDATE_NGRAM_DML, (words_str, )).rowcount
        #print(rowcount)
        assert rowcount <= 1

        if 0 == rowcount:
            cur.execute(INSERT_NGRAM_DML, (words_str, ))

    docfile.close()

    #sign epoch only after last pass
    if N == length:
        utils.sign_epoch(infilestatus, 'Populate')
        utils.store_status(infilestatuspath, infilestatus)

    return True
Esempio n. 14
0
 def storeModelStatus(modelfile, textnum, nexttextnum):
     #store model info in status file
     modelstatuspath = modelfile + config.getStatusPostfix()
     #create None status
     modelstatus = {}
     modelstatus['GenerateStart'] = textnum
     modelstatus['GenerateEnd'] = nexttextnum
     utils.sign_epoch(modelstatus, 'Generate')
     utils.store_status(modelstatuspath, modelstatus)
Esempio n. 15
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'NewWord'):
        raise utils.EpochError('Please new word first.\n')
    if utils.check_epoch(indexstatus, 'MarkPinyin'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    markPinyins(workdir)

    #sign epoch
    utils.sign_epoch(indexstatus, 'MarkPinyin')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 16
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'NewWord'):
        raise utils.EpochError('Please new word first.\n')
    if utils.check_epoch(indexstatus, 'MarkPinyin'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    markPinyins(workdir)

    #sign epoch
    utils.sign_epoch(indexstatus, 'MarkPinyin')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 17
0
def handleOneModel(modelfile, reportfile):
    modelfilestatuspath = modelfile + config.getStatusPostfix()
    modelfilestatus = utils.load_status(modelfilestatuspath)
    if not utils.check_epoch(modelfilestatus, 'Generate'):
        raise utils.EpochError('Please generate first.\n')
    if utils.check_epoch(modelfilestatus, 'Estimate'):
        return

    reporthandle = open(reportfile, 'wb')

    result_line_prefix = "average lambda:"
    avg_lambda = 0.

    #begin processing
    cmdline = ['../utils/training/estimate_k_mixture_model', \
                   '--deleted-bigram-file', \
                   config.getEstimatesModel(), \
                   '--bigram-file', \
                   modelfile]

    subprocess = Popen(cmdline, shell=False, stdout=PIPE, \
                           close_fds=True)

    for line in subprocess.stdout.readlines():
        reporthandle.writelines([line])
        #remove trailing '\n'
        line = line.decode('utf-8')
        line = line.rstrip(os.linesep)
        if line.startswith(result_line_prefix):
            avg_lambda = float(line[len(result_line_prefix):])

    reporthandle.close()

    (pid, status) = os.waitpid(subprocess.pid, 0)
    if status != 0:
        sys.exit('estimate k mixture model returns error.')
    #end processing

    print('average lambda:', avg_lambda)
    modelfilestatus['EstimateScore'] = avg_lambda
    utils.sign_epoch(modelfilestatus, 'Estimate')
    utils.store_status(modelfilestatuspath, modelfilestatus)
Esempio n. 18
0
def handleOneModel(modelfile, reportfile):
    modelfilestatuspath = modelfile + config.getStatusPostfix()
    modelfilestatus = utils.load_status(modelfilestatuspath)
    if not utils.check_epoch(modelfilestatus, 'Generate'):
        raise utils.EpochError('Please generate first.\n')
    if utils.check_epoch(modelfilestatus, 'Estimate'):
        return

    reporthandle = open(reportfile, 'wb')

    result_line_prefix = "average lambda:"
    avg_lambda = 0.

    #begin processing
    cmdline = ['../utils/training/estimate_k_mixture_model', \
                   '--deleted-bigram-file', \
                   config.getEstimatesModel(), \
                   '--bigram-file', \
                   modelfile]

    subprocess = Popen(cmdline, shell=False, stdout=PIPE, \
                           close_fds=True)

    for line in subprocess.stdout.readlines():
        reporthandle.writelines([line])
        #remove trailing '\n'
        line = line.decode('utf-8')
        line = line.rstrip(os.linesep)
        if line.startswith(result_line_prefix):
            avg_lambda = float(line[len(result_line_prefix):])

    reporthandle.close()

    (pid, status) = os.waitpid(subprocess.pid, 0)
    if status != 0:
        sys.exit('estimate k mixture model returns error.')
    #end processing

    print('average lambda:', avg_lambda)
    modelfilestatus['EstimateScore'] = avg_lambda
    utils.sign_epoch(modelfilestatus, 'Estimate')
    utils.store_status(modelfilestatuspath, modelfilestatus)
Esempio n. 19
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'Prepare'):
        return

    #create directory
    onedir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    os.path.exists(onedir) or os.makedirs(onedir)

    #create sqlite databases
    createSqliteDatabases(onedir)

    #sign epoch
    utils.sign_epoch(indexstatus, 'Prepare')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 20
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Segment'):
        raise utils.EpochError('Please segment first.\n')
    if utils.check_epoch(indexstatus, 'Prepare'):
        return

    #create directory
    onedir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    os.path.exists(onedir) or os.makedirs(onedir)

    #create sqlite databases
    createSqliteDatabases(onedir)

    #sign epoch
    utils.sign_epoch(indexstatus, 'Prepare')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 21
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'PartialWord'):
        raise utils.EpochError('Please partial word first.\n')
    if utils.check_epoch(indexstatus, 'NewWord'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    createBigramSqlite(workdir)
    populateBigramSqlite(workdir)

    filename = config.getBigramFileName()
    filepath = workdir + os.sep + filename

    conn = sqlite3.connect(filepath)

    prethres = computeThreshold(conn, "prefix")
    indexstatus['NewWordPrefixThreshold'] = prethres
    postthres = computeThreshold(conn, "postfix")
    indexstatus['NewWordPostfixThreshold'] = postthres

    utils.store_status(indexstatuspath, indexstatus)

    filterPartialWord(workdir, conn, prethres, postthres)

    conn.commit()
    if conn:
        conn.close()

    #sign epoch
    utils.sign_epoch(indexstatus, 'NewWord')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 22
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'PartialWord'):
        raise utils.EpochError('Please partial word first.\n')
    if utils.check_epoch(indexstatus, 'NewWord'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    createBigramSqlite(workdir)
    populateBigramSqlite(workdir)

    filename = config.getBigramFileName()
    filepath = workdir + os.sep + filename

    conn = sqlite3.connect(filepath)

    prethres = computeThreshold(conn, "prefix")
    indexstatus['NewWordPrefixThreshold'] = prethres
    postthres = computeThreshold(conn, "postfix")
    indexstatus['NewWordPostfixThreshold'] = postthres

    utils.store_status(indexstatuspath, indexstatus)

    filterPartialWord(workdir, conn, prethres, postthres)

    conn.commit()
    if conn:
        conn.close()

    #sign epoch
    utils.sign_epoch(indexstatus, 'NewWord')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 23
0
def handleOneIndex(indexpath, subdir, indexname):
    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'Populate'):
        raise utils.EpochError('Please populate first.\n')
    if utils.check_epoch(indexstatus, 'PartialWord'):
        return

    workdir = config.getWordRecognizerDir() + os.sep + \
        subdir + os.sep + indexname
    print(workdir)

    threshold = getThreshold(workdir)
    indexstatus['PartialWordThreshold'] = threshold
    utils.store_status(indexstatuspath, indexstatus)

    recognizePartialWord(workdir, threshold)

    #sign epoch
    utils.sign_epoch(indexstatus, 'PartialWord')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 24
0
def gatherModels(path, indexname):
    indexfilestatuspath = indexname + config.getStatusPostfix()
    indexfilestatus = utils.load_status(indexfilestatuspath)
    if utils.check_epoch(indexfilestatuspath, 'Estimate'):
        return

    #begin processing
    indexfile = open(indexname, "w")
    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
        for onefile in files:
            filepath = os.path.join(root, onefile)
            if onefile.endswith(config.getModelPostfix()):
                #append one record to index file
                subdir = os.path.relpath(root, path)
                statusfilepath = filepath + config.getStatusPostfix()
                status = utils.load_status(statusfilepath)
                if not (utils.check_epoch(status, 'Estimate') and \
                        'EstimateScore' in status):
                    raise utils.EpochError('Unknown Error:\n' + \
                                               'Try re-run estimate.\n')
                avg_lambda = status['EstimateScore']
                line = subdir + '#' + onefile + '#' + str(avg_lambda)
                indexfile.writelines([line, os.linesep])
                #record written
            elif onefile.endswith(config.getStatusPostfix()):
                pass
            elif onefile.endswith(config.getIndexPostfix()):
                pass
            elif onefile.endswith(config.getReportPostfix()):
                pass
            else:
                print('Unexpected file:' + filepath)
    indexfile.close()
    #end processing

    utils.sign_epoch(indexfilestatus, 'Estimate')
    utils.store_status(indexfilestatuspath, indexfilestatus)
Esempio n. 25
0
def gatherModels(path, indexname):
    indexfilestatuspath = indexname + config.getStatusPostfix()
    indexfilestatus = utils.load_status(indexfilestatuspath)
    if utils.check_epoch(indexfilestatuspath, 'Estimate'):
        return

    #begin processing
    indexfile = open(indexname, "w")
    for root, dirs, files in os.walk(path, topdown=True, onerror=handleError):
        for onefile in files:
            filepath = os.path.join(root, onefile)
            if onefile.endswith(config.getModelPostfix()):
                #append one record to index file
                subdir = os.path.relpath(root, path)
                statusfilepath = filepath + config.getStatusPostfix()
                status = utils.load_status(statusfilepath)
                if not (utils.check_epoch(status, 'Estimate') and \
                        'EstimateScore' in status):
                    raise utils.EpochError('Unknown Error:\n' + \
                                               'Try re-run estimate.\n')
                avg_lambda = status['EstimateScore']
                line = subdir + '#' + onefile + '#' + str(avg_lambda)
                indexfile.writelines([line, os.linesep])
                #record written
            elif onefile.endswith(config.getStatusPostfix()):
                pass
            elif onefile.endswith(config.getIndexPostfix()):
                pass
            elif onefile.endswith(config.getReportPostfix()):
                pass
            else:
                print('Unexpected file:' + filepath)
    indexfile.close()
    #end processing

    utils.sign_epoch(indexfilestatus, 'Estimate')
    utils.store_status(indexfilestatuspath, indexfilestatus)
Esempio n. 26
0
    else:
        #backup merged model
        utils.copyfile(mergedmodel, prunedmodel)
        pruneModel(prunedmodel, args.k, args.CDF)

    #validate pruned model
    print('validating')
    validateModel(prunedmodel)

    #export textual format
    print('exporting')
    exportfile = os.path.join(trydir, 'kmm_pruned.text')
    exportModel(prunedmodel, exportfile)

    #convert to interpolation
    print('converting')
    kmm_model = exportfile
    inter_model = os.path.join(trydir, config.getFinalModelFileName())
    convertModel(kmm_model, inter_model)

    modelsize = utils.get_file_length(inter_model)
    cwdstatus['PruneModelSize'] = modelsize
    utils.store_status(cwdstatuspath, cwdstatus)

    print('final model size:', modelsize)

    #sign status epoch
    utils.sign_epoch(cwdstatus, 'Prune')
    utils.store_status(cwdstatuspath, cwdstatus)
    print('done')
Esempio n. 27
0
    utils.copyfile(modelfile, destfile)

    print('cleaning')
    cleanUpData()

    print('building')
    buildData()

    print('estimating')
    reportfile = os.path.join(trydir, 'estimate' + config.getReportPostfix())
    avg_lambda = estimateModel(reportfile)
    print('average lambda:', avg_lambda)

    cwdstatus['EvaluateAverageLambda'] = avg_lambda
    utils.store_status(cwdstatuspath, cwdstatus)

    print('modifying lambda')
    modifyLambda(avg_lambda)

    print('evaluating')
    reportfile = os.path.join(trydir, 'evaluate' + config.getReportPostfix())
    rate = evaluateModel(reportfile)
    print(tryname + "'s correction rate:", rate)

    cwdstatus['EvaluateCorrectionRate'] = rate
    utils.store_status(cwdstatuspath, cwdstatus)

    utils.sign_epoch(cwdstatus, 'Evaluate')
    utils.store_status(cwdstatuspath, cwdstatus)
    print('done')
Esempio n. 28
0
def handleOneIndex(indexpath, subdir, indexname, fast):
    inMemoryFile = "model.db"

    modeldir = os.path.join(config.getModelDir(), subdir, indexname)
    os.path.exists(modeldir) or os.makedirs(modeldir)


    def cleanupInMemoryFile():
        modelfile = os.path.join(config.getInMemoryFileSystem(), inMemoryFile)
        reportfile = modelfile + config.getReportPostfix()
        if os.access(modelfile, os.F_OK):
            os.unlink(modelfile)
        if os.access(reportfile, os.F_OK):
            os.unlink(reportfile)

    def copyoutInMemoryFile(modelfile):
        inmemoryfile = os.path.join\
            (config.getInMemoryFileSystem(), inMemoryFile)
        inmemoryreportfile = inmemoryfile + config.getReportPostfix()
        reportfile = modelfile + config.getReportPostfix()

        if os.access(inmemoryfile, os.F_OK):
            utils.copyfile(inmemoryfile, modelfile)
        if os.access(inmemoryreportfile, os.F_OK):
            utils.copyfile(inmemoryreportfile, reportfile)

    def cleanupFiles(modelnum):
        modeldir = os.path.join(config.getModelDir(), subdir, indexname)
        modelfile = os.path.join( \
            modeldir, config.getCandidateModelName(modelnum))
        reportfile = modelfile + config.getReportPostfix()
        if os.access(modelfile, os.F_OK):
            os.unlink(modelfile)
        if os.access(reportfile, os.F_OK):
            os.unlink(reportfile)

    def storeModelStatus(modelfile, textnum, nexttextnum):
        #store model info in status file
        modelstatuspath = modelfile + config.getStatusPostfix()
        #create None status
        modelstatus = {}
        modelstatus['GenerateStart'] = textnum
        modelstatus['GenerateEnd'] = nexttextnum
        utils.sign_epoch(modelstatus, 'Generate')
        utils.store_status(modelstatuspath, modelstatus)

    print(indexpath, subdir, indexname)

    indexstatuspath = indexpath + config.getStatusPostfix()
    indexstatus = utils.load_status(indexstatuspath)
    if not utils.check_epoch(indexstatus, 'MergeSequence'):
        raise utils.EpochError('Please mergeseq first.\n')
    if utils.check_epoch(indexstatus, 'Generate'):
        return

    #continue generating
    textnum, modelnum, aggmodelsize = 0, 0, 0
    if 'GenerateTextEnd' in indexstatus:
        textnum = indexstatus['GenerateTextEnd']
    if 'GenerateModelEnd' in indexstatus:
        modelnum = indexstatus['GenerateModelEnd']

    #clean up previous file
    if fast:
        cleanupInMemoryFile()

    cleanupFiles(modelnum)

    #begin processing
    indexfile = open(indexpath, 'r')
    for i, oneline in enumerate(indexfile.readlines()):
        #continue last generating
        if i < textnum:
            continue

        #remove trailing '\n'
        oneline = oneline.rstrip(os.linesep)
        (title, textpath) = oneline.split('#')
        infile = config.getTextDir() + textpath
        infilesize = utils.get_file_length(infile + config.getMergedPostfix())
        if infilesize < config.getMinimumFileSize():
            print("Skipping " + title + '#' + textpath)
            continue

        if fast:
            modelfile = os.path.join(config.getInMemoryFileSystem(), \
                                         inMemoryFile)
        else:
            modelfile = os.path.join(modeldir, \
                                         config.getCandidateModelName(modelnum))

        reportfile = modelfile + config.getReportPostfix()
        print("Proccessing " + title + '#' + textpath)
        if generateOneText(infile, modelfile, reportfile):
            aggmodelsize += infilesize
        print("Processed " + title + '#' + textpath)
        if aggmodelsize > config.getCandidateModelSize():
            #copy out in memory file
            if fast:
                modelfile = os.path.join\
                    (modeldir, config.getCandidateModelName(modelnum))
                copyoutInMemoryFile(modelfile)
                cleanupInMemoryFile()

            #the model file is in disk now
            nexttextnum = i + 1
            storeModelStatus(modelfile, textnum, nexttextnum)

            #new model candidate
            aggmodelsize = 0
            textnum = nexttextnum
            modelnum += 1

            #clean up next file
            cleanupFiles(modelnum)

            #save current progress in status file
            indexstatus['GenerateTextEnd'] = nexttextnum
            indexstatus['GenerateModelEnd'] = modelnum
            utils.store_status(indexstatuspath, indexstatus)


    #copy out in memory file
    if fast:
        modelfile = os.path.join\
            (modeldir, config.getCandidateModelName(modelnum))
        copyoutInMemoryFile(modelfile)
        cleanupInMemoryFile()

    #the model file is in disk now
    nexttextnum = i + 1
    storeModelStatus(modelfile, textnum, nexttextnum)

    indexfile.close()
    #end processing

    #save current progress in status file
    modelnum += 1
    indexstatus['GenerateTextEnd'] = nexttextnum
    indexstatus['GenerateModelEnd'] = modelnum

    utils.sign_epoch(indexstatus, 'Generate')
    utils.store_status(indexstatuspath, indexstatus)
Esempio n. 29
0
    utils.copyfile(modelfile, destfile)

    print('cleaning')
    cleanUpData()

    print('building')
    buildData()

    print('estimating')
    reportfile = os.path.join(trydir, 'estimate' + config.getReportPostfix())
    avg_lambda = estimateModel(reportfile)
    print('average lambda:', avg_lambda)

    cwdstatus['EvaluateAverageLambda'] = avg_lambda
    utils.store_status(cwdstatuspath, cwdstatus)

    print('modifying lambda')
    modifyLambda(avg_lambda)

    print('evaluating')
    reportfile = os.path.join(trydir, 'evaluate' + config.getReportPostfix())
    rate = evaluateModel(reportfile)
    print(tryname + "'s correction rate:", rate)

    cwdstatus['EvaluateCorrectionRate'] = rate
    utils.store_status(cwdstatuspath, cwdstatus)

    utils.sign_epoch(cwdstatus, 'Evaluate')
    utils.store_status(cwdstatuspath, cwdstatus)
    print('done')
Esempio n. 30
0
    else:
        #backup merged model
        utils.copyfile(mergedmodel, prunedmodel)
        pruneModel(prunedmodel, args.k, args.CDF)

    #validate pruned model
    print('validating')
    validateModel(prunedmodel)

    #export textual format
    print('exporting')
    exportfile = os.path.join(trydir, 'kmm_pruned.text')
    exportModel(prunedmodel, exportfile)

    #convert to interpolation
    print('converting')
    kmm_model = exportfile
    inter_model = os.path.join(trydir, config.getFinalModelFileName())
    convertModel(kmm_model, inter_model)

    modelsize = utils.get_file_length(inter_model)
    cwdstatus['PruneModelSize'] = modelsize
    utils.store_status(cwdstatuspath, cwdstatus)

    print('final model size:', modelsize)

    #sign status epoch
    utils.sign_epoch(cwdstatus, 'Prune')
    utils.store_status(cwdstatuspath, cwdstatus)
    print('done')