Example #1
0
def getRealCosts():
  global _realUnigramCost, _realBigramCost, _possibleFills

  if _realUnigramCost is None:
    print(f'Training language cost functions [corpus: {CORPUS}]... ', end = '')

    _realUnigramCost, _realBigramCost = wordsegUtil.makeLanguageModels(CORPUS)
    _possibleFills = wordsegUtil.makeInverseRemovalDictionary(CORPUS, 'aeiou')

    print('Done!')
    print('')

  return _realUnigramCost, _realBigramCost, _possibleFills
Example #2
0
def main():

    corpus = 'training.txt'

    sys.stdout.write('Training language cost functions [corpus: %s]... ' % corpus)
    sys.stdout.flush()

    unigramCost, bigramCost = wordsegUtil.makeLanguageModels(corpus)

    print('Done!')
    print('')

    # repl(unigramCost, bigramCost)
    calculate(unigramCost, bigramCost)
Example #3
0
def getRealCosts():
    global _realUnigramCost, _realBigramCost, _possibleFills

    if _realUnigramCost is None:
        sys.stdout.write('Training language cost functions [corpus: %s]... ' % CORPUS)
        sys.stdout.flush()

        _realUnigramCost, _realBigramCost = wordsegUtil.makeLanguageModels(CORPUS)
        _possibleFills = wordsegUtil.makeInverseRemovalDictionary(CORPUS, 'aeiou')

        print 'Done!'
        print ''

    return _realUnigramCost, _realBigramCost, _possibleFills
Example #4
0
def getRealCosts():
    global _realUnigramCost, _realBigramCost, _possibleFills

    if _realUnigramCost is None:
        sys.stdout.write('Training language cost functions [corpus: %s]... ' % CORPUS)
        sys.stdout.flush()

        _realUnigramCost, _realBigramCost = wordsegUtil.makeLanguageModels(CORPUS)
        _possibleFills = wordsegUtil.makeInverseRemovalDictionary(CORPUS, 'aeiou')

        print('Done!')
        print('')

    return _realUnigramCost, _realBigramCost, _possibleFills
Example #5
0
def main():
    args = parseArgs()
    if args.model and args.model not in ['seg', 'ins', 'both']:
        print(('Unrecognized model:', args.model))
        sys.exit(1)

    corpus = args.text_corpus or 'leo-will.txt'

    sys.stdout.write('Training language cost functions [corpus: %s]... ' % corpus)
    sys.stdout.flush()

    unigramCost, bigramCost = wordsegUtil.makeLanguageModels(corpus)
    possibleFills = wordsegUtil.makeInverseRemovalDictionary(corpus, 'aeiou')

    print('Done!')
    print('')

    repl(unigramCost, bigramCost, possibleFills, command=args.model)
Example #6
0
def main():
    args = parseArgs()
    if args.model and args.model not in ['seg', 'ins', 'both']:
        print 'Unrecognized model:', args.model
        sys.exit(1)

    corpus = args.text_corpus or 'leo-will.txt'

    sys.stdout.write('Training language cost functions [corpus: %s]... ' % corpus)
    sys.stdout.flush()

    unigramCost, bigramCost = wordsegUtil.makeLanguageModels(corpus)
    possibleFills = wordsegUtil.makeInverseRemovalDictionary(corpus, 'aeiou')

    print 'Done!'
    print ''

    repl(unigramCost, bigramCost, possibleFills, command=args.model)
#step: 현재 까지 띄어쓰기를 진행한 문자의 수

    def succ_and_cost(self, state):  #다음에 취할 수 있는 모든 Action을 찾는 것
        for step in range(1, len(self.query) - state + 1):
            next_state = state + step
            word = self.query[state:
                              next_state]  #현재 states 에서 다음 states에 해당하는 query
            cost = self.unigramCost(word)
            yield word, next_state, cost  # action, next_state, cost
            # yield를 할때마다 element가 list에 추가됨
            # action이 이루어지면 알아서 띄어쓰기가 된다고 이해하면 된다.

if __name__ == '__main__':
    unigramCost, bigramCost = wordsegUtil.makeLanguageModels(
        'leo-will.txt'
    )  #함수를 만들어 주는 것(leo-will.txt = corpus, 이것을 통해서 Language Model을 만드는 것이다.)
    #Corpus에 없는 단어가 들어가면, 코스트가 무한대가 된다.(잘 작동하지 않는다.)
    problem = SegmentationProblem('thisisnotmybeautifulhouse', unigramCost)

    # import dynamic_programming_search
    # dps = dynamic_programming_search.DynamicProgrammingSearch(verbose=1)
    # # dps = dynamic_programming_search.DynamicProgrammingSearch(memory_use=False, verbose=1)
    # print(dps.solve(problem))

    import uniform_cost_search
    ucs = uniform_cost_search.UniformCostSearch(verbose=0)
    print(ucs.solve(problem))

# === Other Examples ===
#
Example #8
0
        self.query = query
        self.bigramCost = bigramCost
        self.possibleFills = possibleFills

    def start_state(self):
        # position before which text is reconstructed & previous word
        return 0, wordsegUtil.SENTENCE_BEGIN

    def is_end(self, state):
        return state[0] == len(self.query)

    def succ_and_cost(self, state):
        raise NotImplementedError


unigramCost, bigramCost = wordsegUtil.makeLanguageModels('leo-will.txt')
smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
possibleFills = wordsegUtil.makeInverseRemovalDictionary(
    'leo-will.txt', 'aeiou')
problem = JointSegmentationInsertionProblem('mgnllthppl', smoothCost,
                                            possibleFills)

import dynamic_programming_search
dps = dynamic_programming_search.DynamicProgrammingSearch(verbose=1)
# dps = dynamic_programming_search.DynamicProgrammingSearch(memory_use=False, verbose=1)
# print(dps.solve(problem))

import uniform_cost_search
ucs = uniform_cost_search.UniformCostSearch(verbose=0)
print(ucs.solve(problem))