Example #1
0
def smart_tag(tokens, tagger):
    """Tag a token stream using a pre-built tagger"""
    tr1, tr2, tr3 = tee(iter_ngrams(tokens, 3), 3)

    next(tr2)
    next(tr3)
    next(tr3)

    for token, m3, r3 in tr3:
        l1, m1, _ = next(tr1)
        l2, __, r2 = next(tr2)
        tag_ = tag(token)
        if tag_ is not None:
            yield token, tag_
            continue

        dist_ = {}

        rtags = tagger['R'][(tag(l1), tag(m1))]
        part_ = len(rtags)
        if part_:
            part_ = 200000 / part_
            for t_ in rtags:
                dist_[t_] = dist_.get(t_, 0) + part_

        rtags = tagger['M'][(tag(l2), tag(r2))]
        part_ = len(rtags)
        if part_:
            part_ = 600000 / part_
            for t_ in rtags:
                dist_[t_] = dist_.get(t_, 0) + part_

        rtags = tagger['L'][(tag(m3), tag(r3))]
        part_ = len(rtags)
        if part_:
            part_ = 200000 / part_
            for t_ in rtags:
                dist_[t_] = dist_.get(t_, 0) + part_

        if len(dist_) == 0:
            yield token, None
            continue

        if 'Z' in dist_:
            # damp the pronouns
            rest = sum(dist_.values()) - dist_['Z']
            prob = dist_['Z'] / ((rest + dist_['Z']) * 10)  # 0.1 damping
            dist_['Z'] = prob * rest / (1 - prob)

        dist_ = dist_.items()
        dist_.sort(key = key1, reverse = True)
        yield token, dist_[0][0]
Example #2
0
def smart_tag(tokens, tagger):
    """Tag a token stream using a pre-built tagger"""
    tr1, tr2, tr3 = tee(iter_ngrams(tokens, 3), 3)

    next(tr2)
    next(tr3)
    next(tr3)

    for token, m3, r3 in tr3:
        l1, m1, _ = next(tr1)
        l2, __, r2 = next(tr2)
        tag_ = tag(token)
        if tag_ is not None:
            yield token, tag_
            continue

        dist_ = {}

        rtags = tagger['R'][(tag(l1), tag(m1))]
        part_ = len(rtags)
        if part_:
            part_ = 200000 / part_
            for t_ in rtags:
                dist_[t_] = dist_.get(t_, 0) + part_

        rtags = tagger['M'][(tag(l2), tag(r2))]
        part_ = len(rtags)
        if part_:
            part_ = 600000 / part_
            for t_ in rtags:
                dist_[t_] = dist_.get(t_, 0) + part_

        rtags = tagger['L'][(tag(m3), tag(r3))]
        part_ = len(rtags)
        if part_:
            part_ = 200000 / part_
            for t_ in rtags:
                dist_[t_] = dist_.get(t_, 0) + part_

        if len(dist_) == 0:
            yield token, None
            continue

        if 'Z' in dist_:
            # damp the pronouns
            rest = sum(dist_.values()) - dist_['Z']
            prob = dist_['Z'] / ((rest + dist_['Z']) * 10)  # 0.1 damping
            dist_['Z'] = prob * rest / (1 - prob)

        dist_ = dist_.items()
        dist_.sort(key = key1, reverse = True)
        yield token, dist_[0][0]
Example #3
0
def build_tagger(tokens):
    """Build tagger from basic analysis of a token stream"""
    trigrams = iter_ngrams(tokens, 3)

    tagger = {'L':defaultdict(list), 'M':defaultdict(list), 'R':defaultdict(list)}

    builder_skip_tags = set(('NU', 'PU'))

    for l, m, r in trigrams:
        lt, mt, rt = map(tag, (l, m, r))
        if reduce(or_, [tag_ is None for tag_ in (lt, mt, rt)]): continue

        if lt not in builder_skip_tags:
            tagger['L'][(mt, rt)].append(lt)
        if mt not in builder_skip_tags:
            tagger['M'][(lt, rt)].append(mt)
        if rt not in builder_skip_tags:
            tagger['R'][(lt, mt)].append(rt)

    return tagger
Example #4
0
def build_tagger(tokens):
    """Build tagger from basic analysis of a token stream"""
    trigrams = iter_ngrams(tokens, 3)

    tagger = {'L':defaultdict(list), 'M':defaultdict(list), 'R':defaultdict(list)}

    builder_skip_tags = set(('NU', 'PU'))

    for l, m, r in trigrams:
        lt, mt, rt = map(tag, (l, m, r))
        if reduce(or_, [tag_ is None for tag_ in (lt, mt, rt)]): continue

        if lt not in builder_skip_tags:
            tagger['L'][(mt, rt)].append(lt)
        if mt not in builder_skip_tags:
            tagger['M'][(lt, rt)].append(mt)
        if rt not in builder_skip_tags:
            tagger['R'][(lt, mt)].append(rt)

    return tagger