Python Tagset.tag Examples, tagset.Tagset.tag Python Examples

Example #1

0

Show file

File: build_wsj.py Project: EmilStenstrom/efselab

config = Configuration('wsj', args)

# Read tagset and tag lexicon from corpus
wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
WSJ = Tagset(wsj_tags, config)

# Load a file with word classes
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = WSJ.tag(tag_field, 0)
last_tag        = WSJ.tag(tag_field, -1)
last_last_tag   = WSJ.tag(tag_field, -2)

# Define words (relative to the current position during a search)
this_word       = TextField(text_field, 0)
last_word       = TextField(text_field, -1)
next_word       = TextField(text_field, 1)
next_next_word  = TextField(text_field, 2)

# Use case-sensitive word clusters
this_wc         = WC.lookup(this_word)
last_wc         = WC.lookup(last_word)
next_wc         = WC.lookup(next_word)
next_next_wc    = WC.lookup(next_next_word)

Example #2

0

Show file

                                          3)

# Create a Tagset object from the tags we have read
SUC_NE = Tagset(suc_ne_tags, config)

text_field = 0
lemma_field = 1
suc_full_field = 2
tag_field = 3

Names = WCLexicon.from_file('names', 'suc-data/names.txt', config)

WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config)

# Define tags (relative to the current position during a search)
this_tag = SUC_NE.tag(tag_field, 0)
last_tag = SUC_NE.tag(tag_field, -1)
last_last_tag = SUC_NE.tag(tag_field, -2)

# POS tags (+ morphology)
this_pos = TextField(suc_full_field, 0)
last_pos = TextField(suc_full_field, -1)
next_pos = TextField(suc_full_field, 2)

# Define lemmas (relative to the current position during a search)
this_lemma = TextField(lemma_field, 0)
last_lemma = TextField(lemma_field, -1)
next_lemma = TextField(lemma_field, 1)
next_next_lemma = TextField(lemma_field, 2)

# Define words (relative to the current position during a search)

Example #3

0

Show file

File: build_wsj.py Project: wxhzt/efselab

config = Configuration('wsj', args)

# Read tagset and tag lexicon from corpus
wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
WSJ = Tagset(wsj_tags, config)

# Load a file with word classes
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field = 0
tag_field = 1

# Define tags (relative to the current position during a search)
this_tag = WSJ.tag(tag_field, 0)
last_tag = WSJ.tag(tag_field, -1)
last_last_tag = WSJ.tag(tag_field, -2)

# Define words (relative to the current position during a search)
this_word = TextField(text_field, 0)
last_word = TextField(text_field, -1)
next_word = TextField(text_field, 1)
next_next_word = TextField(text_field, 2)

# Use case-sensitive word clusters
this_wc = WC.lookup(this_word)
last_wc = WC.lookup(last_word)
next_wc = WC.lookup(next_word)
next_next_wc = WC.lookup(next_next_word)

Example #4

0

Show file

File: build_udt.py Project: wxhzt/efselab

udt_tags, udt_norm_tags = read_dict(train_filename, 0, 1)
# UDv1
#udt_tags = set(('ADJ ADP PUNCT ADV AUX SYM INTJ CONJ X NOUN DET PROPN NUM ' +
#                'VERB PART PRON SCONJ').split())
# UDv2
udt_tags = set(('ADJ ADP ADV AUX CCONJ DET INTJ NOUN NUM PART PRON PROPN '
                'PUNCT SCONJ SYM VERB X').split())

# Create a Tagset object from the tags we have read
UDT = Tagset(udt_tags, config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = UDT.tag(tag_field, 0)
last_tag        = UDT.tag(tag_field, -1)
last_last_tag   = UDT.tag(tag_field, -2)

# Define words (relative to the current position during a search)
this_word       = TextField(text_field, 0)
last_word       = TextField(text_field, -1)
next_word       = TextField(text_field, 1)
next_next_word  = TextField(text_field, 2)

# Each tuple below represents a single feature template.
fs = FeatureSet([
        # Tag bigram and trigram features
        (this_tag, last_tag),
        (this_tag, last_tag, last_last_tag),

Example #5

0

Show file

File: build_suc.py Project: EmilStenstrom/efselab

    for line in f:
        token, _, tag, _ = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

# Create a Tagset object from the tags we have read
SUC = Tagset(suc_tags, config)

# Load a file with word classes
WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = SUC.tag(tag_field, 0)
last_tag        = SUC.tag(tag_field, -1)
last_last_tag   = SUC.tag(tag_field, -2)

# Define words (relative to the current position during a search)
this_word       = TextField(text_field, 0)
last_word       = TextField(text_field, -1)
next_word       = TextField(text_field, 1)
next_next_word  = TextField(text_field, 2)

# Use case-sensitive word clusters
this_wc         = WC.lookup(this_word)
last_wc         = WC.lookup(last_word)
next_wc         = WC.lookup(next_word)
next_next_wc    = WC.lookup(next_next_word)

Example #6

0

Show file

udt_en_tags, udt_en_norm_tags = read_dict('data/udt-en-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
UDT_EN = Tagset(udt_en_tags, config)

# Load a file with word clusters
# This is taken from Turian et al.:
#   http://metaoptimize.com/projects/wordreprs/
# and has been converted using the brown2wcl.py script.
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field = 0
tag_field = 1

# Define tags (relative to the current position during a search)
this_tag = UDT_EN.tag(tag_field, 0)
last_tag = UDT_EN.tag(tag_field, -1)
last_last_tag = UDT_EN.tag(tag_field, -2)

# Define words (relative to the current position during a search)
this_word = TextField(text_field, 0)
last_word = TextField(text_field, -1)
next_word = TextField(text_field, 1)
next_next_word = TextField(text_field, 2)

# Use case-sensitive word clusters
this_wc = WC.lookup(this_word)
last_wc = WC.lookup(last_word)
next_wc = WC.lookup(next_word)
next_next_wc = WC.lookup(next_next_word)

Example #7

0

Show file

File: build_udt_suc_sv.py Project: EmilStenstrom/efselab

config = Configuration('udt_suc_sv', args)

# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
udt_sv_tags, udt_sv_suc_tags = read_dict('data/sv-ud-train.tab', 1, 3)
udt_sv_tags.add('X')

# Create a Tagset object from the tags we have read
UDT_SV = Tagset(udt_sv_tags, config)

lemma_field     = 0
suc_field       = 1
suc_full_field  = 2
tag_field       = 3

# UD tag (this is not really a sequence model, so we don't depend on history)
this_tag        = UDT_SV.tag(tag_field, 0)

# Word form features (lemmas)
this_word       = TextField(lemma_field, 0)

# Coarse SUC tags (given as input)
#
# The reason we apply normalization (= lower-casing) to these tags is because
# they're also used in the tag dictionary, which requires normalized inputs.
this_suc        = normalize(TextField(suc_field, 0))
last_suc        = normalize(TextField(suc_field, -1))
next_suc        = normalize(TextField(suc_field, 1))
last_last_suc   = normalize(TextField(suc_field, -2))
next_next_suc   = normalize(TextField(suc_field, 2))

# Full SUC tags

Example #8

0

Show file

config = Configuration('udt_suc_sv', args)

# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
udt_sv_tags, udt_sv_suc_tags = read_dict('data/sv-ud-train.tab', 1, 3)
udt_sv_tags.add('X')

# Create a Tagset object from the tags we have read
UDT_SV = Tagset(udt_sv_tags, config)

lemma_field = 0
suc_field = 1
suc_full_field = 2
tag_field = 3

# UD tag (this is not really a sequence model, so we don't depend on history)
this_tag = UDT_SV.tag(tag_field, 0)

# Word form features (lemmas)
this_word = TextField(lemma_field, 0)

# Coarse SUC tags (given as input)
#
# The reason we apply normalization (= lower-casing) to these tags is because
# they're also used in the tag dictionary, which requires normalized inputs.
this_suc = normalize(TextField(suc_field, 0))
last_suc = normalize(TextField(suc_field, -1))
next_suc = normalize(TextField(suc_field, 1))
last_last_suc = normalize(TextField(suc_field, -2))
next_next_suc = normalize(TextField(suc_field, 2))

# Full SUC tags

Example #9

0

Show file

File: build_suc_ne.py Project: robertostling/efselab

suc_ne_tags, suc_norm_ne_tags = read_dict("suc-data/suc-blogs-ne-train.tab", 1, 3)

# Create a Tagset object from the tags we have read
SUC_NE = Tagset(suc_ne_tags, config)

text_field = 0
lemma_field = 1
suc_full_field = 2
tag_field = 3

Names = WCLexicon.from_file("names", "suc-data/names.txt", config)

WC = WCLexicon.from_file("brown", "suc-data/swe-brown100.txt", config)

# Define tags (relative to the current position during a search)
this_tag = SUC_NE.tag(tag_field, 0)
last_tag = SUC_NE.tag(tag_field, -1)
last_last_tag = SUC_NE.tag(tag_field, -2)

# POS tags (+ morphology)
this_pos = TextField(suc_full_field, 0)
last_pos = TextField(suc_full_field, -1)
next_pos = TextField(suc_full_field, 2)

# Define lemmas (relative to the current position during a search)
this_lemma = TextField(lemma_field, 0)
last_lemma = TextField(lemma_field, -1)
next_lemma = TextField(lemma_field, 1)
next_next_lemma = TextField(lemma_field, 2)

# Define words (relative to the current position during a search)

Example #10

0

Show file

File: build_udt_en.py Project: EmilStenstrom/efselab

udt_en_tags, udt_en_norm_tags = read_dict('data/udt-en-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
UDT_EN = Tagset(udt_en_tags, config)

# Load a file with word clusters
# This is taken from Turian et al.:
#   http://metaoptimize.com/projects/wordreprs/
# and has been converted using the brown2wcl.py script.
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = UDT_EN.tag(tag_field, 0)
last_tag        = UDT_EN.tag(tag_field, -1)
last_last_tag   = UDT_EN.tag(tag_field, -2)

# Define words (relative to the current position during a search)
this_word       = TextField(text_field, 0)
last_word       = TextField(text_field, -1)
next_word       = TextField(text_field, 1)
next_next_word  = TextField(text_field, 2)

# Use case-sensitive word clusters
this_wc         = WC.lookup(this_word)
last_wc         = WC.lookup(last_word)
next_wc         = WC.lookup(next_word)
next_next_wc    = WC.lookup(next_next_word)