config = Configuration('wsj', args) # Read tagset and tag lexicon from corpus wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1) # Create a Tagset object from the tags we have read WSJ = Tagset(wsj_tags, config) # Load a file with word classes WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = WSJ.tag(tag_field, 0) last_tag = WSJ.tag(tag_field, -1) last_last_tag = WSJ.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2) # Use case-sensitive word clusters this_wc = WC.lookup(this_word) last_wc = WC.lookup(last_word) next_wc = WC.lookup(next_word) next_next_wc = WC.lookup(next_next_word)
3) # Create a Tagset object from the tags we have read SUC_NE = Tagset(suc_ne_tags, config) text_field = 0 lemma_field = 1 suc_full_field = 2 tag_field = 3 Names = WCLexicon.from_file('names', 'suc-data/names.txt', config) WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config) # Define tags (relative to the current position during a search) this_tag = SUC_NE.tag(tag_field, 0) last_tag = SUC_NE.tag(tag_field, -1) last_last_tag = SUC_NE.tag(tag_field, -2) # POS tags (+ morphology) this_pos = TextField(suc_full_field, 0) last_pos = TextField(suc_full_field, -1) next_pos = TextField(suc_full_field, 2) # Define lemmas (relative to the current position during a search) this_lemma = TextField(lemma_field, 0) last_lemma = TextField(lemma_field, -1) next_lemma = TextField(lemma_field, 1) next_next_lemma = TextField(lemma_field, 2) # Define words (relative to the current position during a search)
udt_tags, udt_norm_tags = read_dict(train_filename, 0, 1) # UDv1 #udt_tags = set(('ADJ ADP PUNCT ADV AUX SYM INTJ CONJ X NOUN DET PROPN NUM ' + # 'VERB PART PRON SCONJ').split()) # UDv2 udt_tags = set(('ADJ ADP ADV AUX CCONJ DET INTJ NOUN NUM PART PRON PROPN ' 'PUNCT SCONJ SYM VERB X').split()) # Create a Tagset object from the tags we have read UDT = Tagset(udt_tags, config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = UDT.tag(tag_field, 0) last_tag = UDT.tag(tag_field, -1) last_last_tag = UDT.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2) # Each tuple below represents a single feature template. fs = FeatureSet([ # Tag bigram and trigram features (this_tag, last_tag), (this_tag, last_tag, last_last_tag),
for line in f: token, _, tag, _ = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) # Create a Tagset object from the tags we have read SUC = Tagset(suc_tags, config) # Load a file with word classes WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = SUC.tag(tag_field, 0) last_tag = SUC.tag(tag_field, -1) last_last_tag = SUC.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2) # Use case-sensitive word clusters this_wc = WC.lookup(this_word) last_wc = WC.lookup(last_word) next_wc = WC.lookup(next_word) next_next_wc = WC.lookup(next_next_word)
udt_en_tags, udt_en_norm_tags = read_dict('data/udt-en-train.tab', 0, 1) # Create a Tagset object from the tags we have read UDT_EN = Tagset(udt_en_tags, config) # Load a file with word clusters # This is taken from Turian et al.: # http://metaoptimize.com/projects/wordreprs/ # and has been converted using the brown2wcl.py script. WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = UDT_EN.tag(tag_field, 0) last_tag = UDT_EN.tag(tag_field, -1) last_last_tag = UDT_EN.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2) # Use case-sensitive word clusters this_wc = WC.lookup(this_word) last_wc = WC.lookup(last_word) next_wc = WC.lookup(next_word) next_next_wc = WC.lookup(next_next_word)
config = Configuration('udt_suc_sv', args) # Read tagset and tag lexicon (coarse SUC -> UD) from corpus udt_sv_tags, udt_sv_suc_tags = read_dict('data/sv-ud-train.tab', 1, 3) udt_sv_tags.add('X') # Create a Tagset object from the tags we have read UDT_SV = Tagset(udt_sv_tags, config) lemma_field = 0 suc_field = 1 suc_full_field = 2 tag_field = 3 # UD tag (this is not really a sequence model, so we don't depend on history) this_tag = UDT_SV.tag(tag_field, 0) # Word form features (lemmas) this_word = TextField(lemma_field, 0) # Coarse SUC tags (given as input) # # The reason we apply normalization (= lower-casing) to these tags is because # they're also used in the tag dictionary, which requires normalized inputs. this_suc = normalize(TextField(suc_field, 0)) last_suc = normalize(TextField(suc_field, -1)) next_suc = normalize(TextField(suc_field, 1)) last_last_suc = normalize(TextField(suc_field, -2)) next_next_suc = normalize(TextField(suc_field, 2)) # Full SUC tags
suc_ne_tags, suc_norm_ne_tags = read_dict("suc-data/suc-blogs-ne-train.tab", 1, 3) # Create a Tagset object from the tags we have read SUC_NE = Tagset(suc_ne_tags, config) text_field = 0 lemma_field = 1 suc_full_field = 2 tag_field = 3 Names = WCLexicon.from_file("names", "suc-data/names.txt", config) WC = WCLexicon.from_file("brown", "suc-data/swe-brown100.txt", config) # Define tags (relative to the current position during a search) this_tag = SUC_NE.tag(tag_field, 0) last_tag = SUC_NE.tag(tag_field, -1) last_last_tag = SUC_NE.tag(tag_field, -2) # POS tags (+ morphology) this_pos = TextField(suc_full_field, 0) last_pos = TextField(suc_full_field, -1) next_pos = TextField(suc_full_field, 2) # Define lemmas (relative to the current position during a search) this_lemma = TextField(lemma_field, 0) last_lemma = TextField(lemma_field, -1) next_lemma = TextField(lemma_field, 1) next_next_lemma = TextField(lemma_field, 2) # Define words (relative to the current position during a search)