from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict import sys config = Configuration('wsj', args) # Read tagset and tag lexicon from corpus wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1) # Create a Tagset object from the tags we have read WSJ = Tagset(wsj_tags, config) # Load a file with word classes WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = WSJ.tag(tag_field, 0) last_tag = WSJ.tag(tag_field, -1) last_last_tag = WSJ.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2)
config.build() sys.exit(0) # Read tagset and tag lexicon (coarse SUC -> UD) from corpus suc_ne_tags, suc_norm_ne_tags = read_dict('suc-data/suc-blogs-ne-train.tab', 1, 3) # Create a Tagset object from the tags we have read SUC_NE = Tagset(suc_ne_tags, config) text_field = 0 lemma_field = 1 suc_full_field = 2 tag_field = 3 Names = WCLexicon.from_file('names', 'suc-data/names.txt', config) WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config) # Define tags (relative to the current position during a search) this_tag = SUC_NE.tag(tag_field, 0) last_tag = SUC_NE.tag(tag_field, -1) last_last_tag = SUC_NE.tag(tag_field, -2) # POS tags (+ morphology) this_pos = TextField(suc_full_field, 0) last_pos = TextField(suc_full_field, -1) next_pos = TextField(suc_full_field, 2) # Define lemmas (relative to the current position during a search) this_lemma = TextField(lemma_field, 0)
for line in f: token, _, tag, _ = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) with open('suc-data/dalin.txt', 'r', encoding='utf-8') as f: for line in f: token, _, tag, _ = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) # Create a Tagset object from the tags we have read SUC = Tagset(suc_tags, config) # Load a file with word classes WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = SUC.tag(tag_field, 0) last_tag = SUC.tag(tag_field, -1) last_last_tag = SUC.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2)
for line in f: token, tag = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) with open('suc-data/saldo.txt', 'r', encoding='utf-8') as f: for line in f: token, _, tag, _ = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) # Create a Tagset object from the tags we have read SUC = Tagset(suc_tags, config) # Load a file with word classes WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = SUC.tag(tag_field, 0) last_tag = SUC.tag(tag_field, -1) last_last_tag = SUC.tag(tag_field, -2) # Define words (relative to the current position during a search) this_word = TextField(text_field, 0) last_word = TextField(text_field, -1) next_word = TextField(text_field, 1) next_next_word = TextField(text_field, 2)
if config.skip_generate: config.build() sys.exit(0) # Read tagset and tag lexicon (coarse SUC -> UD) from corpus suc_ne_tags, suc_norm_ne_tags = read_dict("suc-data/suc-blogs-ne-train.tab", 1, 3) # Create a Tagset object from the tags we have read SUC_NE = Tagset(suc_ne_tags, config) text_field = 0 lemma_field = 1 suc_full_field = 2 tag_field = 3 Names = WCLexicon.from_file("names", "suc-data/names.txt", config) WC = WCLexicon.from_file("brown", "suc-data/swe-brown100.txt", config) # Define tags (relative to the current position during a search) this_tag = SUC_NE.tag(tag_field, 0) last_tag = SUC_NE.tag(tag_field, -1) last_last_tag = SUC_NE.tag(tag_field, -2) # POS tags (+ morphology) this_pos = TextField(suc_full_field, 0) last_pos = TextField(suc_full_field, -1) next_pos = TextField(suc_full_field, 2) # Define lemmas (relative to the current position during a search) this_lemma = TextField(lemma_field, 0)