def get_batches(scts): batch_xs, batch_ys = empty_batch() ex_cnt = 0 if OOV_ONLY: iv_set = set() def add_iv(f): for sent in ptb.sents(f): for tok in sent: iv_set.add(tok) common.for_all_in_ptb_scts(TRAIN_SCTS, add_iv) for sct in scts: print "Section " + sct fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)] for f in fs: print " File " + f + "...", # For each word in the sentences of the file, # create an example and add it to the batch. for sent in ptb.tagged_sents(f): for i in range(len(sent)): # Ignore "None" tags (not overt lingustic elements) if sent[i][1] == "-NONE-": continue # If we're in OOV, skip known tokens if OOV_ONLY and sent[i][0] in iv_set: continue x, y = get_example(sent, i) batch_xs[ex_cnt] = x batch_ys[ex_cnt] = y # If we reach enough examples to form a batch, yield it now, # then start a new batch. ex_cnt += 1 if ex_cnt == BATCH_SIZE: yield (batch_xs, batch_ys) batch_xs, batch_ys = empty_batch() ex_cnt = 0 # If we have an incomplete batch at the end, pad it with nothings # and yield it. if ex_cnt != 0: while ex_cnt < BATCH_SIZE: x, y = empty_example() batch_xs[ex_cnt] = x batch_ys[ex_cnt] = y ex_cnt += 1 yield (batch_xs, batch_ys) raise StopIteration
def make_word_model(scts): super_model = {tag: {} for tag in common.OPEN_CLASSES | common.CLOSED_CLASSES} def parse_file(f): for word, tag in ptb.tagged_words(f): if tag in common.OPEN_CLASSES: add_counts(word, super_model[tag]) elif tag in common.CLOSED_CLASSES: observe_closed(word, super_model[tag]) common.for_all_in_ptb_scts(scts, parse_file) for tag, model in super_model.iteritems(): if tag in common.OPEN_CLASSES: # smooth(model, 1) open_as_probs(model) else: counts_to_probs(tag, super_model) return super_model
def make_word_model(scts): super_model = { tag: {} for tag in common.OPEN_CLASSES | common.CLOSED_CLASSES } def parse_file(f): for word, tag in ptb.tagged_words(f): if tag in common.OPEN_CLASSES: add_counts(word, super_model[tag]) elif tag in common.CLOSED_CLASSES: observe_closed(word, super_model[tag]) common.for_all_in_ptb_scts(scts, parse_file) for tag, model in super_model.iteritems(): if tag in common.OPEN_CLASSES: # smooth(model, 1) open_as_probs(model) else: counts_to_probs(tag, super_model) return super_model