def run_geo(self): train_ids, tune_ids, test_ids = self.get_folds() train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w') train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w') train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w') train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w') train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w') train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w') train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w') unlabeled_nl = open('%s/unlabeled.nl' % self.config.experiment_dir, 'w') tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w') tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w') test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w') test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w') test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w') corpus = ET.parse('%s/corpus-true.xml' % self.config.data_dir) corpus_root = corpus.getroot() counter = 0 #stop_labeling = False for node in corpus_root.findall('example'): nl = node.find("nl[@lang='%s']" % self.config.lang).text nl = self.preprocess_nl(nl) fun = node.find("mrl[@lang='geo-funql']").text fun = self.preprocess_fun(fun) #fun = self.replace_specials(fun) mrl = util.fun_to_mrl(fun) eid = int(node.attrib['id']) unlabel_this = (counter >= 10 * self.config.lfrac) counter += 1 counter %= 10 if eid in tune_ids: print >>tune_nl, nl print >>tune_mrl, mrl elif eid in train_ids and not unlabel_this: print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '<s>', nl, '</s>' print >>train_mrl_lm, '<s>', mrl, '</s>' elif eid in train_ids and unlabel_this: print >>unlabeled_nl, nl elif eid in test_ids: print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun nplist = ET.parse('%s/nps-true.xml' % self.config.data_dir) nplist_root = nplist.getroot() for node in nplist_root.findall('example'): fun = node.find("mrl[@lang='geo-funql']").text fun = self.preprocess_fun(fun) #fun = self.replace_specials(fun) mrl = util.fun_to_mrl(fun) big_np = len(mrl.split()) > 1 if (self.config.np_type == 'big' and not big_np) or \ (self.config.np_type == 'small' and big_np): continue for nl_node in node.findall("nl[@lang='%s']" % self.config.lang): nl = nl_node.text nl = self.preprocess_nl(nl) for i in range(self.NP_WEIGHT): print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, nl print >>train_mrl_lm, mrl train_nl.close() train_nl_lm.close() train_mrl.close() train_mrl_lm.close() train_fun.close() test_nl.close() test_mrl.close() test_fun.close() tune_nl.close() tune_mrl.close()
def run_atis(self): train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w') train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w') train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w') train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w') train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w') train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w') train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w') tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w') tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w') test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w') test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w') test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w') if self.config.run == 'debug': with open('%s/atis-train.sem' % self.config.data_dir) as data_file: counter = 0 for line in data_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) if counter % 4 in (0,1): print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '<s>', nl, '</s>' print >>train_mrl_lm, '<s>', mrl, '</s>' elif counter % 4 == 2: print >>tune_nl, nl print >>tune_mrl, mrl else: print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun counter += 1 else: train_path = '%s/atis-train.sem' % self.config.data_dir if self.config.run == 'dev': tune_path = train_path test_path = '%s/atis-dev.sem' % self.config.data_dir elif self.config.run == 'test': tune_path = '%s/atis-dev.sem' % self.config.data_dir test_path = '%s/atis-test.sem' % self.config.data_dir with open(train_path) as train_file: for line in train_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '<s>', nl, '</s>' print >>train_mrl_lm, '<s>', mrl, '</s>' with open(tune_path) as tune_file: for line in tune_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) print >>tune_nl, nl print >>tune_mrl, mrl with open(test_path) as test_file: for line in test_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun for np_name in os.listdir('%s/db' % self.config.data_dir): np_path = '%s/db/%s' % (self.config.data_dir, np_name) with open(np_path) as np_file: for line in np_file: names = re.findall(r'"([^"]+)"', line) for name in names: nl = name mrl = "%s" % self.replace_specials(name) mrl = mrl.replace(' ', '_') mrl = mrl + '@s' print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, nl print >>train_mrl_lm, mrl train_nl.close() train_nl_lm.close() train_mrl.close() train_mrl_lm.close() train_fun.close() test_nl.close() test_mrl.close() test_fun.close() tune_nl.close() tune_mrl.close()
def run_robo(self): train_ids, tune_ids, test_ids = self.get_folds() tune_ids = test_ids train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w') train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w') train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w') train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w') train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w') train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w') train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w') tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w') tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w') test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w') test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w') test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w') corpus = ET.parse('%s/corpus.xml' % self.config.data_dir) corpus_root = corpus.getroot() for node in corpus_root.findall('example'): nl = node.find("nl[@lang='%s']" % self.config.lang).text nl = self.preprocess_nl(nl) clang = node.find("mrl[@lang='robocup-clang']").text clang = self.replace_specials(clang) fun = self.clang_to_fun(clang) #print fun mrl = util.fun_to_mrl(fun) eid = int(node.attrib['id']) if eid in tune_ids: print >>tune_nl, nl print >>tune_mrl, mrl elif eid in train_ids: print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '<s>', nl, '</s>' print >>train_mrl_lm, '<s>', mrl, '</s>' if eid in test_ids: #elif eid in test_ids: print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun nps_file = open('%s/names' % self.config.data_dir) while True: line = nps_file.readline() if not line: break nl = nps_file.readline().strip()[3:] nl = self.preprocess_nl(nl) nps_file.readline() nps_file.readline() while True: line = nps_file.readline().strip() if line == '': break m = re.match('^\*n:(Num|Unum|Ident) -> \(\{ (\S+) \}\)$', line) mrl = m.group(2) + '@0' for i in range(self.NP_WEIGHT): print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, nl print >>train_mrl_lm, mrl train_nl.close() train_nl_lm.close() train_mrl.close() train_mrl_lm.close() train_fun.close() test_nl.close() test_mrl.close() test_fun.close() tune_nl.close() tune_mrl.close()