Beispiel #1
0
  def run_geo(self):
    train_ids, tune_ids, test_ids = self.get_folds()

    train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w')
    train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w')
    train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w')
    train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w')
    train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w')
    train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w')
    train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w')
    unlabeled_nl = open('%s/unlabeled.nl' % self.config.experiment_dir, 'w')
    tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w')
    tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w')
    test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w')
    test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w')
    test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w')

    corpus = ET.parse('%s/corpus-true.xml' % self.config.data_dir)
    corpus_root = corpus.getroot()

    counter = 0
    #stop_labeling = False
    for node in corpus_root.findall('example'):
      nl = node.find("nl[@lang='%s']" % self.config.lang).text
      nl = self.preprocess_nl(nl)
      fun = node.find("mrl[@lang='geo-funql']").text
      fun = self.preprocess_fun(fun)
      #fun = self.replace_specials(fun)
      mrl = util.fun_to_mrl(fun)
      eid = int(node.attrib['id'])

      unlabel_this = (counter >= 10 * self.config.lfrac)
      counter += 1
      counter %= 10

      if eid in tune_ids:
        print >>tune_nl, nl
        print >>tune_mrl, mrl
      elif eid in train_ids and not unlabel_this:
        print >>train_nl, nl
        print >>train_mrl, mrl
        print >>train_fun, fun
        print >>train_nl_np, nl
        print >>train_mrl_np, mrl
        print >>train_nl_lm, '<s>', nl, '</s>'
        print >>train_mrl_lm, '<s>', mrl, '</s>'
      elif eid in train_ids and unlabel_this:
        print >>unlabeled_nl, nl
      elif eid in test_ids:
        print >>test_nl, nl
        print >>test_mrl, mrl
        print >>test_fun, fun

    nplist = ET.parse('%s/nps-true.xml' % self.config.data_dir)
    nplist_root = nplist.getroot()
    for node in nplist_root.findall('example'):
      fun = node.find("mrl[@lang='geo-funql']").text
      fun = self.preprocess_fun(fun)
      #fun = self.replace_specials(fun)
      mrl = util.fun_to_mrl(fun)
      big_np = len(mrl.split()) > 1
      if (self.config.np_type == 'big' and not big_np) or \
          (self.config.np_type == 'small' and big_np):
        continue
      for nl_node in node.findall("nl[@lang='%s']" % self.config.lang):
        nl = nl_node.text
        nl = self.preprocess_nl(nl)
        for i in range(self.NP_WEIGHT):
          print >>train_nl_np, nl
          print >>train_mrl_np, mrl
          print >>train_nl_lm, nl
          print >>train_mrl_lm, mrl

    train_nl.close()
    train_nl_lm.close()
    train_mrl.close()
    train_mrl_lm.close()
    train_fun.close()
    test_nl.close()
    test_mrl.close()
    test_fun.close()
    tune_nl.close()
    tune_mrl.close()
Beispiel #2
0
  def run_atis(self):

    train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w')
    train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w')
    train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w')
    train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w')
    train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w')
    train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w')
    train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w')
    tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w')
    tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w')
    test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w')
    test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w')
    test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w')
    
    if self.config.run == 'debug':
      with open('%s/atis-train.sem' % self.config.data_dir) as data_file:
        counter = 0
        for line in data_file:
          nl, slot = line.split('<=>', 1)
          nl = self.preprocess_nl(nl)
          slot = self.replace_specials(slot)
          fun = self.slot_to_fun(slot)
          mrl = util.fun_to_mrl(fun, True)
          if counter % 4 in (0,1):
            print >>train_nl, nl
            print >>train_mrl, mrl
            print >>train_fun, fun
            print >>train_nl_np, nl
            print >>train_mrl_np, mrl
            print >>train_nl_lm, '<s>', nl, '</s>'
            print >>train_mrl_lm, '<s>', mrl, '</s>'
          elif counter % 4 == 2:
            print >>tune_nl, nl
            print >>tune_mrl, mrl
          else:
            print >>test_nl, nl
            print >>test_mrl, mrl
            print >>test_fun, fun
          counter += 1

    else:
      train_path = '%s/atis-train.sem' % self.config.data_dir
      if self.config.run == 'dev':
        tune_path = train_path
        test_path = '%s/atis-dev.sem' % self.config.data_dir
      elif self.config.run == 'test':
        tune_path = '%s/atis-dev.sem' % self.config.data_dir
        test_path = '%s/atis-test.sem' % self.config.data_dir

      with open(train_path) as train_file:
        for line in train_file:
          nl, slot = line.split('<=>', 1)
          nl = self.preprocess_nl(nl)
          slot = self.replace_specials(slot)
          fun = self.slot_to_fun(slot)
          mrl = util.fun_to_mrl(fun, True)
          print >>train_nl, nl
          print >>train_mrl, mrl
          print >>train_fun, fun
          print >>train_nl_np, nl
          print >>train_mrl_np, mrl
          print >>train_nl_lm, '<s>', nl, '</s>'
          print >>train_mrl_lm, '<s>', mrl, '</s>'

      with open(tune_path) as tune_file:
        for line in tune_file:
          nl, slot = line.split('<=>', 1)
          nl = self.preprocess_nl(nl)
          slot = self.replace_specials(slot)
          fun = self.slot_to_fun(slot)
          mrl = util.fun_to_mrl(fun, True)
          print >>tune_nl, nl
          print >>tune_mrl, mrl

      with open(test_path) as test_file:
        for line in test_file:
          nl, slot = line.split('<=>', 1)
          nl = self.preprocess_nl(nl)
          slot = self.replace_specials(slot)
          fun = self.slot_to_fun(slot)
          mrl = util.fun_to_mrl(fun, True)
          print >>test_nl, nl
          print >>test_mrl, mrl
          print >>test_fun, fun

    for np_name in os.listdir('%s/db' % self.config.data_dir):
      np_path = '%s/db/%s' % (self.config.data_dir, np_name)
      with open(np_path) as np_file:
        for line in np_file:
          names = re.findall(r'"([^"]+)"', line)
          for name in names:
            nl = name
            mrl = "%s" % self.replace_specials(name)
            mrl = mrl.replace(' ', '_')
            mrl = mrl + '@s'
            print >>train_nl_np, nl
            print >>train_mrl_np, mrl
            print >>train_nl_lm, nl
            print >>train_mrl_lm, mrl

    train_nl.close()
    train_nl_lm.close()
    train_mrl.close()
    train_mrl_lm.close()
    train_fun.close()
    test_nl.close()
    test_mrl.close()
    test_fun.close()
    tune_nl.close()
    tune_mrl.close()
Beispiel #3
0
  def run_robo(self):

    train_ids, tune_ids, test_ids = self.get_folds()
    tune_ids = test_ids

    train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w')
    train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w')
    train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w')
    train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w')
    train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w')
    train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w')
    train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w')
    tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w')
    tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w')
    test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w')
    test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w')
    test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w')

    corpus = ET.parse('%s/corpus.xml' % self.config.data_dir)
    corpus_root = corpus.getroot()

    for node in corpus_root.findall('example'):
      nl = node.find("nl[@lang='%s']" % self.config.lang).text
      nl = self.preprocess_nl(nl)
      clang = node.find("mrl[@lang='robocup-clang']").text
      clang = self.replace_specials(clang)
      fun = self.clang_to_fun(clang)
      #print fun
      mrl = util.fun_to_mrl(fun)
      eid = int(node.attrib['id'])

      if eid in tune_ids:
        print >>tune_nl, nl
        print >>tune_mrl, mrl
      elif eid in train_ids:
        print >>train_nl, nl
        print >>train_mrl, mrl
        print >>train_fun, fun
        print >>train_nl_np, nl
        print >>train_mrl_np, mrl
        print >>train_nl_lm, '<s>', nl, '</s>'
        print >>train_mrl_lm, '<s>', mrl, '</s>'
      if eid in test_ids:
      #elif eid in test_ids:
        print >>test_nl, nl
        print >>test_mrl, mrl
        print >>test_fun, fun

    nps_file = open('%s/names' % self.config.data_dir)
    while True:
      line = nps_file.readline()
      if not line:
        break
      nl = nps_file.readline().strip()[3:]
      nl = self.preprocess_nl(nl)
      nps_file.readline()
      nps_file.readline()
      while True:
        line = nps_file.readline().strip()
        if line == '':
          break
        m = re.match('^\*n:(Num|Unum|Ident) -> \(\{ (\S+) \}\)$', line)
        mrl = m.group(2) + '@0'
        for i in range(self.NP_WEIGHT):
          print >>train_nl_np, nl
          print >>train_mrl_np, mrl
          print >>train_nl_lm, nl
          print >>train_mrl_lm, mrl

    train_nl.close()
    train_nl_lm.close()
    train_mrl.close()
    train_mrl_lm.close()
    train_fun.close()
    test_nl.close()
    test_mrl.close()
    test_fun.close()
    tune_nl.close()
    tune_mrl.close()