def translate_fun(data_point, sess, model, vocabs, FLAGS, slot_filling_classifier=None): tg_ids = [data_utils.ROOT_ID] decoder_features = [[tg_ids]] if type(data_point) is str: source_str = data_point encoder_features = query_to_encoder_features(data_point, vocabs, FLAGS) else: source_str = data_point[0].sc_txt encoder_features = [[data_point[0].sc_ids]] if FLAGS.use_copy and FLAGS.copy_fun == 'copynet': encoder_features.append([data_point[0].csc_ids]) if FLAGS.use_copy and FLAGS.copy_fun == 'copynet': # append dummy copynet target features ( # used only for computing training objectives) ctg_ids = [data_utils.ROOT_ID] decoder_features.append([ctg_ids]) # tokenize the source string with minimal changes on the token form copy_tokens = [query_to_copy_tokens(source_str, FLAGS)] else: copy_tokens = None if FLAGS.normalized: _, entities = tokenizer.ner_tokenizer(source_str) sc_fillers = [entities[0]] else: sc_fillers = None # Which bucket does it belong to? bucket_ids = [ b for b in xrange(len(model.buckets)) if model.buckets[b][0] > len(encoder_features[0][0]) ] bucket_id = min(bucket_ids) if bucket_ids else (len(model.buckets) - 1) # Get a 1-element batch to feed the sentence to the model. formatted_example = model.format_batch(encoder_features, decoder_features, bucket_id=bucket_id) # Compute neural network decoding output model_outputs = model.step(sess, formatted_example, bucket_id, forward_only=True) sequence_logits = model_outputs.sequence_logits decoded_outputs = decode(model_outputs, FLAGS, vocabs, sc_fillers=sc_fillers, slot_filling_classifier=slot_filling_classifier, copy_tokens=copy_tokens) return decoded_outputs, sequence_logits
def extract_rewrites(data): """Extract all pairs of rewrites from a parallel corpus.""" nls, cms = data # Step 1: group pairs with the same natural language description. group_pairs_by_nl = collections.defaultdict(set) for nl, cm in zip(nls, cms): nl = nl.strip() cm = cm.strip() if nl.lower() == "na": continue if not nl: continue if not cm: continue nl_tokens, _ = tokenizer.ner_tokenizer(nl) nl_temp = ' '.join(nl_tokens) cm_temp = data_tools.cmd2template(cm) if not cm_temp in group_pairs_by_nl[nl_temp]: group_pairs_by_nl[nl_temp].add(cm_temp) # Step 2: cluster the commands with the same natural language explanations. merged = set() nls = group_pairs_by_nl.keys() for i in xrange(len(nls)): nl = nls[i] cm_temp_set = group_pairs_by_nl[nl] for j in xrange(i + 1, len(nls)): nl2 = nls[j] cm_temp_set2 = group_pairs_by_nl[nl2] if len(cm_temp_set & cm_temp_set2) >= 2: for cm_temp in cm_temp_set: if not cm_temp in group_pairs_by_nl[nl2]: group_pairs_by_nl[nl2].add(cm_temp) merged.add(i) # Step 3: remove redundant clusters after merge. rewrites = {} for i in xrange(len(nls)): if not i in merged: rewrites[nls[i]] = group_pairs_by_nl[nls[i]] # Step 4: print extracted rewrites and store in database. with DBConnection() as db: db.create_schema() for nl, cm_temps in sorted(rewrites.items(), key=lambda x: len(x[1]), reverse=True)[:10]: if len(cm_temps) >= 2: for cm_temp1 in cm_temps: for cm_temp2 in cm_temps: if cm_temp1 == cm_temp2: continue if not db.exist_rewrite((cm_temp1, cm_temp2)): db.add_rewrite((cm_temp1, cm_temp2)) print("* {} --> {}".format(cm_temp1, cm_temp2)) print()
def group_parallel_data(dataset, attribute='source', use_temp=False, tokenizer_selector='nl'): """ Group parallel dataset by a certain attribute. :param dataset: a list of training quadruples (nl_str, cm_str, nl, cm) :param attribute: attribute by which the data is grouped :param bucket_input: if the input is grouped in buckets :param use_temp: set to true if the dataset is to be grouped by the natural language template; false if the dataset is to be grouped by the natural language strings :param tokenizer_selector: specify which tokenizer to use for making templates :return: list of (key, data group) tuples sorted by the key value. """ if dataset.data_points and isinstance(dataset.data_points, list): if isinstance(dataset.data_points[0], list): data_points = functools.reduce(lambda x, y: x + y, dataset.data_points) else: data_points = dataset.data_points else: raise ValueError grouped_dataset = {} for i in xrange(len(data_points)): data_point = data_points[i] attr = data_point.sc_txt \ if attribute == 'source' else data_point.tg_txt if use_temp: if tokenizer_selector == 'nl': words, _ = tokenizer.ner_tokenizer(attr) else: words = data_tools.bash_tokenizer(attr, arg_type_only=True) temp = ' '.join(words) else: if tokenizer_selector == 'nl': words, _ = tokenizer.basic_tokenizer(attr) temp = ' '.join(words) else: temp = attr if temp in grouped_dataset: grouped_dataset[temp].append(data_point) else: grouped_dataset[temp] = [data_point] return sorted(grouped_dataset.items(), key=lambda x: x[0])
def slot_filler_alignment_induction(nl, cm, verbose=False): """Give an oracle translation pair of (nl, cm), align the slot fillers extracted from the natural language with the slots in the command. """ # Step 1: extract the token ids of the constants in the English sentence # and the slots in the command tokens, entities = tokenizer.ner_tokenizer(nl) nl_fillers, _, _ = entities cm_tokens = data_tools.bash_tokenizer(cm) cm_tokens_with_types = data_tools.bash_tokenizer(cm, arg_type_only=True) assert (len(cm_tokens) == len(cm_tokens_with_types)) cm_slots = {} for i in xrange(len(cm_tokens_with_types)): if cm_tokens_with_types[i] in bash.argument_types: if i > 0 and format_args.is_min_flag(cm_tokens_with_types[i - 1]): cm_token_type = 'Timespan' else: cm_token_type = cm_tokens_with_types[i] cm_slots[i] = (cm_tokens[i], cm_token_type) # Step 2: construct one-to-one mappings for the token ids from both sides M = collections.defaultdict(dict) # alignment score matrix for i in nl_fillers: surface, filler_type = nl_fillers[i] filler_value = format_args.extract_value(filler_type, filler_type, surface) for j in cm_slots: slot_value, slot_type = cm_slots[j] if (filler_value and format_args.is_parameter(filler_value)) or \ slot_filler_type_match(slot_type, filler_type): M[i][j] = slot_filler_value_match(slot_value, filler_value, slot_type) else: M[i][j] = -np.inf mappings, remained_fillers = stable_marriage_alignment(M) if verbose: print('nl: {}'.format(nl)) print('cm: {}'.format(cm)) for (i, j) in mappings: print('[{}] {} <-> [{}] {}'.format(i, nl_fillers[i][0], j, cm_slots[j][0])) for i in remained_fillers: print('filler {} is not matched to any slot\n'.format( nl_fillers[i][0].encode('utf-8'))) return mappings
def translate_fun(data_point, sess, model, vocabs, FLAGS, slot_filling_classifier=None): if type(data_point) is str: sc_ids, sc_full_ids, sc_copy_ids, sc_fillers = \ vectorize_query(data_point, vocabs, FLAGS) tg_ids = [data_utils.ROOT_ID] tg_full_ids = [data_utils.ROOT_ID] pointer_targets = np.zeros( [1, FLAGS.max_tg_length, FLAGS.max_sc_length]) else: sc_ids = data_point[0].sc_ids sc_full_ids = data_point[0].sc_full_ids sc_copy_ids = data_point[0].sc_copy_ids tg_ids = data_point[0].tg_ids tg_full_ids = data_point[0].tg_full_ids pointer_targets = data_point[0].pointer_targets _, entities = tokenizer.ner_tokenizer(data_point[0].sc_txt) sc_fillers = entities[0] # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(model.buckets)) if model.buckets[b][0] > len(sc_ids) ]) # Get a 1-element batch to feed the sentence to the model. formatted_example = model.format_example( [[sc_ids], [sc_full_ids], [sc_copy_ids]], [[tg_ids], [tg_full_ids]], pointer_targets=pointer_targets, bucket_id=bucket_id) # Compute neural network decoding output model_outputs = model.step(sess, formatted_example, bucket_id, forward_only=True) output_logits = model_outputs.output_logits decoded_outputs = decode(formatted_example.encoder_full_inputs, model_outputs, FLAGS, vocabs, [sc_fillers], slot_filling_classifier) return decoded_outputs, output_logits
def eval_slot_filling(dataset): """ Evaluate global slot filling algorithm F1 using ground truth templates. """ vocabs = data_utils.load_vocab(FLAGS) rev_tg_vocab = vocabs.rev_tg_vocab rev_tg_full_vocab = vocabs.rev_tg_full_vocab with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) as sess: # Create model. FLAGS.beam_size = 1 FLAGS.token_decoding_algorithm = 'beam_search' FLAGS.force_reading_input = True model = graph_utils.create_model(sess, FLAGS, Seq2SeqModel, buckets=_buckets, forward_only=True) model_param_dir = os.path.join(FLAGS.model_dir, 'train.mappings.X.Y.npz') train_X, train_Y = data_utils.load_slot_filling_data(model_param_dir) slot_filling_classifier = classifiers.KNearestNeighborModel( FLAGS.num_nn_slot_filling, train_X, train_Y) print('Slot filling classifier parameters loaded.') num_correct_argument = 0.0 num_argument = 0.0 num_correct_align = 0.0 num_predict_align = 0.0 num_gt_align = 0.0 for bucket_id in xrange(len(_buckets)): for data_id in xrange(len(dataset[bucket_id])): dp = dataset[bucket_id][data_id] gt_mappings = [tuple(m) for m in dp.mappings] outputs = dp.tg_ids[1:-1] full_outputs = dp.tg_full_ids[1:-1] if gt_mappings: _, entities = tokenizer.ner_tokenizer(dp.sc_txt) nl_fillers = entities[0] encoder_inputs = [dp.sc_ids] encoder_full_inputs = [dp.sc_copy_ids] \ if FLAGS.use_copy else [dp.sc_full_ids] decoder_inputs = [dp.tg_ids] decoder_full_inputs = [dp.tg_full_ids] \ if FLAGS.use_copy else [dp.tg_copy_ids] pointer_targets = [dp.pointer_targets] \ if FLAGS.use_copy else None formatted_example = model.format_example( [encoder_inputs, encoder_full_inputs], [decoder_inputs, decoder_full_inputs], pointer_targets=pointer_targets, bucket_id=bucket_id) model_outputs = model.step(sess, formatted_example, bucket_id, forward_only=True) encoder_outputs = model_outputs.encoder_hidden_states decoder_outputs = model_outputs.decoder_hidden_states print(decoder_outputs[:, 0, :]) cm_slots = {} output_tokens = [] for ii in xrange(len(outputs)): output = outputs[ii] if output < len(rev_tg_vocab): token = rev_tg_vocab[output] if "@@" in token: token = token.split("@@")[-1] output_tokens.append(token) if token.startswith('__ARG__'): token = token[len('__ARG__'):] if nl_fillers is not None and \ token in constants._ENTITIES: if ii > 0 and slot_filling.is_min_flag( rev_tg_vocab[outputs[ii - 1]]): token_type = 'Timespan' else: token_type = token cm_slots[ii] = (token, token_type) else: output_tokens.append(data_utils._UNK) if FLAGS.use_copy: P = pointer_targets[0][0] > 0 pointers = model_outputs.pointers[0] pointers = np.multiply( np.sum(P.astype(float)[:pointers.shape[0], -pointers.shape[1]:], 1, keepdims=True), pointers) else: pointers = None tree, _, mappings = slot_filling.stable_slot_filling( output_tokens, nl_fillers, cm_slots, pointers, encoder_outputs[0], decoder_outputs[0], slot_filling_classifier, verbose=True) if mappings is not None: # print(gt_mappings) for mapping in mappings: # print(mapping) if mapping in gt_mappings: num_correct_align += 1 num_predict_align += len(mappings) num_gt_align += len(gt_mappings) tokens = data_tools.ast2tokens(tree) if not tokens: continue for ii in xrange(len(outputs)): output = outputs[ii] token = rev_tg_vocab[output] if token.startswith('__ARG__'): token = token[len('__ARG__'):] if token in constants._ENTITIES: argument = rev_tg_full_vocab[full_outputs[ii]] if argument.startswith('__ARG__'): argument = argument[len('__ARG__'):] pred = tokens[ii] if constants.remove_quotation(argument) == \ constants.remove_quotation(pred): num_correct_argument += 1 num_argument += 1 if gt_mappings: break precision = num_correct_align / num_predict_align recall = num_correct_align / num_gt_align print("Argument Alignment Precision: {}".format(precision)) print("Argument Alignment Recall: {}".format(recall)) print("Argument Alignment F1: {}".format(2 * precision * recall / (precision + recall))) print("Argument filling accuracy: {}".format(num_correct_argument / num_argument))
def decode_set(model, dataset, rev_sc_vocab, rev_tg_vocab, verbose=True): grouped_dataset = data_utils.group_data_by_nl(dataset, use_bucket=False, use_temp=False) with DBConnection() as db: db.remove_model(model_name) num_eval = 0 for sc_temp in grouped_dataset: batch_sc_strs, batch_tg_strs, batch_scs, batch_cmds = \ grouped_dataset[sc_temp] _, entities = tokenizer.ner_tokenizer(sc_temp) nl_fillers = entities[-1] if nl_fillers is not None: cm_slots = {} sc_str = batch_sc_strs[0] nl = batch_scs[0] if verbose: print("Example {}".format(num_eval+1)) print("Original English: " + sc_str.strip()) print("English: " + sc_temp) for j in xrange(len(batch_tg_strs)): print("GT Command {}: {}".format(j+1, batch_tg_strs[j].strip())) # retrieve top-ranked command template top_k_results = model.test(nl, 100) count = 0 for i in xrange(len(top_k_results)): nn, output_tokens, score = top_k_results[i] nn_str = ' '.join([rev_sc_vocab[j] for j in nn]) tokens = [] for j in xrange(1, len(output_tokens)-1): pred_token = rev_tg_vocab[output_tokens[j]] if "@@" in pred_token: pred_token = pred_token.split("@@")[-1] if nl_fillers is not None and \ pred_token in constants._ENTITIES: if j > 0 and slot_filling.is_min_flag( rev_tg_vocab[output_tokens[j-1]]): pred_token_type = 'Timespan' else: pred_token_type = pred_token cm_slots[j] = (pred_token, pred_token_type) tokens.append(pred_token) pred_cmd = ' '.join(tokens) # check if the predicted command templates have enough slots to # hold the fillers (to rule out templates that are trivially # unqualified) if FLAGS.dataset.startswith("bash"): pred_cmd = re.sub('( ;\s+)|( ;$)', ' \\; ', pred_cmd) tree = data_tools.bash_parser(pred_cmd) else: tree = data_tools.paren_parser(pred_cmd) if nl_fillers is None or len(cm_slots) >= len(nl_fillers): # Step 2: check if the predicted command template is grammatical # filter out non-grammatical output if tree is not None: matched = slot_filling.heuristic_slot_filling(tree, nl_fillers) if tree is not None: slot_filling.fill_default_value(tree) pred_cmd = data_tools.ast2command(tree) if verbose: print("NN: {}".format(nn_str)) print("Prediction {}: {} ({})".format(i, pred_cmd, score)) db.add_prediction(model_name, sc_str, pred_cmd, float(score), update_mode=False) count += 1 if count == 10: break print("") num_eval += 1