def query_to_encoder_features(sentence, vocabs, FLAGS): """ Convert a natural language query into feature vectors used by the encoder. """ if FLAGS.channel == 'char': tokens = data_utils.nl_to_characters(sentence) init_vocab = data_utils.CHAR_INIT_VOCAB elif FLAGS.channel == 'partial.token': tokens = data_utils.nl_to_partial_tokens(sentence, tokenizer.basic_tokenizer) init_vocab = data_utils.TOKEN_INIT_VOCAB else: if FLAGS.normalized: tokens = data_utils.nl_to_tokens(sentence, tokenizer.ner_tokenizer) else: tokens = data_utils.nl_to_tokens(sentence, tokenizer.basic_tokenizer) init_vocab = data_utils.TOKEN_INIT_VOCAB sc_ids = data_utils.tokens_to_ids(tokens, vocabs.sc_vocab) encoder_features = [[sc_ids]] if FLAGS.use_copy and FLAGS.copy_fun == 'copynet': csc_ids = [] for i, t in enumerate(tokens): if not t in init_vocab and t in vocabs.tg_vocab: csc_ids.append(vocabs.tg_vocab[t]) else: csc_ids.append(len(vocabs.tg_vocab) + i) encoder_features.append([csc_ids]) return encoder_features
def query_to_copy_tokens(sentence, FLAGS): if FLAGS.channel == 'char': tokens = data_utils.nl_to_characters(sentence) elif FLAGS.channel == 'partial.token': tokens = data_utils.nl_to_partial_tokens( sentence, tokenizer.basic_tokenizer, to_lower_case=False, lemmatization=False) else: tokens = data_utils.nl_to_tokens( sentence, tokenizer.basic_tokenizer, to_lower_case=False, lemmatization=False) return tokens
def main(): with open(os.path.join(os.getcwd(), 'data/nl2bash-data.json')) as fr: d_nl2bash = json.load(fr) with open(os.path.join(os.getcwd(), 'data/ainix_data.json')) as fr: d_ainix = json.load(fr) tuple_nl2bash = [(d_nl2bash[key]['invocation'], d_nl2bash[key]['cmd']) for key in d_nl2bash] tuple_ainix = [(d_ainix[key]['invocation'], d_ainix[key]['cmd']) for key in d_ainix] tuple_all = list(set(tuple_nl2bash + tuple_ainix)) d = {} for idx, t in enumerate(tuple_all): temp = {} temp['invocation'] = t[0] temp['cmd'] = [t[1]] d[str(idx + 1)] = temp def normalizer(text): for keyword in KEYWORD_LIST: if keyword in text: return 'ARG' return text input_template_predictor = [] input_argument_predictor = [] for t in tuple_all: nl = t[0][0].lower() + t[0][1:] norm_nl = ' '.join( tokenizer.basic_tokenizer(nl, to_lower_case=False, lemmatization=False, remove_stop_words=True, correct_spell=False)[0]) norm_nl_arg_replace = ' '.join([ normalizer(item) for item in data_utils.nl_to_tokens( nl, tokenizer=tokenizer.ner_tokenizer) ]) cm = t[1] norm_cm = [ normalizer(item.split('<FLAG_SUFFIX>')[0]) for item in data_utils.cm_to_tokens( cm, data_tools.bash_tokenizer, arg_type_only=True) ] norm_cm_ref = [ normalizer(item.split('<FLAG_SUFFIX>')[0]) for item in data_utils.cm_to_tokens( cm, data_tools.bash_tokenizer, arg_type_only=False) ] args = [] for c, r in zip(norm_cm, norm_cm_ref): if c != r: args.append(r) norm_cm = ' '.join(norm_cm) if len(args) > 0: source = norm_nl + ' SEP ' + norm_cm target = ' SEP '.join(args) input_argument_predictor.append((source, target)) input_template_predictor.append((norm_nl_arg_replace, norm_cm)) random.seed(18015651) random.shuffle(input_template_predictor) random.shuffle(input_argument_predictor) os.makedirs(os.path.join(os.getcwd(), 'corpus/template_predictor'), exist_ok=True) with open(os.path.join(os.getcwd(), 'corpus/template_predictor/train.nl'), 'w') as fwn, \ open(os.path.join(os.getcwd(), 'corpus/template_predictor/train.cm'), 'w') as fwc: for example in input_template_predictor: fwn.write(example[0] + '\n') fwc.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/template_predictor/valid.nl'), 'w') as fwn, \ open(os.path.join(os.getcwd(), 'corpus/template_predictor/valid.cm'), 'w') as fwc: for example in input_template_predictor[10247:]: fwn.write(example[0] + '\n') fwc.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/template_predictor/test.nl'), 'w') as fwn, \ open(os.path.join(os.getcwd(), 'corpus/template_predictor/test.cm'), 'w') as fwc: for example in input_template_predictor[10247:]: fwn.write(example[0] + '\n') fwc.write(example[1] + '\n') os.makedirs(os.path.join(os.getcwd(), 'corpus/argument_predictor'), exist_ok=True) with open(os.path.join(os.getcwd(), 'corpus/argument_predictor/train.ctx'), 'w') as fwc, \ open(os.path.join(os.getcwd(), 'corpus/argument_predictor/train.arg'), 'w') as fwa: for example in input_argument_predictor: fwc.write(example[0] + '\n') fwa.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/argument_predictor/valid.ctx'), 'w') as fwc, \ open(os.path.join(os.getcwd(), 'corpus/argument_predictor/valid.arg'), 'w') as fwa: for example in input_argument_predictor[9830:]: fwc.write(example[0] + '\n') fwa.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/argument_predictor/test.ctx'), 'w') as fwc, \ open(os.path.join(os.getcwd(), 'corpus/argument_predictor/test.arg'), 'w') as fwa: for example in input_argument_predictor[9830:]: fwc.write(example[0] + '\n') fwa.write(example[1] + '\n')