def compute_nl_stats(): input_file = sys.argv[1] unique_sentences = set() unique_words = set() words_per_sent = [] sents_per_word = collections.defaultdict(int) with open(input_file) as f: for line in f: nl = line.strip() unique_sentences.add(nl) words, _ = basic_tokenizer(nl, to_lower_case=False, lemmatization=False) unique_words |= set(words) words_per_sent.append(len(words)) for word in words: sents_per_word[word] += 1 print('# unique sentences: {}'.format(len(unique_sentences))) print('# unique words: {}'.format(len(unique_words))) print('# words per sentence: average {}, median {}'.format( np.mean(words_per_sent), np.median(words_per_sent))) print('# sentences per word: average {} median {}'.format( np.mean(sents_per_word.values()), np.median(sents_per_word.values()))) for w, f in sorted(sents_per_word.items(), key=lambda x: x[1], reverse=True)[:5]: print(w, f)
def count_unique_nls(): nl_file = sys.argv[1] unique_nls = set() with open(nl_file) as f: for line in f: nl = line.strip() nl_temp = ' '.join(basic_tokenizer(nl)[0]) unique_nls.add(nl_temp) print('number of unique natural language forms: {}'.format( len(unique_nls)))
def group_parallel_data(dataset, attribute='source', use_temp=False, tokenizer_selector='nl'): """ Group parallel dataset by a certain attribute. :param dataset: a list of training quadruples (nl_str, cm_str, nl, cm) :param attribute: attribute by which the data is grouped :param bucket_input: if the input is grouped in buckets :param use_temp: set to true if the dataset is to be grouped by the natural language template; false if the dataset is to be grouped by the natural language strings :param tokenizer_selector: specify which tokenizer to use for making templates :return: list of (key, data group) tuples sorted by the key value. """ if dataset.data_points and isinstance(dataset.data_points, list): if isinstance(dataset.data_points[0], list): data_points = functools.reduce(lambda x, y: x + y, dataset.data_points) else: data_points = dataset.data_points else: raise ValueError grouped_dataset = {} for i in xrange(len(data_points)): data_point = data_points[i] attr = data_point.sc_txt \ if attribute == 'source' else data_point.tg_txt if use_temp: if tokenizer_selector == 'nl': words, _ = tokenizer.ner_tokenizer(attr) else: words = data_tools.bash_tokenizer(attr, arg_type_only=True) temp = ' '.join(words) else: if tokenizer_selector == 'nl': words, _ = tokenizer.basic_tokenizer(attr) temp = ' '.join(words) else: temp = attr if temp in grouped_dataset: grouped_dataset[temp].append(data_point) else: grouped_dataset[temp] = [data_point] return sorted(grouped_dataset.items(), key=lambda x: x[0])
def main(): with open(os.path.join(os.getcwd(), 'data/nl2bash-data.json')) as fr: d_nl2bash = json.load(fr) with open(os.path.join(os.getcwd(), 'data/ainix_data.json')) as fr: d_ainix = json.load(fr) tuple_nl2bash = [(d_nl2bash[key]['invocation'], d_nl2bash[key]['cmd']) for key in d_nl2bash] tuple_ainix = [(d_ainix[key]['invocation'], d_ainix[key]['cmd']) for key in d_ainix] tuple_all = list(set(tuple_nl2bash + tuple_ainix)) d = {} for idx, t in enumerate(tuple_all): temp = {} temp['invocation'] = t[0] temp['cmd'] = [t[1]] d[str(idx + 1)] = temp def normalizer(text): for keyword in KEYWORD_LIST: if keyword in text: return 'ARG' return text input_template_predictor = [] input_argument_predictor = [] for t in tuple_all: nl = t[0][0].lower() + t[0][1:] norm_nl = ' '.join( tokenizer.basic_tokenizer(nl, to_lower_case=False, lemmatization=False, remove_stop_words=True, correct_spell=False)[0]) norm_nl_arg_replace = ' '.join([ normalizer(item) for item in data_utils.nl_to_tokens( nl, tokenizer=tokenizer.ner_tokenizer) ]) cm = t[1] norm_cm = [ normalizer(item.split('<FLAG_SUFFIX>')[0]) for item in data_utils.cm_to_tokens( cm, data_tools.bash_tokenizer, arg_type_only=True) ] norm_cm_ref = [ normalizer(item.split('<FLAG_SUFFIX>')[0]) for item in data_utils.cm_to_tokens( cm, data_tools.bash_tokenizer, arg_type_only=False) ] args = [] for c, r in zip(norm_cm, norm_cm_ref): if c != r: args.append(r) norm_cm = ' '.join(norm_cm) if len(args) > 0: source = norm_nl + ' SEP ' + norm_cm target = ' SEP '.join(args) input_argument_predictor.append((source, target)) input_template_predictor.append((norm_nl_arg_replace, norm_cm)) random.seed(18015651) random.shuffle(input_template_predictor) random.shuffle(input_argument_predictor) os.makedirs(os.path.join(os.getcwd(), 'corpus/template_predictor'), exist_ok=True) with open(os.path.join(os.getcwd(), 'corpus/template_predictor/train.nl'), 'w') as fwn, \ open(os.path.join(os.getcwd(), 'corpus/template_predictor/train.cm'), 'w') as fwc: for example in input_template_predictor: fwn.write(example[0] + '\n') fwc.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/template_predictor/valid.nl'), 'w') as fwn, \ open(os.path.join(os.getcwd(), 'corpus/template_predictor/valid.cm'), 'w') as fwc: for example in input_template_predictor[10247:]: fwn.write(example[0] + '\n') fwc.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/template_predictor/test.nl'), 'w') as fwn, \ open(os.path.join(os.getcwd(), 'corpus/template_predictor/test.cm'), 'w') as fwc: for example in input_template_predictor[10247:]: fwn.write(example[0] + '\n') fwc.write(example[1] + '\n') os.makedirs(os.path.join(os.getcwd(), 'corpus/argument_predictor'), exist_ok=True) with open(os.path.join(os.getcwd(), 'corpus/argument_predictor/train.ctx'), 'w') as fwc, \ open(os.path.join(os.getcwd(), 'corpus/argument_predictor/train.arg'), 'w') as fwa: for example in input_argument_predictor: fwc.write(example[0] + '\n') fwa.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/argument_predictor/valid.ctx'), 'w') as fwc, \ open(os.path.join(os.getcwd(), 'corpus/argument_predictor/valid.arg'), 'w') as fwa: for example in input_argument_predictor[9830:]: fwc.write(example[0] + '\n') fwa.write(example[1] + '\n') with open(os.path.join(os.getcwd(), 'corpus/argument_predictor/test.ctx'), 'w') as fwc, \ open(os.path.join(os.getcwd(), 'corpus/argument_predictor/test.arg'), 'w') as fwa: for example in input_argument_predictor[9830:]: fwc.write(example[0] + '\n') fwa.write(example[1] + '\n')
def get_example_nl_key(nl): """ Get the natural language description in an example with nuances removed. """ tokens, _ = tokenizer.basic_tokenizer(nl) return ' '.join(tokens)
def token_overlap(s1, s2): tokens1 = set([w for w in basic_tokenizer(s1) if not is_stopword(w)]) tokens2 = set([w for w in basic_tokenizer(s2) if not is_stopword(w)]) return (len(tokens1 & tokens2) + 0.0) / len(tokens1 | tokens2)
def dump_data(self, data_dir, num_folds=10): # First-pass: group pairs by URLs pairs = self.unique_pairs() # Second-pass: group url clusters by nls templates = {} urls = pairs.keys() print("%d urls in the database" % len(urls)) merged_urls_by_nl = [] for i in xrange(len(urls)): url = urls[i] merge = False for j in xrange(i + 1, len(urls)): url2 = urls[j] for nl in pairs[url]: if nl in templates: nl_template1 = templates[nl] else: nl_template1 = " ".join(basic_tokenizer(nl)[0]) templates[nl] = nl_template1 for nl2 in pairs[url2]: if nl2 in templates: nl_template2 = templates[nl2] else: nl_template2 = " ".join(basic_tokenizer(nl2)[0]) templates[nl2] = nl_template2 if nl_template1 == nl_template2: merge = True break if merge: break if merge: break if merge: for nl in pairs[url]: if nl in pairs[url2]: pairs[url2][nl] = pairs[url][nl] | pairs[url2][nl] else: pairs[url2][nl] = pairs[url][nl] merged_urls_by_nl.append(i) print("%d urls merged by nl" % len(merged_urls_by_nl)) # Third-pass: group url clusters by commands merged_urls_by_cmd = [] """templates = {} for i in xrange(len(urls)): if i in merged_urls_by_nl: continue url = urls[i] merge = False for j in xrange(i+1, len(urls)): if j in merged_urls_by_nl: continue url2 = urls[j] for _, cmds in pairs[url].items(): for cmd in cmds: if cmd in templates: template = templates[cmd] else: template = cmd2template(cmd, arg_type_only=split_by_template) templates[cmd] = template for _, cmd2s in pairs[url2].items(): for cmd2 in cmd2s: if cmd2 in templates: template2 = templates[cmd2] else: template2 = cmd2template(cmd2, arg_type_only=split_by_template) templates[cmd2] = template2 if template == template2: merge = True break if merge: break if merge: break if merge: break if merge: break if merge: for nl in pairs[url]: if nl in pairs[url2]: pairs[url2][nl] = pairs[url][nl] | pairs[url2][nl] else: pairs[url2][nl] = pairs[url][nl] merged_urls_by_cmd.append(i) print("%d urls merged by cmd" % len(merged_urls_by_cmd)) """ remained_urls = [] for i in xrange(len(urls)): if i in merged_urls_by_cmd: continue if i in merged_urls_by_nl: continue remained_urls.append(urls[i]) sorted_urls = sorted( remained_urls, key=lambda x: reduce(lambda a, b: a + b, [len(pairs[x][nl]) for nl in pairs[x]]), reverse=True) data = collections.defaultdict(list) num_pairs = 0 num_train = 0 num_train_pairs = 0 num_dev = 0 num_dev_pairs = 0 num_test = 0 num_test_pairs = 0 num_urls = 0 top_k = 0 for i in xrange(len(sorted_urls)): url = sorted_urls[i] url_size = reduce(lambda x, y: x + y, [len(pairs[url][nl]) for nl in pairs[url]]) # print("url %d (%d)" % (i, url_size)) if i < top_k: for nl in pairs[url]: print(nl) print("-------------") ind = random.randrange(num_folds - 2) num_train += 1 num_train_pairs += url_size else: ind = random.randrange(num_folds) if ind < num_folds - 2: num_train += 1 num_train_pairs += url_size elif ind == num_folds - 2: num_dev += 1 num_dev_pairs += url_size elif ind == num_folds - 1: num_test += 1 num_test_pairs += url_size num_urls += 1 bin = data[ind] for nl in pairs[url]: for cmd in pairs[url][nl]: num_pairs += 1 cmd = cmd.strip().replace('\n', ' ').replace('\r', ' ') nl = nl.strip().replace('\n', ' ').replace('\r', ' ') if not type(nl) is unicode: nl = nl.decode() if not type(cmd) is unicode: cmd = cmd.decode() bin.append((nl, cmd)) print("Total number of pairs: %d" % num_pairs) print("Total number of url clusters: %d" % num_urls) print("Total number of train clusters: %d (%d pairs)" % (num_train, num_train_pairs)) print("Total number of dev clusters: %d (%d pairs)" % (num_dev, num_dev_pairs)) print("Total number of test clusters: %d (%d pairs)" % (num_test, num_test_pairs)) print("%.2f pairs per url cluster" % ((num_pairs + 0.0) / num_urls)) # if split_by_template: # split_by = "template" # else: # split_by = "command" # with open(data_dir + "/data.by.%s.dat" % split_by, 'w') as o_f: # pickle.dump(data, o_f) train_nl_list = [] train_cm_list = [] dev_nl_list = [] dev_cm_list = [] test_nl_list = [] test_cm_list = [] numFolds = len(data) for i in xrange(numFolds): if i < numFolds - 2: for j in xrange(len(data[i])): train_nl_list.append(data[i][j][0]) train_cm_list.append(data[i][j][1]) elif i == numFolds - 2: for j in xrange(len(data[i])): dev_nl_list.append(data[i][j][0]) dev_cm_list.append(data[i][j][1]) elif i == numFolds - 1: for j in xrange(len(data[i])): test_nl_list.append(data[i][j][0]) test_cm_list.append(data[i][j][1]) def write_data(data_path, data): if not os.path.exists(data_path): with open(data_path, 'w') as o_f: for line in data: o_f.write(line.encode('utf-8') + '\n') print('{} saved'.format(data_path)) train_path = os.path.join(data_dir, "train") dev_path = os.path.join(data_dir, "dev") test_path = os.path.join(data_dir, "test") write_data(train_path + ".nl", train_nl_list) write_data(train_path + ".cm", train_cm_list) write_data(dev_path + ".nl", dev_nl_list) write_data(dev_path + ".cm", dev_cm_list) write_data(test_path + ".nl", test_nl_list) write_data(test_path + ".cm", test_cm_list)
def get_nl_temp(nl): return ' '.join(basic_tokenizer(nl)[0])