def replace_unk(hypo_str, src_str, alignment, unk): hypo_tokens = preprocess.word_tokenize(hypo_str) src_tokens = preprocess.word_tokenize(src_str) + ['<eos>'] for i, ht in enumerate(hypo_tokens): if ht == unk: hypo_tokens[i] = src_tokens[alignment[i]] return ' '.join(hypo_tokens)
def readlines(filename, ignore_case=False, wrapped=False): lines = [] with open(filename) as file: for line in file.readlines(): line = line.rstrip() if ignore_case: line = line.lower() line = word_tokenize(line) lines.append([line] if wrapped else line) return lines
def __init__(self, input_path, word2idx): self.word2idx = word2idx self.seqs = [] # questions # self.idx_seqs = [] self.g_ids = [] self.g_ids_features = [] self.g_adj = [] self.sql_seqs = [] # source sql sequence # self.idx_sql_seqs = [] with open(input_path, 'r') as f: lines = f.readlines() for line in lines: line = line.strip() jo = json.loads(line, object_pairs_hook=OrderedDict) if len(jo['sql']) == 0: continue if 'text_tokens' in jo: # simple workaround for Spider dataset seq = jo['text_tokens'] + ['<eos>'] else: seq = word_tokenize(jo['text']) + ['<eos>'] # idx_seq = [word2idx[w] if w in word2idx else word2idx['<oov>'] for w in seq] self.seqs.append(seq) # self.idx_seqs.append(torch.tensor(idx_seq)) sql_seq = jo['sql'] # idx_sql_seq = [word2idx[w] if w in word2idx else word2idx['<oov>'] for w in sql_seq] self.sql_seqs.append(sql_seq) # self.idx_sql_seqs.append(torch.tensor(idx_sql_seq)) self.g_ids.append(jo['g_ids']) g_ids_features = jo['g_ids_features'] for k, v in g_ids_features.items(): feat_tokens = word_tokenize(v) g_ids_features[k] = [ word2idx[w] if w in word2idx else word2idx['<oov>'] for w in feat_tokens ] self.g_ids_features.append(jo['g_ids_features']) self.g_adj.append(jo['g_adj'])
def replace_spans(text: str, gen_d: Dict, d: Dict) -> str: """Replace words in text with spans from d""" words = word_tokenize(text).split() # compile list of spans to replace via recursive search replaces = [] to_consider = [(gen_d, d)] while len(to_consider) > 0: cur_gen_d, cur_d = to_consider.pop() for k in cur_gen_d.keys(): if type(cur_d[k]) == dict: to_consider.append((cur_gen_d[k], cur_d[k])) elif type(cur_d[k]) == str and cur_d[k].upper() != cur_d[k]: replaces.append((cur_gen_d[k], cur_d[k])) # replace each span in words replaces.sort(key=lambda r: r[0][1][0], reverse=True) # sort by L of span for (sentence_idx, (L, R)), s in replaces: assert sentence_idx == 0 words = words[:L] + word_tokenize(s).split() + words[(R + 1) :] return " ".join(words)
def get_word_count(x): """ return normal word count, stop word count, unusual word count """ wc = x.apply(lambda text: len(word_tokenize(text))) unique_wc = x.apply(lambda text: len(np.unique(tokenizer(text)))) stop_wc = x.apply(lambda text: len(get_stopwords(tokenizer(text)))) unusual_wc = x.apply(lambda text: len(get_unusual_words(tokenizer(text)))) return pd.DataFrame( { '{}_word_count'.format(x.name): wc, '{}_unique_word_count'.format(x.name): unique_wc, '{}_stopword_count'.format(x.name): stop_wc, '{}_unusual_word_count'.format(x.name): unusual_wc, '{}_total_word_count'.format(x.name): wc + stop_wc + unusual_wc } )