Ejemplo n.º 1
0
def replace_unk(hypo_str, src_str, alignment, unk):
    hypo_tokens = preprocess.word_tokenize(hypo_str)
    src_tokens = preprocess.word_tokenize(src_str) + ['<eos>']
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            hypo_tokens[i] = src_tokens[alignment[i]]
    return ' '.join(hypo_tokens)
Ejemplo n.º 2
0
def readlines(filename, ignore_case=False, wrapped=False):
    lines = []
    with open(filename) as file:
        for line in file.readlines():
            line = line.rstrip()
            if ignore_case:
                line = line.lower()
            line = word_tokenize(line)
            lines.append([line] if wrapped else line)
    return lines
Ejemplo n.º 3
0
    def __init__(self, input_path, word2idx):
        self.word2idx = word2idx
        self.seqs = []  # questions
        # self.idx_seqs = []
        self.g_ids = []
        self.g_ids_features = []
        self.g_adj = []
        self.sql_seqs = []  # source sql sequence
        # self.idx_sql_seqs = []

        with open(input_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                jo = json.loads(line, object_pairs_hook=OrderedDict)
                if len(jo['sql']) == 0:
                    continue
                if 'text_tokens' in jo:  # simple workaround for Spider dataset
                    seq = jo['text_tokens'] + ['<eos>']
                else:
                    seq = word_tokenize(jo['text']) + ['<eos>']
                # idx_seq = [word2idx[w] if w in word2idx else word2idx['<oov>'] for w in seq]
                self.seqs.append(seq)
                # self.idx_seqs.append(torch.tensor(idx_seq))
                sql_seq = jo['sql']
                # idx_sql_seq = [word2idx[w] if w in word2idx else word2idx['<oov>'] for w in sql_seq]
                self.sql_seqs.append(sql_seq)
                # self.idx_sql_seqs.append(torch.tensor(idx_sql_seq))

                self.g_ids.append(jo['g_ids'])
                g_ids_features = jo['g_ids_features']
                for k, v in g_ids_features.items():
                    feat_tokens = word_tokenize(v)
                    g_ids_features[k] = [
                        word2idx[w] if w in word2idx else word2idx['<oov>']
                        for w in feat_tokens
                    ]
                self.g_ids_features.append(jo['g_ids_features'])
                self.g_adj.append(jo['g_adj'])
Ejemplo n.º 4
0
def replace_spans(text: str, gen_d: Dict, d: Dict) -> str:
    """Replace words in text with spans from d"""

    words = word_tokenize(text).split()

    # compile list of spans to replace via recursive search
    replaces = []
    to_consider = [(gen_d, d)]
    while len(to_consider) > 0:
        cur_gen_d, cur_d = to_consider.pop()
        for k in cur_gen_d.keys():
            if type(cur_d[k]) == dict:
                to_consider.append((cur_gen_d[k], cur_d[k]))
            elif type(cur_d[k]) == str and cur_d[k].upper() != cur_d[k]:
                replaces.append((cur_gen_d[k], cur_d[k]))

    # replace each span in words
    replaces.sort(key=lambda r: r[0][1][0], reverse=True)  # sort by L of span
    for (sentence_idx, (L, R)), s in replaces:
        assert sentence_idx == 0
        words = words[:L] + word_tokenize(s).split() + words[(R + 1) :]

    return " ".join(words)
Ejemplo n.º 5
0
def get_word_count(x):
    """
    return normal word count, stop word count, unusual word count
    """
    wc = x.apply(lambda text: len(word_tokenize(text)))
    unique_wc = x.apply(lambda text: len(np.unique(tokenizer(text))))
    stop_wc = x.apply(lambda text: len(get_stopwords(tokenizer(text))))
    unusual_wc = x.apply(lambda text: len(get_unusual_words(tokenizer(text))))
    return pd.DataFrame(
        {
            '{}_word_count'.format(x.name): wc,
            '{}_unique_word_count'.format(x.name): unique_wc,
            '{}_stopword_count'.format(x.name): stop_wc,
            '{}_unusual_word_count'.format(x.name): unusual_wc,
            '{}_total_word_count'.format(x.name): wc + stop_wc + unusual_wc
        }
    )