def __init__(self): self.data = [] self.dictionary = Dictionary() self.max_sent_len = 0 # Read the positive reviews with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f: positive_reviews = f.readlines() for review in positive_reviews: review = normalize_string(review) review_words = word_tokenize(review) self.dictionary.add_words(review_words) self.data.append((review, 1)) self.max_sent_len = max(self.max_sent_len, 2 + len(review_words)) # Read the negative reviews with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f: negative_reviews = f.readlines() for review in negative_reviews: review = normalize_string(review) review_words = word_tokenize(review) self.dictionary.add_words(review_words) self.data.append((review, 0)) self.max_sent_len = max(self.max_sent_len, 2 + len(review_words)) # Split the original dataset into train/test random.shuffle(self.data) split_index = int(0.9 * len(self.data)) self.train = AugmentedList(self.data[:split_index]) self.test = AugmentedList(self.data[split_index:])
def score_it_s(s, d): sents_in_doc = docsentdic[d] doc_words_lower_lemmatized = [] for sent in sents_in_doc: doc_words_lower_lemmatized += [ lemmatizer.lemmatize(t) for t in word_tokenize(sentwordrawdic[sent], lower=True) ] doc_words_lower_lemmatized = [ t for t in doc_words_lower_lemmatized if t not in stopwords.words('english') ] sent_words_lower_lemmatized = [ lemmatizer.lemmatize(t) for t in word_tokenize(s, lower=True) ] sent_words_lower_lemmatized = [ t for t in sent_words_lower_lemmatized if t not in stopwords.words('english') ] return len([ t for t in sent_words_lower_lemmatized if t in index_term_lower_lemmatized ]) / len([ t for t in doc_words_lower_lemmatized if t in index_term_lower_lemmatized ])
def convert_to_features(config, data, word2idx_dict, char2idx_dict): example = {} context, question = data context = context.replace("''", '" ').replace("``", '" ') question = question.replace("''", '" ').replace("``", '" ') example['context_tokens'] = word_tokenize(context) example['ques_tokens'] = word_tokenize(question) example['context_chars'] = [ list(token) for token in example['context_tokens'] ] example['ques_chars'] = [list(token) for token in example['ques_tokens']] para_limit = config.para_limit ques_limit = config.ques_limit # ans_limit = 100 char_limit = config.char_limit def filter_func(_example): return len(_example["context_tokens"]) > para_limit or \ len(_example["ques_tokens"]) > ques_limit if filter_func(example): raise ValueError("Context/Questions lengths are over the limit") context_idxs = np.zeros([para_limit], dtype=np.int32) context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32) ques_idxs = np.zeros([ques_limit], dtype=np.int32) ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32) def _get_word(word): for each in (word, word.lower(), word.capitalize(), word.upper()): if each in word2idx_dict: return word2idx_dict[each] return 1 def _get_char(_char): if _char in char2idx_dict: return char2idx_dict[_char] return 1 for i, token in enumerate(example["context_tokens"]): context_idxs[i] = _get_word(token) for i, token in enumerate(example["ques_tokens"]): ques_idxs[i] = _get_word(token) for i, token in enumerate(example["context_chars"]): for j, char in enumerate(token): if j == char_limit: break context_char_idxs[i, j] = _get_char(char) for i, token in enumerate(example["ques_chars"]): for j, char in enumerate(token): if j == char_limit: break ques_char_idxs[i, j] = _get_char(char) return context_idxs, context_char_idxs, ques_idxs, ques_char_idxs
def get_answer(content, question, session, model, word_dictionary, char_dictionary, config): try: content_tokenized = word_tokenize(content.replace("''", '" ').replace("``", '" ')) content = ''.join(content_tokenized[:config.para_limit]) candidate_keys = ['什么', '谁', '哪', '几', '何', '多', '是否', '怎么', '嘛', '怎样'] question_tokenized = word_tokenize(question.replace("''", '" ').replace("``", '" ')) if len(question_tokenized) > config.ques_limit: find, pos = False, 0 for idx, token in enumerate(question_tokenized): for key in candidate_keys: if key in token: find, pos = True, idx break if find: break if find: question = ''.join( question_tokenized[max(0, pos - int(config.ques_limit/2 + 1)): min(pos + int(config.ques_limit/2 - 1), len(question_tokenized)-1)]) else: question = ''.join(question_tokenized[len(question_tokenized) - config.ques_limit + 1:]) c, ch, q, qh = convert_to_features(config, (content, question), word_dictionary, char_dictionary) fd = {'context:0': [c], 'question:0': [q], 'context_char:0': [ch], 'question_char:0': [qh]} yp1, yp2 = session.run([model.yp1, model.yp2], feed_dict=fd) yp2[0] += 1 return "".join(content_tokenized[yp1[0]:yp2[0]]) except ValueError: print("ValueError triggered!") return None
def split_data(self, filename): self.load_data(filename) sub_dir = filename.split('-')[0] # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.labels'), 'w', encoding="utf-8") as labels_file: # loop over the data for article_id in tqdm.tqdm(range(len(self.data['data']))): list_paragraphs = self.data['data'][article_id]['paragraphs'] # loop over the paragraphs for paragraph in list_paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = [w for w in word_tokenize(context) if w] spans = convert_idx(context, context_tokens) qas = paragraph['qas'] # loop over Q/A for qa in qas: question = qa['question'] question = clean_text(question) question_tokens = [w for w in word_tokenize(question) if w] if sub_dir == "train": # select only one ground truth, the top answer, if any answer answer_ids = 1 if qa['answers'] else 0 else: answer_ids = len(qa['answers']) labels = [] if answer_ids: for answer_id in range(answer_ids): answer = qa['answers'][answer_id]['text'] answer = clean_text(answer) answer_tokens = [w for w in word_tokenize(answer) if w] answer_start = qa['answers'][answer_id]['answer_start'] answer_stop = answer_start + len(answer) answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue labels.append(str(answer_span[0]) + ' ' + str(answer_span[-1])) # write to file context_file.write(' '.join([token for token in context_tokens]) + '\n') question_file.write(' '.join([token for token in question_tokens]) + '\n') answer_file.write(' '.join([token for token in answer_tokens]) + '\n') labels_file.write("|".join(labels) + "\n")
def process_article(args): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). """ text, lemmatize, title, pageid = args categories = get_categories(text) if not list(set(categories).intersection(input_categories)): return None, None, None, None text = filter_wiki(text) sentences = sentence_tokenize(text) title = title.replace(' ', '_') paragraphs = {} # Split document into paragraphs # sentences = [s0, s1, t0, s2, t1, ...] paragraph_title = [title] level = 1 this_sentences = [] for sent in sentences: # Sent is a paragraph title if sent[:1] == '=': pt = '/'.join(paragraph_title) pt = pt.replace(',', '') paragraphs[pt] = this_sentences this_sentences = [] # Level of paragraph level = max(len(s) for s in re.findall(r'=+', sent)) this_title = sent[level:len(sent)-level].strip().replace(' ', '_') if level > len(paragraph_title): paragraph_title.append(this_title) elif level < len(paragraph_title): for i in range(len(paragraph_title)-level): paragraph_title.pop() paragraph_title[level-1] = this_title else: paragraph_title[level-1] = this_title else: this_sentences.append(sent) pt = '/'.join(paragraph_title) pt = pt.replace(',', '') paragraphs[pt] = this_sentences if lemmatize: result = {k: [utils.lemmatize(s) for s in v if len(utils.lemmatize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0} else: result = {k: [word_tokenize(s) for s in v if len(word_tokenize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0} return categories, result, title, pageid
def get_terms_and_words(self, field): words = [ self.analyzer.parse(word)[0].normal_form for word in word_tokenize(field) if word not in stop_words] terms = set(words) return terms, words
def _tokenize(self, text): if not self._pristine_input: text = text.lower() if self.word_tokens: if self._pristine_input: return text.split() return word_tokenize(text) return text
def get_clean_text(text_list): """ :param text_list: a list of strings :return: string - tokenized and with sent tags """ text_list = [txt for txt in text_list if len(txt.strip()) > 0] text_list = [' '.join(utils.word_tokenize(txt, tokenizer)) for txt in text_list] text = utils.sent_list_to_tagged_str(text_list) return text
def build_word_vectors(infile_name, outfile_name): print('building word vectors...') pynlpir.open() jieba.initialize() df = pd.read_json(infile_name) with open(outfile_name, 'w') as f: for content in tqdm(df.article_content): f.write(' '.join(word_tokenize(content)))
def score_tfidf_s(s, d): sent_words_lower_lemmatized = [ lemmatizer.lemmatize(t) for t in word_tokenize(s, lower=True) ] sent_words_lower_lemmatized = [ t for t in sent_words_lower_lemmatized if t not in stopwords.words('english') ] return mean([score_tfidf_w(w, d) for w in sent_words_lower_lemmatized])
def next_batch(self, batch_size, mode=TRAIN_MODE): review_lengths, reviews, targets = [], [], [] data = self.train if mode == TRAIN_MODE else self.test batch = data.next_items(batch_size) for (review, target) in batch: review_length = len(word_tokenize(normalize_string(review))) review = indexes_from_sentence(review, self.dictionary, self.max_sent_len) target = one_hot_encoding(2, target) reviews.append(review) targets.append(target) review_lengths.append(review_length) return review_lengths, reviews, targets
def tf(w, d): sents_in_doc = docsentdic[d] doc_words_lower_lemmatized = [] for sent in sents_in_doc: doc_words_lower_lemmatized += [ lemmatizer.lemmatize(t) for t in word_tokenize(sentwordrawdic[sent], lower=True) ] doc_words_lower_lemmatized = [ t for t in doc_words_lower_lemmatized if t not in stopwords.words('english') ] return math.log(1 + (doc_words_lower_lemmatized.count(w) / len(doc_words_lower_lemmatized)))
def score_dt_s(s, d): doc_title_words = d.replace('_', ' ').replace('/', ' ').split() doc_title_words_lower = [t.lower() for t in doc_title_words] doc_title_words_lower_lemmatized = [ lemmatizer.lemmatize(t) for t in doc_title_words_lower ] doc_title_words_lower_lemmatized = [ t for t in doc_title_words_lower_lemmatized if t not in stopwords.words('english') ] sent_words = word_tokenize(s, lower=True) sent_words_lemmatized = [lemmatizer.lemmatize(t) for t in sent_words] return len([ x for x in doc_title_words_lower_lemmatized if x in sent_words_lemmatized ]) / len(doc_title_words_lower_lemmatized)
def normalize_corpus(corpus, lemmatize=False): # 输入文档列表,返回二维的列表,其中每个元素都是一个文档的词列表 normalize_corpus = [] text_list = [remove_special_characters(text) for text in corpus] # 去掉特殊符号的原始的英文文本列表,较好的输入 for text in text_list: text = expand_contrations(text) if (lemmatize): pass else: text = text.lower() text = word_tokenize(text) normalize_corpus.append(text) #print(text) normalize_corpus = [ remove_stopwords(text, 'stopwords.txt') for text in normalize_corpus ] return normalize_corpus
def prep_input(self, input): # clean input article_list = [clean_str(x.strip()) for x in input] # tokenize x = [word_tokenize(d) for d in article_list] # replace with dictionary or unk x = [[self.word_dict.get(w, self.word_dict["<unk>"]) for w in d] for d in x] # trim as necessary x = [d[:self.article_max_len] for d in x] x = [ d + (self.article_max_len - len(d)) * [self.word_dict["<padding>"]] for d in x ] return x
def concatenate_data(squad_data_dir, newsqa_data_dir, out_dir, env="train", full_context=False): ext = ".context" if full_context else ".sentence" sentence_files = [ os.path.join(squad_data_dir, env, env + ext), os.path.join(newsqa_data_dir, env, env + ext) ] question_files = [ os.path.join(squad_data_dir, env, env + ".question"), os.path.join(newsqa_data_dir, env, env + ".question") ] out_sentence_filename = os.path.join(out_dir, env + ext) out_question_filename = os.path.join(out_dir, env + ".question") for infiles, outfile in zip( [sentence_files, question_files], [out_sentence_filename, out_question_filename]): with open(outfile, "w") as o: for f in infiles: with open(f) as infile: for line in infile: o.write(line) with open(out_sentence_filename, "r") as f,\ open(out_question_filename, "r") as g: sentence_lines = f.readlines() question_lines = g.readlines() sentence_lines, question_lines = zip( *[(s, q) for s, q in sorted(zip(sentence_lines, question_lines), key=lambda x: len(word_tokenize(x[0])))]) with open(out_sentence_filename, "w") as f,\ open(out_question_filename, "w") as g: for line in sentence_lines: f.write(line) for line in question_lines: g.write(line)
def parse_sent(self, sent, fix_length): sent = [self.word_dict[w] if w in self.word_dict else 0 for w in utils.word_tokenize(sent)] sent, _ = self.pad_to_fix_len(sent, fix_length, padding_front=False) return sent
def process_file(filename, data_type, word_counter, char_counter, ques_limit): """ 从文本文件中读取内容后进行初步处理。如果数据集是train的话,需要进行内容过滤 :param filename: :param data_type: :param word_counter: :param char_counter: :return: """ print("Processing {} examples...".format(data_type)) examples = [] eval_examples = {} total = 0 with open(filename, "r") as fh: source = json.load(fh) # TODO 预处理中进行了过滤,但后续没有办法计算spans for article in tqdm(source): content = article['article_title'] + '。' + article[ 'article_content'] content_tokens = word_tokenize(content) content_chars = [list(token) for token in content_tokens] spans = convert_idx(content, content_tokens) for token in content_tokens: word_counter[token] += len(article['questions']) for char in token: char_counter[char] += len(article["questions"]) for q in article['questions']: question_text = q["question"] answer_text = q['answer'] question_tokens = word_tokenize(question_text) question_tokens = shrink_question_tokens( question_tokens, ques_limit) question_chars = [list(token) for token in question_tokens] result = list(substring_indexes(answer_text, content)) for token in question_tokens: word_counter[token] += 1 for char in token: char_counter[char] += 1 if len(result) == 1: # 将result的字符转换成分词之后的位置,y1 y2 分别是开始的分词位置和结束的位置 current_pos, start_token, end_token = 0, -1, -1 for token_cnt, token in enumerate(content_tokens): if current_pos > result[0] and start_token == -1: start_token = token_cnt - 1 if current_pos > result[0] + len(q["answer"]): end_token = token_cnt - 2 break current_pos += len(token) total += 1 example = { "context_tokens": content_tokens, "context_chars": content_chars, "ques_tokens": question_tokens, "ques_chars": question_chars, "y1s": [start_token], "y2s": [end_token], "id": total } eval_examples[str(total)] = { "context": content, "spans": spans, # 全文的每个token与位置的对应关系 "answers": [answer_text], # TODO 改成不分para的 "uuid": q["questions_id"] } # example中没有存储原始的问题文本信息,在这里保存了,在后续的结果展示中可以用到。 examples.append(example) # 不考虑任何跨段的问题 random.shuffle(examples) print("{} questions in total".format(len(examples))) return examples, eval_examples
def split_data(self, filename): self.load_data(filename) envs = ["train", "dev"] for sub_dir in envs: # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + ".context"), "w", encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".sentence"), "w", encoding="utf-8") as sentence_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".question"), "w", encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".answer"), "w", encoding="utf-8") as answer_file: # loop over the data for article in tqdm.tqdm(self.data["data"]): context = article["text"] context_tokens = word_tokenize(context) context_sentences = sent_tokenize(context) if config.paragraph and ( len(context_tokens) < config.min_len_context or len(context_tokens) > config.max_len_context): continue spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) if not article["type"] == sub_dir: continue for question in article["questions"]: if question.get("isQuestionBad") == 0 and question[ "consensus"].get("s"): q = question["q"].strip() if q[-1] != "?" or len(q.split( )) < config.min_len_question or len( q.split()) > config.max_len_question: continue answer_start = question["consensus"]["s"] answer = context[question["consensus"]["s"]: question["consensus"]["e"]].strip( ".| ").strip("\n") answer_stop = answer_start + len(answer) # Getting spans of the answer in the context answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue # Getting the sentence where we have the answer sentence_tokens = [] for idx, start in enumerate(first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[idx] answer_sentence_span = [ span - start for span in answer_span ] else: break # write to file sent = [] for idx, token in enumerate(sentence_tokens): if token.strip("\n").strip(): if idx in answer_sentence_span: sent.append(token + u"│" + "1") else: sent.append(token + u"│" + "0") sent = " ".join(sent) sent = sent.strip() index = sent.find("(│0 CNN│0 )│0 --│0 ") if index > -1: sent = sent[index + len("(│0 CNN│0 )│0 --│0 "):] ctxt = [] for idx, token in enumerate(context_tokens): if token.strip("\n").strip(): if idx in answer_span: ctxt.append(token + u"│" + "1") else: ctxt.append(token + u"│" + "0") ctxt = " ".join(ctxt) ctxt = ctxt.strip() index = ctxt.find("(│0 CNN│0 )│0 --│0 ") if index > -1: ctxt = ctxt[index + len("(│0 CNN│0 )│0 --│0 "):] context_file.write(ctxt + "\n") sentence_file.write(sent + "\n") question_file.write(q + "\n") answer_file.write(answer + "\n")
def tokenize_word(): print('-- tokenize words') words = [word_tokenize(data) for data in raw_data + test_data] labels = [label for label in raw_labels + test_labels] return words, labels
'-i', required=True, help='input preprocessed text file') parser.add_argument('--output', '-o', required=True, help='output file name') args = parser.parse_args() title_text_pair = [] with open(args.input, 'r') as f: lines = f.read().splitlines() title = [] text = [] this_title = '' this_text = '' for line in lines: if len(word_tokenize(line)) < 30 and line[-1] != '.': this_title = line else: this_text = line title_text_pair.append((this_title, this_text)) with open(args.output, 'w') as f: titles = [] for title, text in title_text_pair: title = title.replace(' ', '_') title = title.replace(',', '') if title in titles: title = title + '_' titles.append(title) f.write(title + '\n') sents = [s for s in sent_tokenize(text) if len(word_tokenize(s)) >= 5]
def predict(self, raw_sentences): if type(raw_sentences) is not list: raw_sentences = [raw_sentences] if self.params.get('convert_slang', True): split_sentences = [ sentence.split(' ') for sentence in raw_sentences ] convert_sentences = [[ self.slang_dict[_word] if _word in self.slang_dict else _word for _word in word ] for word in split_sentences] raw_sentences = [ clean_str(' '.join(sentence)) for sentence in convert_sentences ] with tf.device('/cpu:0'): sess, word2id, label2id = self.model.sess, self.word_vocab[ 'word2id'], self.label_vocab['label2id'] # tokenize word and create faked labels sentences = [word_tokenize(sentence) for sentence in raw_sentences] labels = [self.label_vocab['id2label'][0]] * len( sentences) # just faked label dataset = create_dataset(sentences, labels, word2id, label2id) # if self.params.get('sentiment_lexicon', False): lexicons = train.convert_sentiment_lexicon( self.lexicon_vocab['lexicon2id'], sentences) dataset = add_to_dataset(dataset, lexicons, self.lexicon_vocab['lexicon2id'], 'lexicon_ids') # if self.params.get('pos', False): pos = train.convert_pos(self.pos_vocab['pos2id'], sentences) dataset = add_to_dataset(dataset, pos, self.pos_vocab['pos2id'], 'pos_ids') # if self.params.get('vader_lexicon', False): vaderlexicons = train.convert_vader_lexicon( self.vader_lexicon_vocab['vaderlexicon2id'], sentences) dataset = add_to_dataset( dataset, vaderlexicons, self.vader_lexicon_vocab['vaderlexicon2id'], 'vader_lexicon_ids') # prepare input feed dict ip_feed_dict = self.model.create_input(dataset) for k, v in self.model.create_additional_input(dataset).items(): ip_feed_dict[k] = v if hasattr(self.model, 'dropout'): ip_feed_dict[self.model.dropout] = 1.0 if hasattr(self.model, 'is_training'): ip_feed_dict[self.model.is_training] = False predict, predict_proba = sess.run( [self.model.logits, tf.nn.softmax(self.model.logits)], feed_dict=ip_feed_dict) predict_ids = np.argmax(predict, axis=1) return \ [ (( self.label_vocab['id2label'][predict_id], predict_proba[id][predict_id], {self.label_vocab['id2label'][i]: p for i, p in enumerate(predict_proba[id])} )) for id, predict_id in enumerate(predict_ids) ]
def make_dataset(self, train_path, test_path, is_convert_slang, is_sentiment_lexicon, is_pos, is_vader_lexicon): # load file raw_data, raw_labels = load_train_file(train_path) test_data, test_labels = load_train_file(test_path) len_train = len(raw_data) print('-- tokenize words') words, labels = [word_tokenize(data) for data in raw_data + test_data ], [label for label in raw_labels + test_labels] # # convert slang into its corresponding word (maybe not?) # if is_convert_slang: slang_path = os.path.dirname(os.path.realpath( __file__)) + '/data/preprocess/slang/slang.pkl' slang_dict = load_dict(slang_path) words = train.convert_slang(slang_dict, words) # # split train and test. # print('-- split train and test data') train_words, train_labels, test_words, test_labels = words[:len_train], labels[:len_train], words[ len_train:], labels[len_train:] # # create dataset # train_dataset = create_dataset(train_words, train_labels, self.word_vocab['word2id'], self.label_vocab['label2id']) test_dataset = create_dataset(test_words, test_labels, self.word_vocab['word2id'], self.label_vocab['label2id']) # # add sentiment_lexicon as additional features (maybe not?) # if is_sentiment_lexicon: lexicon_path = os.path.dirname(os.path.realpath( __file__)) + '/data/preprocess/lexicon/lexicon.pkl' lexicon_dct = load_dict(lexicon_path) lexicons = train.convert_sentiment_lexicon(lexicon_dct, words, 'neu') train_lexicons, test_lexicons = lexicons[:len_train], lexicons[ len_train:] train_dataset = add_to_dataset(train_dataset, train_lexicons, self.lexicon_vocab['lexicon2id'], 'lexicon_ids') test_dataset = add_to_dataset(test_dataset, test_lexicons, self.lexicon_vocab['lexicon2id'], 'lexicon_ids') # # add pos (part-of-speech) as additional features (maybe not?) # if is_pos: pos = train.convert_pos(words) train_pos, test_pos = pos[:len_train], pos[len_train:] train_dataset = add_to_dataset(train_dataset, train_pos, self.pos_vocab['pos2id'], 'pos_ids') test_dataset = add_to_dataset(test_dataset, test_pos, self.pos_vocab['pos2id'], 'pos_ids') # # add vader_lexicon as additional features (maybe not?) # if is_vader_lexicon: vaderlexicons = train.convert_vader_lexicon( self.vader_lexicon_vocab['vaderlexicon2id'], words) train_vaderlexicons, test_vaderlexicons = vaderlexicons[:len_train], vaderlexicons[ len_train:] train_dataset = add_to_dataset( train_dataset, train_vaderlexicons, self.vader_lexicon_vocab['vaderlexicon2id'], 'vader_lexicon_ids') test_dataset = add_to_dataset( test_dataset, test_vaderlexicons, self.vader_lexicon_vocab['vaderlexicon2id'], 'vader_lexicon_ids') return train_dataset, test_dataset
def read_news(news_path, args, mode='train'): news = {} categories = [] subcategories = [] domains = [] news_index = {} index = 1 word_cnt = Counter() with tf.io.gfile.GFile(news_path, "r") as f: for line in tqdm(f): splited = line.strip('\n').split('\t') doc_id, category, subcategory, title, abstract, url, _, _ = splited body = "" news_index[doc_id] = index index += 1 if 'title' in args.news_attributes: title = title.lower() title = word_tokenize(title) else: title = [] if 'abstract' in args.news_attributes: abstract = abstract.lower() abstract = word_tokenize(abstract) else: abstract = [] if 'body' in args.news_attributes: body = body.lower()[:2000] body = word_tokenize(body) else: body = [] if 'category' in args.news_attributes: categories.append(category) else: category = None if 'subcategory' in args.news_attributes: subcategories.append(subcategory) else: subcategory = None if 'domain' in args.news_attributes: domain = get_domain(url) domains.append(domain) else: domain = None news[doc_id] = [title, abstract, body, category, domain, subcategory] if mode == 'train': word_cnt.update(title + abstract + body) if mode == 'train': word = [k for k, v in word_cnt.items() if v > args.filter_num] word_dict = {k: v for k, v in zip(word, range(1, len(word) + 1))} categories = list(set(categories)) category_dict = {} index = 1 for x in categories: category_dict[x] = index index += 1 subcategories = list(set(subcategories)) subcategory_dict = {} index = 1 for x in subcategories: subcategory_dict[x] = index index += 1 domains = list(set(domains)) domain_dict = {} index = 1 for x in domains: domain_dict[x] = index index += 1 return news, news_index, category_dict, word_dict, domain_dict, subcategory_dict elif mode == 'test': return news, news_index else: assert False, 'Wrong mode!'
doc_title_words_lower = [t.lower() for t in doc_title_words] doc_title_words_lower_lemmatized = [ lemmatizer.lemmatize(t) for t in doc_title_words_lower ] doc_title_words_lower_lemmatized = [ t for t in doc_title_words_lower_lemmatized if t not in stopwords.words('english') ] sents = docsentdic[doc] for sent in sents: sent_words = sentworddic[sent] sent_words_lemmatized = [ lemmatizer.lemmatize(t) for t in sent_words ] # Check sentence length words = word_tokenize(sentwordrawdic[sent]) if len(words) < SENTENCE_MIN_LENGTH: continue if list(set(hand_stopwords).intersection(set(sent_words))): continue # Check whether index term or document title word is in the sentence if (not list( set(index_term_lower_lemmatized).intersection( set(sent_words_lemmatized)))) and (not list( set(doc_title_words_lower_lemmatized).intersection( set(sent_words_lemmatized)))): continue quiz_doc_sent_tuple.append((doc, sent)) logging.info("Performing gap search...")
def split_data(self, filename): self.load_data(filename) sub_dir = filename.split('-')[0] # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.sentence'), 'w', encoding="utf-8") as sentence_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file: # loop over the data for article_id in tqdm.tqdm(range(len(self.data['data']))): list_paragraphs = self.data['data'][article_id]['paragraphs'] # loop over the paragraphs for paragraph in list_paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = word_tokenize(context) if config.paragraph and ( len(context_tokens) < config.min_len_context or len(context_tokens) > config.max_len_context): continue context_sentences = sent_tokenize(context) spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) qas = paragraph['qas'] # loop over Q/A for qa in qas: question = qa['question'] question = clean_text(question) question_tokens = word_tokenize(question) if question_tokens[-1] != "?" or len( question_tokens ) < config.min_len_question or len( question_tokens) > config.max_len_question: continue if sub_dir == "train": # select only one ground truth, the top answer, if any answer answer_ids = 1 if qa['answers'] else 0 else: answer_ids = len(qa['answers']) if answer_ids: for answer_id in range(answer_ids): answer = qa['answers'][answer_id]['text'] answer = clean_text(answer) answer_tokens = word_tokenize(answer) answer_start = qa['answers'][answer_id][ 'answer_start'] answer_stop = answer_start + len(answer) # Getting spans of the answer in the context answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue # Getting the sentence where we have the answer sentence_tokens = [] for idx, start in enumerate( first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[ idx] answer_sentence_span = [ span - start for span in answer_span ] else: break if not sentence_tokens: print("Sentence cannot be found") raise Exception() # write to file context_file.write(" ".join([ token + u"│" + "1" if idx in answer_span else token + u"│" + "0" for idx, token in enumerate(context_tokens) ]) + "\n") sentence_file.write(" ".join([ token + u"│" + "1" if idx in answer_sentence_span else token + u"│" + "0" for idx, token in enumerate(sentence_tokens) ]) + "\n") question_file.write( " ".join([token for token in question_tokens]) + "\n") answer_file.write( " ".join([token for token in answer_tokens]) + "\n")
] index_term_lower_lemmatized = [ t for t in index_term_lower_lemmatized if t not in stopwords.words('english') ] logging.info("Constructing document words...") docwords = {} for doc in docsentdic: sents_in_doc = docsentdic[doc] doc_words_lower_lemmatized = [] for sent in sents_in_doc: doc_words_lower_lemmatized += [ lemmatizer.lemmatize(t) for t in word_tokenize(sentwordrawdic[sent], lower=True) ] doc_words_lower_lemmatized = [ t for t in doc_words_lower_lemmatized if t not in stopwords.words('english') ] docwords[doc] = doc_words_lower_lemmatized def score_dt_s(s, d): doc_title_words = d.replace('_', ' ').replace('/', ' ').split() doc_title_words_lower = [t.lower() for t in doc_title_words] doc_title_words_lower_lemmatized = [ lemmatizer.lemmatize(t) for t in doc_title_words_lower ] doc_title_words_lower_lemmatized = [
def split_sentence_question(self, filename, data_type): data = self.load_data(filename) with open(os.path.join(self.save_dir + data_type + '.sentence'), 'w', encoding="utf-8") as sentence_file,\ open(os.path.join(self.save_dir + data_type + '.question'), 'w', encoding="utf-8") as question_file: artilces = data for article in tqdm(artilces): paragraphs = article['paragraphs'] for paragraph in paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = word_tokenize(context) context_sentences = sent_tokenize(context) spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) question_and_answer_list = paragraph['qas'] for question_and_answer in question_and_answer_list: question = question_and_answer['question'] question = clean_text(question) question_tokens = word_tokenize(question) if len(question_tokens) > MAX_QUESTION_LENGTH or len( question_tokens) < MIN_QUESTION_LENGHT: continue if not question_and_answer['answers']: continue answer = question_and_answer['answers'][0] answer_text = answer['text'] answer_text = clean_text(answer_text) answer_tokens = word_tokenize(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue sentence_tokens = [] for idx, start in enumerate(first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[idx] answer_sentence_span = [ span - start for span in answer_span ] else: break if not sentence_tokens: print("Sentence cannot be found") raise Exception() if len(sentence_tokens) > MAX_SENTENCE_LENGTH or len( sentence_tokens) < MIN_SENTENCE_LENGTH: continue sentence_file.write(" ".join([ token + u"│" + "1" if idx in answer_sentence_span else token + u"│" + "0" for idx, token in enumerate(sentence_tokens) ]) + "\n") question_file.write( " ".join([token for token in question_tokens]) + "\n")
def extract_features(self, max_len_context=config.max_len_context, max_len_question=config.max_len_question, max_len_word=config.max_len_word, is_train=True): # choose the right directory directory = "train" if is_train else "dev" # load context with open(os.path.join(self.data_dir, directory, directory + ".context"), "r", encoding="utf-8") as c: context = c.readlines() # load questions with open(os.path.join(self.data_dir, directory, directory + ".question"), "r", encoding="utf-8") as q: question = q.readlines() # load answer with open(os.path.join(self.data_dir, directory, directory + ".labels"), "r", encoding="utf-8") as l: labels = l.readlines() # clean and tokenize context and question context = [[w for w in word_tokenize(clean_text(doc.strip('\n')))] for doc in context] question = [[w for w in word_tokenize(clean_text(doc.strip('\n')))] for doc in question] # download vocabulary if not done yet if directory == "train": labels = [ np.array(l.strip("\n").split(), dtype=np.int32) for l in labels ] word_vocab, word2idx, char_vocab, char2idx = build_vocab( directory + ".context", directory + ".question", "word_vocab.pkl", "word2idx.pkl", "char_vocab.pkl", "char2idx.pkl", is_train=is_train, max_words=config.max_words) # create an embedding matrix from the vocabulary with pretrained vectors (GloVe) for words build_embeddings(word_vocab, embedding_path=config.glove, output_path="word_embeddings.pkl", vec_size=config.word_embedding_size) build_embeddings(char_vocab, embedding_path="", output_path="char_embeddings.pkl", vec_size=config.char_embedding_size) else: labels = np.array([l.strip("\n") for l in labels]) with open(os.path.join(self.data_dir, "train", "word2idx.pkl"), "rb") as wi,\ open(os.path.join(self.data_dir, "train", "char2idx.pkl"), "rb") as ci: word2idx = pickle.load(wi) char2idx = pickle.load(ci) print("Number of questions before filtering:", len(question)) filter = [ len(c) < max_len_context and max([len(w) for w in c]) < max_len_word and len(q) < max_len_question and max([len(w) for w in q]) < max_len_word and len(q) > 3 for c, q in zip(context, question) ] context, question, labels = zip( *[(c, q, l) for c, q, l, f in zip(context, question, labels, filter) if f]) print("Number of questions after filtering ", len(question)) # replace the tokenized words with their associated ID in the vocabulary context_idxs = [] context_char_idxs = [] question_idxs = [] question_char_idxs = [] for i, (c, q) in tqdm.tqdm(enumerate(zip(context, question))): # create empty numpy arrays context_idx = np.zeros([max_len_context], dtype=np.int32) question_idx = np.zeros([max_len_question], dtype=np.int32) context_char_idx = np.zeros([max_len_context, max_len_word], dtype=np.int32) question_char_idx = np.zeros([max_len_question, max_len_word], dtype=np.int32) # replace 0 values with word and char IDs for j, word in enumerate(c): if word in word2idx: context_idx[j] = word2idx[word] else: context_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: context_char_idx[j, k] = char2idx[char] else: context_char_idx[j, k] = 1 context_idxs.append(context_idx) context_char_idxs.append(context_char_idx) for j, word in enumerate(q): if word in word2idx: question_idx[j] = word2idx[word] else: question_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: question_char_idx[j, k] = char2idx[char] else: question_char_idx[j, k] = 1 question_idxs.append(question_idx) question_char_idxs.append(question_char_idx) # save features as numpy arrays np.savez(os.path.join(self.data_dir, directory, directory + "_features"), context_idxs=np.array(context_idxs), context_char_idxs=np.array(context_char_idxs), question_idxs=np.array(question_idxs), question_char_idxs=np.array(question_char_idxs), label=np.array(labels))
def eval(context, question): with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \ open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \ open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \ open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb: word2idx = pickle.load(wi) char2idx = pickle.load(ci) word_embedding_matrix = pickle.load(wb) char_embedding_matrix = pickle.load(cb) # transform them into Tensors word_embedding_matrix = torch.from_numpy( np.array(word_embedding_matrix)).type(torch.float32) char_embedding_matrix = torch.from_numpy( np.array(char_embedding_matrix)).type(torch.float32) idx2word = dict([(y, x) for x, y in word2idx.items()]) context = clean_text(context) context = [w for w in word_tokenize(context) if w] question = clean_text(question) question = [w for w in word_tokenize(question) if w] if len(context) > config.max_len_context: print("The context is too long. Maximum accepted length is", config.max_len_context, "words.") if max([len(w) for w in context]) > config.max_len_word: print("Some words in the context are longer than", config.max_len_word, "characters.") if len(question) > config.max_len_question: print("The question is too long. Maximum accepted length is", config.max_len_question, "words.") if max([len(w) for w in question]) > config.max_len_word: print("Some words in the question are longer than", config.max_len_word, "characters.") if len(question) < 3: print( "The question is too short. It needs to be at least a three words question." ) context_idx = np.zeros([config.max_len_context], dtype=np.int32) question_idx = np.zeros([config.max_len_question], dtype=np.int32) context_char_idx = np.zeros([config.max_len_context, config.max_len_word], dtype=np.int32) question_char_idx = np.zeros( [config.max_len_question, config.max_len_word], dtype=np.int32) # replace 0 values with word and char IDs for j, word in enumerate(context): if word in word2idx: context_idx[j] = word2idx[word] else: context_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: context_char_idx[j, k] = char2idx[char] else: context_char_idx[j, k] = 1 for j, word in enumerate(question): if word in word2idx: question_idx[j] = word2idx[word] else: question_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: question_char_idx[j, k] = char2idx[char] else: question_char_idx[j, k] = 1 model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=config.hidden_size, drop_prob=config.drop_prob) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: pass print( "Model weights not found, initialized model with random weights.") model.to(device) model.eval() with torch.no_grad(): context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device) pred1, pred2 = model(context_idx, context_char_idx, question_idx, question_char_idx) starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False) prediction = " ".join(context[starts.item():ends.item() + 1]) return prediction