def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path[0], encoding='utf8') as f: pairs = json.load(f) for pair in pairs: input_s = [] for p in pair[:-1]: input_s.append(nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(p))) dataset.add_data(" [SEP] ".join(input_s), nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(pair[-1]))) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path, encoding='utf8') as f: for _ in list(f.readlines()): data = json.loads(_) input = nlp2.split_sentence_to_array(data['dream'], True) target = nlp2.split_sentence_to_array(data["decode"], True) if len(input) + len(target) < 512: input = " ".join(input) target = " ".join(target) dataset.add_data(input, target) return dataset
def toMiddleFormat(data, context_max_len=450, answer_max_len=50): dataset = MiddleFormat(DATASETINFO) for d in data: context = nlp2.split_sentence_to_array(d['context']) answer = nlp2.split_sentence_to_array(d['answers']['text'][0]) input_data = " ".join( context[:context_max_len]) + " [SEP] " + " ".join( answer[:answer_max_len]) target_data = d['question'] dataset.add_data(input_data, target_data) return dataset
def toMiddleFormat(path): from phraseg import Phraseg punctuations = r"[.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+" MASKTOKEN = "[MASK]" dataset = MiddleFormat(DATASETINFO, [MASKTOKEN]) phraseg = Phraseg(path) for line in tqdm(nlp2.read_files_yield_lines(path)): line = nlp2.clean_all(line).strip() if len(nlp2.split_sentence_to_array(line)) > 1: phrases = list((phraseg.extract(sent=line, merge_overlap=False)).keys()) reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations reg = "|".join(phrases) + "|" + reg input_sent = re.findall(reg, line, re.UNICODE) target_sent = re.findall(reg, line, re.UNICODE) for ind, word in enumerate(input_sent): prob = random.random() if prob <= 0.15 and len(word) > 0: input_sent[ind] = MASKTOKEN if len(input_sent) > 2 and len(target_sent) > 2 and len( "".join(input_sent).strip()) > 2 and len( "".join(target_sent).strip()) > 2: dataset.add_data(nlp2.join_words_to_sentence(input_sent), nlp2.join_words_to_sentence(target_sent)) return dataset
def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int): unk_count_dict = OrderedDict() for path in file_paths: for input_sent in tqdm(nlp2.read_files_yield_lines(path)): for tok in nlp2.split_sentence_to_array(input_sent): if tokenizer._unk_token in tokenizer.tokenize(tok): unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1 return [key for key, value in unk_count_dict.items() if value >= freqK]
def toMiddleFormat(paths): dataset = MiddleFormat(DATASETINFO) if not isinstance(paths, list): paths = [paths] for path in paths: with open(path, encoding="utf-8", errors='replace') as dataset_file: dataset_json = json.loads(dataset_file.read()) dataset_json = dataset_json['data'] for item in dataset_json: for paragraph in item['paragraphs']: for qas in paragraph['qas']: question = replace_s(qas['question']) for answers in qas['answers'][:1]: context = replace_s(paragraph['context']) ans = replace_s(str(answers['text'])) ori_start = start = answers['answer_start'] ans = nlp2.split_sentence_to_array(ans) context = nlp2.split_sentence_to_array(context) question = nlp2.split_sentence_to_array(question) pos = -1 for tok in context: pos += len(tok) if len(tok) != 1: if pos <= ori_start: start -= len(tok) - 1 end = start + len(ans) if 'YES' in ans or 'NO' in ans: input_sent = " ".join( ans + context) + " [SEP] " + " ".join(question) dataset.add_data(input_sent, [0, 1]) elif 'FAKE' in ans: input_sent = " ".join( context) + " [SEP] " + " ".join(question) dataset.add_data(input_sent, [0, 0]) elif context[start:end] == ans: input_sent = " ".join( context) + " [SEP] " + " ".join(question) dataset.add_data(input_sent, [start, end]) else: print("input_sent", context[start:end], "ans", ans) return dataset
def get_topP_unk_token(tokenizer, file_paths: list, topP: float): unk_count_dict = OrderedDict() for path in file_paths: for input_sent in tqdm(nlp2.read_files_yield_lines(path)): for tok in nlp2.split_sentence_to_array(input_sent): if tokenizer._unk_token in tokenizer.tokenize(tok): unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1 top_range = int(len(unk_count_dict) * (topP / 100)) return list(unk_count_dict.keys())[:top_range]
def _filter_second_frequently(self, result_dict): words = list(result_dict.keys()) ngrams, _ = self._cal_ngrams_idf(words) for word in words: keep = False for i in split_sentence_to_array(word): if ngrams[i] < self.ngrams[word]: keep = True if not keep: result_dict.pop(word) return result_dict
def extract(self, sent=None, merge_overlap=True, result_word_minlen=1, line_max_length=50): result_dict = defaultdict(int) sents = split_lines_by_punc( [sent], max_len=line_max_length) if sent is not None else self.sentences iter = sents if sent is not None else tqdm(sents, total=len(self.sentences)) for sentence in iter: filter_dict = defaultdict(int) filter_arr = [] ngram_part = split_sentence_to_ngram_in_part(sentence) if len(ngram_part) > 0: for part in ngram_part: filter_result = self._filter_condprob(part, self.ngrams) for key, value in filter_result.items(): filter_dict[key] = self.ngrams[key] filter_arr.append(key) if len(filter_dict) > 0: if merge_overlap: filter_arr = self.maximum_match_same_value(filter_dict) rm_sup = self._remove_by_superlap(filter_arr, filter_dict, sentence) filter_arr = self._remove_by_overlap( rm_sup, sentence, self.ngrams) gaol = self._all_words_match_maximum_array(filter_arr) for i in gaol: result_dict[i] = self.ngrams[i] / self.idf[i] else: for key in filter_arr: result_dict[key] = self.ngrams[key] / self.idf[key] if merge_overlap: result_dict = self._filter_second_frequently(result_dict) result_dict = { k: v for k, v in result_dict.items() if len(split_sentence_to_array(k)) > result_word_minlen } return result_dict
def predict(self, input='', neg="O", task=None, handle_exceed='slide', merge_strategy=['minentropy', 'maxprob', 'maxcount'], minlen=1, start_contain="B", end_contain="I"): handle_exceed = handle_exceed[0] if isinstance(handle_exceed, list) else handle_exceed merge_strategy = merge_strategy[0] if isinstance( merge_strategy, list) else merge_strategy self.eval() input = " ".join(nlp2.split_sentence_to_array(input)) with torch.no_grad(): ret_detail = [] predicted_pos_prob = defaultdict(lambda: defaultdict(list)) for feature_dict in get_feature_from_data( tokenizer=self.tokenizer, labels=self.labels, input=input.strip(), maxlen=self.maxlen, handle_exceed=handle_exceed): for k, v in feature_dict.items(): feature_dict[k] = [v] result = self.forward(feature_dict, eval=True) for token_pred, token_map in zip(result['label_prob_all'], result['token_word_mapping']): token_prob = list(token_pred.values())[0] max_label = max(token_prob, key=token_prob.get) max_prob = token_prob[max_label] predicted_pos_prob[ token_map['pos']]['char'] = token_map['word'] predicted_pos_prob[token_map['pos']]['labels'].append( max_label) predicted_pos_prob[token_map['pos']]['prob'].append( max_prob) predicted_pos_prob[token_map['pos']]['entropy'].append(1) ret_detail.append(result) ret_result = [] for key, value in predicted_pos_prob.items(): if merge_strategy == 'maxcount': label = max(value['labels'], key=value['labels'].count) if merge_strategy == 'minentropy': min_entropy_index = value['entropy'].index( min(value['entropy'])) label = value['labels'][min_entropy_index] if merge_strategy == 'maxprob': max_prob_index = value['prob'].index(max(value['prob'])) label = value['labels'][max_prob_index] ret_result.append({value['char']: label}) output = [] target_str = ["", ""] after_start = False for mapping in ret_result: for k, y in mapping.items(): if start_contain in y: after_start = True if len(target_str[0]) > 0: if len(target_str[0]) > minlen: output.append(target_str) target_str = ["", ""] target_str[0] += k target_str[1] = y elif y is not neg and after_start: target_str[0] += k target_str[1] = y else: after_start = False if len(target_str[0]) > minlen and target_str not in output: output.append(target_str) output = [[ ner, tag.replace(start_contain, "").replace(end_contain, "") ] for ner, tag in output] return output, ret_detail
def predict(self, input='', neg="O", task=None, handle_exceed='slide', merge_strategy=['minentropy', 'maxprob', 'maxcount'], minlen=1, start_contain="B_", end_contain="I_"): handle_exceed = handle_exceed[0] if isinstance(handle_exceed, list) else handle_exceed merge_strategy = merge_strategy[0] if isinstance( merge_strategy, list) else merge_strategy self.eval() input = " ".join(nlp2.split_sentence_to_array(input)) with torch.no_grad(): ret_detail = [] predicted_pos_prob = defaultdict(lambda: defaultdict(list)) for feature_dict in get_feature_from_data( tokenizer=self.tokenizer, labels=self.labels, input=input.strip(), maxlen=self.maxlen, handle_exceed=handle_exceed): start, end = feature_dict['pos'] for k, v in feature_dict.items(): feature_dict[k] = [v] result = self.forward(feature_dict, eval=True) pos_to_char = feature_dict['mapping'][0] for ind, mapping in enumerate(result['label_prob_all']): for map_pos, map_dict in mapping.items(): max_label = max(map_dict, key=map_dict.get) max_prob = map_dict[max_label] max_entropy = Categorical(probs=torch.tensor( list(map_dict.values()))).entropy().data.tolist() predicted_pos_prob[str( ind + start)]['char'] = pos_to_char[ind]['char'] predicted_pos_prob[str(ind + start)]['labels'].append( max_label) predicted_pos_prob[str(ind + start)]['prob'].append(max_prob) predicted_pos_prob[str(ind + start)]['entropy'].append( max_entropy) ret_detail.append(result) ret_result = [] for key, value in predicted_pos_prob.items(): if merge_strategy == 'maxcount': label = max(value['labels'], key=value['labels'].count) if merge_strategy == 'minentropy': min_entropy_index = value['entropy'].index( min(value['entropy'])) label = value['labels'][min_entropy_index] if merge_strategy == 'maxprob': max_prob_index = value['prob'].index(max(value['prob'])) label = value['labels'][max_prob_index] ret_result.append({value['char']: label}) output = [] target_str = ["", ""] for mapping in ret_result: for k, y in mapping.items(): if (y is not neg and len(target_str[0]) > 0) or start_contain in y: target_str[0] += k target_str[1] = y else: if len(target_str[0] ) > minlen and target_str not in output: output.append(target_str) target_str = ["", ""] if len(target_str[0]) > minlen and target_str not in output: output.append(target_str) output = [[ ner, tag.replace(start_contain, "").replace(end_contain, "") ] for ner, tag in output] return output, ret_detail
f for f in nlp2.get_files_from_dir('./processed_data') if 'csv' in f ] for inputFile in inputFiles: article_length = [] question_length = [] answer_length = [] distractor_length = [] distractor_num = [] with open(inputFile, encoding="utf-8", errors='replace') as dataset_file: rows = csv.reader(dataset_file) for r in rows: article, question, answer = r[0].split("[SEP]") distractors = r[1].split("[SEP]") article = nlp2.split_sentence_to_array(article, True) question = nlp2.split_sentence_to_array(question, True) answer = nlp2.split_sentence_to_array(answer, True) article_length.append(len(article)) question_length.append(len(question)) answer_length.append(len(answer)) distractor_num.append(len(distractors)) for dist in distractors: dist = nlp2.split_sentence_to_array(dist, True) distractor_length.append(len(dist)) print(f"====={inputFile}======") print("number of data", len(question_length)) print("average article_length", mean(article_length)) print("average question_length", mean(question_length))