def toMiddleFormat(data): dataset = MiddleFormat(DATASETINFO) for d in data: input = d['Problem'] + " [SEP] " + d['options'] target = d['correct'] dataset.add_data(input, target) return dataset
def toMiddleFormat(paths): dataset = MiddleFormat() for path in paths: soup = BeautifulSoup(open(path, 'r', encoding='utf8'), features="lxml") temp = soup.root.find_all('doc') for i in temp: tag_s = i.find('text').string error_temp = i.find_all('error') tag_s = tag_s.strip(' ') tag_s = tag_s.strip('\n') if (len(tag_s)) >= 2: try: empty_tag = list() for i in range(len(tag_s)): empty_tag.append('O') for e in error_temp: for i in range(int(e['start_off']), int(e['end_off'])): empty_tag[i] = str(e['type']) except: pass if len(tag_s) == len(empty_tag): dataset.add_data(tag_s, empty_tag) return dataset
def toMiddleFormat(path): from phraseg import Phraseg punctuations = r"[.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+" MASKTOKEN = "[MASK]" dataset = MiddleFormat(DATASETINFO, [MASKTOKEN]) phraseg = Phraseg(path) for line in tqdm(nlp2.read_files_yield_lines(path)): line = nlp2.clean_all(line).strip() if len(nlp2.split_sentence_to_array(line)) > 1: phrases = list((phraseg.extract(sent=line, merge_overlap=False)).keys()) reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations reg = "|".join(phrases) + "|" + reg input_sent = re.findall(reg, line, re.UNICODE) target_sent = re.findall(reg, line, re.UNICODE) for ind, word in enumerate(input_sent): prob = random.random() if prob <= 0.15 and len(word) > 0: input_sent[ind] = MASKTOKEN if len(input_sent) > 2 and len(target_sent) > 2 and len( "".join(input_sent).strip()) > 2 and len( "".join(target_sent).strip()) > 2: dataset.add_data(nlp2.join_words_to_sentence(input_sent), nlp2.join_words_to_sentence(target_sent)) return dataset
def toMiddleFormat(data): dataset = MiddleFormat(DATASETINFO) for d in data: input = d['tokens'] target = [ner_tag[i] for i in d['ner_tags']] dataset.add_data(input, target) return dataset
def toMiddleFormat(data): dataset = MiddleFormat(DATASETINFO) for d in data: input = d['premise'] + " [SEP] " + d['hypothesis'] target = d['label'] dataset.add_data(input, target) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path, encoding='utf8') as csvfile: spamreader = csv.reader(csvfile) for row in spamreader: if len(row[0].strip()) > 2 and len(row[1].strip()) > 2: dataset.add_data(row[0], row[1]) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path[0], 'r', encoding='utf8') as src: with open(path[1], 'r', encoding='utf8') as tgt: for ori, sum in zip(src, tgt): ori = clean_text(ori) sum = clean_text(sum) dataset.add_data(ori, sum) return dataset
def toMiddleFormat(data): dataset = MiddleFormat(DATASETINFO) for d in data: input = d['context'] + " [SEP] " + d['question'] + " [SEP] " + d[ 'answer0'] + " [SEP] " + d['answer1'] + " [SEP] " + d[ 'answer2'] + " [SEP] " + d['answer3'] target = d['label'] dataset.add_data(input, target) return dataset
def toMiddleFormat(paths): dataset = MiddleFormat(DATASETINFO) with open(paths[0], 'r', encoding='utf8', errors='ignore') as posts: with open(paths[1], 'r', encoding='utf8', errors='ignore') as resps: for p, r in zip(posts.readlines(), resps.readlines()): p = p.replace('\t', " [SEP] ").replace('\n', "") r = r.replace('\n', "") dataset.add_data(p, r) return dataset
def toMiddleFormat(paths): dataset = MiddleFormat(DATASETINFO) for path in paths: with open(path, encoding='utf8') as csvfile: rows = csv.reader(csvfile) for row in rows: input = row[0] target = row[1] dataset.add_data(input.strip(), target.strip()) return dataset
def toMiddleFormat(paths): mf = MiddleFormat(DATASETINFO) for path in paths: with open(path, encoding='utf8') as csvfile: rows = csv.reader(csvfile) next(rows, None) for row in rows: input = row[0] target = row[1] mf.add_data(input, target) return mf
def toMiddleFormat(paths): dataset = MiddleFormat() for path in paths: with open(path, encoding='utf8') as f: if "失望" in f.readline(): sentiment = "negative" else: sentiment = "positive" for i in list(f.readlines()): dataset.add_data(i.strip(), sentiment) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path[0], encoding='utf8') as f: pairs = json.load(f) for pair in pairs: input_s = [] for p in pair[:-1]: input_s.append(nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(p))) dataset.add_data(" [SEP] ".join(input_s), nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(pair[-1]))) return dataset
def toMiddleFormat(data, context_max_len=450, answer_max_len=50): dataset = MiddleFormat(DATASETINFO) for d in data: context = nlp2.split_sentence_to_array(d['context']) answer = nlp2.split_sentence_to_array(d['answers']['text'][0]) input_data = " ".join( context[:context_max_len]) + " [SEP] " + " ".join( answer[:answer_max_len]) target_data = d['question'] dataset.add_data(input_data, target_data) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path, encoding='utf8') as f: for _ in list(f.readlines()): data = json.loads(_) input = nlp2.split_sentence_to_array(data['dream'], True) target = nlp2.split_sentence_to_array(data["decode"], True) if len(input) + len(target) < 512: input = " ".join(input) target = " ".join(target) dataset.add_data(input, target) return dataset
def toMiddleFormat(paths): dataset = MiddleFormat() for path in paths: with open(path, encoding='utf8') as csvfile: rows = csv.reader(csvfile) for row in rows: input = nlp2.spilt_sentence_to_array(row[0], True) target = nlp2.spilt_sentence_to_array(row[1], True) if len(input) + len(target) < 256: input = " ".join(input) target = " ".join(target) dataset.add_data(input, target) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path, encoding='utf8') as f: sent_input = [] sent_target = [] for i in list(f.readlines()): i = i.strip() if len(i) > 1: sent, tar = i.split(' ') sent_input.append(sent) sent_target.append(tar) else: dataset.add_data(sent_input, sent_target) sent_input = [] sent_target = [] return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) with open(path, encoding="utf8") as f: for sentence in list(f.readlines()): sent_input = [] sent_target = [] word_tags = sentence.split() for word_tag in word_tags: context, tag = word_tag.split("/") if tag == "nr" and len(context) > 1: sent_input.append(context[0]) sent_target.append("B-PER") for char in context[1:]: sent_input.append(char) sent_target.append("I-PER") else: for char in context: sent_input.append(char) sent_target.append("O") dataset.add_data(sent_input, sent_target) return dataset
def toMiddleFormat(path): dataset = MiddleFormat() max_len = 507 with open(path, encoding="utf-8", errors='replace') as dataset_file: dataset_json = json.loads(dataset_file.read()) dataset_json = dataset_json['data'] for item in dataset_json: for paragraph in item['paragraphs']: for qas in paragraph['qas']: qas['question'] = filter(qas['question']) question = list(qas['question']) question = ['[Question]'] + question for answers in qas['answers'][:1]: paragraph['context'] = filter(paragraph['context']) context = paragraph['context'] ans = filter(str(answers['text'])) ans_length = len(ans) start = answers['answer_start'] end = start + ans_length tag = ["O"] * len(context) tag[start:end] = ["A"] * ans_length context, tstart = split_text(context, max_len) for i, c in enumerate(context): c = list(c) t = tag[tstart[i]:tstart[i] + len(c)] c.extend(question) t.extend(["O"] * len(question)) if "A" in t: ind = t.index("A") if "".join( c[ind:ind + ans_length]) != ans or len(c) != len(t): pass if len(c) < max_len: dataset.add_data(c, t) return dataset
def toMiddleFormat(pairs): dataset = MiddleFormat(DATASETINFO) if 'news-commentary-v12' in pairs[0]: ## training data pairs = [[ p, p.replace('.en', "." + re.search("v12.(.+)+-", p).group(1)) ] for p in pairs if '-en.en' in p] else: pairs = [[ p, p.replace('src', "ref").replace( re.search("\.\w+\.", p).group(0), "." + re.search("-\w{2}(\w{2})-", p).group(1) + ".") ] for p in pairs if 'src.en' in p and re.search("-\w{4}-", p)] for pair in pairs: is_sgm = 'sgm' in pair[0] src_lines, targ_lines = _merge_blanks(pair[0], pair[1], verbose=False) for src, targ in zip(src_lines, targ_lines): src = _preprocess(src, is_sgm) targ = _preprocess(targ, is_sgm) if len(src) > 0 and len(targ) > 0: dataset.add_data(src, targ) return dataset
def toMiddleFormat(paths): dataset = MiddleFormat(DATASETINFO) if not isinstance(paths, list): paths = [paths] for path in paths: with open(path, encoding="utf-8", errors='replace') as dataset_file: dataset_json = json.loads(dataset_file.read()) dataset_json = dataset_json['data'] for item in dataset_json: for paragraph in item['paragraphs']: for qas in paragraph['qas']: question = replace_s(qas['question']) for answers in qas['answers'][:1]: context = replace_s(paragraph['context']) ans = replace_s(str(answers['text'])) ori_start = start = answers['answer_start'] ans = nlp2.split_sentence_to_array(ans) context = nlp2.split_sentence_to_array(context) question = nlp2.split_sentence_to_array(question) pos = -1 for tok in context: pos += len(tok) if len(tok) != 1: if pos <= ori_start: start -= len(tok) - 1 end = start + len(ans) if 'YES' in ans or 'NO' in ans: input_sent = " ".join( ans + context) + " [SEP] " + " ".join(question) dataset.add_data(input_sent, [0, 1]) elif 'FAKE' in ans: input_sent = " ".join( context) + " [SEP] " + " ".join(question) dataset.add_data(input_sent, [0, 0]) elif context[start:end] == ans: input_sent = " ".join( context) + " [SEP] " + " ".join(question) dataset.add_data(input_sent, [start, end]) else: print("input_sent", context[start:end], "ans", ans) return dataset
def toMiddleFormat(path): dataset = MiddleFormat(DATASETINFO) # some file reading and processing dataset.add_data("input", "target") return dataset