Ejemplo n.º 1
0
def toMiddleFormat(data):
    dataset = MiddleFormat(DATASETINFO)
    for d in data:
        input = d['Problem'] + " [SEP] " + d['options']
        target = d['correct']
        dataset.add_data(input, target)
    return dataset
Ejemplo n.º 2
0
def toMiddleFormat(paths):
    dataset = MiddleFormat()
    for path in paths:
        soup = BeautifulSoup(open(path, 'r', encoding='utf8'), features="lxml")
        temp = soup.root.find_all('doc')

        for i in temp:
            tag_s = i.find('text').string
            error_temp = i.find_all('error')

            tag_s = tag_s.strip(' ')
            tag_s = tag_s.strip('\n')

            if (len(tag_s)) >= 2:
                try:
                    empty_tag = list()

                    for i in range(len(tag_s)):
                        empty_tag.append('O')

                    for e in error_temp:
                        for i in range(int(e['start_off']), int(e['end_off'])):
                            empty_tag[i] = str(e['type'])
                except:
                    pass

            if len(tag_s) == len(empty_tag):
                dataset.add_data(tag_s, empty_tag)

    return dataset
Ejemplo n.º 3
0
def toMiddleFormat(path):
    from phraseg import Phraseg
    punctuations = r"[.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+"
    MASKTOKEN = "[MASK]"
    dataset = MiddleFormat(DATASETINFO, [MASKTOKEN])
    phraseg = Phraseg(path)

    for line in tqdm(nlp2.read_files_yield_lines(path)):
        line = nlp2.clean_all(line).strip()

        if len(nlp2.split_sentence_to_array(line)) > 1:
            phrases = list((phraseg.extract(sent=line,
                                            merge_overlap=False)).keys())
            reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations
            reg = "|".join(phrases) + "|" + reg
            input_sent = re.findall(reg, line, re.UNICODE)
            target_sent = re.findall(reg, line, re.UNICODE)
            for ind, word in enumerate(input_sent):
                prob = random.random()
                if prob <= 0.15 and len(word) > 0:
                    input_sent[ind] = MASKTOKEN
            if len(input_sent) > 2 and len(target_sent) > 2 and len(
                    "".join(input_sent).strip()) > 2 and len(
                        "".join(target_sent).strip()) > 2:
                dataset.add_data(nlp2.join_words_to_sentence(input_sent),
                                 nlp2.join_words_to_sentence(target_sent))

    return dataset
Ejemplo n.º 4
0
def toMiddleFormat(data):
    dataset = MiddleFormat(DATASETINFO)
    for d in data:
        input = d['tokens']
        target = [ner_tag[i] for i in d['ner_tags']]
        dataset.add_data(input, target)
    return dataset
Ejemplo n.º 5
0
def toMiddleFormat(data):
    dataset = MiddleFormat(DATASETINFO)
    for d in data:
        input = d['premise'] + " [SEP] " + d['hypothesis']
        target = d['label']
        dataset.add_data(input, target)
    return dataset
Ejemplo n.º 6
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path, encoding='utf8') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if len(row[0].strip()) > 2 and len(row[1].strip()) > 2:
                dataset.add_data(row[0], row[1])
    return dataset
Ejemplo n.º 7
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path[0], 'r', encoding='utf8') as src:
        with open(path[1], 'r', encoding='utf8') as tgt:
            for ori, sum in zip(src, tgt):
                ori = clean_text(ori)
                sum = clean_text(sum)
                dataset.add_data(ori, sum)
    return dataset
Ejemplo n.º 8
0
def toMiddleFormat(data):
    dataset = MiddleFormat(DATASETINFO)
    for d in data:
        input = d['context'] + " [SEP] " + d['question'] + " [SEP] " + d[
            'answer0'] + " [SEP] " + d['answer1'] + " [SEP] " + d[
                'answer2'] + " [SEP] " + d['answer3']
        target = d['label']
        dataset.add_data(input, target)
    return dataset
Ejemplo n.º 9
0
def toMiddleFormat(paths):
    dataset = MiddleFormat(DATASETINFO)
    with open(paths[0], 'r', encoding='utf8', errors='ignore') as posts:
        with open(paths[1], 'r', encoding='utf8', errors='ignore') as resps:
            for p, r in zip(posts.readlines(), resps.readlines()):
                p = p.replace('\t', " [SEP] ").replace('\n', "")
                r = r.replace('\n', "")
                dataset.add_data(p, r)
    return dataset
Ejemplo n.º 10
0
def toMiddleFormat(paths):
    dataset = MiddleFormat(DATASETINFO)
    for path in paths:
        with open(path, encoding='utf8') as csvfile:
            rows = csv.reader(csvfile)
            for row in rows:
                input = row[0]
                target = row[1]
                dataset.add_data(input.strip(), target.strip())
    return dataset
Ejemplo n.º 11
0
def toMiddleFormat(paths):
    mf = MiddleFormat(DATASETINFO)
    for path in paths:
        with open(path, encoding='utf8') as csvfile:
            rows = csv.reader(csvfile)
            next(rows, None)
            for row in rows:
                input = row[0]
                target = row[1]
                mf.add_data(input, target)
    return mf
Ejemplo n.º 12
0
def toMiddleFormat(paths):
    dataset = MiddleFormat()
    for path in paths:
        with open(path, encoding='utf8') as f:
            if "失望" in f.readline():
                sentiment = "negative"
            else:
                sentiment = "positive"
            for i in list(f.readlines()):
                dataset.add_data(i.strip(), sentiment)
    return dataset
Ejemplo n.º 13
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path[0], encoding='utf8') as f:
        pairs = json.load(f)
        for pair in pairs:
            input_s = []
            for p in pair[:-1]:
                input_s.append(nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(p)))
            dataset.add_data(" [SEP] ".join(input_s),
                             nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(pair[-1])))
    return dataset
Ejemplo n.º 14
0
def toMiddleFormat(data, context_max_len=450, answer_max_len=50):
    dataset = MiddleFormat(DATASETINFO)
    for d in data:
        context = nlp2.split_sentence_to_array(d['context'])
        answer = nlp2.split_sentence_to_array(d['answers']['text'][0])
        input_data = " ".join(
            context[:context_max_len]) + " [SEP] " + " ".join(
                answer[:answer_max_len])
        target_data = d['question']
        dataset.add_data(input_data, target_data)

    return dataset
Ejemplo n.º 15
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path, encoding='utf8') as f:
        for _ in list(f.readlines()):
            data = json.loads(_)
            input = nlp2.split_sentence_to_array(data['dream'], True)
            target = nlp2.split_sentence_to_array(data["decode"], True)
            if len(input) + len(target) < 512:
                input = " ".join(input)
                target = " ".join(target)
                dataset.add_data(input, target)
    return dataset
Ejemplo n.º 16
0
def toMiddleFormat(paths):
    dataset = MiddleFormat()
    for path in paths:
        with open(path, encoding='utf8') as csvfile:
            rows = csv.reader(csvfile)
            for row in rows:
                input = nlp2.spilt_sentence_to_array(row[0], True)
                target = nlp2.spilt_sentence_to_array(row[1], True)
                if len(input) + len(target) < 256:
                    input = " ".join(input)
                    target = " ".join(target)
                    dataset.add_data(input, target)
    return dataset
Ejemplo n.º 17
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path, encoding='utf8') as f:
        sent_input = []
        sent_target = []
        for i in list(f.readlines()):
            i = i.strip()
            if len(i) > 1:
                sent, tar = i.split(' ')
                sent_input.append(sent)
                sent_target.append(tar)
            else:
                dataset.add_data(sent_input, sent_target)
                sent_input = []
                sent_target = []
    return dataset
Ejemplo n.º 18
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path, encoding="utf8") as f:
        for sentence in list(f.readlines()):
            sent_input = []
            sent_target = []
            word_tags = sentence.split()
            for word_tag in word_tags:
                context, tag = word_tag.split("/")
                if tag == "nr" and len(context) > 1:
                    sent_input.append(context[0])
                    sent_target.append("B-PER")
                    for char in context[1:]:
                        sent_input.append(char)
                        sent_target.append("I-PER")
                else:
                    for char in context:
                        sent_input.append(char)
                        sent_target.append("O")
            dataset.add_data(sent_input, sent_target)
    return dataset
Ejemplo n.º 19
0
def toMiddleFormat(path):
    dataset = MiddleFormat()

    max_len = 507
    with open(path, encoding="utf-8", errors='replace') as dataset_file:
        dataset_json = json.loads(dataset_file.read())
        dataset_json = dataset_json['data']

    for item in dataset_json:
        for paragraph in item['paragraphs']:
            for qas in paragraph['qas']:
                qas['question'] = filter(qas['question'])
                question = list(qas['question'])
                question = ['[Question]'] + question
                for answers in qas['answers'][:1]:
                    paragraph['context'] = filter(paragraph['context'])
                    context = paragraph['context']
                    ans = filter(str(answers['text']))
                    ans_length = len(ans)
                    start = answers['answer_start']
                    end = start + ans_length
                    tag = ["O"] * len(context)
                    tag[start:end] = ["A"] * ans_length
                    context, tstart = split_text(context, max_len)
                    for i, c in enumerate(context):
                        c = list(c)
                        t = tag[tstart[i]:tstart[i] + len(c)]
                        c.extend(question)
                        t.extend(["O"] * len(question))
                        if "A" in t:
                            ind = t.index("A")
                            if "".join(
                                    c[ind:ind +
                                      ans_length]) != ans or len(c) != len(t):
                                pass
                        if len(c) < max_len:
                            dataset.add_data(c, t)

    return dataset
Ejemplo n.º 20
0
def toMiddleFormat(pairs):
    dataset = MiddleFormat(DATASETINFO)
    if 'news-commentary-v12' in pairs[0]:  ## training data
        pairs = [[
            p, p.replace('.en', "." + re.search("v12.(.+)+-", p).group(1))
        ] for p in pairs if '-en.en' in p]
    else:
        pairs = [[
            p,
            p.replace('src', "ref").replace(
                re.search("\.\w+\.", p).group(0),
                "." + re.search("-\w{2}(\w{2})-", p).group(1) + ".")
        ] for p in pairs if 'src.en' in p and re.search("-\w{4}-", p)]

    for pair in pairs:
        is_sgm = 'sgm' in pair[0]
        src_lines, targ_lines = _merge_blanks(pair[0], pair[1], verbose=False)
        for src, targ in zip(src_lines, targ_lines):
            src = _preprocess(src, is_sgm)
            targ = _preprocess(targ, is_sgm)
            if len(src) > 0 and len(targ) > 0:
                dataset.add_data(src, targ)
    return dataset
Ejemplo n.º 21
0
def toMiddleFormat(paths):
    dataset = MiddleFormat(DATASETINFO)
    if not isinstance(paths, list):
        paths = [paths]

    for path in paths:
        with open(path, encoding="utf-8", errors='replace') as dataset_file:
            dataset_json = json.loads(dataset_file.read())
            dataset_json = dataset_json['data']
        for item in dataset_json:
            for paragraph in item['paragraphs']:
                for qas in paragraph['qas']:
                    question = replace_s(qas['question'])
                    for answers in qas['answers'][:1]:
                        context = replace_s(paragraph['context'])
                        ans = replace_s(str(answers['text']))
                        ori_start = start = answers['answer_start']

                        ans = nlp2.split_sentence_to_array(ans)
                        context = nlp2.split_sentence_to_array(context)
                        question = nlp2.split_sentence_to_array(question)

                        pos = -1
                        for tok in context:
                            pos += len(tok)
                            if len(tok) != 1:
                                if pos <= ori_start:
                                    start -= len(tok) - 1
                        end = start + len(ans)

                        if 'YES' in ans or 'NO' in ans:
                            input_sent = " ".join(
                                ans + context) + " [SEP] " + " ".join(question)
                            dataset.add_data(input_sent, [0, 1])
                        elif 'FAKE' in ans:
                            input_sent = " ".join(
                                context) + " [SEP] " + " ".join(question)
                            dataset.add_data(input_sent, [0, 0])
                        elif context[start:end] == ans:
                            input_sent = " ".join(
                                context) + " [SEP] " + " ".join(question)
                            dataset.add_data(input_sent, [start, end])
                        else:
                            print("input_sent", context[start:end], "ans", ans)
    return dataset
Ejemplo n.º 22
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    # some file reading and processing
    dataset.add_data("input", "target")
    return dataset