Ejemplo n.º 1
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path[0], encoding='utf8') as f:
        pairs = json.load(f)
        for pair in pairs:
            input_s = []
            for p in pair[:-1]:
                input_s.append(nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(p)))
            dataset.add_data(" [SEP] ".join(input_s),
                             nlp2.join_words_to_sentence(nlp2.split_sentence_to_array(pair[-1])))
    return dataset
Ejemplo n.º 2
0
def toMiddleFormat(path):
    dataset = MiddleFormat(DATASETINFO)
    with open(path, encoding='utf8') as f:
        for _ in list(f.readlines()):
            data = json.loads(_)
            input = nlp2.split_sentence_to_array(data['dream'], True)
            target = nlp2.split_sentence_to_array(data["decode"], True)
            if len(input) + len(target) < 512:
                input = " ".join(input)
                target = " ".join(target)
                dataset.add_data(input, target)
    return dataset
Ejemplo n.º 3
0
def toMiddleFormat(data, context_max_len=450, answer_max_len=50):
    dataset = MiddleFormat(DATASETINFO)
    for d in data:
        context = nlp2.split_sentence_to_array(d['context'])
        answer = nlp2.split_sentence_to_array(d['answers']['text'][0])
        input_data = " ".join(
            context[:context_max_len]) + " [SEP] " + " ".join(
                answer[:answer_max_len])
        target_data = d['question']
        dataset.add_data(input_data, target_data)

    return dataset
Ejemplo n.º 4
0
def toMiddleFormat(path):
    from phraseg import Phraseg
    punctuations = r"[.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+"
    MASKTOKEN = "[MASK]"
    dataset = MiddleFormat(DATASETINFO, [MASKTOKEN])
    phraseg = Phraseg(path)

    for line in tqdm(nlp2.read_files_yield_lines(path)):
        line = nlp2.clean_all(line).strip()

        if len(nlp2.split_sentence_to_array(line)) > 1:
            phrases = list((phraseg.extract(sent=line,
                                            merge_overlap=False)).keys())
            reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations
            reg = "|".join(phrases) + "|" + reg
            input_sent = re.findall(reg, line, re.UNICODE)
            target_sent = re.findall(reg, line, re.UNICODE)
            for ind, word in enumerate(input_sent):
                prob = random.random()
                if prob <= 0.15 and len(word) > 0:
                    input_sent[ind] = MASKTOKEN
            if len(input_sent) > 2 and len(target_sent) > 2 and len(
                    "".join(input_sent).strip()) > 2 and len(
                        "".join(target_sent).strip()) > 2:
                dataset.add_data(nlp2.join_words_to_sentence(input_sent),
                                 nlp2.join_words_to_sentence(target_sent))

    return dataset
Ejemplo n.º 5
0
def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int):
    unk_count_dict = OrderedDict()
    for path in file_paths:
        for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
            for tok in nlp2.split_sentence_to_array(input_sent):
                if tokenizer._unk_token in tokenizer.tokenize(tok):
                    unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
    return [key for key, value in unk_count_dict.items() if value >= freqK]
Ejemplo n.º 6
0
def toMiddleFormat(paths):
    dataset = MiddleFormat(DATASETINFO)
    if not isinstance(paths, list):
        paths = [paths]

    for path in paths:
        with open(path, encoding="utf-8", errors='replace') as dataset_file:
            dataset_json = json.loads(dataset_file.read())
            dataset_json = dataset_json['data']
        for item in dataset_json:
            for paragraph in item['paragraphs']:
                for qas in paragraph['qas']:
                    question = replace_s(qas['question'])
                    for answers in qas['answers'][:1]:
                        context = replace_s(paragraph['context'])
                        ans = replace_s(str(answers['text']))
                        ori_start = start = answers['answer_start']

                        ans = nlp2.split_sentence_to_array(ans)
                        context = nlp2.split_sentence_to_array(context)
                        question = nlp2.split_sentence_to_array(question)

                        pos = -1
                        for tok in context:
                            pos += len(tok)
                            if len(tok) != 1:
                                if pos <= ori_start:
                                    start -= len(tok) - 1
                        end = start + len(ans)

                        if 'YES' in ans or 'NO' in ans:
                            input_sent = " ".join(
                                ans + context) + " [SEP] " + " ".join(question)
                            dataset.add_data(input_sent, [0, 1])
                        elif 'FAKE' in ans:
                            input_sent = " ".join(
                                context) + " [SEP] " + " ".join(question)
                            dataset.add_data(input_sent, [0, 0])
                        elif context[start:end] == ans:
                            input_sent = " ".join(
                                context) + " [SEP] " + " ".join(question)
                            dataset.add_data(input_sent, [start, end])
                        else:
                            print("input_sent", context[start:end], "ans", ans)
    return dataset
Ejemplo n.º 7
0
def get_topP_unk_token(tokenizer, file_paths: list, topP: float):
    unk_count_dict = OrderedDict()
    for path in file_paths:
        for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
            for tok in nlp2.split_sentence_to_array(input_sent):
                if tokenizer._unk_token in tokenizer.tokenize(tok):
                    unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
    top_range = int(len(unk_count_dict) * (topP / 100))
    return list(unk_count_dict.keys())[:top_range]
Ejemplo n.º 8
0
 def _filter_second_frequently(self, result_dict):
     words = list(result_dict.keys())
     ngrams, _ = self._cal_ngrams_idf(words)
     for word in words:
         keep = False
         for i in split_sentence_to_array(word):
             if ngrams[i] < self.ngrams[word]:
                 keep = True
         if not keep:
             result_dict.pop(word)
     return result_dict
Ejemplo n.º 9
0
 def extract(self,
             sent=None,
             merge_overlap=True,
             result_word_minlen=1,
             line_max_length=50):
     result_dict = defaultdict(int)
     sents = split_lines_by_punc(
         [sent],
         max_len=line_max_length) if sent is not None else self.sentences
     iter = sents if sent is not None else tqdm(sents,
                                                total=len(self.sentences))
     for sentence in iter:
         filter_dict = defaultdict(int)
         filter_arr = []
         ngram_part = split_sentence_to_ngram_in_part(sentence)
         if len(ngram_part) > 0:
             for part in ngram_part:
                 filter_result = self._filter_condprob(part, self.ngrams)
                 for key, value in filter_result.items():
                     filter_dict[key] = self.ngrams[key]
                     filter_arr.append(key)
         if len(filter_dict) > 0:
             if merge_overlap:
                 filter_arr = self.maximum_match_same_value(filter_dict)
                 rm_sup = self._remove_by_superlap(filter_arr, filter_dict,
                                                   sentence)
                 filter_arr = self._remove_by_overlap(
                     rm_sup, sentence, self.ngrams)
                 gaol = self._all_words_match_maximum_array(filter_arr)
                 for i in gaol:
                     result_dict[i] = self.ngrams[i] / self.idf[i]
             else:
                 for key in filter_arr:
                     result_dict[key] = self.ngrams[key] / self.idf[key]
     if merge_overlap:
         result_dict = self._filter_second_frequently(result_dict)
     result_dict = {
         k: v
         for k, v in result_dict.items()
         if len(split_sentence_to_array(k)) > result_word_minlen
     }
     return result_dict
Ejemplo n.º 10
0
    def predict(self,
                input='',
                neg="O",
                task=None,
                handle_exceed='slide',
                merge_strategy=['minentropy', 'maxprob', 'maxcount'],
                minlen=1,
                start_contain="B",
                end_contain="I"):
        handle_exceed = handle_exceed[0] if isinstance(handle_exceed,
                                                       list) else handle_exceed
        merge_strategy = merge_strategy[0] if isinstance(
            merge_strategy, list) else merge_strategy
        self.eval()
        input = " ".join(nlp2.split_sentence_to_array(input))
        with torch.no_grad():
            ret_detail = []
            predicted_pos_prob = defaultdict(lambda: defaultdict(list))
            for feature_dict in get_feature_from_data(
                    tokenizer=self.tokenizer,
                    labels=self.labels,
                    input=input.strip(),
                    maxlen=self.maxlen,
                    handle_exceed=handle_exceed):
                for k, v in feature_dict.items():
                    feature_dict[k] = [v]
                result = self.forward(feature_dict, eval=True)
                for token_pred, token_map in zip(result['label_prob_all'],
                                                 result['token_word_mapping']):
                    token_prob = list(token_pred.values())[0]
                    max_label = max(token_prob, key=token_prob.get)
                    max_prob = token_prob[max_label]
                    predicted_pos_prob[
                        token_map['pos']]['char'] = token_map['word']
                    predicted_pos_prob[token_map['pos']]['labels'].append(
                        max_label)
                    predicted_pos_prob[token_map['pos']]['prob'].append(
                        max_prob)
                    predicted_pos_prob[token_map['pos']]['entropy'].append(1)

                ret_detail.append(result)

            ret_result = []
            for key, value in predicted_pos_prob.items():
                if merge_strategy == 'maxcount':
                    label = max(value['labels'], key=value['labels'].count)
                if merge_strategy == 'minentropy':
                    min_entropy_index = value['entropy'].index(
                        min(value['entropy']))
                    label = value['labels'][min_entropy_index]
                if merge_strategy == 'maxprob':
                    max_prob_index = value['prob'].index(max(value['prob']))
                    label = value['labels'][max_prob_index]
                ret_result.append({value['char']: label})

            output = []
            target_str = ["", ""]
            after_start = False
            for mapping in ret_result:
                for k, y in mapping.items():
                    if start_contain in y:
                        after_start = True
                        if len(target_str[0]) > 0:
                            if len(target_str[0]) > minlen:
                                output.append(target_str)
                            target_str = ["", ""]
                        target_str[0] += k
                        target_str[1] = y
                    elif y is not neg and after_start:
                        target_str[0] += k
                        target_str[1] = y
                    else:
                        after_start = False
            if len(target_str[0]) > minlen and target_str not in output:
                output.append(target_str)
            output = [[
                ner,
                tag.replace(start_contain, "").replace(end_contain, "")
            ] for ner, tag in output]
            return output, ret_detail
Ejemplo n.º 11
0
    def predict(self,
                input='',
                neg="O",
                task=None,
                handle_exceed='slide',
                merge_strategy=['minentropy', 'maxprob', 'maxcount'],
                minlen=1,
                start_contain="B_",
                end_contain="I_"):
        handle_exceed = handle_exceed[0] if isinstance(handle_exceed,
                                                       list) else handle_exceed
        merge_strategy = merge_strategy[0] if isinstance(
            merge_strategy, list) else merge_strategy
        self.eval()
        input = " ".join(nlp2.split_sentence_to_array(input))
        with torch.no_grad():
            ret_detail = []
            predicted_pos_prob = defaultdict(lambda: defaultdict(list))
            for feature_dict in get_feature_from_data(
                    tokenizer=self.tokenizer,
                    labels=self.labels,
                    input=input.strip(),
                    maxlen=self.maxlen,
                    handle_exceed=handle_exceed):
                start, end = feature_dict['pos']
                for k, v in feature_dict.items():
                    feature_dict[k] = [v]
                result = self.forward(feature_dict, eval=True)
                pos_to_char = feature_dict['mapping'][0]
                for ind, mapping in enumerate(result['label_prob_all']):
                    for map_pos, map_dict in mapping.items():
                        max_label = max(map_dict, key=map_dict.get)
                        max_prob = map_dict[max_label]
                        max_entropy = Categorical(probs=torch.tensor(
                            list(map_dict.values()))).entropy().data.tolist()
                        predicted_pos_prob[str(
                            ind + start)]['char'] = pos_to_char[ind]['char']
                        predicted_pos_prob[str(ind + start)]['labels'].append(
                            max_label)
                        predicted_pos_prob[str(ind +
                                               start)]['prob'].append(max_prob)
                        predicted_pos_prob[str(ind + start)]['entropy'].append(
                            max_entropy)
                ret_detail.append(result)
            ret_result = []
            for key, value in predicted_pos_prob.items():
                if merge_strategy == 'maxcount':
                    label = max(value['labels'], key=value['labels'].count)
                if merge_strategy == 'minentropy':
                    min_entropy_index = value['entropy'].index(
                        min(value['entropy']))
                    label = value['labels'][min_entropy_index]
                if merge_strategy == 'maxprob':
                    max_prob_index = value['prob'].index(max(value['prob']))
                    label = value['labels'][max_prob_index]
                ret_result.append({value['char']: label})

            output = []
            target_str = ["", ""]
            for mapping in ret_result:
                for k, y in mapping.items():
                    if (y is not neg
                            and len(target_str[0]) > 0) or start_contain in y:
                        target_str[0] += k
                        target_str[1] = y
                    else:
                        if len(target_str[0]
                               ) > minlen and target_str not in output:
                            output.append(target_str)
                        target_str = ["", ""]
            if len(target_str[0]) > minlen and target_str not in output:
                output.append(target_str)
            output = [[
                ner,
                tag.replace(start_contain, "").replace(end_contain, "")
            ] for ner, tag in output]
            return output, ret_detail
Ejemplo n.º 12
0
    f for f in nlp2.get_files_from_dir('./processed_data') if 'csv' in f
]

for inputFile in inputFiles:
    article_length = []
    question_length = []
    answer_length = []
    distractor_length = []
    distractor_num = []
    with open(inputFile, encoding="utf-8", errors='replace') as dataset_file:
        rows = csv.reader(dataset_file)
        for r in rows:
            article, question, answer = r[0].split("[SEP]")
            distractors = r[1].split("[SEP]")

            article = nlp2.split_sentence_to_array(article, True)
            question = nlp2.split_sentence_to_array(question, True)
            answer = nlp2.split_sentence_to_array(answer, True)

            article_length.append(len(article))
            question_length.append(len(question))
            answer_length.append(len(answer))
            distractor_num.append(len(distractors))
            for dist in distractors:
                dist = nlp2.split_sentence_to_array(dist, True)
                distractor_length.append(len(dist))

    print(f"====={inputFile}======")
    print("number of data", len(question_length))
    print("average article_length", mean(article_length))
    print("average question_length", mean(question_length))