Beispiel #1
0
def read_squad_data(json_input, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    # input_data = json_input["data"]
    input_data = json_input

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    #데이터를 받아오는 부분
    examples = []
    # 읽어온 input_data는 paragraphs와 title로 구분 되어있음
    # paragraph는 질의응답셋인 qas와 문맥정보를 의미하는 context로 구성되어 있다.
    for entry in input_data:
        # input_date에서 각 데이터를 하나씩 불러 오고
        # 데이터를 context 먼저 처리

        paragraph_text = entry["context"]
        doc_tokens = []  # 띄어쓰기 기준으로 단어를 토큰으로 나눈다
        char_to_word_offset = [
        ]  # 각 charater가 몇 번째 단어에 속하는지 순서를 0,1,2,...,n으로 나타낸다
        prev_is_whitespace = True
        for c in paragraph_text:  # context를 character 단위로 받아온다.
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:  # character가 화이트스페이스가 아니고, 이전이 화이트페이스면
                    doc_tokens.append(c)  # 최초 삽입
                else:
                    doc_tokens[-1] += c  # 마지막 배열의 요소에 character들을 추
                prev_is_whitespace = False  #character가 화이트스페이스가 아니므로 false로 변경
            char_to_word_offset.append(len(doc_tokens) - 1)  #0 부터 시작으로 len -1

        # qas_id = qa["id"] # 질의의 id
        question_text = entry["question"]  #질문 데이터
        start_position = None
        end_position = None
        orig_answer_text = None
        is_impossible = False

        start_position = -1
        end_position = -1
        orig_answer_text = ""

        example = rs.SquadExample(qas_id=1,
                                  question_text=question_text,
                                  doc_tokens=doc_tokens,
                                  orig_answer_text=orig_answer_text,
                                  start_position=start_position,
                                  end_position=end_position,
                                  is_impossible=is_impossible)
        examples.append(example)

    return examples
Beispiel #2
0
    def my_create_examples(self, data_object):
        """
        Modified version of read_squad_examples from run_squad.
        Note that this returns feature objects, not example objects. The feature TENSORS themselves are made elsewhere.
        :param data_object: equivalent object to the 'data' section of the SQuAD JSON scheme
        :return: a list of `SquadExample`s
        """
        def is_whitespace(c):
            return c in " \t\r\n" or ord(c) == 0x202F

        examples = []
        for entry in data_object:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                for qa in paragraph["qas"]:
                    examples.append(run_squad.SquadExample(
                        qas_id=qa["id"],
                        question_text=qa["question"],
                        doc_tokens=doc_tokens,
                        orig_answer_text=None,
                        start_position=None,
                        end_position=None,
                        is_impossible=False)
                    )

        feature_objects = []
        run_squad.convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=self.flags.max_seq_length,
            doc_stride=self.flags.doc_stride,
            max_query_length=self.flags.max_query_length,
            is_training=False,
            output_fn=feature_objects.append)
        return feature_objects
Beispiel #3
0
def preprocess_data(raw_data):
    examples = []

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    paragraph_text = raw_data["context"]
    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in paragraph_text:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    qas_id = raw_data["qas_id"]
    question_text = raw_data["question"]
    start_position = None
    end_position = None
    orig_answer_text = None
    is_impossible = False

    data = mainfile.SquadExample(qas_id=qas_id,
                                 question_text=question_text,
                                 doc_tokens=doc_tokens,
                                 orig_answer_text=orig_answer_text,
                                 start_position=start_position,
                                 end_position=end_position,
                                 is_impossible=is_impossible)
    examples.append(data)
    return examples
    def read_squad_examples(self, input_data, is_training):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F:
                return True
            return False

        examples = []
        for entry in input_data:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None
                    is_impossible = False
                    if is_training:

                        if FLAGS.version_2_with_negative:
                            is_impossible = qa["is_impossible"]
                        if (len(qa["answers"]) != 1) and (not is_impossible):
                            raise ValueError(
                                "For training, each question should have exactly 1 answer."
                            )
                        if not is_impossible:
                            answer = qa["answers"][0]
                            orig_answer_text = answer["text"]
                            answer_offset = answer["answer_start"]
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset +
                                                               answer_length -
                                                               1]
                            # Only add answers where the text can be exactly recovered from the
                            # document. If this CAN'T happen it's likely due to weird Unicode
                            # stuff so we will just skip the example.
                            #
                            # Note that this means for training mode, every example is NOT
                            # guaranteed to be preserved.
                            actual_text = " ".join(
                                doc_tokens[start_position:(end_position + 1)])
                            cleaned_answer_text = " ".join(
                                tokenization.whitespace_tokenize(
                                    orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                tf.logging.warning(
                                    "Could not find answer: '%s' vs. '%s'",
                                    actual_text, cleaned_answer_text)
                                continue
                        else:
                            start_position = -1
                            end_position = -1
                            orig_answer_text = ""

                    example = run_squad.SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position,
                        is_impossible=is_impossible)
                    examples.append(example)

        return examples
    char_to_word_offset.append(len(doc_tokens) - 1)

doc_tokens = doc_tokens[:2000]

#eval_examples is a list of 10570 'SquadExample' objects
#just using it as a format placeholder
#predict_file='/Users/davidbressler/pythonstuff/squad_data/dev-v1.1.json'
#eval_examples_routes = run_squad.read_squad_examples(input_file=predict_file, is_training=False)
#eval_examples_routes=[eval_examples_routes[0]]
#eval_examples_routes[0].question_text=query
#eval_examples_routes[0].doc_tokens=tokenizer.tokenize(document)

eval_examples_routes = [
    run_squad.SquadExample(qas_id=0,
                           question_text=query,
                           doc_tokens=doc_tokens,
                           orig_answer_text=None,
                           start_position=None,
                           end_position=None)
]

#docstride: When splitting up a long document into chunks, how much stride to take between chunks.
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.

#eval_features is a list of 'run_squad.InputFeatures' objects
#each object has fields for tokens, input_mask, input_ids, segment_ids, etc.
#input_mask: I think all the examples have the same length (max_seq_length), so input_mask is just 1's (for good input) and 0's (right-padding)
#input_ids: numericalized tokens, then 0's (right-padding)
#segment_ids: 0's for query positions, 1's for document positions, then 0's (right-padding)
eval_features_routes = run_squad.convert_examples_to_features(