def _create_example(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[0])
         # if i == 0:
         #     print('label: ', label)
         examples.append(InputExample(guid=guid, text=text, label=label))
     return examples
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):

            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(line[1])
            label = tokenization.convert_to_unicode(line[0])
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=None,
                             label=label))
        return examples
 def _create_examples(self, lines, set_type):
     examples = []
     np.random.shuffle(lines)
     for i, line in enumerate(lines):
         guid = '%s-%s' % (set_type, i)
         # if set_type == 'test':
         #     text_a = tokenization.convert_to_unicode(line[1])
         #     label = '0'
         # else:
         #     text_a = tokenization.convert_to_unicode(line[1])
         #     label = tokenization.convert_to_unicode(line[0])
         #     self.labels.add(label)
         text_a = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[0])
         self.labels.add(label)
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          label=label,
                          text_b=None))
     return examples
Example #4
0
def read_line_examples(lst_strs):
    """Read a list of `InputExample`s from a list of strings."""
    unique_id = 0
    for ss in lst_strs:
        line = tokenization.convert_to_unicode(ss)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
        unique_id += 1
Example #5
0
def read_tokenized_examples(lst_strs):
    """

    :param lst_strs: [[]] 每个子元素为一个序列,子元素的每一个元素为这个序列的一个index
    :return:
    """
    unique_id = 0
    # 对lst_list中的数据进行转化为ID
    lst_strs = [[tokenization.convert_to_unicode(w) for w in s]
                for s in lst_strs]
    for ss in lst_strs:
        text_a = ss
        text_b = None
        try:
            # 这里使用|||对输入的句子进行切分如果存在这个符号,表示输入的是两个句子,即text_a 和text_b, 否则index出错,只会存在test_a
            j = ss.index('|||')
            text_a = ss[:j]
            text_b = ss[(j + 1):]
        except ValueError:
            pass
        yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
        unique_id += 1
Example #6
0
def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:
            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a,
                             text_b=text_b))
            unique_id += 1
    return examples