Example #1
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[3])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
Example #2
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type,
                           tokenization.convert_to_unicode(line[0]))
         text_a = tokenization.convert_to_unicode(line[8])
         text_b = tokenization.convert_to_unicode(line[9])
         label = tokenization.convert_to_unicode(line[-1])
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
Example #3
0
 def get_dev_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "dev-%d" % (i)
     language = tokenization.convert_to_unicode(line[0])
     if language != tokenization.convert_to_unicode(self.language):
       continue
     text_a = tokenization.convert_to_unicode(line[6])
     text_b = tokenization.convert_to_unicode(line[7])
     label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Example #4
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     # Only the test set has a header
     if set_type == "test" and i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "test":
       text_a = tokenization.convert_to_unicode(line[1])
       label = "0"
     else:
       text_a = tokenization.convert_to_unicode(line[3])
       label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
   return examples
Example #5
0
 def get_train_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(
       os.path.join(data_dir, "multinli",
                    "multinli.train.%s.tsv" % self.language))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "train-%d" % (i)
     text_a = tokenization.convert_to_unicode(line[0])
     text_b = tokenization.convert_to_unicode(line[1])
     label = tokenization.convert_to_unicode(line[2])
     if label == tokenization.convert_to_unicode("contradictory"):
       label = tokenization.convert_to_unicode("contradiction")
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        labels = []
        labels_test = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)

            # tokenization is based on vocab file
            text_a = tokenization.convert_to_unicode(line[0])
            label = tokenization.convert_to_unicode(line[1])
            labels.append(label)

            if set_type == "test":
                label = "0"
            labels_test.append(label)
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))

        return examples, labels, labels_test
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
  """Create `TrainingInstance`s from raw text."""
  all_documents = [[]]

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    with tf.gfile.GFile(input_file, "r") as reader: # 类似python里面的open操作
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break
        line = line.strip()

        # Empty lines are used as document delimiters(文档分隔符)
        if not line:
          all_documents.append([])
        tokens = tokenizer.tokenize(line) # 相当于把句子按照vocab表中的词进行划分表示
        if tokens:
          all_documents[-1].append(tokens)

  # Remove empty documents
  all_documents = [x for x in all_documents if x] 
  rng.shuffle(all_documents)

  # 把之前的句子[[['the','fountain','of','classic',...]],[['this','text','is',...]]]转换为
  # 标签['[PAD]','[unused1]','[unused2],...']
  vocab_words = list(tokenizer.vocab.keys()) # 这是label词表list['[PAD]','[unused1]','[unused2],...']
  instances = []
  for _ in range(dupe_factor): # dupe_factor是重复因子
    for document_index in range(len(all_documents)):
      instances.extend(
          create_instances_from_document(
              all_documents, document_index, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

  rng.shuffle(instances)
  return instances
Example #8
0
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, rng))

    rng.shuffle(instances)
    return instances
def write_to_tf_record(writer,
                       tokenizer,
                       query,
                       docs,
                       labels,
                       ids_file=None,
                       query_id=None,
                       doc_ids=None):
    #query = tokenization.convert_to_unicode(query)
    query_token_ids = tokenization.convert_to_bert_input(
        text=query,
        max_seq_length=FLAGS.max_query_length,
        tokenizer=tokenizer,
        add_cls=True)

    query_token_ids_tf = tf.train.Feature(int64_list=tf.train.Int64List(
        value=query_token_ids))

    for i, (doc_text, label) in enumerate(zip(docs, labels)):

        doc_token_id = tokenization.convert_to_bert_input(
            text=tokenization.convert_to_unicode(doc_text),
            max_seq_length=FLAGS.max_seq_length - len(query_token_ids),
            tokenizer=tokenizer,
            add_cls=False)

        doc_ids_tf = tf.train.Feature(int64_list=tf.train.Int64List(
            value=doc_token_id))

        labels_tf = tf.train.Feature(int64_list=tf.train.Int64List(
            value=[label]))

        features = tf.train.Features(
            feature={
                'query_ids': query_token_ids_tf,
                'doc_ids': doc_ids_tf,
                'label': labels_tf,
            })
        example = tf.train.Example(features=features)
        writer.write(example.SerializeToString())

        if ids_file:
            ids_file.write('\t'.join([query_id, doc_ids[i]]) + '\n')
Example #10
0
    def get_train_examples(self, data_dir):
        examples = []
        train_files = ["mytrain.term_recall.json"] 

        for file_name in train_files:
            train_file = open(os.path.join(data_dir, file_name))
            for i, line in enumerate(train_file):
                q_json_dict = json.loads(line)
                qid = q_json_dict["qid"]
                q_text = tokenization.convert_to_unicode(q_json_dict["query"])
                term_recall_dict = q_json_dict["term_recall"][self.recall_field]

                guid = "train-%s" % qid
                examples.append(
                    InputExample(guid=guid, text=q_text, term_recall_dict=term_recall_dict)
                )
            train_file.close()
        random.shuffle(examples)
        return examples
Example #11
0
    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for prediction."""
        lines = self._read_tsv(os.path.join(data_dir, "weibo_senti_10k.csv"))

        examples = []
        lines = lines[10040:10230]
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "test-%d" % (i)
            strs = line[0].split(",")
            text_a = tokenization.convert_to_unicode(strs[1])
            label = "0"
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=None,
                             label=label))
        return examples
    def get_test_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'test.tsv')
        with open(file_path, 'r', encoding="utf-8") as f:
            reader = f.readlines()

        examples = []
        for index, line in enumerate(reader):
            guid = 'train-%d' % index
            split_line = line.strip().split("\t")
            text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = None
            label = split_line[0]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))

        return examples
Example #13
0
def read_array_examples(array_example):
    examples = []
    unique_id = 0

    for l in array_example:
        line = tokenization.convert_to_unicode(l)
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(
            InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1
    return examples
Example #14
0
 def get_train_examples(self, data_dir):
     # 读取训练数据路径
     print("hi, commentProcessor")
     file_path = os.path.join(data_dir, 'comments_train.csv')
     # 使用 Pandas 读取数据
     df = pd.read_csv(file_path)
     # 将训练数据切分为 80% 训练集和 20% 验证集
     # df_train, self.df_dev = train_test_split(df, test_size=0.2)
     df_train = df
     examples = []
     # 按 BERT 推荐格式处理数据
     for index, row in df_train.iterrows():
         guid = 'train-%d' % row[0]  # 索引
         text_a = tokenization.convert_to_unicode(str(row[1]))  # 文本
         label = row[2]  # 文本标签
         print(guid, "text_a: ", text_a, "label: ", label)
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      label=label))
     return examples
Example #15
0
 def get_dev_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'test.csv')
     with open(file_path, 'r', encoding='utf-8') as f:
         reader = f.readlines()
     examples = []
     labels_dev = []
     for index, line in enumerate(reader):
         guid = 'dev-%d' % index
         split_line = line.strip().split(',')
         text_a = tokenization.convert_to_unicode(split_line[1])
         # text_b = tokenization.convert_to_unicode(split_line[2])
         label = split_line[0]
         labels_dev.append(label)
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples, labels_dev
def read_examples(text):
    """return example format from string"""
    examples = []
    unique_id = 0
    text = text.replace('\n', ' ')  #remove line breaks
    line = tokenization.convert_to_unicode(text)
    line = line.strip()
    text_a = None
    text_b = None
    m = re.match(r"^(.*) \|\|\| (.*)$", line)
    if m is None:
        text_a = line
    else:
        text_a = m.group(1)
        text_b = m.group(2)
    examples.append(
        InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
    unique_id += 1
    return examples
Example #17
0
    def get_train_examples(self, data_dir):
        examples = []
        train_files = [data_dir]

        for file_name in train_files:
            train_file = open(file_name)
            for i, line in enumerate(train_file):
                json_dict = json.loads(line)
                docid = json_dict["doc"]["id"]
                doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"])
                term_recall_dict = json_dict["term_recall"]

                guid = "train-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            train_file.close()
        random.shuffle(examples)
        return examples
Example #18
0
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
  """Create `TrainingInstance`s from raw text."""
  all_documents = [[]]

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    with tf.gfile.GFile(input_file, "r") as reader:
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break
        line = line.strip()

        # Empty lines are used as document delimiters
        if not line:
          all_documents.append([])
        tokens = tokenizer.tokenize(line)
        if tokens:
          all_documents[-1].append(tokens)

  # Remove empty documents
  all_documents = [x for x in all_documents if x]
  rng.shuffle(all_documents)

  vocab_words = list(tokenizer.vocab.keys())
  instances = []
  for _ in range(dupe_factor):
    for document_index in range(len(all_documents)):
      instances.extend(
          create_instances_from_document(
              all_documents, document_index, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

  rng.shuffle(instances)
  return instances
Example #19
0
 def _create_examples(self, lines, set_type="test"):
     """Creates examples for the training and dev sets.
     :param lines: all input lines from input file
     :type lines: list
     :return: a list of InputExample element
     :rtype: list
     """
     examples = []
     for (i, line) in enumerate(lines):
         line = line.split("\t")
         guid = "%s-%s" % (set_type, i)
         label = eval(line[1])
         textA = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid, textA=textA, textB=None, label=label))
     # examples 包含了所有数据的列表, 其中每个数据类型为 InputExample
     # 对于训练数据进行随机打乱
     if set_type == "train":
         random.shuffle(examples)
     return examples
Example #20
0
def convert_single_example(query):
    global max_seq_length
    
    text = tokenization.convert_to_unicode(query)
    raw_tokens = tokenizer.tokenize(text)
    
    tokens = raw_tokens[0:(max_seq_length - 2)]
    tokens.insert(0, "[CLS]")  # 句子开始设置CLS 标志    
    tokens.append("[SEP]")  # 句尾添加[SEP] 标志
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    segment_ids = [0] * max_seq_length
    
    input_mask = [1] * len(input_ids)
    
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
    
    return input_ids, input_mask, segment_ids, raw_tokens
Example #21
0
def read_examples(str_io):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    while True:
        line = tokenization.convert_to_unicode(str_io.readline())
        if not line:
            break
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1
    return examples
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "train":
       text_a = tokenization.convert_to_unicode(line[1])
       text_b = tokenization.convert_to_unicode(line[2])
       label = tokenization.convert_to_unicode(line[3])
     if set_type == "dev":
       text_a = tokenization.convert_to_unicode(line[0])
       text_b = tokenization.convert_to_unicode(line[1])
       label = tokenization.convert_to_unicode(line[3])
     if set_type == "test":
       label = "NOT ENOUGH INFO"
       text_a = tokenization.convert_to_unicode(line[0])
       text_b = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Example #23
0
  def _create_test_examples(self, dataset_file, set_type):

    mag_dataset = dataset_file
    examples = []
    for _ in range(120000):
      next(mag_dataset)

    index = 0
    MAX_LINE_COUNT = 120000
    for line in mag_dataset:
      guid = "%s-%s" % (set_type, index)
      pos = line.find('\t')
      text_a = tokenization.convert_to_unicode(line[pos + 1:])
      examples.append(
        InputExample(guid=guid, text_a=text_a, text_b=None, label=None))
      index += 1
      if index >= MAX_LINE_COUNT:
        break
    mag_dataset.close()
    return examples
    def _create_samples(self, set_type):

        examples = []
        if set_type == "train":
            files = sea.DataFiles._train_file_names
        elif set_type == "dev":
            files = sea.DataFiles._validation_file_names
        else:
            files = sea.DataFiles._test_file_names

        for file in files:
            file = os.path.join("../", file)
            lines = pd.read_csv(file, delimiter=",")
            for i in range(len(lines)):
                offset = 0
                max_length = 512

                doc_stride = FLAGS.doc_stride
                sentence = lines.iloc[i, 1].strip("\n").strip("\"")
                sentence = tokenization.convert_to_unicode(sentence)

                for _ in range(6):
                    text_a = sentence[offset:offset + max_length]
                    text_a = " ".join(text_a)

                    guid = "{}-{}".format(i, set_type)

                    label = []
                    for j in range(2, 2 + self._labels_num, 1):
                        code = lines.iloc[i, j]
                        code = code + 2
                        label.append(code)
                    examples.append(
                        InputExample(guid=guid,
                                     text_a=text_a,
                                     text_b=None,
                                     label=label))
                    offset += max_length
                    offset = offset - doc_stride

        return examples
Example #25
0
    def get_test_examples(self, data_dir):
        test_files = [data_dir]
        examples = []

        for file_name in test_files:
            test_file = open(file_name)
            for i, line in enumerate(test_file):
                jdict = json.loads(line)
                docid = jdict["id"]
                doc_text = jdict["content"]
                doc_text = tokenization.convert_to_unicode(doc_text)
                term_recall_dict = {}
                if not doc_text.strip():
                     doc_text = '.'

                guid = "test-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            test_file.close()
        return examples
Example #26
0
def read_tokenized_examples(lst_strs):
    """

    :param lst_strs: [[]] 每个子元素为一个序列,子元素的每一个元素为这个序列的一个index
    :return:
    """
    unique_id = 0
    # 对lst_list中的数据进行转化为ID
    lst_strs = [[tokenization.convert_to_unicode(w) for w in s] for s in lst_strs]
    for ss in lst_strs:
        text_a = ss
        text_b = None
        try:
            # 这里使用|||对输入的句子进行切分如果存在这个符号,表示输入的是两个句子,即text_a 和text_b, 否则index出错,只会存在test_a
            j = ss.index('|||')
            text_a = ss[:j]
            text_b = ss[(j + 1):]
        except ValueError:
            pass
        yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
        unique_id += 1
Example #27
0
    def get_train_examples(self, data_dir):
        examples = []
        train_files = ["train.fold0.docterm_recall", "train.fold1.docterm_recall", "train.fold2.docterm_recall", "train.fold3.docterm_recall"]

        for file_name in train_files:
            train_file = open(os.path.join(data_dir, file_name))
            for i, line in enumerate(train_file):
                json_dict = json.loads(line)
                docid = json_dict["doc"]["id"]
                doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"])
                term_recall_dict = json_dict["term_recall"]
                if not term_recall_dict or not doc_text.strip():
                    continue

                guid = "train-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            train_file.close()
        random.shuffle(examples)
        return examples
Example #28
0
    def get_dev_examples_(self, data_dir):
        """See base class."""
        lines = self._read_tsv(os.path.join(data_dir, "test.csv"))
        examples = []

        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "dev-%s" % (i)
            line_ = line[0].split(',')
            #if len(line_) != 3: # bad sample num=150000
            #    continue
            text_a = tokenization.convert_to_unicode(line_[1])
            #label = tokenization.convert_to_unicode(line_[2])
            label = '0'
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=None,
                             label=label))
        return examples
def convert_lines_to_examples(lines):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    for line in lines:
        line = tokenization.convert_to_unicode(line)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(
            InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1
    return examples
    def _create_examples(self, lines, set_type):
        """Creates examples for the training sets."""
        examples = []
        for (i, line) in enumerate(lines):

            #if i == 0:
            #  continue

            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(line[0])
            text_b = None
            if set_type == "test":
                label = "pos"
            else:
                label = line[1]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
Example #31
0
    def get_train_examples(self, data_dir):
        # 读取数据集
        file_path = os.path.join(data_dir, 'train.csv')
        df = pd.read_csv(file_path)
        # 划分训练集和测试集
        file_path_test = os.path.join(data_dir, 'test.csv')
        df_test = pd.read_csv(file_path_test)
        df_train = df
        self.df_test = df_test
        # 再从训练集中划分出一部分验证集
        df_train, self.df_dev = train_test_split(df_train, test_size=0.2)

        examples = []
        for index, row in df_train.iterrows():
            guid = 'train-%d' % index  # 按示例添加唯一 guid
            text_a = tokenization.convert_to_unicode(str(row[0]))  # title1_zh
            #text_b = tokenization.convert_to_unicode(str(row[1]))  # title2_zh
            label = row[1]  # label
            examples.append(InputExample(guid=guid, text_a=text_a,
                                         label=label))
        return examples
 def _create_examples(self, lines):
     """See base class."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s" % (i) if 'id' not in line else line['id']
         text_a = tokenization.convert_to_unicode(line['text'])
         label = ['O'] * len(text_a)
         if 'label' in line:
             for l, words in line['label'].items():
                 for word, indices in words.items():
                     for index in indices:
                         if index[0] == index[1]:
                             label[index[0]] = 'S-' + l
                         else:
                             label[index[0]] = 'B-' + l
                             label[index[1]] = 'E-' + l
                             for i in range(index[0] + 1, index[1]):
                                 label[i] = 'M-' + l
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      label=label))
     return examples
Example #33
0
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples
Example #34
0
    def get_dev_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'cnews.val.txt')
        with open(file_path, 'r', encoding="utf-8") as f:
            reader = f.readlines()
        random.shuffle(reader)  # 注意要shuffle
        reader = reader[0:200]

        examples = []
        labels = []
        for index, line in enumerate(reader):
            guid = 'train-%d' % index
            split_line = line.strip().split("\t")
            text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = None
            label = split_line[0]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
            labels.append(label)
        return examples, labels
Example #35
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         # title = line[1]
         text_a = tokenization.convert_to_unicode(line[2])
         # text_a = tokenization.convert_to_unicode(line[0])
         # sdp = line[3]
         if set_type == "test":
             label = "false"
         else:
             label = line[4]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    def process_text(s):
      for token in retain:
        s = s.replace(token, sep_replace[token][FLAGS.entity_sep])
      return s

    examples = []
    for (i, line) in enumerate(lines):
      guid = "%s-%s" % (set_type, i)
     
      if type(line[0]) != str:
        text_a = ' '
      else: 
        text_a = tokenization.convert_to_unicode(process_text(line[0]))
      if set_type == "test":
        label = 0
      else:
        label = int(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples
Example #37
0
    def get_dev_examples(self, data_dir):
        dev_files = ["{}.json".format(self.dev_fold)]
        examples = []
        data_dirs = data_dir.split(',')

        for file_name in dev_files:
            for data_dir in data_dirs:
                dev_file = open(os.path.join(data_dir, file_name))
                for i, line in enumerate(dev_file):
                    q_json_dict = json.loads(line)
                    qid = q_json_dict["qid"]
                    q_text = tokenization.convert_to_unicode(q_json_dict["query"])
                    for field in self.recall_fields:
                        if field not in q_json_dict["term_recall"]: continue
                        term_recall_dict = q_json_dict["term_recall"][field]

                        guid = "dev-%s" % qid
                        examples.append(
                            InputExample(guid=guid, text=q_text, term_recall_dict=term_recall_dict)
                        )
                dev_file.close()
        return examples
    def read_examples(self, sentence_list):
        """Read a list of `InputExample`s from an input file."""
        examples = []
        unique_id = 0
        for line in sentence_list:
            line = tokenization.convert_to_unicode(line)

            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                self.InputExample(unique_id=unique_id,
                                  text_a=text_a,
                                  text_b=text_b))
            unique_id += 1
        return examples
Example #39
0
    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
          sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 3 strings:
        text_a, text_b and label.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
            label: '0'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens:  '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14
            label: 0

        For single sequences, the input is a tuple of 2 strings: text_a and label.
        Inputs:
            text_a: 'the dog is hairy .'
            label: '1'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a:  '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7
            label: 1

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 3 strings:
            (text_a, text_b, label). For single sequences, the input is a tuple
            of 2 strings: (text_a, label).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)
        np.array: label id in 'int32', shape (batch_size, 1)
        """
        # convert to unicode
        text_a = line[0]
        label = line[-1]
        text_a = convert_to_unicode(text_a)
        label = convert_to_unicode(label)
        if self._pair:
            assert len(line) == 3
            text_b = line[1]
            text_b = convert_to_unicode(text_b)

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer.tokenize(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append('[CLS]')
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append('[SEP]')
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append('[SEP]')
            segment_ids.append(1)

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
        label_id = self._label_map[label]

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            input_ids.extend([0] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
               np.array(segment_ids, dtype='int32'), np.array([label_id], dtype='int32')
    def raw_preprocess(iterator):
        tokenizer = tokenization.FullTokenizer(vocab_file=_vocab_file,
                do_lower_case=_do_lower_case)
        while True:
            try:
                line_arr = iterator.next().strip().split("\001")
                #_id, source_str = line_arr 
                _id = line_arr[0]
                source_str = line_arr[2]
                if not source_str:
                    continue 
                source = tokenization.convert_to_unicode(source_str)
                if not source:
                    continue 
                text_a = None 
                text_b = None 
                m = re.match(r"^(.*) \|\|\| (.*)$", source.strip())
                if m is None:
                    text_a = source.strip()
                else:
                    text_a = m.group(1)
                    text_b = m.group(2)
                tokens_a = tokenizer.tokenize(text_a)
                tokens_b = None
                if text_b:
                    tokens_b = tokenizer.tokenize(text_b)
                if tokens_b:
                    _truncate_seq_pair(tokens_a, tokens_b, _seq_length - 3)
                else:
                    if len(tokens_a) > _seq_length - 2:
                        tokens_a = tokens_a[0: (_seq_length-2)]
                # The convention in BERT is:
                # (a) For sequence pairs:
                #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
                #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
                # (b) For single sequences:
                #  tokens:   [CLS] the dog is hairy . [SEP]
                #  type_ids: 0     0   0   0  0     0 0
                #
                # Where "type_ids" are used to indicate whether this is the first
                # sequence or the second sequence. The embedding vectors for `type=0` and
                # `type=1` were learned during pre-training and are added to the wordpiece
                # embedding vector (and position vector). This is not *strictly* necessary
                # since the [SEP] token unambiguously separates the sequences, but it makes
                # it easier for the model to learn the concept of sequences.
                #
                # For classification tasks, the first vector (corresponding to [CLS]) is
                # used as as the "sentence vector". Note that this only makes sense because
                # the entire model is fine-tuned.
                tokens, input_type_ids = _encode_tokens(tokens_a, tokens_b)
                input_ids = tokenizer.convert_tokens_to_ids(tokens)
                # The mask has 1 for real tokens and 0 for padding tokens. Only real
                # tokens are attended to.
                input_mask = [1] * len(input_ids)

                # Zero-pad up to the sequence length.
                while len(input_ids) < _seq_length:
                    input_ids.append(0)
                    input_mask.append(0)
                    input_type_ids.append(0)
                    tokens.append("##NULL##")
                assert len(input_ids) == _seq_length
                assert len(input_mask) == _seq_length
                assert len(input_type_ids) == _seq_length
                assert len(tokens) == _seq_length

                encode_dict = {}
                encode_dict["_id"] = _id 
                encode_dict["tokens"] = tokens
                encode_dict["input_ids"] = input_ids
                encode_dict["input_mask"] = input_mask
                encode_dict["input_type_ids"] = input_type_ids

                yield encode_dict
            except StopIteration, e:
                print("stop")
                break
            except Exception, e:
                err = traceback.format_exc()
                print(err, file=sys.stderr)