Example #1
0
    def __init__(self, model_folder, max_length=256, lowercase=True):

        # 1. Create tokenizer
        self.max_length = max_length
        vocab_file = os.path.join(model_folder, 'vocab.txt')
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase)

        # 2. Read Config
        config_file = os.path.join(model_folder, 'bert_config.json')
        self.config = BertConfig.from_json_file(config_file)

        # 3. Create Model
        self.session = tf.Session()
        self.token_ids_op = tf.placeholder(tf.int32,
                                           shape=(None, max_length),
                                           name='token_ids')
        self.model = BertModel(config=self.config,
                               is_training=False,
                               input_ids=self.token_ids_op,
                               use_one_hot_embeddings=False)

        # 4. Restore Trained Model
        self.saver = tf.train.Saver()
        ckpt_file = os.path.join(model_folder, 'bert_model.ckpt')
        # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000')
        self.saver.restore(self.session, ckpt_file)

        hidden_layers = self.config.num_hidden_layers
        self.embeddings_op = tf.get_default_graph().get_tensor_by_name(
            "bert/encoder/Reshape_{}:0".format(hidden_layers + 1))
Example #2
0
def parse_text(text):
    sentences = text.split('\n\n')

    all_pos = Counter()
    all_dep = Counter()
    all_path = Counter()
    all_vocab = Counter()

    tokenizer = FullTokenizer(vocab_file=VOCAB_FILE,
                              do_lower_case=DO_LOWER_CASE)

    for sentence in sentences:
        token_sequence = []

        for token in sentence.split('\n'):
            if len(token) >= 8:
                token = token.split('\t')
                token_sequence.append(token)

        subwords = sum(
            [tokenizer.tokenize(item[0]) for item in token_sequence], [])
        all_vocab.update(subwords)
        all_pos.update([item[2] for item in token_sequence])
        all_dep.update([item[3] for item in token_sequence])
        all_path.update([item[4] for item in token_sequence])

    return all_pos, all_dep, all_path, all_vocab
Example #3
0
def bosonner(params, mode):
    tokenizer = FullTokenizer(vocab_file=params.vocab_file)

    boson_data = read_bosonnlp_data(
        file_pattern='data/ner/BosonNLP_NER_6C/BosonNLP*', eval_size=0.2)

    inputs_list = []
    target_list = []
    for data in [boson_data]:
        if mode == 'train':
            inputs_list += data['train']['inputs']
            target_list += data['train']['target']

        else:
            inputs_list += data['eval']['inputs']
            target_list += data['eval']['target']

    flat_target_list = ['O',
                        'B-LOC',
                        'B-PER',
                        'B-ORG',
                        'B-PRD',
                        'I-LOC',
                        'I-PER',
                        'I-ORG',
                        'I-PRD', ]
    label_encoder = get_or_make_label_encoder(
        params, 'bosonner', mode, flat_target_list, zero_class='O')
    return create_single_problem_generator('bosonner',
                                           inputs_list,
                                           target_list,
                                           label_encoder,
                                           params,
                                           tokenizer)
Example #4
0
def prepare_training_data(input_data_dir, output_data_dir, input_filename, output_filename, language, config, \
    vocab_file, sliding_window_size, demo=False):

    tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
    writer = tf.python_io.TFRecordWriter(
        os.path.join(output_data_dir,
                     "{}.{}.tfrecord".format(output_filename, language)))

    data_file_path = os.path.join(input_data_dir, input_filename)
    with open(data_file_path, "r") as f:
        documents = [json.loads(jsonline) for jsonline in f.readlines()]
    doc_map = {}
    for doc_idx, document in enumerate(documents):
        doc_key = document["doc_key"]
        tensorized = tensorize_example(document,
                                       config,
                                       tokenizer,
                                       is_training=True)
        if type(tensorized) is not tuple:
            tensorized = tuple(tensorized)
        write_instance_to_example_file(writer, tensorized, doc_key, config)
        doc_map[doc_idx] = doc_key
        if demo and doc_idx > 5:
            break
    with open(
            os.path.join(output_data_dir,
                         "{}.{}.map".format(output_filename, language)),
            'w') as fo:
        json.dump(doc_map, fo, indent=2)
Example #5
0
 def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None,
              test_corpus_fname=None, tokenized_test_corpus_fname=None,
              model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000,
              batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None,
              sp_model_path=None):
     # configurations
     tf.logging.set_verbosity(tf.logging.INFO)
     self.model_name = model_name
     self.eval_every = eval_every
     self.model_ckpt_path = model_ckpt_path
     self.model_save_path = model_save_path
     self.batch_size = batch_size
     self.num_epochs = num_epochs
     self.dropout_keep_prob_rate = dropout_keep_prob_rate
     self.best_valid_score = 0.0
     if not os.path.exists(model_save_path):
         os.mkdir(model_save_path)
     # define tokenizer
     if self.model_name == "bert":
         self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
     elif self.model_name == "xlnet":
         sp = spm.SentencePieceProcessor()
         sp.Load(sp_model_path)
         self.tokenizer = sp
     else:
         self.tokenizer = get_tokenizer("mecab")
     # load or tokenize corpus
     self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
     self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
Example #6
0
def main(unused_argv):
  tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary)

  print('Loading ' + str(FLAGS.dataset_name) + ' dataset from ' +
        FLAGS.input_filepath)

  # The debugging file saves all of the processed SQL queries.
  debugging_file = gfile.Open(
      os.path.join('/'.join(FLAGS.output_filepath.split('/')[:-1]),
                   FLAGS.dataset_name + '_'.join(FLAGS.splits) + '_gold.txt'),
      'w')

  # The output file will save a sequence of string-serialized JSON objects, one
  # line per object.
  output_file = gfile.Open(os.path.join(FLAGS.output_filepath), 'w')

  if FLAGS.dataset_name.lower() == 'spider':
    num_examples_created, num_examples_failed = process_spider(
        output_file, debugging_file, tokenizer)
  elif FLAGS.dataset_name.lower() == 'wikisql':
    num_examples_created, num_examples_failed = process_wikisql(
        output_file, debugging_file, tokenizer)
  else:
    num_examples_created, num_examples_failed = process_michigan_datasets(
        output_file, debugging_file, tokenizer)

  print('Wrote %s examples, could not annotate %s examples.' %
        (num_examples_created, num_examples_failed))
  debugging_file.write('Wrote %s examples, could not annotate %s examples.' %
                       (num_examples_created, num_examples_failed))
  debugging_file.close()
  output_file.close()
Example #7
0
def getDataset():
    data_path = "./"
    train = sp.get_train_examples(data_path)
    dev = sp.get_dev_examples(data_path)
    test = sp.get_test_examples(data_path)
    from bert.tokenization import FullTokenizer

    tokenizer = FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt"))
    train_feas = []
    for example in train:
        fea = convert_single_example(example.guid, example, sp.get_labels(),
                                     max_seq_len, tokenizer)
        train_feas.append(fea)

    dev_feas = []
    for example in dev:
        fea = convert_single_example(example.guid, example, sp.get_labels(),
                                     max_seq_len, tokenizer)
        dev_feas.append(fea)

    test_feas = []
    for example in test:
        fea = convert_single_example(example.guid, example, sp.get_labels(),
                                     max_seq_len, tokenizer)
        test_feas.append(fea)
    return train_feas, dev_feas, test_feas
Example #8
0
 def __init__(self, train_corpus_fname = None,
             tokenized_train_corpus_fname = None,
             test_corpus_fname = None, tokenized_test_corpus_fname= None,
             model_name='bert', model_save_path = None, vocab_fname=None,
             eval_every=1000,
             batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9,
             model_ckpt_path=None):
     
     self.model_name = model_name
     self.eval_every = eval_every
     self.model_ckpt_path = model_ckpt_path
     self.model_save_path = model_save_path
     self.batch_size = batch_size
     self.num_epochs = num_epochs
     self.dropout_keep_prob_rate = dropout_keep_prob_rate
     self.best_valid_score = 0.0
     
     #tokenizer defining
     if self.model_name =='bert':
         self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False)
     else:
         self.tokenizer = get_tokenizer('mecab')
         
     #load or tokenize corpus
     
     self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
     self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
def prepare_train_dataset(input_file,
                          output_data_dir,
                          output_filename,
                          sliding_window_size,
                          config,
                          tokenizer=None,
                          vocab_file=None,
                          language="english",
                          max_doc_length: int = None,
                          is_training=True,
                          demo=False,
                          lowercase=False):
    if vocab_file is None:
        if not lowercase:
            vocab_file = os.path.join(REPO_PATH, "data_utils",
                                      "uppercase_vocab.txt")
        else:
            vocab_file = os.path.join(REPO_PATH, "data_utils",
                                      "lowercase_vocab.txt")

    if tokenizer is None:
        tokenizer = FullTokenizer(vocab_file=vocab_file,
                                  do_lower_case=lowercase)

    writer = tf.python_io.TFRecordWriter(
        os.path.join(output_data_dir,
                     "{}.{}.tfrecord".format(output_filename, language)))
    doc_map = {}
    documents = read_conll_file(input_file)
    for doc_idx, document in enumerate(documents):
        doc_info = parse_document(document, language)
        tokenized_document = tokenize_document(config,
                                               doc_info,
                                               tokenizer,
                                               max_doc_length=max_doc_length)
        doc_key = tokenized_document['doc_key']
        token_windows, mask_windows, text_len = convert_to_sliding_window(
            tokenized_document, sliding_window_size)
        input_id_windows = [
            tokenizer.convert_tokens_to_ids(tokens) for tokens in token_windows
        ]
        span_start, span_end, mention_span, cluster_ids = flatten_clusters(
            tokenized_document['clusters'])

        # {'sub_tokens': sub_tokens, 'sentence_map': sentence_map, 'subtoken_map': subtoken_map,
        # 'speakers': speakers, 'clusters': clusters, 'doc_key': doc_info['doc_key']}
        tmp_speaker_ids = tokenized_document["speakers"]
        tmp_speaker_ids = [[0] * 130] * config["max_training_sentences"]
        instance = (input_id_windows, mask_windows, text_len, tmp_speaker_ids,
                    tokenized_document["genre"], is_training, span_start,
                    span_end, cluster_ids, tokenized_document['sentence_map'])
        write_instance_to_example_file(writer, instance, doc_key, config)
        doc_map[doc_idx] = doc_key
        if demo and doc_idx > 3:
            break
    with open(
            os.path.join(output_data_dir,
                         "{}.{}.map".format(output_filename, language)),
            'w') as fo:
        json.dump(doc_map, fo, indent=2)
Example #10
0
def WeiboPretrain(params, mode):

    sentence_split = r'[.!?。?!]'

    tokenizer = FullTokenizer(vocab_file=params.vocab_file)
    data = read_ner_data(file_pattern='data/ner/weiboNER*',
                         proc_fn=gold_horse_segment_process_fn)
    if mode == 'train':
        data = data['train']
    else:
        data = data['eval']
    inputs_list = data['inputs']

    segmented_list = []
    for document in inputs_list:
        segmented_list.append([])
        doc_string = ''.join(document)
        splited_doc = re.split(sentence_split, doc_string)
        for sentence in splited_doc:
            if sentence:
                segmented_list[-1].append(list(sentence))
    segmented_list = [doc for doc in segmented_list if doc]

    return create_pretraining_generator('WeiboPretrain', segmented_list, None,
                                        None, params, tokenizer)
def main(unused_argv):
    tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary)

    print("Loading " + str(FLAGS.dataset_name) + " dataset from " +
          FLAGS.input_filepath)

    # The debugging file saves all of the processed SQL queries.
    debugging_file = open(
        os.path.join(
            "/".join(FLAGS.output_filepath.split("/")[:-1]),
            FLAGS.dataset_name + "_".join(FLAGS.splits) + "_gold.txt",
        ),
        "w",
    )

    # The output file will save a sequence of string-serialized JSON objects, one
    # line per object.
    output_file = open(os.path.join(FLAGS.output_filepath), "w")

    if FLAGS.dataset_name.lower() == "spider":
        num_examples_created, num_examples_failed = process_spider(
            output_file, debugging_file, tokenizer)
    elif FLAGS.dataset_name.lower() == "wikisql":
        num_examples_created, num_examples_failed = process_wikisql(
            output_file, debugging_file, tokenizer)
    else:
        num_examples_created, num_examples_failed = process_michigan_datasets(
            output_file, debugging_file, tokenizer)

    print("Wrote %s examples, could not annotate %s examples." %
          (num_examples_created, num_examples_failed))
    debugging_file.write("Wrote %s examples, could not annotate %s examples." %
                         (num_examples_created, num_examples_failed))
    debugging_file.close()
    output_file.close()
Example #12
0
def predict_input_fn_generator(input_file_or_list,
                               config: Params,
                               mode='predict'):
    # if is string, treat it as path to file
    if isinstance(input_file_or_list, str):
        inputs = open(input_file_or_list, 'r', encoding='utf8').readlines()
    else:
        inputs = input_file_or_list

    tokenizer = FullTokenizer(config.vocab_file)

    data_dict = {}
    data_dict['input_ids'] = []
    data_dict['input_mask'] = []
    data_dict['segment_ids'] = []

    for doc in inputs:

        inputs_a = list(doc)
        tokens, target = tokenize_text_with_seqs(tokenizer, inputs_a, None)

        tokens_a, tokens_b, target = truncate_seq_pair(tokens, None, target,
                                                       config.max_seq_len)

        tokens, segment_ids, target = add_special_tokens_with_seqs(
            tokens_a, tokens_b, target)

        input_mask, tokens, segment_ids, target = create_mask_and_padding(
            tokens, segment_ids, target, config.max_seq_len)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        data_dict['input_ids'] = input_ids
        data_dict['input_mask'] = input_mask
        data_dict['segment_ids'] = segment_ids
        yield data_dict
    def __init__(self):
        bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name
        self.do_lower_case = args.bert_model_name.startswith('uncased')
        self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt')
        self.config_file = os.path.join(bert_pretrained_dir,
                                        'bert_config.json')
        self.tokenizer = FullTokenizer(vocab_file=self.vocab_file,
                                       do_lower_case=self.do_lower_case)

        self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids')
        self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask')
        self.segment_ids = tf.placeholder(tf.int64, [None, None],
                                          'segment_ids')

        bert_config = BertConfig.from_json_file(self.config_file)
        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=self.input_id,
                          input_mask=self.input_mask,
                          token_type_ids=self.segment_ids,
                          use_one_hot_embeddings=True,
                          scope='bert')
        self.output_layer = model.get_sequence_output()
        self.embedding_layer = model.get_embedding_output()

        saver = tf.train.Saver()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')
Example #14
0
def get_sentiment(text):

    try:
        model = create_model(MAX_SEQ_LEN, adapter_size=None)
        model.load_weights("./model_trained.h5")
    except:
        return "Cannot create model"

    pred_sentences = [str(text)]

    tokenizer = FullTokenizer(vocab_file=VOCAB_FILE)
    pred_tokens    = map(tokenizer.tokenize, pred_sentences)
    pred_tokens    = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
    pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

    pred_token_ids = map(lambda tids: tids +[0]*(MAX_SEQ_LEN-len(tids)),pred_token_ids)
    pred_token_ids = np.array(list(pred_token_ids))

    print('pred_token_ids', pred_token_ids.shape)

    res = model.predict(pred_token_ids).argmax(axis=-1)

    for text, sentiment in zip(pred_sentences, res):
        return(["negative","positive"][sentiment])

    return sentiment
def prepare_training_data(data_dir: str, language: str, vocab_file: str,
                          sliding_window_size: int):
    tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
    for dataset in ['train', 'dev', 'test']:
        conll_file_path = os.path.join(data_dir,
                                       F"{dataset}.{language}.v4_gold_conll")
        writer = tf.python_io.TFRecordWriter(
            os.path.join(data_dir, F"{dataset}.{language}.tfrecord"))
        doc_map = {}
        documents = read_conll_file(conll_file_path)
        for doc_idx, document in enumerate(documents):
            doc_info = parse_document(document, language)
            checkout_clusters(doc_info)
            tokenized_document = tokenize_document(doc_info, tokenizer)
            doc_map[doc_idx] = tokenized_document['doc_key']
            token_windows, mask_windows = convert_to_sliding_window(
                tokenized_document, sliding_window_size)
            input_id_windows = [
                tokenizer.convert_tokens_to_ids(tokens)
                for tokens in token_windows
            ]
            span_starts, span_ends, cluster_ids = flatten_clusters(
                tokenized_document['clusters'])
            instance = (doc_idx, tokenized_document['sentence_map'],
                        tokenized_document['subtoken_map'], input_id_windows,
                        mask_windows, span_starts, span_ends, cluster_ids)
            write_instance_to_example_file(writer, instance)
        with open(os.path.join(data_dir, F"{dataset}.{language}.map"),
                  'w') as fo:
            json.dump(doc_map, fo, indent=2)
Example #16
0
    def __init__(self, **kwargs):
        self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose'])
        self.logger = set_logger('BertNer', kwargs['log_dir'],
                                 kwargs['verbose'])
        self.model_dir = kwargs['ner_model']

        from bert.tokenization import FullTokenizer
        self.tokenizer = FullTokenizer(
            os.path.join(self.model_dir, 'vocab.txt'))

        self.ner_sq_len = 128
        self.input_ids = self.tf.placeholder(self.tf.int32,
                                             (None, self.ner_sq_len),
                                             'input_ids')
        self.input_mask = self.tf.placeholder(self.tf.int32,
                                              (None, self.ner_sq_len),
                                              'input_mask')

        # init graph
        self._init_graph()

        # init ner assist data
        self._init_predict_var()

        self.per_proun = [
            '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅',
            '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥'
        ]
Example #17
0
    def __init__(self, tf_hub_url: str, max_sequence_length: int = _DEFAULT_MAX_SEQUENCE_LENGTH) -> None:
        self._graph = tf.Graph()
        self._session = None

        # Initialize the BERT model
        with tf.Session(graph=self._graph) as session:
            # Download module from tf-hub
            bert_module = hub.Module(tf_hub_url)

            # Get the tokenizer from the module
            tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
            self._vocab_file, self._do_lower_case = session.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])
            self._vocab_file = self._vocab_file.decode("UTF-8")
            self._do_lower_case = bool(self._do_lower_case)
            self._tokenizer = FullTokenizer(vocab_file=self._vocab_file, do_lower_case=self._do_lower_case)

            # Create symbolic input tensors as inputs to the model
            self._input_ids = tf.placeholder(name="input_ids", shape=(None, max_sequence_length), dtype=tf.int32)
            self._input_mask = tf.placeholder(name="input_mask", shape=(None, max_sequence_length), dtype=tf.int32)
            self._segment_ids = tf.placeholder(name="segment_ids", shape=(None, max_sequence_length), dtype=tf.int32)

            # Get the symbolic output tensors
            self._outputs = bert_module({
                "input_ids": self._input_ids,
                "input_mask": self._input_mask,
                "segment_ids": self._segment_ids
            }, signature="tokens", as_dict=True)
Example #18
0
def WeiboFakeCLS(params, mode):
    """Just a test problem to test multiproblem support

    Arguments:
        params {Params} -- params
        mode {mode} -- mode
    """
    tokenizer = FullTokenizer(vocab_file=params.vocab_file)
    data = read_ner_data(file_pattern='data/ner/weiboNER*',
                         proc_fn=gold_horse_ent_type_process_fn)
    if mode == 'train':
        data = data['train']
    else:
        data = data['eval']
    inputs_list = data['inputs']
    target_list = data['target']

    new_target_list = [1 if len(set(t)) > 1 else 0 for t in target_list]

    label_encoder = get_or_make_label_encoder('WeiboFakeCLS', mode,
                                              new_target_list, 'O')

    return create_single_problem_generator('WeiboFakeCLS', inputs_list,
                                           new_target_list, label_encoder,
                                           params, tokenizer)
  def __init__(self, checkpoint, attr_values_file, vocab_file):
    self.checkpoint = checkpoint
    self.attr_values_file = attr_values_file
    self.vocab_file = vocab_file
    if not os.path.exists(self.checkpoint):
      raise Exception("local checkpoint %s not exists" % self.checkpoint)
    if not os.path.exists(self.attr_values_file):
      raise Exception("local attr_values_file %s not exists" % self.attr_values_file)
    if not os.path.exists(self.vocab_file):
      raise Exception("local vocab_file %s not exists" % self.vocab_file)
    self.config = InferConfig()
    self.tokenizer = FullTokenizer(self.vocab_file)
    with open(self.attr_values_file, 'rb') as fr:
      attr_values, attr_values_r = pickle.load(fr)
    self.attr_values_r = attr_values_r
    self.config.output_dim = len(attr_values_r)

    self.graph = tf.Graph()
    with self.graph.as_default():
      self.input_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      self.token_type_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      self.input_mask_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      model = Model(self.config)
      self.inference = model.infer(self.input_ids_p, self.token_type_ids_p, self.input_mask_p)
      ckpt_state = tf.train.get_checkpoint_state(self.checkpoint)
      if not (ckpt_state and ckpt_state.model_checkpoint_path):
        raise Exception('No model to eval yet at: ' + self.checkpoint)
      self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True))
      saver = tf.train.Saver()
      saver.restore(self.sess, ckpt_state.model_checkpoint_path)
Example #20
0
def POS(params, mode):
    tokenizer = FullTokenizer(vocab_file=params.vocab_file)

    input_list, target_list = read_ctbpos()

    if mode == 'train':
        input_list, _, target_list, _ = train_test_split(input_list,
                                                         target_list,
                                                         test_size=0.2,
                                                         random_state=3721)
    else:
        _, input_list, _, target_list = train_test_split(input_list,
                                                         target_list,
                                                         test_size=0.2,
                                                         random_state=3721)

    flat_target_list = [item for sublist in target_list for item in sublist]

    label_encoder = get_or_make_label_encoder(params,
                                              'POS',
                                              mode,
                                              flat_target_list,
                                              zero_class='[PAD]')
    return create_single_problem_generator('POS', input_list, target_list,
                                           label_encoder, params, tokenizer)
Example #21
0
def create_tokenizer_from_hub_module(bert_path):
    """Get the vocab file and casing info from the Hub module."""
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info",
                                    as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Example #22
0
def create_tokenizer_from_hub_module(bert_path):
    """Get the vocab file and casing info from the Hub module."""
    bert_layer = hub.KerasLayer(bert_path, trainable=False)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    return tokenizer, bert_layer
Example #23
0
def bert_tokenize(vocab_fname, corpus_fname, output_fname):
    tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = tokenizer.tokenize(convert_to_unicode(sentence))
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
 def __init__(self, config, category_dir, vocab_file):
     self.config = config
     self.category_dir = category_dir
     self.tokenizer = FullTokenizer(vocab_file)
     if not os.path.exists(
             os.path.join(self.category_dir, 'train_data', 'raw.csv')):
         raise Exception("local raw train data not exists!!")
     if not os.path.exists(vocab_file):
         raise Exception("local vocab_file not exists")
Example #25
0
def create_tokenizer_from_hub_module():
    bert_module = hub.Module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run([
        tokenization_info["vocab_file"],
        tokenization_info["do_lower_case"],
    ])

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def create_tokenizer_from_hub_module():
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info",
                                    as_dict=True)
    vocab_file, do_lower_case = sess.run([
        tokenization_info["vocab_file"],
        tokenization_info["do_lower_case"],
    ])
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Example #27
0
def tokenize_bert():
    train_config = get_config()
    bert_config = get_bert_config(train_config)
    uncased = train_config.BERT_DIR.split('/')[-1].startswith('uncased')
    tokenizer = FullTokenizer(bert_config.vocab, do_lower_case=uncased)
    text, _ = load_data(os.path.join(train_config.DATA_DIR, 'train.csv'))
    tok_text = tokenize_examples(text, tokenizer, max_len=512)
    import pickle
    pickle.dump(tok_text, open('tok_text_uncased.pkl', 'wb'))
Example #28
0
def CTBCWS(params, mode):
    tokenizer = FullTokenizer(vocab_file=params.vocab_file)
    file_list = glob.glob('data/ctb8.0/data/segmented/*')

    input_list = []
    target_list = []

    # Create possible tags for fast lookup
    possible_tags = []
    for i in range(1, 300):
        if i == 1:
            possible_tags.append('s')
        else:
            possible_tags.append('b' + 'm' * (i - 2) + 'e')

    for file_path in file_list:
        with open(file_path, 'r', encoding='utf8') as f:
            raw_doc_list = f.readlines()
        text_row_ind = [
            i + 1 for i, text in enumerate(raw_doc_list) if '<S ID=' in text
        ]

        sentence_list = [
            text for i, text in enumerate(raw_doc_list) if i in text_row_ind
        ]

        for sentence in sentence_list:
            input_list.append([])
            target_list.append([])
            for word in sentence.split():
                if word and len(word) <= 299:
                    tag = possible_tags[len(word) - 1]
                    input_list[-1] += list(word)
                    target_list[-1] += list(tag)
                else:
                    continue

    if mode == 'train':
        input_list, _, target_list, _ = train_test_split(input_list,
                                                         target_list,
                                                         test_size=0.2,
                                                         random_state=3721)
    else:
        _, input_list, _, target_list = train_test_split(input_list,
                                                         target_list,
                                                         test_size=0.2,
                                                         random_state=3721)

    flat_target_list = [item for sublist in target_list for item in sublist]

    label_encoder = get_or_make_label_encoder('CTBCWS',
                                              mode,
                                              flat_target_list,
                                              zero_class='[PAD]')
    return create_single_problem_generator('CTBCWS', input_list, target_list,
                                           label_encoder, params, tokenizer)
def create_tokenizer_from_hub_module(sess):
    """Get the vocab file and casing info from the Hub module."""
    # tf.compat.v1.disable_eager_execution()
    bert_module = hub.Module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [tokenization_info["vocab_file"], tokenization_info["do_lower_case"],]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Example #30
0
    def create_tokenizer_from_hub_module(self, is_bert):
        """Get the vocab file and casing info from the Hub module."""
        bert_module = hub.Module(self.bert_model_hub_path)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        vocab_file, do_lower_case = self.sess.run([
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ])

        if is_bert:
            from bert.tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            from vectorizers.albert_tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case,
                                           spm_model_file=vocab_file)