def __init__(self, model_folder, max_length=256, lowercase=True): # 1. Create tokenizer self.max_length = max_length vocab_file = os.path.join(model_folder, 'vocab.txt') self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase) # 2. Read Config config_file = os.path.join(model_folder, 'bert_config.json') self.config = BertConfig.from_json_file(config_file) # 3. Create Model self.session = tf.Session() self.token_ids_op = tf.placeholder(tf.int32, shape=(None, max_length), name='token_ids') self.model = BertModel(config=self.config, is_training=False, input_ids=self.token_ids_op, use_one_hot_embeddings=False) # 4. Restore Trained Model self.saver = tf.train.Saver() ckpt_file = os.path.join(model_folder, 'bert_model.ckpt') # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000') self.saver.restore(self.session, ckpt_file) hidden_layers = self.config.num_hidden_layers self.embeddings_op = tf.get_default_graph().get_tensor_by_name( "bert/encoder/Reshape_{}:0".format(hidden_layers + 1))
def parse_text(text): sentences = text.split('\n\n') all_pos = Counter() all_dep = Counter() all_path = Counter() all_vocab = Counter() tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) for sentence in sentences: token_sequence = [] for token in sentence.split('\n'): if len(token) >= 8: token = token.split('\t') token_sequence.append(token) subwords = sum( [tokenizer.tokenize(item[0]) for item in token_sequence], []) all_vocab.update(subwords) all_pos.update([item[2] for item in token_sequence]) all_dep.update([item[3] for item in token_sequence]) all_path.update([item[4] for item in token_sequence]) return all_pos, all_dep, all_path, all_vocab
def bosonner(params, mode): tokenizer = FullTokenizer(vocab_file=params.vocab_file) boson_data = read_bosonnlp_data( file_pattern='data/ner/BosonNLP_NER_6C/BosonNLP*', eval_size=0.2) inputs_list = [] target_list = [] for data in [boson_data]: if mode == 'train': inputs_list += data['train']['inputs'] target_list += data['train']['target'] else: inputs_list += data['eval']['inputs'] target_list += data['eval']['target'] flat_target_list = ['O', 'B-LOC', 'B-PER', 'B-ORG', 'B-PRD', 'I-LOC', 'I-PER', 'I-ORG', 'I-PRD', ] label_encoder = get_or_make_label_encoder( params, 'bosonner', mode, flat_target_list, zero_class='O') return create_single_problem_generator('bosonner', inputs_list, target_list, label_encoder, params, tokenizer)
def prepare_training_data(input_data_dir, output_data_dir, input_filename, output_filename, language, config, \ vocab_file, sliding_window_size, demo=False): tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) writer = tf.python_io.TFRecordWriter( os.path.join(output_data_dir, "{}.{}.tfrecord".format(output_filename, language))) data_file_path = os.path.join(input_data_dir, input_filename) with open(data_file_path, "r") as f: documents = [json.loads(jsonline) for jsonline in f.readlines()] doc_map = {} for doc_idx, document in enumerate(documents): doc_key = document["doc_key"] tensorized = tensorize_example(document, config, tokenizer, is_training=True) if type(tensorized) is not tuple: tensorized = tuple(tensorized) write_instance_to_example_file(writer, tensorized, doc_key, config) doc_map[doc_idx] = doc_key if demo and doc_idx > 5: break with open( os.path.join(output_data_dir, "{}.{}.map".format(output_filename, language)), 'w') as fo: json.dump(doc_map, fo, indent=2)
def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None, test_corpus_fname=None, tokenized_test_corpus_fname=None, model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None, sp_model_path=None): # configurations tf.logging.set_verbosity(tf.logging.INFO) self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 if not os.path.exists(model_save_path): os.mkdir(model_save_path) # define tokenizer if self.model_name == "bert": self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) elif self.model_name == "xlnet": sp = spm.SentencePieceProcessor() sp.Load(sp_model_path) self.tokenizer = sp else: self.tokenizer = get_tokenizer("mecab") # load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
def main(unused_argv): tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary) print('Loading ' + str(FLAGS.dataset_name) + ' dataset from ' + FLAGS.input_filepath) # The debugging file saves all of the processed SQL queries. debugging_file = gfile.Open( os.path.join('/'.join(FLAGS.output_filepath.split('/')[:-1]), FLAGS.dataset_name + '_'.join(FLAGS.splits) + '_gold.txt'), 'w') # The output file will save a sequence of string-serialized JSON objects, one # line per object. output_file = gfile.Open(os.path.join(FLAGS.output_filepath), 'w') if FLAGS.dataset_name.lower() == 'spider': num_examples_created, num_examples_failed = process_spider( output_file, debugging_file, tokenizer) elif FLAGS.dataset_name.lower() == 'wikisql': num_examples_created, num_examples_failed = process_wikisql( output_file, debugging_file, tokenizer) else: num_examples_created, num_examples_failed = process_michigan_datasets( output_file, debugging_file, tokenizer) print('Wrote %s examples, could not annotate %s examples.' % (num_examples_created, num_examples_failed)) debugging_file.write('Wrote %s examples, could not annotate %s examples.' % (num_examples_created, num_examples_failed)) debugging_file.close() output_file.close()
def getDataset(): data_path = "./" train = sp.get_train_examples(data_path) dev = sp.get_dev_examples(data_path) test = sp.get_test_examples(data_path) from bert.tokenization import FullTokenizer tokenizer = FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt")) train_feas = [] for example in train: fea = convert_single_example(example.guid, example, sp.get_labels(), max_seq_len, tokenizer) train_feas.append(fea) dev_feas = [] for example in dev: fea = convert_single_example(example.guid, example, sp.get_labels(), max_seq_len, tokenizer) dev_feas.append(fea) test_feas = [] for example in test: fea = convert_single_example(example.guid, example, sp.get_labels(), max_seq_len, tokenizer) test_feas.append(fea) return train_feas, dev_feas, test_feas
def __init__(self, train_corpus_fname = None, tokenized_train_corpus_fname = None, test_corpus_fname = None, tokenized_test_corpus_fname= None, model_name='bert', model_save_path = None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None): self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 #tokenizer defining if self.model_name =='bert': self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False) else: self.tokenizer = get_tokenizer('mecab') #load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
def prepare_train_dataset(input_file, output_data_dir, output_filename, sliding_window_size, config, tokenizer=None, vocab_file=None, language="english", max_doc_length: int = None, is_training=True, demo=False, lowercase=False): if vocab_file is None: if not lowercase: vocab_file = os.path.join(REPO_PATH, "data_utils", "uppercase_vocab.txt") else: vocab_file = os.path.join(REPO_PATH, "data_utils", "lowercase_vocab.txt") if tokenizer is None: tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=lowercase) writer = tf.python_io.TFRecordWriter( os.path.join(output_data_dir, "{}.{}.tfrecord".format(output_filename, language))) doc_map = {} documents = read_conll_file(input_file) for doc_idx, document in enumerate(documents): doc_info = parse_document(document, language) tokenized_document = tokenize_document(config, doc_info, tokenizer, max_doc_length=max_doc_length) doc_key = tokenized_document['doc_key'] token_windows, mask_windows, text_len = convert_to_sliding_window( tokenized_document, sliding_window_size) input_id_windows = [ tokenizer.convert_tokens_to_ids(tokens) for tokens in token_windows ] span_start, span_end, mention_span, cluster_ids = flatten_clusters( tokenized_document['clusters']) # {'sub_tokens': sub_tokens, 'sentence_map': sentence_map, 'subtoken_map': subtoken_map, # 'speakers': speakers, 'clusters': clusters, 'doc_key': doc_info['doc_key']} tmp_speaker_ids = tokenized_document["speakers"] tmp_speaker_ids = [[0] * 130] * config["max_training_sentences"] instance = (input_id_windows, mask_windows, text_len, tmp_speaker_ids, tokenized_document["genre"], is_training, span_start, span_end, cluster_ids, tokenized_document['sentence_map']) write_instance_to_example_file(writer, instance, doc_key, config) doc_map[doc_idx] = doc_key if demo and doc_idx > 3: break with open( os.path.join(output_data_dir, "{}.{}.map".format(output_filename, language)), 'w') as fo: json.dump(doc_map, fo, indent=2)
def WeiboPretrain(params, mode): sentence_split = r'[.!?。?!]' tokenizer = FullTokenizer(vocab_file=params.vocab_file) data = read_ner_data(file_pattern='data/ner/weiboNER*', proc_fn=gold_horse_segment_process_fn) if mode == 'train': data = data['train'] else: data = data['eval'] inputs_list = data['inputs'] segmented_list = [] for document in inputs_list: segmented_list.append([]) doc_string = ''.join(document) splited_doc = re.split(sentence_split, doc_string) for sentence in splited_doc: if sentence: segmented_list[-1].append(list(sentence)) segmented_list = [doc for doc in segmented_list if doc] return create_pretraining_generator('WeiboPretrain', segmented_list, None, None, params, tokenizer)
def main(unused_argv): tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary) print("Loading " + str(FLAGS.dataset_name) + " dataset from " + FLAGS.input_filepath) # The debugging file saves all of the processed SQL queries. debugging_file = open( os.path.join( "/".join(FLAGS.output_filepath.split("/")[:-1]), FLAGS.dataset_name + "_".join(FLAGS.splits) + "_gold.txt", ), "w", ) # The output file will save a sequence of string-serialized JSON objects, one # line per object. output_file = open(os.path.join(FLAGS.output_filepath), "w") if FLAGS.dataset_name.lower() == "spider": num_examples_created, num_examples_failed = process_spider( output_file, debugging_file, tokenizer) elif FLAGS.dataset_name.lower() == "wikisql": num_examples_created, num_examples_failed = process_wikisql( output_file, debugging_file, tokenizer) else: num_examples_created, num_examples_failed = process_michigan_datasets( output_file, debugging_file, tokenizer) print("Wrote %s examples, could not annotate %s examples." % (num_examples_created, num_examples_failed)) debugging_file.write("Wrote %s examples, could not annotate %s examples." % (num_examples_created, num_examples_failed)) debugging_file.close() output_file.close()
def predict_input_fn_generator(input_file_or_list, config: Params, mode='predict'): # if is string, treat it as path to file if isinstance(input_file_or_list, str): inputs = open(input_file_or_list, 'r', encoding='utf8').readlines() else: inputs = input_file_or_list tokenizer = FullTokenizer(config.vocab_file) data_dict = {} data_dict['input_ids'] = [] data_dict['input_mask'] = [] data_dict['segment_ids'] = [] for doc in inputs: inputs_a = list(doc) tokens, target = tokenize_text_with_seqs(tokenizer, inputs_a, None) tokens_a, tokens_b, target = truncate_seq_pair(tokens, None, target, config.max_seq_len) tokens, segment_ids, target = add_special_tokens_with_seqs( tokens_a, tokens_b, target) input_mask, tokens, segment_ids, target = create_mask_and_padding( tokens, segment_ids, target, config.max_seq_len) input_ids = tokenizer.convert_tokens_to_ids(tokens) data_dict['input_ids'] = input_ids data_dict['input_mask'] = input_mask data_dict['segment_ids'] = segment_ids yield data_dict
def __init__(self): bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name self.do_lower_case = args.bert_model_name.startswith('uncased') self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt') self.config_file = os.path.join(bert_pretrained_dir, 'bert_config.json') self.tokenizer = FullTokenizer(vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids') self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask') self.segment_ids = tf.placeholder(tf.int64, [None, None], 'segment_ids') bert_config = BertConfig.from_json_file(self.config_file) model = BertModel(config=bert_config, is_training=False, input_ids=self.input_id, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=True, scope='bert') self.output_layer = model.get_sequence_output() self.embedding_layer = model.get_embedding_output() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')
def get_sentiment(text): try: model = create_model(MAX_SEQ_LEN, adapter_size=None) model.load_weights("./model_trained.h5") except: return "Cannot create model" pred_sentences = [str(text)] tokenizer = FullTokenizer(vocab_file=VOCAB_FILE) pred_tokens = map(tokenizer.tokenize, pred_sentences) pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens) pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens)) pred_token_ids = map(lambda tids: tids +[0]*(MAX_SEQ_LEN-len(tids)),pred_token_ids) pred_token_ids = np.array(list(pred_token_ids)) print('pred_token_ids', pred_token_ids.shape) res = model.predict(pred_token_ids).argmax(axis=-1) for text, sentiment in zip(pred_sentences, res): return(["negative","positive"][sentiment]) return sentiment
def prepare_training_data(data_dir: str, language: str, vocab_file: str, sliding_window_size: int): tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) for dataset in ['train', 'dev', 'test']: conll_file_path = os.path.join(data_dir, F"{dataset}.{language}.v4_gold_conll") writer = tf.python_io.TFRecordWriter( os.path.join(data_dir, F"{dataset}.{language}.tfrecord")) doc_map = {} documents = read_conll_file(conll_file_path) for doc_idx, document in enumerate(documents): doc_info = parse_document(document, language) checkout_clusters(doc_info) tokenized_document = tokenize_document(doc_info, tokenizer) doc_map[doc_idx] = tokenized_document['doc_key'] token_windows, mask_windows = convert_to_sliding_window( tokenized_document, sliding_window_size) input_id_windows = [ tokenizer.convert_tokens_to_ids(tokens) for tokens in token_windows ] span_starts, span_ends, cluster_ids = flatten_clusters( tokenized_document['clusters']) instance = (doc_idx, tokenized_document['sentence_map'], tokenized_document['subtoken_map'], input_id_windows, mask_windows, span_starts, span_ends, cluster_ids) write_instance_to_example_file(writer, instance) with open(os.path.join(data_dir, F"{dataset}.{language}.map"), 'w') as fo: json.dump(doc_map, fo, indent=2)
def __init__(self, **kwargs): self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose']) self.logger = set_logger('BertNer', kwargs['log_dir'], kwargs['verbose']) self.model_dir = kwargs['ner_model'] from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer( os.path.join(self.model_dir, 'vocab.txt')) self.ner_sq_len = 128 self.input_ids = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_ids') self.input_mask = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_mask') # init graph self._init_graph() # init ner assist data self._init_predict_var() self.per_proun = [ '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅', '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥' ]
def __init__(self, tf_hub_url: str, max_sequence_length: int = _DEFAULT_MAX_SEQUENCE_LENGTH) -> None: self._graph = tf.Graph() self._session = None # Initialize the BERT model with tf.Session(graph=self._graph) as session: # Download module from tf-hub bert_module = hub.Module(tf_hub_url) # Get the tokenizer from the module tokenization_info = bert_module(signature="tokenization_info", as_dict=True) self._vocab_file, self._do_lower_case = session.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]) self._vocab_file = self._vocab_file.decode("UTF-8") self._do_lower_case = bool(self._do_lower_case) self._tokenizer = FullTokenizer(vocab_file=self._vocab_file, do_lower_case=self._do_lower_case) # Create symbolic input tensors as inputs to the model self._input_ids = tf.placeholder(name="input_ids", shape=(None, max_sequence_length), dtype=tf.int32) self._input_mask = tf.placeholder(name="input_mask", shape=(None, max_sequence_length), dtype=tf.int32) self._segment_ids = tf.placeholder(name="segment_ids", shape=(None, max_sequence_length), dtype=tf.int32) # Get the symbolic output tensors self._outputs = bert_module({ "input_ids": self._input_ids, "input_mask": self._input_mask, "segment_ids": self._segment_ids }, signature="tokens", as_dict=True)
def WeiboFakeCLS(params, mode): """Just a test problem to test multiproblem support Arguments: params {Params} -- params mode {mode} -- mode """ tokenizer = FullTokenizer(vocab_file=params.vocab_file) data = read_ner_data(file_pattern='data/ner/weiboNER*', proc_fn=gold_horse_ent_type_process_fn) if mode == 'train': data = data['train'] else: data = data['eval'] inputs_list = data['inputs'] target_list = data['target'] new_target_list = [1 if len(set(t)) > 1 else 0 for t in target_list] label_encoder = get_or_make_label_encoder('WeiboFakeCLS', mode, new_target_list, 'O') return create_single_problem_generator('WeiboFakeCLS', inputs_list, new_target_list, label_encoder, params, tokenizer)
def __init__(self, checkpoint, attr_values_file, vocab_file): self.checkpoint = checkpoint self.attr_values_file = attr_values_file self.vocab_file = vocab_file if not os.path.exists(self.checkpoint): raise Exception("local checkpoint %s not exists" % self.checkpoint) if not os.path.exists(self.attr_values_file): raise Exception("local attr_values_file %s not exists" % self.attr_values_file) if not os.path.exists(self.vocab_file): raise Exception("local vocab_file %s not exists" % self.vocab_file) self.config = InferConfig() self.tokenizer = FullTokenizer(self.vocab_file) with open(self.attr_values_file, 'rb') as fr: attr_values, attr_values_r = pickle.load(fr) self.attr_values_r = attr_values_r self.config.output_dim = len(attr_values_r) self.graph = tf.Graph() with self.graph.as_default(): self.input_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) self.token_type_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) self.input_mask_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) model = Model(self.config) self.inference = model.infer(self.input_ids_p, self.token_type_ids_p, self.input_mask_p) ckpt_state = tf.train.get_checkpoint_state(self.checkpoint) if not (ckpt_state and ckpt_state.model_checkpoint_path): raise Exception('No model to eval yet at: ' + self.checkpoint) self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True)) saver = tf.train.Saver() saver.restore(self.sess, ckpt_state.model_checkpoint_path)
def POS(params, mode): tokenizer = FullTokenizer(vocab_file=params.vocab_file) input_list, target_list = read_ctbpos() if mode == 'train': input_list, _, target_list, _ = train_test_split(input_list, target_list, test_size=0.2, random_state=3721) else: _, input_list, _, target_list = train_test_split(input_list, target_list, test_size=0.2, random_state=3721) flat_target_list = [item for sublist in target_list for item in sublist] label_encoder = get_or_make_label_encoder(params, 'POS', mode, flat_target_list, zero_class='[PAD]') return create_single_problem_generator('POS', input_list, target_list, label_encoder, params, tokenizer)
def create_tokenizer_from_hub_module(bert_path): """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run( [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def create_tokenizer_from_hub_module(bert_path): """Get the vocab file and casing info from the Hub module.""" bert_layer = hub.KerasLayer(bert_path, trainable=False) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) return tokenizer, bert_layer
def bert_tokenize(vocab_fname, corpus_fname, output_fname): tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() tokens = tokenizer.tokenize(convert_to_unicode(sentence)) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def __init__(self, config, category_dir, vocab_file): self.config = config self.category_dir = category_dir self.tokenizer = FullTokenizer(vocab_file) if not os.path.exists( os.path.join(self.category_dir, 'train_data', 'raw.csv')): raise Exception("local raw train data not exists!!") if not os.path.exists(vocab_file): raise Exception("local vocab_file not exists")
def create_tokenizer_from_hub_module(): bert_module = hub.Module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1") tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def create_tokenizer_from_hub_module(): bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def tokenize_bert(): train_config = get_config() bert_config = get_bert_config(train_config) uncased = train_config.BERT_DIR.split('/')[-1].startswith('uncased') tokenizer = FullTokenizer(bert_config.vocab, do_lower_case=uncased) text, _ = load_data(os.path.join(train_config.DATA_DIR, 'train.csv')) tok_text = tokenize_examples(text, tokenizer, max_len=512) import pickle pickle.dump(tok_text, open('tok_text_uncased.pkl', 'wb'))
def CTBCWS(params, mode): tokenizer = FullTokenizer(vocab_file=params.vocab_file) file_list = glob.glob('data/ctb8.0/data/segmented/*') input_list = [] target_list = [] # Create possible tags for fast lookup possible_tags = [] for i in range(1, 300): if i == 1: possible_tags.append('s') else: possible_tags.append('b' + 'm' * (i - 2) + 'e') for file_path in file_list: with open(file_path, 'r', encoding='utf8') as f: raw_doc_list = f.readlines() text_row_ind = [ i + 1 for i, text in enumerate(raw_doc_list) if '<S ID=' in text ] sentence_list = [ text for i, text in enumerate(raw_doc_list) if i in text_row_ind ] for sentence in sentence_list: input_list.append([]) target_list.append([]) for word in sentence.split(): if word and len(word) <= 299: tag = possible_tags[len(word) - 1] input_list[-1] += list(word) target_list[-1] += list(tag) else: continue if mode == 'train': input_list, _, target_list, _ = train_test_split(input_list, target_list, test_size=0.2, random_state=3721) else: _, input_list, _, target_list = train_test_split(input_list, target_list, test_size=0.2, random_state=3721) flat_target_list = [item for sublist in target_list for item in sublist] label_encoder = get_or_make_label_encoder('CTBCWS', mode, flat_target_list, zero_class='[PAD]') return create_single_problem_generator('CTBCWS', input_list, target_list, label_encoder, params, tokenizer)
def create_tokenizer_from_hub_module(sess): """Get the vocab file and casing info from the Hub module.""" # tf.compat.v1.disable_eager_execution() bert_module = hub.Module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1") tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run( [tokenization_info["vocab_file"], tokenization_info["do_lower_case"],] ) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def create_tokenizer_from_hub_module(self, is_bert): """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(self.bert_model_hub_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ]) if is_bert: from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: from vectorizers.albert_tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, spm_model_file=vocab_file)