Ejemplo n.º 1
0
def process_data():
    """covert the string data to idx, and save."""
    with codecs.open(cg.DATA_PATH, 'r', 'utf-8') as file:
        data = file.read().split('\n')

    if len(data[-1]) == 0:
        _error('The last line which is empty has been removed.')
        data = data[:-1]

    questions = []
    answers = []
    for line in data:
        line_split = line.split('=')
        que, ans = line_split[0], line_split[1]
        questions.append(que)
        answers.append(ans)
    assert len(questions) == len(answers),\
     _error('The number of quesiton: {} not equal to the number of answer: {}'.format(len(questions), len(answers)))

    que_idx = [str_to_idx(que) for que in questions]
    ans_idx = [str_to_idx(ans) for ans in answers]

    Path('processed_data/').mkdir(exist_ok=True)
    with codecs.open('processed_data/questions.bin', 'wb') as file:
        pickle.dump(que_idx, file)
    with codecs.open('processed_data/answers.bin', 'wb') as file:
        pickle.dump(ans_idx, file)

    _info(
        'Coverted questions and answers have been saved into `processed_data` directory.'
    )
Ejemplo n.º 2
0
def analyse(postive_result, negative_result):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    true_negative = 0

    with codecs.open(postive_result, 'r', 'utf-8') as file:
        for line in file:
            tag = line.split('\t')[1].strip()
            if tag == '1':
                true_positive += 1
            else:
                false_negative += 1

    with codecs.open(negative_result, 'r', 'utf-8') as file:
        for line in file:
            tag = line.split('\t')[1].strip()
            if tag == '0':
                true_negative += 1
            else:
                false_positive += 1

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1_score = (2 * precision * recall) / (precision + recall)

    _info('', head='The Result')
    print(
        '\t TP: {}\t|  FP: {}\n \t FN: {}\t|  TN: {}\n  Precision: {:.2}\tRecall: {:.2}\n\t  F1_Score: {:.2}'
        .format(true_positive, false_positive, false_negative, true_negative,
                precision, recall, f1_score))
def read_dictionary(path, split_tag='\t', need_score=False):
  """read the vocab from the dictionary.
  
  Args:
    path: str, the absolute path of the dictionary.
    split_tag: str, the splitting tag for each line.
    need_score: boolean, need the score behind the vocab or not.
  
  Returns:
    a list(or dict) contains the whole vocabulary.
  """
  if not need_score:
    vocab_set = []
  else:
    vocab_set = {}

  with codecs.open(path, 'r', 'utf-8') as file:
    for line in file:
      line = line.strip()
      if len(line) > 0:
        line_split = line.split(split_tag)
        if not need_score:
          vocab = line_split[0]
          vocab_set.append(vocab)
        else:
          vocab, score = line_split[0], float(line_split[1])
          vocab_set[vocab] = score

  if not need_score:
    _info('{} contains {} vocabs.\n Some vocabs looks like {}.\n'.format(
      path, len(vocab_set), vocab_set[:5] + vocab_set[-5:]))
  else:
    _info('{} contains {} vocabs.\n Some vocabs looks like {}.\n'.format(
      path, len(vocab_set), list(vocab_set.keys())[:5] + list(vocab_set.keys())[-5:]))
  return vocab_set
def check_overlap(data, dic_set, single_data_or_not=True):
  match_number = [0 for _ in data]
  match_per_length = 0
  total_length = 0

  for idx, line in enumerate(data):
    single_length = 0
    match_words = []
    if single_data_or_not:
      line = line[0]
      sentence = re.sub(ELIMINATE_PUNCTUATION, ' ', line[1])
    else:
      sentence = re.sub(ELIMINATE_PUNCTUATION, ' ', line[2])
    sentence_split = sentence.split(' ')
    for vocab in sentence_split:
      if vocab in dic_set:
        match_words.append(vocab)
        single_length += 1
        match_per_length += 1
        match_number[idx] = 1
    # _info(line[0])
    # print(single_length / len(sentence_split))
    # print(line)
    # print(match_words)
    # input()
    single_length = 0
    total_length += len(sentence_split)

  _info('TOTAL DATA: {} MATCH: {} TOTAL_MATCH: {}'.format(len(data), sum(match_number), match_per_length / total_length))
Ejemplo n.º 5
0
def parse_pclntable(module_data):
    pPcHeader = module_data.pPcHeader
    pc_header = parse_pc_header(pMem=pPcHeader)
    ptrSize = pc_header.ptrSize
    numberOfFuncs = pc_header.nFunc

    log._info("Number of Functions : %d" % numberOfFuncs)

    pclntable_start = module_data.pPclnTable
    cur_addr = pclntable_start
    for idx in range(numberOfFuncs):
        cur_addr = pclntable_start + (2 * ptrSize) * idx
        func_rva = common.mem_read_integer(addr=cur_addr, read_size=ptrSize)
        _func_structure_offset = common.mem_read_integer(addr=cur_addr +
                                                         ptrSize,
                                                         read_size=ptrSize)
        _func_addr = pclntable_start + _func_structure_offset

        if not idc.GetFunctionName(func_rva):
            log._info("Unk Func @0x%x" % func_rva)
            idc.MakeUnkn(func_rva, idc.DOUNK_EXPAND)
            idaapi.autoWait()
            idc.MakeCode(func_rva)
            idaapi.autoWait()
            if idc.MakeFunction(func_rva):
                idaapi.autoWait()
                log._info("Create Func @0x%x" % func_rva)

        _func = parse__func(pMem=_func_addr)
        #args=_func.args
        #func_id=_func.args

        func_name_addr = module_data.pFuncNameTable + _func.nameoff
        func_name = idc.GetString(func_name_addr)
        if func_name:
            clean_func_name = utils.clean_function_name(func_name)
            log._info("@0x%x Name : [%s]" % (func_rva, func_name))
            idc.MakeComm(func_rva, "@0x" + str(hex(func_rva)) + " entry")
            idaapi.autoWait()

            if idc.MakeStr(func_name_addr,
                           func_name_addr + len(func_name) + 1):
                idaapi.autoWait()
            else:
                log._error("@0x%x Name : [%s] Failed..." %
                           (func_rva, func_name))

        _func_addr = idaapi.get_func(func_rva)
        if _func_addr is not None:
            if idc.MakeNameEx(_func_addr.startEA,
                              func_name,
                              flags=idaapi.SN_FORCE):
                idaapi.autoWait()
                log._info("@0x%x Name : [%s]" % (func_rva, func_name))
            else:
                log._error("@0x%x Name : [%s] Failed..." %
                           (func_rva, func_name))
Ejemplo n.º 6
0
def extract(log_path, save_path):
    with codecs.open(log_path, 'r', 'utf-8') as file, \
         codecs.open(save_path, 'w', 'utf-8') as file_2:
        for line in file:
            if re.search(PATTERN, line):
                match = re.search(PATTERN, line).group()
                loss = match.split(' ')[2]
                file_2.write('sup_avg:' + loss + '\n')
                file_2.flush()
    _info('The loss record have been save to {}.'.format(save_path))
Ejemplo n.º 7
0
    def __init__(self,
                 config,
                 is_training,
                 input_text,
                 input_image,
                 scope=None):
        """"Constructor for EANN Model.
      
      Args:
        config: Config Object, hyparameters set.
        is_training: Boolean, whether train or not.
        input_text: tf.int32 Tensor, [batch_size, seq_length].
        input_image: tf.float32 Tensor, [batch_size, h, w, c].
      """
        # config
        config = copy.deepcopy(config)

        # textCNN config
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.window_size = config.window_size
        self.pool_size = config.pool_size
        self.filter_number_text = config.filter_number_text
        self.seq_length = config.max_length

        # VGG_19
        try:
            self.vgg = tf.keras.applications.VGG19(input_shape=(224, 224, 3),
                                                   include_top=False,
                                                   weights='imagenet')
            _info('Successfully load the pre-trained VGG-19 weights.')
        except Exception:
            _error(
                'Please download the VGG_19 weights from : \n{}\n, then put the file \
        into ~/.keras/models'.format(_cg.VGG_19_Weights_Download_URL))

        self.vgg.trainable = False  # do not train the vgg pretrained parameters

        # global config
        self.hidden_size = config.hidden_size
        self.num_classes = config.num_classes
        self.num_domains = config.num_domains

        # basic config
        self.initializer_range = config.initializer_range
        self.dropout = config.dropout
        if not is_training:
            self.dropout = 0.0

        # Build the Graph
        self.label_output, self.domain_output, self.batch_size = self.build(
            input_text, input_image)
Ejemplo n.º 8
0
def find_moduledata():
    pModuleData = 0
    if common.check_is_stripped():
        log._info("binary is not stripped!")
        for addr, name in idautils.Names():
            if name == "runtime.firstmoduledata":
                pModuleData = addr
                break
    else:
        log._info("binary is stripped..")
        log._info("Now find the moduledata by using brute force searching")
        GO1_16_MAGIC = 0xFFFFFFFA  # <-- go 1.16 magic
        text_section = common.get_segment_addr_by_name(name=".text")
        rdata_section = common.get_segment_addr_by_name(name=".rdata")
        data_section = common.get_segment_addr_by_name(name=".data")

        sections = [(".text", text_section), (".rdata", rdata_section),
                    (".data", data_section)]

        for sec_name, section_addr in sections:
            cur_addr = section_addr
            next_section_addr = common.get_next_segment_addr(addr=cur_addr)
            pModuleData = find_module_data_bruteforce(
                start_addr=section_addr,
                break_addr=next_section_addr,
                magic=GO1_16_MAGIC)
            if pModuleData != 0:
                log._info("ModuleData Structure locate at [%s] - @0x%x" %
                          (sec_name, pModuleData))
                break

        if pModuleData == 0:
            log._error("Cannot find ModuleData Structre in current binary...")

        return pModuleData
Ejemplo n.º 9
0
def load_dict(path):
    """load the dictionary.
  
  Returns:
    A list consists of tuples with vocab and value.
  """
    data = []
    with codecs.open(path, 'r', 'utf-8') as file:
        for line in file:
            line = line.strip()
            if len(line) > 0:
                vocab, value = line.split('\t')[0], line.split('\t')[1]
                data.append((vocab, value))
    _info('{} contains {} vocabs.'.format(path, len(data)))
    return data
Ejemplo n.º 10
0
def readFile(path):
    _info('Start building graph...')

    graph = Graph()
    with codecs.open(path, 'r', 'utf-8') as file:
        data = file.read().split('\n')
    for line in data:
        line_split = line.split(' ')
        u, other = line_split[0], line_split[1:]
        u_obj = Vertex(u)
        for v in other:
            v_obj = Vertex(v)
            graph.addEdge(u_obj, v_obj)

    _info('Finish building graph...')
    return graph
Ejemplo n.º 11
0
def make_dict(path, save_dir):
    vocab_idx = {}
    idx_vocab = {}
    with codecs.open(path, 'r', 'utf-8') as file:
        for idx, vocab in enumerate(file):
            vocab = vocab.strip()
            if len(vocab) > 0:
                vocab_idx[vocab] = idx
                idx_vocab[idx] = vocab

    with codecs.open(save_dir + 'vocab_idx.bin', 'wb') as vocab_idx_save,\
         codecs.open(save_dir + 'idx_vocab.bin', 'wb') as idx_vocab_save:
        pickle.dump(vocab_idx, vocab_idx_save)
        pickle.dump(idx_vocab, idx_vocab_save)

    _info('Vocab length: {}, the dictionary has been saved to: {}'.format(
        len(vocab_idx), save_dir))
def read_twitter_data(path, split_tag='\t', polarity=None):
  """read the twitter data from the given path."""
  data = []
  with codecs.open(path, 'r', 'utf-8') as file:
    for line in file:
      line = line.strip()
      if len(line) > 0:
        if split_tag is not None:
          line_strip = line.split(split_tag)
          id_, tag, sentence = line_strip[0], line_strip[1], line_strip[2]
          data.append((id_, tag, sentence))
        else:
          data.append((polarity, line))
  _info('{} contains {} data.'.format(path, len(data)))
  if split_tag is not None:
    return data
  else:
    return data[0]
Ejemplo n.º 13
0
def create_or_load(model, ckpt_path, session, force=False):
    """create a new model or load from the existing one"""
    dir_path = '/'.join(ckpt_path.split('/')[:-1])
    latest_ckpt = tf.train.latest_checkpoint(dir_path)

    if latest_ckpt and not force:
        try:
            model.saver.restore(session, latest_ckpt)
        except Exception as e:
            _error(e, head='ERROR')
            raise e
        _info('successfully load model from <{}>'.format(latest_ckpt),
              head='INFO')
    else:
        session.run(tf.global_variables_initializer())
        session.run(tf.local_variables_initializer())
        session.run(tf.tables_initializer())
        _info('successfully create a new model', head='INFO')
    global_step = model.global_step.eval(session=session)
    return model, global_step
Ejemplo n.º 14
0
def predict(face, rect, gray, _hists, _labels, _Names):
    ##    cv2.imwrite("a/aa.png",face)
    lbph = LBPH(face, 8, 2)
    _ = lbph.create_MB_LBPH_2()
    confidence, pos, _ = lbph.findClosest(_hists)
    label = _labels[pos]

    ##    label,confidence =reconizer.predict(face)
    (x, y, w, h) = rect
    if confidence < 75:
        print(label)
        print("Recognition: {}, confidence: {}".format(_Names[label],
                                                       confidence))
        text = "{},{}".format(_Names[label], confidence)
        ##    text = Names[label]
        draw_name(gray, text, x, y)
        _info(_Names[label])
    else:
        print("Unknow people!!!")
        text = "unknow"
        draw_name(gray, text, x, y)
def save_to_binary(data, save_path, replace=False):
  """convert the data to binary file and save.
  
  Args:
    data: object, the original file.
    save_path: str, the absolute path to save the data.
    replace: boolean, Whether to replace the file when the file exits.
  """
  # change the str path to PosixPath
  save_path = Path(save_path)
  
  # check the file exits or not
  if save_path.is_file():
    if not replace:
      _error('{} already exits.'.format(save_path), head='ERROR')
      raise FileExistsError
    else:
      _info('{} already exits, replaced.'.format(save_path))
  
  with codecs.open(save_path, 'wb') as file:
    pickle.dump(data, file)
Ejemplo n.º 16
0
def predict_2(model, test_data_path, batch_size=32):
    with codecs.open(test_data_path, 'rb') as file:
        data = pickle.load(file)
    _info('The total test data length: {}.'.format(len(data)))

    predict_result_set = []
    for (start, end) in provide_batch_idx(len(data), batch_size):
        data_batch = data[start:end]
        sentences = [data[1] for data in data_batch]
        # labels = [data[0] for data in data_batch]

        sentiment_features = list(map(no_mask, sentences))
        input_idx = [item[0] for item in sentiment_features]
        input_idx_padded = np.array(padding_data_2(input_idx), dtype=np.int32)
        input_mask = list(map(make_mask_2, input_idx_padded))

        features = {'input_data': input_idx_padded, 'input_mask': input_mask}

        predictions = model(features)
        predict_results = predictions['predict']

        predict_result_set.extend(predict_results)

    return predict_result_set
Ejemplo n.º 17
0
        features = {'input_data': input_idx_padded, 'input_mask': input_mask}

        predictions = model(features)
        predict_results = predictions['predict']

        predict_result_set.extend(predict_results)

    return predict_result_set


if __name__ == '__main__':
    model = restore_model(cg.pb_model_path)

    predict_pos = predict_2(
        model, MAIN_PATH / 'data/Stanford_Data_binary/test_train_pos.bin')

    # with codecs.open(MAIN_PATH / 'data/Stanford_Data_binary/test_train_pos.bin', 'rb') as file:
    #   data = pickle.load(file)
    # for i, v in enumerate(predict_pos):
    #   if v != 1:
    #     print(data[i])
    #     input()

    pos_accuracy = sum(predict_pos) / len(predict_pos)
    predict_neg = predict_2(
        model, MAIN_PATH / 'data/Stanford_Data_binary/test_train_neg.bin')
    neg_accuracy = 1 - sum(predict_neg) / len(predict_neg)

    _info('Predict positve accuracy: {}.'.format(pos_accuracy))
    _info('Predict negative accuracy: {}.'.format(neg_accuracy))
Ejemplo n.º 18
0
from pathlib import Path
from model import BertModel
from model_UniLM import UniLM
from hparams_config import config as config_
from log import log_info as _info
from log import log_error as _error

PROJECT_PATH = str(Path(__file__).absolute().parent

def train(config):
    # build the training graph
    train_graph = tf.Graph()
    with train_graph.as_default():
        # train_model = BertModel(config=config, is_training=True)
        train_model = UniLM(config=config, is_training=True)    # for UniLM
        _info('finish building graph', head='INFO')
    
    # create session
    # the relationship between graph and session,
    # like python language and python interpreter.
    sess_conf = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8)
    sess_conf.gpu_options.allow_growth = True
    train_sess = tf.Session(config=sess_conf, graph=train_graph)

    # restore model from the latest checkpoint
    with train_graph.as_default():
        loaded_model, global_step = _mh.create_or_load(
            train_model, os.path.join(PROJECT_PATH, config.ckpt_path), train_sess)
    
    # initialize Tensorboard
    summary_writer = tf.summary.FileWriter(
Ejemplo n.º 19
0
	def model_fn(features, labels, mode, params):
		# obtain the data
		_info('*** Features ***')
		for name in sorted(features.keys()):
			tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape))
		
		input_text = features['input_text']
		input_image = features['input_image']

		# build the model
		is_training = (mode == tf.estimator.ModeKeys.TRAIN)
		model = EANNModel(config,
											is_training=is_training,
											input_text=input_text,
											input_image=input_image)
		label_output = model.get_label_output()
		domain_output = model.get_domain_output()
		batch_size = model.get_batch_size()

		# make predict
		label_output_prob = tf.nn.softmax(label_output, axis=-1)
		predict_label = tf.argmax(label_output_prob, axis=-1)
		domain_output_prob = tf.nn.softmax(domain_output, axis=-1)
		predict_event = tf.argmax(domain_output_prob, axis=-1)

		if mode == tf.estimator.ModeKeys.PREDICT:
			predictions = {'predict_label': predict_label,
										 'predict_event': predict_event,
										 'label_output_prob': label_output_prob,
										 'domain_output_prob': domain_output_prob}
			output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
		else:
			# get golden data
			classify_labels = labels['label']
			event_labels = labels['event']
			batch_size = tf.cast(batch_size, tf.float32)
			# # add l2 loss, no need
			# tv = tf.trainable_variables()
			# l2_loss = 1e-2 * (tf.reduce_sum([tf.nn.l2_loss(v) for v in tv]) / batch_size)
			
			# loss for classification
			classify_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
				labels=classify_labels,
				logits=label_output)) / batch_size
			
			# loss for event
			event_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
				labels=event_labels,
				logits=domain_output)) / batch_size
			
			# plus the 'event_loss' is correct, as add reversal layer after encoder,
			# when do backpropagation, the flag will change to minus when updating the parameters of the encoder.
			loss = classify_loss + event_loss
			
			if mode == tf.estimator.ModeKeys.TRAIN:
				# specify the learning rate
				if _cg.LEARNING_RATE_METHOD == 'polynomial':
					learning_rate = tf.train.polynomial_decay(_cg.LEARNING_RATE,
																										tf.train.get_or_create_global_step(),
																										_cg.TRAIN_STEPS,
																										end_learning_rate=_cg.LEARNING_LIMIT,
																										power=1.0,
																										cycle=False)
				elif _cg.LEARNING_RATE_METHOD == 'paper':
					learning_rate = learning_rate_decay(_cg.LEARNING_RATE, 
																							tf.train.get_or_create_global_step(),
																							_cg.TRAIN_STEPS)
				else:
					raise NotImplementedError('Not support the {}'.format(_cg.LEARNING_RATE_METHOD))
				lr = tf.maximum(tf.constant(_cg.LEARNING_LIMIT), learning_rate)

				# update the parameters
				optimizer = tf.train.GradientDescentOptimizer(lr, name='optimizer')
				tvars = tf.trainable_variables()
				gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=_cg.colocate_gradients_with_ops)
				clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
				train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())
				
				# check the accuracy during training
				predict_label = tf.cast(predict_label, tf.int32)
				predict_event = tf.cast(predict_event, tf.int32)
				classify_labels = tf.cast(classify_labels, tf.int32)
				event_labels = tf.cast(event_labels, tf.int32)

				accuracy_classify = tf.reduce_mean(tf.cast(tf.equal(predict_label, classify_labels), tf.float32))
				accuracy_event = tf.reduce_mean(tf.cast(tf.equal(predict_event, event_labels), tf.float32))

				# specify the information while training
				logging_hook = tf.train.LoggingTensorHook({'step': tf.train.get_global_step(),
																									 'class_loss': classify_loss,
																									 'class_acc': accuracy_classify,
																									 'event_loss': event_loss,
																									 'event_acc': accuracy_event,
																									 'lr': learning_rate}, every_n_iter=2)
				output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
			elif mode == tf.estimator.ModeKeys.EVAL:
				# evaluate metrics
				metric_dict = {'accuracy': tf.metrics.accuracy(labels=classify_labels, predictions=predict_label),
											 'precision': tf_metrics.precision(labels=classify_labels, predictions=predict_label, num_classes=2),
											 'recall': tf_metrics.recall(labels=classify_labels, predictions=predict_label, num_classes=2),
											 'f1': tf_metrics.f1(classify_labels, predict_label, num_classes=2)}
				output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metric_dict)
			else:
			  raise NotImplementedError
			
		return output_spec
Ejemplo n.º 20
0
    v_sorted = sorted(v_unsorted, reverse=True)

    # reverse the graph
    graph_rev = Graph()
    for v in v_unsorted:
        for u in graph.graph[v]:
            graph_rev.addEdge(u, v)

    return graph_rev, v_sorted


if __name__ == '__main__':
    graph = readFile('test_scc.txt')

    # sanity check
    _info('Check the graph:')
    cache = graph.DFSSearch(Vertex('b'))
    for v in cache:
        print(v, end=' ')
    _info('Finish checking!', head='\n INFO')

    # reverse the graph
    _info('Reverse the graph...')
    graph_rev, v_sorted = reverseGraph(graph)

    # sanity check
    _info('Check the graph:')
    cache = graph_rev.DFSSearch(Vertex('a'))
    for v in cache:
        print(v, end=' ')
    _info('Finish checking!', head='\n INFO')
Ejemplo n.º 21
0
    def __init__(self, config, is_training, scope=None):
        config = copy.deepcopy(config_)
        self.is_training = is_training
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_prob_dropout_prob = 0.0

        # Initializer Section
        # set the global initializer, which would cover all the variable scopes
        self.initializer = _mh.select_initializer(
            itype=config.initializer,
            seed=config.seed,
            init_weight=config.init_weight)
        tf.get_variable_scope().set_initializer(self.initializer)
        self.global_step = tf.Variable(0, trainable=False)

        # Input Section
        self.input_ids = tf.placeholder(tf.int32, [None, None],
                                        name='input_ids')
        self.input_length = tf.placeholder(tf.int32, name='input_length')
        self.input_mask = tf.placeholder(tf.int32, [None, None],
                                         name='input_mask')

        # Output Mask
        self.output_ids = tf.placeholder(tf.int32, [None, None],
                                         name='output_ids')

        # Encoder Section
        # TODO access the variables from the specific scope
        with tf.variable_scope(scope, default_name='bert'):
            # create embedding and get embedded input
            with tf.variable_scope('embeddings'):
                self.embedding = tf.get_variable(
                    'embedding', [config.vocab_size, config.embedding_size],
                    dtype=tf.float32)
                embedded_input = tf.nn.embedding_lookup(
                    self.embedding, self.input_ids)
            # add positional embedding
            embedded_input_pos = self._embedding_positional(
                config.pos_type,
                embedded_input,
                config.embedding_size,
                dropout_prob=config.dropout_prob)

            # Encoder Blocks
            with tf.variable_scope('encoder'):
                # get attention mask
                attention_mask = self._create_attention_mask(config.batch_size)
                # Multi-head, multi-layer Transformer
                sequence_output = self.transformer_model(
                    embedded_input_pos, config.batch_size,
                    config.encoder_layer, config.num_attention_heads,
                    config.forward_size, config.forward_ac,
                    config.hidden_dropout_prob,
                    config.attention_prob_dropout_prob, attention_mask,
                    config.attention_ac)

            # Decoder
            # Inherit this class, for expansion capability,
            # No Returns, because of considerring for expansion after pre_train,
            # do not add to much variable in initial function.
            with tf.variable_scope('decoder'):
                self._projection(sequence_output, config.vocab_size)

            if is_training:
                self._compute_loss(config.batch_size)
                self._update(config.learning_rate, config.decay_step,
                             config.lr_limit)
                self.train_summary = tf.summary.merge([
                    tf.summary.scalar('lr', self.learning_rate),
                    tf.summary.scalar('loss', self.loss_bs)
                ])
            else:
                self._infer()

            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
            _info('Finish Building Graph', head='INFO')
Ejemplo n.º 22
0
  def model_fn(features, labels, mode, params):
    # features name and shape
    _info('*** Features ****')
    for name in sorted(features.keys()):
      tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape))

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    # get data
    input_x = features['input_x']
    input_mask = features['input_mask']
    if is_training:
      input_y = features['input_y']
      seq_length = features['seq_length']
    else:
      input_y = None
      seq_length = None

    # build encoder
    model = BertEncoder(
      config=cg.BertEncoderConfig,
      is_training=is_training,
      input_ids=input_x,
      input_mask=input_mask)
    embedding_table = model.get_embedding_table()
    encoder_output = tf.reduce_sum(model.get_sequence_output(), axis=1)

    # build decoder
    decoder_model = Decoder(
      config=cg.DecoderConfig,
      is_training=is_training,
      encoder_state=encoder_output,
      embedding_table=embedding_table,
      decoder_intput_data=input_y,
      seq_length_decoder_input_data=seq_length)
    logits, sample_id, ppl_seq, ppl = decoder_model.get_decoder_output()

    if mode == tf.estimator.ModeKeys.PREDICT:
      predictions = {'sample_id': sample_id, 'ppls': ppl_seq}
      output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
      if mode == tf.estimator.ModeKeys.TRAIN:
        max_time = ft.get_shape_list(labels, expected_rank=2)[1]
        target_weights = tf.sequence_mask(seq_length, max_time, dtype=logits.dtype)
        batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=2)[0], tf.float32)

        loss = tf.reduce_sum(
          tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) * target_weights) / batch_size

        learning_rate = tf.train.polynomial_decay(cg.learning_rate,
                                          tf.train.get_or_create_global_step(),
                                          cg.train_steps / 100,
                                          end_learning_rate=1e-4,
                                          power=1.0,
                                          cycle=False)

        lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate)
        optimizer = tf.train.AdamOptimizer(lr, name='optimizer')
        tvars = tf.trainable_variables()
        gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())


        # this is excellent, because it could display the result each step, i.e., each step equals to batch_size.
        # the output_spec, display the result every save checkpoints step.
        logging_hook = tf.train.LoggingTensorHook({'loss' : loss, 'ppl': ppl, 'lr': lr}, every_n_iter=cg.print_info_interval)

        output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
      elif mode == tf.estimator.ModeKeys.EVAL:
        # TODO
        raise NotImplementedError
    
    return output_spec
Ejemplo n.º 23
0
  Path(cg.save_model_path).mkdir(exist_ok=True)

  model_fn = model_fn_builder()

  gpu_config = tf.ConfigProto()
  gpu_config.gpu_options.allow_growth = True

  run_config = tf.contrib.tpu.RunConfig(
    session_config=gpu_config,
    keep_checkpoint_max=cg.keep_checkpoint_max,
    save_checkpoints_steps=cg.save_checkpoints_steps,
    model_dir=cg.save_model_path)

  estimator = tf.estimator.Estimator(model_fn, config=run_config)
  estimator.train(train_input_fn)

def package_model(ckpt_path, pb_path):
  model_fn = model_fn_builder()
  estimator = tf.estimator.Estimator(model_fn, ckpt_path)
  estimator.export_saved_model(pb_path, server_input_fn)

if __name__ == '__main__':
  parser = argparse.ArgumentParser(description=_info('python train.py [train | package]', head='USAGE:'))
  parser.add_argument('mode')
  
  args = parser.parse_args()
  mode = args.mode
  if mode == 'train':
    main()
  elif mode == 'package':
    package_model(cg.save_model_path, cg.pb_model_path)