class DataAnalysis(object): def __init__(self): self.data_utils = DataUtils() def word_analysis(self, data_filename): """ Count word frequency :param data_filename: :return: """ words_count = 0 words_count_map = dict() with open(data_filename, encoding='utf-8', mode='rt') as data_file: for line in data_file: words, _ = self.data_utils.split(line) words_count += len(words) for word in words: if word in words_count_map: words_count_map[word] += 1 else: words_count_map[word] = 1 words_type_count = len(words_count_map) return words_count, words_type_count def length_analysis(self, data_filename): """ Count sentence length :param data_filename: :return: """ sentences_count = 0 max_length = 0 length_count_map = dict() with open(data_filename, encoding='utf-8', mode='rt') as data_file: for line in data_file: words, _ = self.data_utils.split(line) length = len(''.join(words)) # length = len(''.join(line.strip().split())) if length in length_count_map: length_count_map[length] += 1 else: length_count_map[length] = 1 if length > max_length: max_length = length sentences_count += 1 if sentences_count == 0: return dict() statistic_result = dict() accumulative_count = 0 for i in range(max_length + 1): if i in length_count_map: accumulative_count += length_count_map[i] if i != 0 and (i % 50 == 0 or i == max_length): statistic_result[i] = '%.2f' % (accumulative_count / sentences_count * 100) return statistic_result
def __init__(self): self.batch_size = FLAGS.batch_size self.num_steps = FLAGS.num_steps self.min_after_dequeue = FLAGS.min_after_dequeue self.num_threads = FLAGS.num_threads self.embedding_size = FLAGS.embedding_size self.data_utils = DataUtils() self.default_word_padding_id = self.data_utils._START_VOCAB_ID[0] self.default_label_padding_id = self.data_utils.load_default_label_id()
def create(self, splits=Config.DS_SPLIT, data_dir=Config.DATA_DIR): # ds sizes: # test 4445 # train 44199 # val 4444 self.data_dir = data_dir # Saves the categories of the labels self.categories = dict() # Does the data directory exist? if not self.data_dir.exists(): sys.exit( 'No dataset for training found at the given data directory') Config.STYLES_USE_COLS.append('split') df = DataUtils.load_data_frame('adjusted_styles.csv') set_names = ['train', 'val', 'test'] ids_by_split = [list(df[df['split'] == x].index) for x in set_names] df = df.drop('split', axis=1) for col in list(df.columns): df[col] = pd.Categorical(df[col]) self.categories[col] = df[col].cat.categories df[col] = df[col].cat.codes self.df = df for set_name, indices in list(zip(set_names, ids_by_split)): ds = tf.data.Dataset.from_tensor_slices(tf.constant(indices)) ds = ds.map(self._process_id, num_parallel_calls=AUTOTUNE) ds = self._prepare_for_training(set_name, ds) setattr(self, set_name, ds) return self
def write_calculate_result_data_to_excel(excel_file_path: str, question_data_list: []): print('开始将计算结果数据写入excel') workbook = openpyxl.load_workbook(excel_file_path) row_index = 0 question_mapping = {} for question in question_data_list: question_mapping[question.question_key] = question for row in workbook.worksheets[0].rows: if row[0].value is not None and row_index > 0: if row[0].value.strip() != '': question_key = row[3].value if question_key in question_mapping: xdata = question_mapping[ question_key].get_answer_data_str() xdata2 = [] if ',' not in xdata: for x in xdata: xdata2.append(x) workbook.worksheets[0].cell( row_index + 1, 10, DataUtils.parse_arr_data_to_comma_str_data( xdata2)) else: workbook.worksheets[0].cell( row_index + 1, 10, question_mapping[question_key]. get_answer_data_str()) workbook.worksheets[0].cell( row_index + 1, 11, question_mapping[question_key]. get_editable_original_data_str()) row_index = row_index + 1 workbook.save(excel_file_path) print('计算结果Excel写出完毕')
def __init__(self): self.vocab_path = FLAGS.vocab_path self.checkpoint_path = FLAGS.checkpoint_path self.freeze_graph_path = FLAGS.freeze_graph_path self.saved_model_path = FLAGS.saved_model_path self.use_crf = FLAGS.use_crf self.num_steps = FLAGS.num_steps self.default_label = FLAGS.default_label self.default_score = FLAGS.default_predict_score self.data_utils = DataUtils() self.tensorflow_utils = TensorflowUtils() self.num_classes = self.data_utils.get_vocabulary_size(os.path.join(FLAGS.vocab_path, 'labels_vocab.txt')) self.sequence_labeling_model = SequenceLabelingModel() self.init_predict_graph()
def __init__(self): self.tfrecords_path = FLAGS.tfrecords_path self.checkpoint_path = FLAGS.checkpoint_path self.tensorboard_path = FLAGS.tensorboard_path self.use_crf = FLAGS.use_crf self.learning_rate = FLAGS.learning_rate self.learning_rate_decay_factor = FLAGS.learning_rate_decay_factor self.decay_steps = FLAGS.decay_steps self.clip_norm = FLAGS.clip_norm self.max_training_step = FLAGS.max_training_step self.train_tfrecords_filename = os.path.join(self.tfrecords_path, 'train.tfrecords') self.test_tfrecords_filename = os.path.join(self.tfrecords_path, 'test.tfrecords') self.data_utils = DataUtils() self.num_classes = self.data_utils.get_vocabulary_size( os.path.join(FLAGS.vocab_path, 'labels_vocab.txt')) self.tensorflow_utils = TensorflowUtils() self.sequence_labeling_model = SequenceLabelingModel()
def _process_id(self, id): label = tf.py_function(func=self._get_label, inp=[id], Tout=tf.float32) file_path = tf.strings.join([ str(Config.DATA_DIR), '/images/', tf.strings.as_string(id), '.jpg' ]) # Load the raw data from the file as a string img = tf.io.read_file(file_path) img = DataUtils.decode_img(img) # Set shape manually bc tensor is returned by a py_func label.set_shape([39]) return img, label
def __init__(self, args): self.args = args # Loading data self.data = pd.read_hdf(self.args.hdf_file + '.hdf') self.data.drop_duplicates(subset=['url'], inplace=True) self.max_features = 200 self.batch_size = 32 # Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`; # (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.) tqdm.pandas(desc="my bar") # cut texts after this number of words (among top max_features most common words) self.maxlen = max(self.data['url'].progress_map(lambda x: len(x))) + 1 # Fix the seed self.seed = 21 np.random.seed(self.seed) # Splitting data into train, test & validation sets self.x_train, self.x_val_test, self.y_train, self.y_val_test = train_test_split( self.data['url'].values, self.data['label'], test_size=.33, random_state=self.seed, stratify=self.data['label']) self.x_val, self.x_test, self.y_val, self.y_test = train_test_split( self.x_val_test, self.y_val_test, test_size=.5, random_state=self.seed, stratify=self.y_val_test) print('\n*************** Data statistics ****************') datautils = DataUtils(self.args) datautils.data_Stats(self.y_train, self.y_val, self.y_test)
def train(args): """ Training using pipeline module :param args: :return: """ # Preparing dataset data = DataUtils(args) # Generating & saving dataframe from raw event folder if args.hdf_file: print('Raw data files will not be processed.\n') print('{} file present.'.format(args.hdf_file)) pass elif args.raw_data_dir: print('Processing Raw data files.\n') print('A hdf file will be generated for further exploration.') data.load_txt_files() data = data.prepare_data() # Creating pipelines model_pipe = Models(args) model_pipelined, model_name = model_pipe.model_pipeline() print('Pipeline created for model {}'.format(model_name)) # Running a model pipeline model = ModelRunPipeline(args, model_pipelined, data) # Multiprocessing to spawn processes using an API similar to threading module proc = Process(target=model.run_pipeline, args=()) proc.start() proc.join() print( '\n\n****************** Classification done. Enjoy Life. :) *******************' )
def load_data(year: str, stat_type: str): url = DATA_SOURCE_URL + str(year).split("-")[1] + stat_type + ".html" html = pd.read_html(url, header=0) data_frame = html[0] raw = data_frame.drop(data_frame[ data_frame.Age == "Age"].index) # Deletes repeating headers in content raw = raw.fillna(0) playerstats = raw.drop(["Rk"], axis=1) aggregation_functions = {} columns = DataUtils.get_columns(playerstats) DataUtils.convert_columns_to_numeric( playerstats, DataUtils.get_numeric_columns(columns)) aggregation_functions = DataUtils.get_aggregate_functions(columns) playerstats = playerstats.groupby( playerstats["Player"]).aggregate(aggregation_functions) playerstats["FanPoints"] = DataUtils.calculate_fan_points(playerstats) playerstats.sort_values(["FanPoints"], ascending=False, inplace=True) playerstats.insert(0, "rank", np.arange(start=1, stop=len(playerstats) + 1)) return playerstats
def run(self): self.data_loader_utils = DataLoaderUtils(self.load_config.server, self.index, self.type, self.load_config.server_username, self.load_config.server_password) self.data_utils = DataUtils() count = 0 bulk_data = '' ids_to_fetch = self.data_loader_batch.keys() self.load_config.log(LOG_LEVEL_TRACE, 'Fetching docs', self.load_config.server, self.index, self.type) self.data_utils.batch_fetch_docs_for_ids(self.load_config.server, ids_to_fetch, self.index, self.type, self.docs_fetched, self.load_config.doc_fetch_batch_size, self.load_config.server_username, self.load_config.server_password) for _id in self.existing_docs: relations = self.data_loader_batch[_id] existing_doc = self.existing_docs[_id] doc = {} updates = [] # Update relations for relation in relations: dest_index_id = relation['index_id'] dest_ids = relation['ids'] relationship_type = relation['type'] ids_to_remove = [] if 'ids_to_remove' in relation: ids_to_remove = relation['ids_to_remove'] self.load_config.log(LOG_LEVEL_TRACE, self.index, relationship_type, dest_index_id, len(dest_ids)) existing_doc = self.load_config.data_mapper.update_relations_for_doc(_id, existing_doc, dest_ids, self.source, dest_index_id, relation_type=relationship_type, append=self.append, ids_to_remove=ids_to_remove) doc[relationship_type] = existing_doc[relationship_type] updates.append({ 'index_id': dest_index_id, 'source': self.source, 'added_ids': dest_ids, 'removed_ids': ids_to_remove, 'relation_type': relationship_type }) # Relations updates relations_updates = [] if 'relations_updates' in existing_doc: relations_updates = existing_doc['relations_updates'] update_item = { 'update_source': self.data_source_name, 'update_date': self.updated_date, 'updates': updates } relations_updates.append(update_item) doc['relations_updates'] = relations_updates if self.load_config.test_mode and count % 2500 == 0: # print 'Existing doc id', _id self.load_config.log(LOG_LEVEL_INFO, 'Data', relations) self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc) if len(doc) > 0: bulk_update_header = self.data_loader_utils.bulk_update_header(_id) self.load_config.log(LOG_LEVEL_TRACE, 'bulk update header:', bulk_update_header) self.load_config.log(LOG_LEVEL_TRACE, 'bulk data', doc) bulk_data += bulk_update_header bulk_data += '\n' doc = { 'doc': doc } bulk_data += json.dumps(doc) bulk_data += '\n' count += 1 if count % 50 == 0: self.load_config.log(LOG_LEVEL_DEBUG, 'Processed docs', count, os.getpid(), self.index, _id) if len(bulk_data) >= self.load_config.bulk_data_size: self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) # logger.log(1, 'Process completed, saving loaded ids.........................') if not self.load_config.test_mode: self.save_summary(ids_to_fetch)
def __init__(self, hdfs_client, flags): self.train_is_alive = False self.hdfs_client = hdfs_client self.flags = flags self.data_utils = DataUtils()
class Segmenter(object): def __init__(self, hdfs_client, flags): self.train_is_alive = False self.hdfs_client = hdfs_client self.flags = flags self.data_utils = DataUtils() def update_config(self): config_path = os.path.join(self.flags.raw_data_path, 'config.json') try: with open(config_path, encoding='utf-8', mode='r') as data_file: config_json = json.load(data_file) if 'use_lstm' in config_json: self.flags.use_lstm = config_json['use_lstm'] elif 'use_dynamic_rnn' in config_json: self.flags.use_dynamic_rnn = config_json['use_dynamic_rnn'] elif 'use_bidirectional_rnn' in config_json: self.flags.use_bidirectional_rnn = config_json[ 'use_bidirectional_rnn'] elif 'vocab_drop_limit' in config_json: self.flags.vocab_drop_limit = config_json[ 'vocab_drop_limit'] elif 'batch_size' in config_json: self.flags.batch_size = config_json['batch_size'] elif 'num_steps' in config_json: self.flags.num_steps = config_json['num_steps'] elif 'num_layer' in config_json: self.flags.num_layer = config_json['num_layer'] elif 'embedding_size' in config_json: self.flags.embedding_size = config_json['embedding_size'] elif 'learning_rate' in config_json: self.flags.learning_rate = config_json['learning_rate'] elif 'learning_rate_decay_factor' in config_json: self.flags.learning_rate_decay_factor = config_json[ 'learning_rate_decay_factor'] elif 'keep_prob' in config_json: self.flags.keep_prob = config_json['keep_prob'] elif 'clip_norm' in config_json: self.flags.clip_norm = config_json['clip_norm'] except: raise Exception('ERROR: config.json content invalid') def train(self): self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, 'train.txt'), os.path.join(self.flags.datasets_path, 'train.txt')) self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'test.txt')) self.data_utils.label_segment_file( os.path.join(self.flags.datasets_path, 'train.txt'), os.path.join(self.flags.datasets_path, 'label_train.txt')) self.data_utils.label_segment_file( os.path.join(self.flags.datasets_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'label_test.txt')) self.data_utils.split_label_file( os.path.join(self.flags.datasets_path, 'label_train.txt'), os.path.join(self.flags.datasets_path, 'split_train.txt')) self.data_utils.split_label_file( os.path.join(self.flags.datasets_path, 'label_test.txt'), os.path.join(self.flags.datasets_path, 'split_test.txt')) words_vocab, labels_vocab = self.data_utils.create_vocabulary( os.path.join(self.flags.datasets_path, 'split_train.txt'), self.flags.vocab_path, self.flags.vocab_drop_limit) train_word_ids_list, train_label_ids_list = self.data_utils.file_to_word_ids( os.path.join(self.flags.datasets_path, 'split_train.txt'), words_vocab, labels_vocab) test_word_ids_list, test_label_ids_list = self.data_utils.file_to_word_ids( os.path.join(self.flags.datasets_path, 'split_test.txt'), words_vocab, labels_vocab) tensorflow_utils = TensorflowUtils() tensorflow_utils.create_record( train_word_ids_list, train_label_ids_list, os.path.join(self.flags.tfrecords_path, 'train.tfrecords')) tensorflow_utils.create_record( test_word_ids_list, test_label_ids_list, os.path.join(self.flags.tfrecords_path, 'test.tfrecords')) self.hdfs_client.hdfs_upload( self.flags.vocab_path, os.path.join(self.flags.output_path, os.path.basename(self.flags.vocab_path))) train = Train() train.train() def upload_tensorboard(self): hdfs_tensorboard_path = os.path.join( self.flags.output_path, os.path.basename(os.path.normpath(self.flags.tensorboard_path))) temp_hdfs_tensorboard_path = hdfs_tensorboard_path + '-temp' self.hdfs_client.hdfs_upload(self.flags.tensorboard_path, temp_hdfs_tensorboard_path) self.hdfs_client.hdfs_delete(hdfs_tensorboard_path) self.hdfs_client.hdfs_mv(temp_hdfs_tensorboard_path, hdfs_tensorboard_path) def log_monitor(self): while (self.train_is_alive): time.sleep(120) self.upload_tensorboard() def upload_model(self): predict = Predict() predict.saved_model_pb() hdfs_checkpoint_path = os.path.join( self.flags.output_path, os.path.basename(os.path.normpath(self.flags.checkpoint_path))) hdfs_saved_model_path = os.path.join( self.flags.output_path, os.path.basename(os.path.normpath(self.flags.saved_model_path))) temp_hdfs_checkpoint_path = hdfs_checkpoint_path + '-temp' temp_hdfs_saved_model_path = hdfs_saved_model_path + '-temp' self.hdfs_client.hdfs_upload(self.flags.checkpoint_path, temp_hdfs_checkpoint_path) self.hdfs_client.hdfs_upload(self.flags.saved_model_path, temp_hdfs_saved_model_path) self.hdfs_client.hdfs_delete(hdfs_checkpoint_path) self.hdfs_client.hdfs_delete(hdfs_saved_model_path) self.hdfs_client.hdfs_mv(temp_hdfs_checkpoint_path, hdfs_checkpoint_path) self.hdfs_client.hdfs_mv(temp_hdfs_saved_model_path, hdfs_saved_model_path) def evaluate(self): shutil.rmtree(self.flags.vocab_path) shutil.rmtree(self.flags.checkpoint_path) self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, os.path.basename(self.flags.vocab_path)), self.flags.vocab_path) self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'test.txt')) hdfs_checkpoint_path = os.path.join( self.flags.input_path, os.path.basename(self.flags.checkpoint_path)) self.hdfs_client.hdfs_download(hdfs_checkpoint_path, self.flags.checkpoint_path) self.data_utils.label_segment_file( os.path.join(self.flags.datasets_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'label_test.txt')) self.data_utils.split_label_file( os.path.join(self.flags.datasets_path, 'label_test.txt'), os.path.join(self.flags.datasets_path, 'split_test.txt')) predict = Predict() predict.file_predict( os.path.join(self.flags.datasets_path, 'split_test.txt'), os.path.join(self.flags.datasets_path, 'test_predict.txt')) self.model_evaluate = Evaluate() self.model_evaluate.evaluate( os.path.join(self.flags.datasets_path, 'test_predict.txt'), os.path.join(self.flags.datasets_path, 'test_evaluate.txt')) self.hdfs_client.hdfs_delete( os.path.join(self.flags.output_path, 'test_evaluate.txt')) self.hdfs_client.hdfs_upload( os.path.join(self.flags.datasets_path, 'test_evaluate.txt'), os.path.join(self.flags.input_path, 'test_evaluate.txt'))
class DataHelpers(object): def __init__(self): # Class Object Initialization. self.conf = Configuration() self.genutil = GeneralUtils() self.du = DataUtils() def load_word_embeddings_compact(self, embedding_dim, vocab_set, masking=False, use_pickled=True): vocab_list = list(vocab_set) if masking: masking_value = "masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked" start_index = 1 # Leaves the 0-index free of any data. else: masking_value = "non_masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked" start_index = 0 # Stores the embedding weights from the zero'th index itself. # Dataset sources file paths embedding_weights_file_path = self.conf.embedding_weights_file_tpl.format( masking_value) if not use_pickled: print("Loading Word Embeddings into memory ... ") word_vector_dict = {} j = 0 with open(self.conf.word_vectors_file, "r") as fopen: for line in fopen: j += 1 try: components = line.strip().split() if not len(components) < embedding_dim: if j % 1000000 == 0: print("Parsing word vector file ... {}".format( j)) word = components[0] if word in vocab_set: vec = np.asarray([ float(x) for x in components[1:embedding_dim + 1] ]) word_vector_dict[word] = vec except Exception as e: print("Exception Encountered: ".format(e)) print("Word Embeddings added to word_vector_dict.") # Adding the word vectors from the input datasets which are not in the word vector file. # Word Vectors are drawn at random from a uniform distribution(-0.25, 0.25) # adding 1 to account for 0th index (for masking) [Number of word:vector pairs is 7115783] n_symbols = len(vocab_list) embedding_weights = np.zeros((n_symbols + 1, embedding_dim)) for i, word_k in enumerate(vocab_list, start=start_index): if word_k in word_vector_dict: embedding_weights[i, :] = word_vector_dict[word_k] else: embedding_weights[i, :] = np.random.uniform( -0.25, 0.25, embedding_dim) print( "Added Random Vectors for the unseen words in the corpus. Current value of i: {}" .format(i)) if self.conf.create_data_dump: print( "Dumping embedding weights and index_dict to disk as pickled files ...." ) joblib.dump(embedding_weights, embedding_weights_file_path) print( 'Finished: Dumping index_dict and embedding_weights to disk.' ) return embedding_weights else: print( 'Loading Word Embeddings: index_dict and embeddings weights from disk ... ' ) embedding_weights = joblib.load(embedding_weights_file_path) print("Word Embedding pickled files loaded into memory!") return embedding_weights def generate_vocabulary_set(self, masking=False): # Load data from files print "Generating Vocabulary set from Input Data file (s): {}".format( self.conf.input_file_list) vocab_index_dict = {} vocab_set = set() # Adding a padding word, unknown word and space vocab_set.add('<PAD/>') vocab_set.add('<UNK/>') vocab_set.add(' ') # Data-Set Line Format: {'q': query, 'doc_corr': correct_url_doc, 'doc_incorr': incorrect_doc_list} less_doc_cnt = 0 for model_training_data_file in self.conf.input_file_list: with open(model_training_data_file) as fo: for line in fo: data = json.loads(line) if len(data['doc_incorr'] ) == self.conf.num_negative_examples: s_list = [] s_list.append(data['q']) s_list.append(data['doc_corr']) s_list += data['doc_incorr'] x_vocab = self.du.build_vocab( self.du.get_text_feature_splits( s_list, mode=self.conf.feature_level)) for i in x_vocab: if not i in vocab_set: # print "Vocab_Entity: {}".format(i.encode('utf-8')) vocab_set.add(i) else: less_doc_cnt += 1 if masking: i = 0 masking_value = "masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked" else: i = -1 masking_value = "non_masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked" for word in vocab_set: i += 1 vocab_index_dict[word] = i if self.conf.create_data_dump: print "Dumping Vocabulary Set and Index - dict to Disk!" joblib.dump(vocab_set, self.conf.vocab_set_file.format(masking_value)) joblib.dump(vocab_index_dict, self.conf.vocab_index_file.format(masking_value)) return vocab_set, vocab_index_dict def load_data_generator(self, vocab_index_dict, mode=None, batch_size=128, nb_epochs=1): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ if mode == None: raise Exception( "Please provide mode as either 'training' or 'validation'") input_dataset_file = "" if mode == "training": input_dataset_file = self.conf.model_training_data elif mode == "validation": input_dataset_file = self.conf.model_validation_data # print "\nLoading Model Training Data: {}\n".format(input_dataset_file) # Data-Set Line Format: {'q': query, 'doc_corr': correct_url_doc, 'doc_incorr': incorrect_doc_list} for epoch in range(0, nb_epochs + 2): less_doc_cnt = 0 with open(input_dataset_file, 'r') as fin: while True: batch_rows = list(islice(fin, batch_size)) if not batch_rows: break batch_query_data = np.empty(shape=(0, 0), dtype=np.int32) batch_pos_query_data = np.empty(shape=(0, 0), dtype=np.int32) batch_neg_query_data = [ np.empty(shape=(0, 0), dtype=np.int32) for _ in range(0, self.conf.num_negative_examples) ] for line in batch_rows: data = json.loads(line) if len(data['doc_incorr'] ) == self.conf.num_negative_examples: input_data_list = [[data['q']], [data['doc_corr']], data['doc_incorr']] # Build Input Data for n, x in enumerate(input_data_list): if n == 0: for i in xrange(0, len(x)): x_array = self.du.build_input_data( self.du.pad_sentences( self.du. get_text_feature_splits( x[i], cutoff=self.conf. query_length, mode=self.conf. feature_level), self.conf.query_length), vocab_index_dict, return_array=True) if batch_query_data.shape[0] == 0: batch_query_data = x_array else: batch_query_data = np.vstack( (batch_query_data, x_array)) elif n == 1: for i in xrange(0, len(x)): x_array = self.du.build_input_data( self.du.pad_sentences( self.du. get_text_feature_splits( x[i], cutoff=self.conf. document_length, mode=self.conf. feature_level), self.conf.document_length), vocab_index_dict, return_array=True) if batch_pos_query_data.shape[0] == 0: batch_pos_query_data = x_array else: batch_pos_query_data = np.vstack( (batch_pos_query_data, x_array)) elif n == 2: for i in xrange(0, len(x)): x_array = self.du.build_input_data( self.du.pad_sentences( self.du. get_text_feature_splits( x[i], cutoff=self.conf. document_length, mode=self.conf. feature_level), self.conf.document_length), vocab_index_dict, return_array=True) if batch_neg_query_data[i].shape[ 0] == 0: batch_neg_query_data[i] = x_array else: batch_neg_query_data[ i] = np.vstack( (batch_neg_query_data[i], x_array)) else: less_doc_cnt += 1 batch_y_data = np.ones(len(batch_query_data)) yield [batch_query_data, batch_pos_query_data ] + batch_neg_query_data, batch_y_data #print "Number of skipped data points: Incorrect Documents in Training Data (< 3): {}".format(less_doc_cnt) def get_vocab_index_embedding_weights(self, embedding_dim, embedding_weights_masking, load_embeddings_pickled=False, load_vocab_pickled=False): embedding_weights = [] if embedding_weights_masking == True: masking_value = "masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked" else: masking_value = "non_masked" # For masked embedding weights leave it blank "", else for masked use "_non_masked" # Load data from files if load_vocab_pickled: vocab_index_dict = joblib.load( self.conf.vocab_index_file.format(masking_value)) vocab_set = joblib.load( self.conf.vocab_set_file.format(masking_value)) else: vocab_set, vocab_index_dict = self.generate_vocabulary_set( masking=embedding_weights_masking) if self.conf.feature_level == "word": embedding_weights = self.load_word_embeddings_compact( embedding_dim, vocab_set, masking=embedding_weights_masking, use_pickled=load_embeddings_pickled) return embedding_weights, vocab_index_dict
def __init__(self): # Class Object Initialization. self.conf = Configuration() self.genutil = GeneralUtils() self.du = DataUtils()
default=None) args = parser.parse_args() if args.train: if not Config.LOG_DIR.exists(): Config.LOG_DIR.mkdir(parents=True) if not Config.CHECKPOINT_DIR.exists(): Config.CHECKPOINT_DIR.mkdir(parents=True) model = Model(DataSet().create()) model.fit() if args.eval: model = Model(DataSet().create(), args.load_model) history = model.eval() print('hello') if args.adjust_data: DataUtils.adjust_data() if args.analyze_data: if not Config.VIZ_RESULTS_DIR.exists(): Config.VIZ_RESULTS_DIR.mkdir() Evaluation() if args.predict_images is not None: model = Model(DataSet(), args.load_model) images = DataUtils.load_all_images(args.predict_images) predictions = model.predict(images) for img, pred in zip(images, predictions): plt.figure(figsize=(20, 20)) plt.imshow(img) plt.title(pred) plt.show()
st.title("Kantina Basketball Association") image = Image.open("kba_logo.jpg") st.image(image, use_column_width=True) st.markdown("""### NBA player statistics and yahoo fanpoints for KBA league. * **Data source:** [Basketball-reference.com](https://www.basketball-reference.com/) """) st.sidebar.header("Filtering") selected_year = st.sidebar.selectbox( "Year", list( reversed([ str(year) + "-" + str(year + 1) for year in range(1980, DataUtils.get_season_year()) ])), ) selected_category = st.sidebar.selectbox("Stats", ("Avg", "Total")) @st.cache def load_data(year: str, stat_type: str): url = DATA_SOURCE_URL + str(year).split("-")[1] + stat_type + ".html" html = pd.read_html(url, header=0) data_frame = html[0] raw = data_frame.drop(data_frame[ data_frame.Age == "Age"].index) # Deletes repeating headers in content raw = raw.fillna(0) playerstats = raw.drop(["Rk"], axis=1) aggregation_functions = {}
#!/usr/bin/python # -*- coding: UTF-8 -*- from utils.data_utils import DataUtils from data_helpers import DataHelpers du = DataUtils() dh = DataHelpers() train_data = "statue of liberty" train_data_list = ["statue of liberty", "new york"] # print du.get_text_feature_splits(train_data, mode='word') # print du.get_text_feature_splits(train_data_list, mode='word') # print du.get_text_feature_splits(train_data, mode='char') # print du.get_text_feature_splits(train_data_list, mode='char') # # print du.get_text_feature_splits(train_data, mode='char', cutoff=5) # print du.get_text_feature_splits(train_data_list, mode='char', cutoff=5) # print "ngram ..." # print du.get_text_feature_splits(train_data, mode='ngram') # print du.get_text_feature_splits(train_data_list, mode='ngram') # print du.get_text_feature_splits(train_data, mode='ngram', cutoff=5) # print du.get_text_feature_splits(train_data_list, mode='ngram', cutoff=5) dh.generate_vocabulary_set()
class Train(object): def __init__(self): self.tfrecords_path = FLAGS.tfrecords_path self.checkpoint_path = FLAGS.checkpoint_path self.tensorboard_path = FLAGS.tensorboard_path self.use_crf = FLAGS.use_crf self.learning_rate = FLAGS.learning_rate self.learning_rate_decay_factor = FLAGS.learning_rate_decay_factor self.decay_steps = FLAGS.decay_steps self.clip_norm = FLAGS.clip_norm self.max_training_step = FLAGS.max_training_step self.train_tfrecords_filename = os.path.join(self.tfrecords_path, 'train.tfrecords') self.test_tfrecords_filename = os.path.join(self.tfrecords_path, 'test.tfrecords') self.data_utils = DataUtils() self.num_classes = self.data_utils.get_vocabulary_size( os.path.join(FLAGS.vocab_path, 'labels_vocab.txt')) self.tensorflow_utils = TensorflowUtils() self.sequence_labeling_model = SequenceLabelingModel() def train(self): """ train bilstm + crf model :return: """ train_data = self.tensorflow_utils.read_and_decode( self.train_tfrecords_filename) train_batch_features, train_batch_labels, train_batch_features_lengths = train_data test_data = self.tensorflow_utils.read_and_decode( self.test_tfrecords_filename) test_batch_features, test_batch_labels, test_batch_features_lengths = test_data with tf.device('/cpu:0'): global_step = tf.Variable(0, name='global_step', trainable=False) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(self.learning_rate, global_step, self.decay_steps, self.learning_rate_decay_factor, staircase=True) optimizer = tf.train.RMSPropOptimizer(lr) with tf.variable_scope('model'): logits = self.sequence_labeling_model.inference( train_batch_features, train_batch_features_lengths, self.num_classes, is_training=True) train_batch_labels = tf.to_int64(train_batch_labels) if self.use_crf: loss, transition_params = self.sequence_labeling_model.crf_loss( logits, train_batch_labels, train_batch_features_lengths, self.num_classes) else: slice_logits, slice_train_batch_labels = self.sequence_labeling_model.slice_seq( logits, train_batch_labels, train_batch_features_lengths) loss = self.sequence_labeling_model.loss(slice_logits, slice_train_batch_labels) with tf.variable_scope('model', reuse=True): accuracy_logits = self.sequence_labeling_model.inference( test_batch_features, test_batch_features_lengths, self.num_classes, is_training=False) test_batch_labels = tf.to_int64(test_batch_labels) if self.use_crf: accuracy = self.sequence_labeling_model.crf_accuracy( accuracy_logits, test_batch_labels, test_batch_features_lengths, transition_params, self.num_classes) else: slice_accuracy_logits, slice_test_batch_labels = self.sequence_labeling_model.slice_seq( accuracy_logits, test_batch_labels, test_batch_features_lengths) accuracy = self.sequence_labeling_model.accuracy( slice_accuracy_logits, slice_test_batch_labels) # summary tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('lr', lr) # compute and update gradient # train_op = optimizer.minimize(loss, global_step=global_step) # computer, clip and update gradient gradients, variables = zip(*optimizer.compute_gradients(loss)) clip_gradients, _ = tf.clip_by_global_norm(gradients, self.clip_norm) train_op = optimizer.apply_gradients(zip(clip_gradients, variables), global_step=global_step) init_op = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=None) checkpoint_filename = os.path.join(self.checkpoint_path, 'model.ckpt') with tf.Session() as sess: summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(self.tensorboard_path, sess.graph) sess.run(init_op) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) if ckpt and ckpt.model_checkpoint_path: print('Continue training from the model {}'.format( ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) max_accuracy = 0.0 min_loss = 100000000.0 try: while not coord.should_stop(): _, loss_value, step = sess.run( [train_op, loss, global_step]) if step % 100 == 0: accuracy_value, summary_value, lr_value = sess.run( [accuracy, summary_op, lr]) china_tz = pytz.timezone('Asia/Shanghai') current_time = datetime.datetime.now(china_tz) print('[{}] Step: {}, loss: {}, accuracy: {}, lr: {}'. format(current_time, step, loss_value, accuracy_value, lr_value)) if accuracy_value > max_accuracy and loss_value < min_loss: writer.add_summary(summary_value, step) data_clean.clean_checkpoint(self.checkpoint_path) saver.save(sess, checkpoint_filename, global_step=step) print('save model to %s-%d' % (checkpoint_filename, step)) max_accuracy = accuracy_value min_loss = loss_value if step >= self.max_training_step: print('Done training after %d step' % step) break except tf.errors.OutOfRangeError: print('Done training after reading all data') finally: coord.request_stop() coord.join(threads)
def run(self): self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.index, self.type, self.load_config.server_username, self.load_config.server_password) self.data_utils = DataUtils() count = 0 bulk_data = '' ids_to_load = self.data_loader_batch.keys() if not self.create_only: # Create ids to fetch ids_to_fetch = [] for _id in ids_to_load: es_id = self.get_es_id(_id) ids_to_fetch.append(es_id) # Fetch ids self.load_config.log(LOG_LEVEL_TRACE, 'Fetching docs', self.load_config.server, self.index, self.type) self.data_utils.batch_fetch_docs_for_ids( self.load_config.server, ids_to_fetch, self.index, self.type, self.docs_fetched, self.load_config.doc_fetch_batch_size, self.load_config.server_username, self.load_config.server_password) for _id in ids_to_load: data_for_id = self.data_loader_batch[_id] es_id = self.get_es_id(_id) if es_id in self.existing_docs: # Update doc existing_doc = self.existing_docs[es_id] doc = self.load_config.data_mapper.update_doc( existing_doc=existing_doc, _id=_id, data_source_name=self.load_config.data_source_name, data=data_for_id) if self.load_config.test_mode and count % 2500 == 0: # print 'Existing doc', self.load_manager.data_mapper.extract_fields_from_existing_doc(existing_doc) self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id) self.load_config.log( LOG_LEVEL_INFO, '--------------------------------------------------------' ) self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc) if len(doc) > 0: bulk_data += self.data_loader_utils.bulk_update_header( es_id) bulk_data += '\n' doc = {'doc': doc} bulk_data += json.dumps(doc) bulk_data += '\n' else: self.add_to_failed_docs( _id, data_for_id, 'Data mapper: update doc returned empty') elif self.allow_doc_creation: # Create new doc doc = self.load_config.data_mapper.create_doc( _id=_id, data_source_name=self.load_config.data_source_name, data=data_for_id) if self.load_config.test_mode and count % 2500 == 0: self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id) self.load_config.log( LOG_LEVEL_INFO, '--------------------------------------------------------' ) self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc) if len(doc) > 0: bulk_data += self.data_loader_utils.bulk_index_header( es_id) bulk_data += '\n' bulk_data += json.dumps(doc) bulk_data += '\n' else: self.add_to_failed_docs( _id, data_for_id, 'Data mapper: create doc returned empty') else: self.add_to_failed_docs( _id, data_for_id, 'Update failed: existing doc not found') count += 1 if count % 500 == 0: self.load_config.log(LOG_LEVEL_DEBUG, 'Processed', count, 'docs') if len(bulk_data) >= self.load_config.bulk_data_size: self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) if not self.load_config.test_mode: self.save_summary(ids_to_load)
class DataLoader(object): def __init__(self, load_config, data_loader_batch, _index, _type, data_source_batch_name=None): self.load_config = load_config self.data_loader_batch = data_loader_batch self.index = _index self.type = _type self.data_source_batch_directory = self.load_config.data_source_batch_directory( data_source_batch_name) self.failed_docs_directory = self.load_config.failed_docs_directory( data_source_batch_name) self.loaded_docs_directory = self.load_config.loaded_docs_directory( data_source_batch_name) self.bulk_update_response_directory = self.load_config.bulk_update_response_directory( data_source_batch_name) self.existing_docs = {} self.failed_docs = {} self.updated_ids = {} self.indexed_ids = {} self.allow_doc_creation = self.load_config.data_mapper.allow_doc_creation( self.load_config.data_source_name) self.create_only = self.load_config.data_mapper.create_only( self.load_config.data_source_name) self.data_loader_utils = None self.data_utils = None def get_es_id(self, doc_id): return self.load_config.data_mapper.get_es_id(doc_id) def get_doc_id(self, es_id): return self.load_config.data_mapper.get_doc_id(es_id) def docs_fetched(self, docs, index, type): self.load_config.log(LOG_LEVEL_TRACE, 'Docs fetched', len(docs)) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] self.existing_docs[_id] = existing_doc def run(self): self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.index, self.type, self.load_config.server_username, self.load_config.server_password) self.data_utils = DataUtils() count = 0 bulk_data = '' ids_to_load = self.data_loader_batch.keys() if not self.create_only: # Create ids to fetch ids_to_fetch = [] for _id in ids_to_load: es_id = self.get_es_id(_id) ids_to_fetch.append(es_id) # Fetch ids self.load_config.log(LOG_LEVEL_TRACE, 'Fetching docs', self.load_config.server, self.index, self.type) self.data_utils.batch_fetch_docs_for_ids( self.load_config.server, ids_to_fetch, self.index, self.type, self.docs_fetched, self.load_config.doc_fetch_batch_size, self.load_config.server_username, self.load_config.server_password) for _id in ids_to_load: data_for_id = self.data_loader_batch[_id] es_id = self.get_es_id(_id) if es_id in self.existing_docs: # Update doc existing_doc = self.existing_docs[es_id] doc = self.load_config.data_mapper.update_doc( existing_doc=existing_doc, _id=_id, data_source_name=self.load_config.data_source_name, data=data_for_id) if self.load_config.test_mode and count % 2500 == 0: # print 'Existing doc', self.load_manager.data_mapper.extract_fields_from_existing_doc(existing_doc) self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id) self.load_config.log( LOG_LEVEL_INFO, '--------------------------------------------------------' ) self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc) if len(doc) > 0: bulk_data += self.data_loader_utils.bulk_update_header( es_id) bulk_data += '\n' doc = {'doc': doc} bulk_data += json.dumps(doc) bulk_data += '\n' else: self.add_to_failed_docs( _id, data_for_id, 'Data mapper: update doc returned empty') elif self.allow_doc_creation: # Create new doc doc = self.load_config.data_mapper.create_doc( _id=_id, data_source_name=self.load_config.data_source_name, data=data_for_id) if self.load_config.test_mode and count % 2500 == 0: self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id) self.load_config.log( LOG_LEVEL_INFO, '--------------------------------------------------------' ) self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc) if len(doc) > 0: bulk_data += self.data_loader_utils.bulk_index_header( es_id) bulk_data += '\n' bulk_data += json.dumps(doc) bulk_data += '\n' else: self.add_to_failed_docs( _id, data_for_id, 'Data mapper: create doc returned empty') else: self.add_to_failed_docs( _id, data_for_id, 'Update failed: existing doc not found') count += 1 if count % 500 == 0: self.load_config.log(LOG_LEVEL_DEBUG, 'Processed', count, 'docs') if len(bulk_data) >= self.load_config.bulk_data_size: self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) if not self.load_config.test_mode: self.save_summary(ids_to_load) def load_bulk_data(self, bulk_data): self.load_config.log(LOG_LEVEL_DEBUG, 'Bulk data size', len(bulk_data), 'loading...') response = None if not self.load_config.test_mode: response = self.data_loader_utils.load_bulk_data(bulk_data) if response: self.load_config.log(LOG_LEVEL_DEBUG, 'Done loading bulk data, saving response') if not self.load_config.test_mode: # Extract and save the failed docs self.process_bulk_update_response(response) else: self.load_config.log(LOG_LEVEL_ERROR, 'Bulk data load failed') def process_response_item(self, item, op): item_op = item[op] es_id = item_op['_id'] _id = self.get_doc_id(es_id) try: doc = self.data_loader_batch[es_id] except Exception as e: doc = self.data_loader_batch[_id] if 'status' in item_op: if item_op['status'] == 200 or item_op['status'] == 201: # doc success if op == OP_INDEX: self.indexed_ids[_id] = 0 elif op == OP_UPDATE: self.updated_ids[_id] = 0 else: self.add_to_failed_docs(_id, doc, item) else: self.add_to_failed_docs(_id, doc, item) def process_bulk_update_response(self, response): load_summary = json.loads(response) items = load_summary['items'] # print load_summary for item in items: if OP_INDEX in item: self.process_response_item(item, OP_INDEX) elif OP_UPDATE in item: self.process_response_item(item, OP_UPDATE) # save response to file self.load_config.log(LOG_LEVEL_TRACE, 'Updated ids:', len(self.updated_ids), 'Indexed ids:', len(self.indexed_ids), 'Failed ids:', len(self.failed_docs)) bulk_update_response_file_name = file_utils.batch_file_name_with_prefix( 'summary') file_utils.save_text_file(self.bulk_update_response_directory, bulk_update_response_file_name + '.json', response) def save_summary(self, ids_to_load): data_loader_batch_name = file_utils.batch_file_name_with_prefix( DATA_LOADER_BATCH_PREFIX) # Find skipped ids for _id in ids_to_load: if _id not in self.updated_ids and _id not in self.indexed_ids and _id not in self.failed_docs: doc = self.data_loader_batch[_id] self.add_to_failed_docs(_id, doc, 'Skipped') # Save failed docs if len(self.failed_docs) > 0: file_utils.save_file(self.failed_docs_directory, data_loader_batch_name + '.json', self.failed_docs) # Save batch summary summary = { 'indexed_ids': self.indexed_ids.keys(), 'updated_ids': self.updated_ids.keys(), } file_utils.save_file(self.loaded_docs_directory, data_loader_batch_name + '.json', summary) # Print summary self.load_config.log( LOG_LEVEL_INFO, '---------------------------------------------------------------------------------------------' ) self.load_config.log(LOG_LEVEL_INFO, self.load_config.server, self.load_config.server_username, self.index, self.type, ' Updated docs:', len(self.updated_ids) + len(self.indexed_ids), ', Failed docs:', len(self.failed_docs)) self.load_config.log( LOG_LEVEL_INFO, '---------------------------------------------------------------------------------------------' ) def add_to_failed_docs(self, _id, doc, reason): data_for_id = {'reason': reason, 'doc': doc} self.failed_docs[_id] = data_for_id
def __init__(self): self.data_utils = DataUtils()
def get_editable_original_data_str(self) -> str: return DataUtils.parse_arr_data_to_comma_str_data(self.editable_original_data)
from bayes.origin.mulyinomial_native_bayes import MultinomialNB from bayes.origin.gaussian_native_bayes import GaussianNB from utils.data_utils import DataUtils x, y = DataUtils.get_data_set('data/mushroom.txt', split=',') print(x) print(y) nb = GaussianNB() nb.fit(x, y) nb.evaluate(x, y)
def get_answer_data_str(self) -> str: return DataUtils.parse_arr_data_to_comma_str_data(self.answer_data)
class Predict(object): def __init__(self): self.vocab_path = FLAGS.vocab_path self.checkpoint_path = FLAGS.checkpoint_path self.freeze_graph_path = FLAGS.freeze_graph_path self.saved_model_path = FLAGS.saved_model_path self.use_crf = FLAGS.use_crf self.num_steps = FLAGS.num_steps self.default_label = FLAGS.default_label self.default_score = FLAGS.default_predict_score self.data_utils = DataUtils() self.tensorflow_utils = TensorflowUtils() self.num_classes = self.data_utils.get_vocabulary_size( os.path.join(FLAGS.vocab_path, 'labels_vocab.txt')) self.sequence_labeling_model = SequenceLabelingModel() self.init_predict_graph() def init_predict_graph(self): """ init predict model graph :return: """ # split 1-D String dense Tensor to words SparseTensor self.input_sentences = tf.placeholder(dtype=tf.string, shape=[None], name='input_sentences') sparse_words = tf.string_split(self.input_sentences, delimiter=' ') # slice SparseTensor valid_indices = tf.less(sparse_words.indices, tf.constant([self.num_steps], dtype=tf.int64)) valid_indices = tf.reshape( tf.split(valid_indices, [1, 1], axis=1)[1], [-1]) valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices) excess_indices = tf.greater_equal( sparse_words.indices, tf.constant([self.num_steps], dtype=tf.int64)) excess_indices = tf.reshape( tf.split(excess_indices, [1, 1], axis=1)[1], [-1]) excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices) # compute sentences lengths int_values = tf.ones(shape=tf.shape(valid_sparse_words.values), dtype=tf.int64) int_valid_sparse_words = tf.SparseTensor( indices=valid_sparse_words.indices, values=int_values, dense_shape=valid_sparse_words.dense_shape) input_sentences_lengths = tf.sparse_reduce_sum(int_valid_sparse_words, axis=1) # sparse to dense default_padding_word = self.data_utils._START_VOCAB[0] words = tf.sparse_to_dense( sparse_indices=valid_sparse_words.indices, output_shape=[valid_sparse_words.dense_shape[0], self.num_steps], sparse_values=valid_sparse_words.values, default_value=default_padding_word) # dict words to ids with open(os.path.join(self.vocab_path, 'words_vocab.txt'), encoding='utf-8', mode='rt') as data_file: words_table_list = [ line.strip() for line in data_file if line.strip() ] words_table_tensor = tf.constant(words_table_list, dtype=tf.string) words_table = lookup.index_table_from_tensor( mapping=words_table_tensor, default_value=self.data_utils._START_VOCAB_ID[3]) # words_table = lookup.index_table_from_file(os.path.join(vocab_path, 'words_vocab.txt'), default_value=3) words_ids = words_table.lookup(words) # blstm model predict with tf.variable_scope('model', reuse=None): logits = self.sequence_labeling_model.inference( words_ids, input_sentences_lengths, self.num_classes, is_training=False) if self.use_crf: logits = tf.reshape(logits, shape=[-1, self.num_steps, self.num_classes]) transition_params = tf.get_variable( "transitions", [self.num_classes, self.num_classes]) input_sentences_lengths = tf.to_int32(input_sentences_lengths) predict_labels_ids, sequence_scores = crf.crf_decode( logits, transition_params, input_sentences_lengths) predict_labels_ids = tf.to_int64(predict_labels_ids) sequence_scores = tf.reshape(sequence_scores, shape=[-1, 1]) normalized_sequence_scores = self.tensorflow_utils.score_normalize( sequence_scores) predict_scores = tf.matmul( normalized_sequence_scores, tf.ones(shape=[1, self.num_steps], dtype=tf.float32)) else: props = tf.nn.softmax(logits) max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1) predict_labels_ids = tf.reshape(max_prop_indices, shape=[-1, self.num_steps]) predict_labels_ids = tf.to_int64(predict_labels_ids) predict_scores = tf.reshape(max_prop_values, shape=[-1, self.num_steps]) predict_scores = tf.as_string(predict_scores, precision=3) # dict ids to labels with open(os.path.join(self.vocab_path, 'labels_vocab.txt'), encoding='utf-8', mode='rt') as data_file: labels_table_list = [ line.strip() for line in data_file if line.strip() ] labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string) labels_table = lookup.index_to_string_table_from_tensor( mapping=labels_table_tensor, default_value=self.default_label) # labels_table = lookup.index_to_string_table_from_file(os.path.join(vocab_path, 'labels_vocab.txt'), default_value='O') predict_labels = labels_table.lookup(predict_labels_ids) sparse_predict_labels = self.tensorflow_utils.sparse_concat( predict_labels, valid_sparse_words, excess_sparse_words, self.default_label) sparse_predict_scores = self.tensorflow_utils.sparse_concat( predict_scores, valid_sparse_words, excess_sparse_words, '0.0') self.format_predict_labels = self.tensorflow_utils.sparse_string_join( sparse_predict_labels, 'predict_labels') self.format_predict_scores = self.tensorflow_utils.sparse_string_join( sparse_predict_scores, 'predict_scores') saver = tf.train.Saver() tables_init_op = tf.tables_initializer() self.sess = tf.Session() self.sess.run(tables_init_op) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) if ckpt and ckpt.model_checkpoint_path: print('read model from {}'.format(ckpt.model_checkpoint_path)) saver.restore(self.sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found at %s' % self.checkpoint_path) return def predict(self, words_list): """ Predict labels, the operation of transfer words to ids is processed by tensorflow tensor Input words list :param words_list: :return: """ split_words_list = [] map_split_indexes = [] for index in range(len(words_list)): temp_words_list = self.data_utils.split_long_sentence( words_list[index], self.num_steps) map_split_indexes.append( list( range(len(split_words_list), len(split_words_list) + len(temp_words_list)))) split_words_list.extend(temp_words_list) predict_labels, predict_scores = self.sess.run( [self.format_predict_labels, self.format_predict_scores], feed_dict={self.input_sentences: split_words_list}) predict_labels_str = [ predict_label.decode('utf-8') for predict_label in predict_labels ] predict_scores_str = [ predict_score.decode('utf-8') for predict_score in predict_scores ] merge_predict_labels_str = [] merge_predict_scores_str = [] for indexes in map_split_indexes: merge_predict_label_str = ' '.join( [predict_labels_str[index] for index in indexes]) merge_predict_labels_str.append(merge_predict_label_str) merge_predict_score_str = ' '.join( [predict_scores_str[index] for index in indexes]) merge_predict_scores_str.append(merge_predict_score_str) return merge_predict_labels_str, merge_predict_scores_str def file_predict(self, data_filename, predict_filename): """ Predict data_filename, save the predict result into predict_filename The label is split into single word, -B -M -E -S :param data_filename: :param predict_filename: :return: """ print('Predict file ' + data_filename) sentence_list = [] words_list = [] labels_list = [] predict_labels_list = [] with open(data_filename, encoding='utf-8', mode='rt') as data_file: for line in data_file: words, labels = self.data_utils.split(line) if words and labels: sentence_list.append(''.join(words)) words_list.append(' '.join(words)) labels_list.append(' '.join(labels)) predict_labels, _ = self.predict([' '.join(words)]) predict_labels_list.append(predict_labels[0]) word_predict_label_list = [] word_category_list = [] word_predict_category_list = [] for (words, labels, predict_labels) in zip(words_list, labels_list, predict_labels_list): word_list = words.split() label_list = labels.split() predict_label_list = predict_labels.split() word_predict_label = ' '.join([ word + '/' + predict_label for (word, predict_label) in zip(word_list, predict_label_list) ]) word_predict_label_list.append(word_predict_label) # merge label merge_word_list, merge_label_list = self.data_utils.merge_label( word_list, label_list) word_category = ' '.join([ word + '/' + label for (word, label) in zip(merge_word_list, merge_label_list) if label != self.default_label ]) word_category_list.append(word_category) # merge predict label merge_predict_word_list, merge_predict_label_list = self.data_utils.merge_label( word_list, predict_label_list) word_predict_category = ' '.join([ predict_word + '/' + predict_label for (predict_word, predict_label) in zip( merge_predict_word_list, merge_predict_label_list) if predict_label != 'O' ]) word_predict_category_list.append(word_predict_category) with open(predict_filename, encoding='utf-8', mode='wt') as predict_file: for (sentence, word_predict_label, word_category, word_predict_category) in \ zip(sentence_list, word_predict_label_list, word_category_list, word_predict_category_list): predict_file.write('Passage: ' + sentence + '\n') predict_file.write('SinglePredict: ' + word_predict_label + '\n') predict_file.write('Merge: ' + word_category + '\n') predict_file.write('MergePredict: ' + word_predict_category + '\n\n') def freeze_graph(self): """ Save graph into .pb file :return: """ graph = tf.graph_util.convert_variables_to_constants( self.sess, self.sess.graph_def, ['init_all_tables', 'predict_labels', 'predict_scores']) tf.train.write_graph(graph, self.freeze_graph_path, 'frozen_graph.pb', as_text=False) print('Successfully freeze model to %s' % self.freeze_graph_path) def saved_model_pb(self): """ Saved model into .ph and variables files, loading it by tensorflow serving, :return: """ saved_model_path = os.path.join(self.saved_model_path, '1') if os.path.exists(saved_model_path): shutil.rmtree(saved_model_path) builder = tf.saved_model.builder.SavedModelBuilder(saved_model_path) input_tensor_info = tf.saved_model.utils.build_tensor_info( self.input_sentences) output_labels_tensor_info = tf.saved_model.utils.build_tensor_info( self.format_predict_labels) output_scores_tensor_info = tf.saved_model.utils.build_tensor_info( self.format_predict_scores) prediction_signature = tf.saved_model.signature_def_utils.build_signature_def( inputs={'input_sentences': input_tensor_info}, outputs={ 'predict_labels': output_labels_tensor_info, 'predict_scores': output_scores_tensor_info }, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( self.sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_segment': prediction_signature}, legacy_init_op=legacy_init_op) builder.save() print('Successfully exported model to %s' % saved_model_path)
def __init__(self, writer: SummaryWriter, model, data_utils: DataUtils, device, run_folder_path, do_validation, lr, optimizer, mse, ssim, save_checkpoints=False, do_persistence=False, nr_of_input_steps=3, **model_params): self.data_utils = data_utils self.estimate_total_nr_train_data_points = self.data_utils.get_estimate_total_nr_train_data_points( ) self.estimate_total_nr_test_data_points = self.data_utils.get_estimate_total_nr_test_data_points( ) self.validation_train_ratio = data_utils.get_validation_train_ratio() self.net = self.net.to(device) logging.info( 'Model params - {}, number of parameters in net: {}, '.format([ '{}: {}'.format(x, y) for x, y in model_params.items() if 'state_dict' not in x ], self.count_parameters())) logging.info( 'Run params - device: {}, run folder: {}, validation: {},' ' learning rate: {}, save checkpoints: {}, do persistence: {}'. format( device, run_folder_path, do_validation, lr, save_checkpoints, do_persistence, )) logging.info(self.net) self.nr_of_input_steps = nr_of_input_steps self.do_persistence = do_persistence self.save_checkpoints = save_checkpoints self.lr = lr self.do_validation = do_validation self.model_params = model_params self.run_folder_path = run_folder_path self.device = device self.model = model self.writer = writer self.optimizer = optimizer(self.net.parameters(), lr=self.lr) self.ssim = ssim() self.mse = mse() self.train_mse_loss_array = [] self.train_ssim_loss_array = [] self.test_mse_loss_array = [] self.test_ssim_loss_array = [] self.running_train_mse_loss = 0.0 self.running_test_mse_loss = 0.0 self.hidden_state = None # writer.add_graph(net, data_utils.get_next_train_data_point().to(device)) # writer.close() if self.do_validation: self.running_validation_loss = 0.0 self.validation_loss_array = [] self.when_validate = data_utils.get_validation_train_ratio()
class TensorflowUtils(object): def __init__(self): self.batch_size = FLAGS.batch_size self.num_steps = FLAGS.num_steps self.min_after_dequeue = FLAGS.min_after_dequeue self.num_threads = FLAGS.num_threads self.embedding_size = FLAGS.embedding_size self.data_utils = DataUtils() self.default_word_padding_id = self.data_utils._START_VOCAB_ID[0] self.default_label_padding_id = self.data_utils.load_default_label_id() def create_record(self, words_list, labels_list, tfrecords_filename): """" Store data into tfrecords file :param words_list: :param labels_list: :param tfrecords_filename: :return: """ print('Create record to ' + tfrecords_filename) writer = tf.python_io.TFRecordWriter(tfrecords_filename) assert len(words_list) == len(labels_list) for (word_ids, label_ids) in zip(words_list, labels_list): word_list = [int(word) for word in word_ids.strip().split()] label_list = [int(label) for label in label_ids.strip().split()] assert len(word_list) == len(label_list) example = tf.train.Example(features=tf.train.Features( feature={ 'words': tf.train.Feature(int64_list=tf.train.Int64List( value=word_list)), 'labels': tf.train.Feature(int64_list=tf.train.Int64List( value=label_list)), })) writer.write(example.SerializeToString()) writer.close() def read_and_decode(self, tfrecords_filename): """" Shuffled read batch data from tfrecords file :param tfrecords_filename: :return: """ print('Read record from ' + tfrecords_filename) filename_queue = tf.train.string_input_producer([tfrecords_filename], num_epochs=None) reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) feature_configs = { # 'words': tf.FixedLenFeature(shape=[num_steps], dtype=tf.int64, default_value=0), 'words': tf.VarLenFeature(dtype=tf.int64), 'labels': tf.VarLenFeature(dtype=tf.int64), } features = tf.parse_single_example(serialized_example, features=feature_configs) words = features['words'] words_len = words.dense_shape[0] words_len = tf.minimum(words_len, tf.constant(self.num_steps, tf.int64)) words = tf.sparse_to_dense( sparse_indices=words.indices[:self.num_steps], output_shape=[self.num_steps], sparse_values=words.values[:self.num_steps], default_value=self.default_word_padding_id) labels = features['labels'] labels = tf.sparse_to_dense( sparse_indices=labels.indices[:self.num_steps], output_shape=[self.num_steps], sparse_values=labels.values[:self.num_steps], default_value=self.default_label_padding_id) capacity = self.min_after_dequeue + 3 * self.batch_size words_batch, labels_batch, words_len_batch = tf.train.shuffle_batch( [words, labels, words_len], batch_size=self.batch_size, capacity=capacity, min_after_dequeue=self.min_after_dequeue, num_threads=self.num_threads) return words_batch, labels_batch, words_len_batch def print_all(self, tfrecords_filename): """ Print all data from tfrecords file :param tfrecords_filename: :return: """ number = 1 for serialized_example in tf.python_io.tf_record_iterator( tfrecords_filename): example = tf.train.Example() example.ParseFromString(serialized_example) words = example.features.feature['words'].int64_list.value labels = example.features.feature['labels'].int64_list.value word_list = [word for word in words] label_list = [label for label in labels] print('Number:{}, labels: {}, features: {}'.format( number, label_list, word_list)) number += 1 def print_shuffle(self, tfrecords_filename): """ Print shuffled data from tfrecords file calling read_and_decode method :param tfrecords_filename: :return: """ words_batch, labels_batch, words_len_batch = self.read_and_decode( tfrecords_filename) with tf.Session() as sess: init_op = tf.global_variables_initializer() sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): batch_words_r, batch_labels_r, batch_words_len_r = sess.run( [words_batch, labels_batch, words_len_batch]) print('batch_words_r : ', batch_words_r.shape) print(batch_words_r) print('batch_labels_r : ', batch_labels_r.shape) print(batch_labels_r) print('batch_words_len_r : ', batch_words_len_r.shape) print(batch_words_len_r) except tf.errors.OutOfRangeError: print('Done reading') finally: coord.request_stop() coord.join(threads) def load_embedding(self, embedding_filename, vocab_filename): """ Load word embedding, that pretrained by Word2Vec :param embedding_filename: :param vocab_filename: :return: """ embedding_dict = dict() with open(embedding_filename, encoding='utf-8', mode='rt') as data_file: for line in data_file: words = line.strip().split() if len(words) != self.embedding_size + 1: raise Exception('Invalid embedding exist : %s' % (line.strip())) word = words[0] embedding = [float(num) for num in words[1:]] embedding_dict[word] = embedding words_vocab = self.data_utils.initialize_single_vocabulary( vocab_filename) embedding = [[0.0 for _ in range(self.embedding_size)] for _ in range(len(words_vocab))] for word, word_ids in words_vocab.items(): if word in embedding_dict: embedding[word_ids] = embedding_dict[word] embedding_tensor = tf.constant(embedding, dtype=tf.float32, name='embedding') return embedding_tensor def sparse_concat(self, sparse_tensor_input, base_tensor, excess_tensor, default_value): """ Extend sparse_tensor_input using base_indices and excess_indices :param sparse_tensor_input: :param base_indices: :param base_shape: :param excess_indices: :param excess_value_shape: :param excess_shape: :param default_value: :return: """ # extract real blstm predict in dense and save to sparse base_sparse_tensor = tf.SparseTensor( indices=base_tensor.indices, values=tf.gather_nd(sparse_tensor_input, base_tensor.indices), dense_shape=base_tensor.dense_shape) # create excess SparseTensor with default_value excess_sparse_tensor = tf.SparseTensor( indices=excess_tensor.indices, values=tf.fill(tf.shape(excess_tensor.values), default_value), dense_shape=excess_tensor.dense_shape) # concat SparseTensor concat_sparse_tensor = tf.SparseTensor( indices=tf.concat(axis=0, values=[ base_sparse_tensor.indices, excess_sparse_tensor.indices ]), values=tf.concat(axis=0, values=[ base_sparse_tensor.values, excess_sparse_tensor.values ]), dense_shape=excess_sparse_tensor.dense_shape) concat_sparse_tensor = tf.sparse_reorder(concat_sparse_tensor) return concat_sparse_tensor def sparse_string_join(self, sparse_tensor_input, name): """ Join SparseTensor to 1-D String dense Tensor :param sparse_tensor_input: :param name: :return: """ dense_tensor_input = tf.sparse_to_dense( sparse_indices=sparse_tensor_input.indices, output_shape=sparse_tensor_input.dense_shape, sparse_values=sparse_tensor_input.values, default_value='') dense_tensor_input_join = tf.reduce_join(dense_tensor_input, axis=1, separator=' ') format_predict_labels = tf.string_strip(dense_tensor_input_join, name=name) return format_predict_labels def score_normalize(self, scores): """ Normalize crf score :param scores: shape [-1, 1] :return: """ lambda_factor = tf.constant(0.05, dtype=tf.float32) normalized_scores = tf.reciprocal( tf.add(tf.constant(1.0, dtype=tf.float32), tf.exp(tf.negative(tf.multiply(lambda_factor, scores))))) return normalized_scores