Ejemplo n.º 1
0
class DataAnalysis(object):
    def __init__(self):
        self.data_utils = DataUtils()

    def word_analysis(self, data_filename):
        """
        Count word frequency
        :param data_filename:
        :return:
        """
        words_count = 0
        words_count_map = dict()
        with open(data_filename, encoding='utf-8', mode='rt') as data_file:
            for line in data_file:
                words, _ = self.data_utils.split(line)
                words_count += len(words)
                for word in words:
                    if word in words_count_map:
                        words_count_map[word] += 1
                    else:
                        words_count_map[word] = 1
        words_type_count = len(words_count_map)
        return words_count, words_type_count

    def length_analysis(self, data_filename):
        """
        Count sentence length
        :param data_filename:
        :return:
        """
        sentences_count = 0
        max_length = 0
        length_count_map = dict()
        with open(data_filename, encoding='utf-8', mode='rt') as data_file:
            for line in data_file:
                words, _ = self.data_utils.split(line)
                length = len(''.join(words))
                # length = len(''.join(line.strip().split()))
                if length in length_count_map:
                    length_count_map[length] += 1
                else:
                    length_count_map[length] = 1
                if length > max_length:
                    max_length = length
                sentences_count += 1

        if sentences_count == 0:
            return dict()

        statistic_result = dict()
        accumulative_count = 0
        for i in range(max_length + 1):
            if i in length_count_map:
                accumulative_count += length_count_map[i]
            if i != 0 and (i % 50 == 0 or i == max_length):
                statistic_result[i] = '%.2f' % (accumulative_count /
                                                sentences_count * 100)
        return statistic_result
Ejemplo n.º 2
0
    def __init__(self):
        self.batch_size = FLAGS.batch_size
        self.num_steps = FLAGS.num_steps
        self.min_after_dequeue = FLAGS.min_after_dequeue
        self.num_threads = FLAGS.num_threads
        self.embedding_size = FLAGS.embedding_size

        self.data_utils = DataUtils()
        self.default_word_padding_id = self.data_utils._START_VOCAB_ID[0]
        self.default_label_padding_id = self.data_utils.load_default_label_id()
Ejemplo n.º 3
0
    def create(self, splits=Config.DS_SPLIT, data_dir=Config.DATA_DIR):
        # ds sizes:
        # test      4445
        # train    44199
        # val       4444
        self.data_dir = data_dir
        # Saves the categories of the labels
        self.categories = dict()
        # Does the data directory exist?
        if not self.data_dir.exists():
            sys.exit(
                'No dataset for training found at the given data directory')
        Config.STYLES_USE_COLS.append('split')
        df = DataUtils.load_data_frame('adjusted_styles.csv')
        set_names = ['train', 'val', 'test']
        ids_by_split = [list(df[df['split'] == x].index) for x in set_names]
        df = df.drop('split', axis=1)
        for col in list(df.columns):
            df[col] = pd.Categorical(df[col])
            self.categories[col] = df[col].cat.categories
            df[col] = df[col].cat.codes

        self.df = df

        for set_name, indices in list(zip(set_names, ids_by_split)):
            ds = tf.data.Dataset.from_tensor_slices(tf.constant(indices))
            ds = ds.map(self._process_id, num_parallel_calls=AUTOTUNE)
            ds = self._prepare_for_training(set_name, ds)
            setattr(self, set_name, ds)
        return self
Ejemplo n.º 4
0
 def write_calculate_result_data_to_excel(excel_file_path: str,
                                          question_data_list: []):
     print('开始将计算结果数据写入excel')
     workbook = openpyxl.load_workbook(excel_file_path)
     row_index = 0
     question_mapping = {}
     for question in question_data_list:
         question_mapping[question.question_key] = question
     for row in workbook.worksheets[0].rows:
         if row[0].value is not None and row_index > 0:
             if row[0].value.strip() != '':
                 question_key = row[3].value
                 if question_key in question_mapping:
                     xdata = question_mapping[
                         question_key].get_answer_data_str()
                     xdata2 = []
                     if ',' not in xdata:
                         for x in xdata:
                             xdata2.append(x)
                         workbook.worksheets[0].cell(
                             row_index + 1, 10,
                             DataUtils.parse_arr_data_to_comma_str_data(
                                 xdata2))
                     else:
                         workbook.worksheets[0].cell(
                             row_index + 1, 10,
                             question_mapping[question_key].
                             get_answer_data_str())
                     workbook.worksheets[0].cell(
                         row_index + 1, 11, question_mapping[question_key].
                         get_editable_original_data_str())
         row_index = row_index + 1
     workbook.save(excel_file_path)
     print('计算结果Excel写出完毕')
Ejemplo n.º 5
0
    def __init__(self):
        self.vocab_path = FLAGS.vocab_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.freeze_graph_path = FLAGS.freeze_graph_path
        self.saved_model_path = FLAGS.saved_model_path

        self.use_crf = FLAGS.use_crf
        self.num_steps = FLAGS.num_steps

        self.default_label = FLAGS.default_label
        self.default_score = FLAGS.default_predict_score

        self.data_utils = DataUtils()
        self.tensorflow_utils = TensorflowUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.sequence_labeling_model = SequenceLabelingModel()
        self.init_predict_graph()
Ejemplo n.º 6
0
    def __init__(self):
        self.tfrecords_path = FLAGS.tfrecords_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.tensorboard_path = FLAGS.tensorboard_path

        self.use_crf = FLAGS.use_crf
        self.learning_rate = FLAGS.learning_rate
        self.learning_rate_decay_factor = FLAGS.learning_rate_decay_factor
        self.decay_steps = FLAGS.decay_steps
        self.clip_norm = FLAGS.clip_norm
        self.max_training_step = FLAGS.max_training_step

        self.train_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                     'train.tfrecords')
        self.test_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                    'test.tfrecords')

        self.data_utils = DataUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(
            os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.tensorflow_utils = TensorflowUtils()
        self.sequence_labeling_model = SequenceLabelingModel()
Ejemplo n.º 7
0
    def _process_id(self, id):
        label = tf.py_function(func=self._get_label, inp=[id], Tout=tf.float32)
        file_path = tf.strings.join([
            str(Config.DATA_DIR), '/images/',
            tf.strings.as_string(id), '.jpg'
        ])
        # Load the raw data from the file as a string
        img = tf.io.read_file(file_path)
        img = DataUtils.decode_img(img)

        # Set shape manually bc tensor is returned by a py_func
        label.set_shape([39])
        return img, label
Ejemplo n.º 8
0
    def __init__(self, args):
        self.args = args

        # Loading data
        self.data = pd.read_hdf(self.args.hdf_file + '.hdf')
        self.data.drop_duplicates(subset=['url'], inplace=True)

        self.max_features = 200
        self.batch_size = 32

        # Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`;
        # (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
        tqdm.pandas(desc="my bar")
        # cut texts after this number of words (among top max_features most common words)
        self.maxlen = max(self.data['url'].progress_map(lambda x: len(x))) + 1

        # Fix the seed
        self.seed = 21
        np.random.seed(self.seed)

        # Splitting data into train, test & validation sets
        self.x_train, self.x_val_test, self.y_train, self.y_val_test = train_test_split(
            self.data['url'].values,
            self.data['label'],
            test_size=.33,
            random_state=self.seed,
            stratify=self.data['label'])

        self.x_val, self.x_test, self.y_val, self.y_test = train_test_split(
            self.x_val_test,
            self.y_val_test,
            test_size=.5,
            random_state=self.seed,
            stratify=self.y_val_test)

        print('\n*************** Data statistics ****************')
        datautils = DataUtils(self.args)
        datautils.data_Stats(self.y_train, self.y_val, self.y_test)
Ejemplo n.º 9
0
def train(args):
    """
    Training using pipeline module
    :param args:
    :return:
    """

    # Preparing dataset
    data = DataUtils(args)
    # Generating & saving dataframe from raw event folder
    if args.hdf_file:
        print('Raw data files will not be processed.\n')
        print('{} file present.'.format(args.hdf_file))
        pass
    elif args.raw_data_dir:
        print('Processing Raw data files.\n')
        print('A hdf file will be generated for further exploration.')
        data.load_txt_files()
    data = data.prepare_data()

    # Creating pipelines
    model_pipe = Models(args)
    model_pipelined, model_name = model_pipe.model_pipeline()
    print('Pipeline created for model {}'.format(model_name))

    # Running a model pipeline
    model = ModelRunPipeline(args, model_pipelined, data)

    # Multiprocessing to spawn processes using an API similar to threading module
    proc = Process(target=model.run_pipeline, args=())

    proc.start()
    proc.join()

    print(
        '\n\n****************** Classification done. Enjoy Life. :) *******************'
    )
Ejemplo n.º 10
0
def load_data(year: str, stat_type: str):
    url = DATA_SOURCE_URL + str(year).split("-")[1] + stat_type + ".html"
    html = pd.read_html(url, header=0)
    data_frame = html[0]
    raw = data_frame.drop(data_frame[
        data_frame.Age == "Age"].index)  # Deletes repeating headers in content
    raw = raw.fillna(0)
    playerstats = raw.drop(["Rk"], axis=1)
    aggregation_functions = {}
    columns = DataUtils.get_columns(playerstats)
    DataUtils.convert_columns_to_numeric(
        playerstats, DataUtils.get_numeric_columns(columns))
    aggregation_functions = DataUtils.get_aggregate_functions(columns)
    playerstats = playerstats.groupby(
        playerstats["Player"]).aggregate(aggregation_functions)
    playerstats["FanPoints"] = DataUtils.calculate_fan_points(playerstats)
    playerstats.sort_values(["FanPoints"], ascending=False, inplace=True)
    playerstats.insert(0, "rank", np.arange(start=1,
                                            stop=len(playerstats) + 1))
    return playerstats
Ejemplo n.º 11
0
    def run(self):
        self.data_loader_utils = DataLoaderUtils(self.load_config.server,
                                                 self.index,
                                                 self.type,
                                                 self.load_config.server_username,
                                                 self.load_config.server_password)

        self.data_utils = DataUtils()

        count = 0
        bulk_data = ''

        ids_to_fetch = self.data_loader_batch.keys()
        self.load_config.log(LOG_LEVEL_TRACE, 'Fetching docs', self.load_config.server, self.index, self.type)

        self.data_utils.batch_fetch_docs_for_ids(self.load_config.server,
                                            ids_to_fetch,
                                            self.index,
                                            self.type,
                                            self.docs_fetched,
                                            self.load_config.doc_fetch_batch_size,
                                            self.load_config.server_username,
                                            self.load_config.server_password)

        for _id in self.existing_docs:
            relations = self.data_loader_batch[_id]
            existing_doc = self.existing_docs[_id]

            doc = {}
            updates = []
            
            # Update relations
            for relation in relations:
                dest_index_id = relation['index_id']
                dest_ids = relation['ids']
                relationship_type = relation['type']
                ids_to_remove = []
                if 'ids_to_remove' in relation:
                    ids_to_remove = relation['ids_to_remove']

                self.load_config.log(LOG_LEVEL_TRACE, self.index, relationship_type,  dest_index_id, len(dest_ids))

                existing_doc = self.load_config.data_mapper.update_relations_for_doc(_id,
                                                                                    existing_doc,
                                                                                    dest_ids,
                                                                                    self.source,
                                                                                    dest_index_id,
                                                                                    relation_type=relationship_type,
                                                                                    append=self.append,
                                                                                    ids_to_remove=ids_to_remove)
                doc[relationship_type] = existing_doc[relationship_type]

                updates.append({
                    'index_id': dest_index_id,
                    'source': self.source,
                    'added_ids': dest_ids,
                    'removed_ids': ids_to_remove,
                    'relation_type': relationship_type
                })

            # Relations updates
            relations_updates = []
            if 'relations_updates' in existing_doc:
                relations_updates = existing_doc['relations_updates']

            update_item = {
                'update_source': self.data_source_name,
                'update_date': self.updated_date,
                'updates': updates
            }
            relations_updates.append(update_item)
            doc['relations_updates'] = relations_updates


            if self.load_config.test_mode and count % 2500 == 0:
                # print 'Existing doc id', _id
                self.load_config.log(LOG_LEVEL_INFO, 'Data', relations)
                self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc)

            if len(doc) > 0:
                bulk_update_header = self.data_loader_utils.bulk_update_header(_id)
                self.load_config.log(LOG_LEVEL_TRACE, 'bulk update header:', bulk_update_header)
                self.load_config.log(LOG_LEVEL_TRACE, 'bulk data', doc)

                bulk_data += bulk_update_header
                bulk_data += '\n'
                doc = {
                    'doc': doc
                }
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

            count += 1
            if count % 50 == 0:
                self.load_config.log(LOG_LEVEL_DEBUG, 'Processed docs', count, os.getpid(), self.index, _id)

            if len(bulk_data) >= self.load_config.bulk_data_size:
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        # logger.log(1,  'Process completed, saving loaded ids.........................')

        if not self.load_config.test_mode:
            self.save_summary(ids_to_fetch)
Ejemplo n.º 12
0
 def __init__(self, hdfs_client, flags):
     self.train_is_alive = False
     self.hdfs_client = hdfs_client
     self.flags = flags
     self.data_utils = DataUtils()
Ejemplo n.º 13
0
class Segmenter(object):
    def __init__(self, hdfs_client, flags):
        self.train_is_alive = False
        self.hdfs_client = hdfs_client
        self.flags = flags
        self.data_utils = DataUtils()

    def update_config(self):
        config_path = os.path.join(self.flags.raw_data_path, 'config.json')
        try:
            with open(config_path, encoding='utf-8', mode='r') as data_file:
                config_json = json.load(data_file)
                if 'use_lstm' in config_json:
                    self.flags.use_lstm = config_json['use_lstm']
                elif 'use_dynamic_rnn' in config_json:
                    self.flags.use_dynamic_rnn = config_json['use_dynamic_rnn']
                elif 'use_bidirectional_rnn' in config_json:
                    self.flags.use_bidirectional_rnn = config_json[
                        'use_bidirectional_rnn']
                elif 'vocab_drop_limit' in config_json:
                    self.flags.vocab_drop_limit = config_json[
                        'vocab_drop_limit']
                elif 'batch_size' in config_json:
                    self.flags.batch_size = config_json['batch_size']
                elif 'num_steps' in config_json:
                    self.flags.num_steps = config_json['num_steps']
                elif 'num_layer' in config_json:
                    self.flags.num_layer = config_json['num_layer']
                elif 'embedding_size' in config_json:
                    self.flags.embedding_size = config_json['embedding_size']
                elif 'learning_rate' in config_json:
                    self.flags.learning_rate = config_json['learning_rate']
                elif 'learning_rate_decay_factor' in config_json:
                    self.flags.learning_rate_decay_factor = config_json[
                        'learning_rate_decay_factor']
                elif 'keep_prob' in config_json:
                    self.flags.keep_prob = config_json['keep_prob']
                elif 'clip_norm' in config_json:
                    self.flags.clip_norm = config_json['clip_norm']
        except:
            raise Exception('ERROR: config.json content invalid')

    def train(self):
        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path, 'train.txt'),
            os.path.join(self.flags.datasets_path, 'train.txt'))
        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'test.txt'))

        self.data_utils.label_segment_file(
            os.path.join(self.flags.datasets_path, 'train.txt'),
            os.path.join(self.flags.datasets_path, 'label_train.txt'))
        self.data_utils.label_segment_file(
            os.path.join(self.flags.datasets_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'label_test.txt'))

        self.data_utils.split_label_file(
            os.path.join(self.flags.datasets_path, 'label_train.txt'),
            os.path.join(self.flags.datasets_path, 'split_train.txt'))
        self.data_utils.split_label_file(
            os.path.join(self.flags.datasets_path, 'label_test.txt'),
            os.path.join(self.flags.datasets_path, 'split_test.txt'))

        words_vocab, labels_vocab = self.data_utils.create_vocabulary(
            os.path.join(self.flags.datasets_path, 'split_train.txt'),
            self.flags.vocab_path, self.flags.vocab_drop_limit)

        train_word_ids_list, train_label_ids_list = self.data_utils.file_to_word_ids(
            os.path.join(self.flags.datasets_path, 'split_train.txt'),
            words_vocab, labels_vocab)
        test_word_ids_list, test_label_ids_list = self.data_utils.file_to_word_ids(
            os.path.join(self.flags.datasets_path, 'split_test.txt'),
            words_vocab, labels_vocab)

        tensorflow_utils = TensorflowUtils()
        tensorflow_utils.create_record(
            train_word_ids_list, train_label_ids_list,
            os.path.join(self.flags.tfrecords_path, 'train.tfrecords'))
        tensorflow_utils.create_record(
            test_word_ids_list, test_label_ids_list,
            os.path.join(self.flags.tfrecords_path, 'test.tfrecords'))

        self.hdfs_client.hdfs_upload(
            self.flags.vocab_path,
            os.path.join(self.flags.output_path,
                         os.path.basename(self.flags.vocab_path)))

        train = Train()
        train.train()

    def upload_tensorboard(self):
        hdfs_tensorboard_path = os.path.join(
            self.flags.output_path,
            os.path.basename(os.path.normpath(self.flags.tensorboard_path)))
        temp_hdfs_tensorboard_path = hdfs_tensorboard_path + '-temp'
        self.hdfs_client.hdfs_upload(self.flags.tensorboard_path,
                                     temp_hdfs_tensorboard_path)
        self.hdfs_client.hdfs_delete(hdfs_tensorboard_path)
        self.hdfs_client.hdfs_mv(temp_hdfs_tensorboard_path,
                                 hdfs_tensorboard_path)

    def log_monitor(self):
        while (self.train_is_alive):
            time.sleep(120)
            self.upload_tensorboard()

    def upload_model(self):
        predict = Predict()
        predict.saved_model_pb()

        hdfs_checkpoint_path = os.path.join(
            self.flags.output_path,
            os.path.basename(os.path.normpath(self.flags.checkpoint_path)))
        hdfs_saved_model_path = os.path.join(
            self.flags.output_path,
            os.path.basename(os.path.normpath(self.flags.saved_model_path)))

        temp_hdfs_checkpoint_path = hdfs_checkpoint_path + '-temp'
        temp_hdfs_saved_model_path = hdfs_saved_model_path + '-temp'

        self.hdfs_client.hdfs_upload(self.flags.checkpoint_path,
                                     temp_hdfs_checkpoint_path)
        self.hdfs_client.hdfs_upload(self.flags.saved_model_path,
                                     temp_hdfs_saved_model_path)

        self.hdfs_client.hdfs_delete(hdfs_checkpoint_path)
        self.hdfs_client.hdfs_delete(hdfs_saved_model_path)

        self.hdfs_client.hdfs_mv(temp_hdfs_checkpoint_path,
                                 hdfs_checkpoint_path)
        self.hdfs_client.hdfs_mv(temp_hdfs_saved_model_path,
                                 hdfs_saved_model_path)

    def evaluate(self):
        shutil.rmtree(self.flags.vocab_path)
        shutil.rmtree(self.flags.checkpoint_path)

        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path,
                         os.path.basename(self.flags.vocab_path)),
            self.flags.vocab_path)
        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'test.txt'))
        hdfs_checkpoint_path = os.path.join(
            self.flags.input_path,
            os.path.basename(self.flags.checkpoint_path))
        self.hdfs_client.hdfs_download(hdfs_checkpoint_path,
                                       self.flags.checkpoint_path)

        self.data_utils.label_segment_file(
            os.path.join(self.flags.datasets_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'label_test.txt'))
        self.data_utils.split_label_file(
            os.path.join(self.flags.datasets_path, 'label_test.txt'),
            os.path.join(self.flags.datasets_path, 'split_test.txt'))

        predict = Predict()
        predict.file_predict(
            os.path.join(self.flags.datasets_path, 'split_test.txt'),
            os.path.join(self.flags.datasets_path, 'test_predict.txt'))

        self.model_evaluate = Evaluate()
        self.model_evaluate.evaluate(
            os.path.join(self.flags.datasets_path, 'test_predict.txt'),
            os.path.join(self.flags.datasets_path, 'test_evaluate.txt'))

        self.hdfs_client.hdfs_delete(
            os.path.join(self.flags.output_path, 'test_evaluate.txt'))
        self.hdfs_client.hdfs_upload(
            os.path.join(self.flags.datasets_path, 'test_evaluate.txt'),
            os.path.join(self.flags.input_path, 'test_evaluate.txt'))
Ejemplo n.º 14
0
class DataHelpers(object):
    def __init__(self):
        # Class Object Initialization.
        self.conf = Configuration()
        self.genutil = GeneralUtils()
        self.du = DataUtils()

    def load_word_embeddings_compact(self,
                                     embedding_dim,
                                     vocab_set,
                                     masking=False,
                                     use_pickled=True):

        vocab_list = list(vocab_set)

        if masking:
            masking_value = "masked"  # For masked embedding weights leave it blank "", else for masked use "_non_masked"
            start_index = 1  # Leaves the 0-index free of any data.
        else:
            masking_value = "non_masked"  # For masked embedding weights leave it blank "", else for masked use "_non_masked"
            start_index = 0  # Stores the embedding weights from the zero'th index itself.

        # Dataset sources file paths
        embedding_weights_file_path = self.conf.embedding_weights_file_tpl.format(
            masking_value)

        if not use_pickled:
            print("Loading Word Embeddings into memory ... ")
            word_vector_dict = {}
            j = 0
            with open(self.conf.word_vectors_file, "r") as fopen:
                for line in fopen:
                    j += 1
                    try:
                        components = line.strip().split()
                        if not len(components) < embedding_dim:
                            if j % 1000000 == 0:
                                print("Parsing word vector file ... {}".format(
                                    j))
                            word = components[0]
                            if word in vocab_set:
                                vec = np.asarray([
                                    float(x)
                                    for x in components[1:embedding_dim + 1]
                                ])
                                word_vector_dict[word] = vec

                    except Exception as e:
                        print("Exception Encountered: ".format(e))
            print("Word Embeddings added to word_vector_dict.")
            # Adding the word vectors from the input datasets which are not in the word vector file.
            # Word Vectors are drawn at random from a uniform distribution(-0.25, 0.25)
            # adding 1 to account for 0th index (for masking) [Number of word:vector pairs is 7115783]
            n_symbols = len(vocab_list)
            embedding_weights = np.zeros((n_symbols + 1, embedding_dim))

            for i, word_k in enumerate(vocab_list, start=start_index):
                if word_k in word_vector_dict:
                    embedding_weights[i, :] = word_vector_dict[word_k]
                else:
                    embedding_weights[i, :] = np.random.uniform(
                        -0.25, 0.25, embedding_dim)

            print(
                "Added Random Vectors for the unseen words in the corpus. Current value of i: {}"
                .format(i))
            if self.conf.create_data_dump:
                print(
                    "Dumping embedding weights and index_dict to disk as pickled files ...."
                )
                joblib.dump(embedding_weights, embedding_weights_file_path)
                print(
                    'Finished: Dumping index_dict and embedding_weights to disk.'
                )
            return embedding_weights
        else:
            print(
                'Loading Word Embeddings: index_dict and embeddings weights from disk ... '
            )
            embedding_weights = joblib.load(embedding_weights_file_path)
            print("Word Embedding pickled files loaded into memory!")
            return embedding_weights

    def generate_vocabulary_set(self, masking=False):
        # Load data from files
        print "Generating Vocabulary set from Input Data file (s): {}".format(
            self.conf.input_file_list)

        vocab_index_dict = {}
        vocab_set = set()

        # Adding a padding word, unknown word and space
        vocab_set.add('<PAD/>')
        vocab_set.add('<UNK/>')
        vocab_set.add(' ')

        # Data-Set Line Format: {'q': query, 'doc_corr': correct_url_doc, 'doc_incorr': incorrect_doc_list}
        less_doc_cnt = 0

        for model_training_data_file in self.conf.input_file_list:
            with open(model_training_data_file) as fo:
                for line in fo:
                    data = json.loads(line)
                    if len(data['doc_incorr']
                           ) == self.conf.num_negative_examples:
                        s_list = []
                        s_list.append(data['q'])
                        s_list.append(data['doc_corr'])
                        s_list += data['doc_incorr']
                        x_vocab = self.du.build_vocab(
                            self.du.get_text_feature_splits(
                                s_list, mode=self.conf.feature_level))
                        for i in x_vocab:
                            if not i in vocab_set:
                                # print "Vocab_Entity: {}".format(i.encode('utf-8'))
                                vocab_set.add(i)
                    else:
                        less_doc_cnt += 1

        if masking:
            i = 0
            masking_value = "masked"  # For masked embedding weights leave it blank "", else for masked use "_non_masked"
        else:
            i = -1
            masking_value = "non_masked"  # For masked embedding weights leave it blank "", else for masked use "_non_masked"

        for word in vocab_set:
            i += 1
            vocab_index_dict[word] = i

        if self.conf.create_data_dump:
            print "Dumping Vocabulary Set and Index - dict to Disk!"
            joblib.dump(vocab_set,
                        self.conf.vocab_set_file.format(masking_value))
            joblib.dump(vocab_index_dict,
                        self.conf.vocab_index_file.format(masking_value))
        return vocab_set, vocab_index_dict

    def load_data_generator(self,
                            vocab_index_dict,
                            mode=None,
                            batch_size=128,
                            nb_epochs=1):
        """
        Loads MR polarity data from files, splits the data into words and generates labels.
        Returns split sentences and labels.
        """
        if mode == None:
            raise Exception(
                "Please provide mode as either 'training' or 'validation'")

        input_dataset_file = ""
        if mode == "training":
            input_dataset_file = self.conf.model_training_data

        elif mode == "validation":
            input_dataset_file = self.conf.model_validation_data

        # print "\nLoading Model Training Data: {}\n".format(input_dataset_file)
        # Data-Set Line Format: {'q': query, 'doc_corr': correct_url_doc, 'doc_incorr': incorrect_doc_list}

        for epoch in range(0, nb_epochs + 2):
            less_doc_cnt = 0
            with open(input_dataset_file, 'r') as fin:
                while True:
                    batch_rows = list(islice(fin, batch_size))

                    if not batch_rows:
                        break

                    batch_query_data = np.empty(shape=(0, 0), dtype=np.int32)
                    batch_pos_query_data = np.empty(shape=(0, 0),
                                                    dtype=np.int32)
                    batch_neg_query_data = [
                        np.empty(shape=(0, 0), dtype=np.int32)
                        for _ in range(0, self.conf.num_negative_examples)
                    ]

                    for line in batch_rows:
                        data = json.loads(line)
                        if len(data['doc_incorr']
                               ) == self.conf.num_negative_examples:
                            input_data_list = [[data['q']], [data['doc_corr']],
                                               data['doc_incorr']]
                            # Build Input Data
                            for n, x in enumerate(input_data_list):
                                if n == 0:
                                    for i in xrange(0, len(x)):
                                        x_array = self.du.build_input_data(
                                            self.du.pad_sentences(
                                                self.du.
                                                get_text_feature_splits(
                                                    x[i],
                                                    cutoff=self.conf.
                                                    query_length,
                                                    mode=self.conf.
                                                    feature_level),
                                                self.conf.query_length),
                                            vocab_index_dict,
                                            return_array=True)
                                        if batch_query_data.shape[0] == 0:
                                            batch_query_data = x_array
                                        else:
                                            batch_query_data = np.vstack(
                                                (batch_query_data, x_array))

                                elif n == 1:
                                    for i in xrange(0, len(x)):
                                        x_array = self.du.build_input_data(
                                            self.du.pad_sentences(
                                                self.du.
                                                get_text_feature_splits(
                                                    x[i],
                                                    cutoff=self.conf.
                                                    document_length,
                                                    mode=self.conf.
                                                    feature_level),
                                                self.conf.document_length),
                                            vocab_index_dict,
                                            return_array=True)
                                        if batch_pos_query_data.shape[0] == 0:
                                            batch_pos_query_data = x_array
                                        else:
                                            batch_pos_query_data = np.vstack(
                                                (batch_pos_query_data,
                                                 x_array))
                                elif n == 2:
                                    for i in xrange(0, len(x)):
                                        x_array = self.du.build_input_data(
                                            self.du.pad_sentences(
                                                self.du.
                                                get_text_feature_splits(
                                                    x[i],
                                                    cutoff=self.conf.
                                                    document_length,
                                                    mode=self.conf.
                                                    feature_level),
                                                self.conf.document_length),
                                            vocab_index_dict,
                                            return_array=True)
                                        if batch_neg_query_data[i].shape[
                                                0] == 0:
                                            batch_neg_query_data[i] = x_array
                                        else:
                                            batch_neg_query_data[
                                                i] = np.vstack(
                                                    (batch_neg_query_data[i],
                                                     x_array))
                        else:
                            less_doc_cnt += 1

                    batch_y_data = np.ones(len(batch_query_data))
                    yield [batch_query_data, batch_pos_query_data
                           ] + batch_neg_query_data, batch_y_data

            #print "Number of skipped data points: Incorrect Documents in Training Data (< 3): {}".format(less_doc_cnt)

    def get_vocab_index_embedding_weights(self,
                                          embedding_dim,
                                          embedding_weights_masking,
                                          load_embeddings_pickled=False,
                                          load_vocab_pickled=False):
        embedding_weights = []
        if embedding_weights_masking == True:
            masking_value = "masked"  # For masked embedding weights leave it blank "", else for masked use "_non_masked"
        else:
            masking_value = "non_masked"  # For masked embedding weights leave it blank "", else for masked use "_non_masked"

        # Load data from files
        if load_vocab_pickled:
            vocab_index_dict = joblib.load(
                self.conf.vocab_index_file.format(masking_value))
            vocab_set = joblib.load(
                self.conf.vocab_set_file.format(masking_value))

        else:
            vocab_set, vocab_index_dict = self.generate_vocabulary_set(
                masking=embedding_weights_masking)

        if self.conf.feature_level == "word":
            embedding_weights = self.load_word_embeddings_compact(
                embedding_dim,
                vocab_set,
                masking=embedding_weights_masking,
                use_pickled=load_embeddings_pickled)
        return embedding_weights, vocab_index_dict
Ejemplo n.º 15
0
 def __init__(self):
     # Class Object Initialization.
     self.conf = Configuration()
     self.genutil = GeneralUtils()
     self.du = DataUtils()
Ejemplo n.º 16
0
                        default=None)

    args = parser.parse_args()
    if args.train:
        if not Config.LOG_DIR.exists():
            Config.LOG_DIR.mkdir(parents=True)
        if not Config.CHECKPOINT_DIR.exists():
            Config.CHECKPOINT_DIR.mkdir(parents=True)
        model = Model(DataSet().create())
        model.fit()
    if args.eval:
        model = Model(DataSet().create(), args.load_model)
        history = model.eval()
        print('hello')
    if args.adjust_data:
        DataUtils.adjust_data()
    if args.analyze_data:
        if not Config.VIZ_RESULTS_DIR.exists():
            Config.VIZ_RESULTS_DIR.mkdir()
        Evaluation()
    if args.predict_images is not None:
        model = Model(DataSet(), args.load_model)
        images = DataUtils.load_all_images(args.predict_images)
        predictions = model.predict(images)

        for img, pred in zip(images, predictions):
            plt.figure(figsize=(20, 20))
            plt.imshow(img)
            plt.title(pred)
            plt.show()
Ejemplo n.º 17
0
st.title("Kantina Basketball Association")

image = Image.open("kba_logo.jpg")
st.image(image, use_column_width=True)

st.markdown("""### NBA player statistics and yahoo fanpoints for KBA league.
* **Data source:** [Basketball-reference.com](https://www.basketball-reference.com/)
""")

st.sidebar.header("Filtering")
selected_year = st.sidebar.selectbox(
    "Year",
    list(
        reversed([
            str(year) + "-" + str(year + 1)
            for year in range(1980, DataUtils.get_season_year())
        ])),
)
selected_category = st.sidebar.selectbox("Stats", ("Avg", "Total"))


@st.cache
def load_data(year: str, stat_type: str):
    url = DATA_SOURCE_URL + str(year).split("-")[1] + stat_type + ".html"
    html = pd.read_html(url, header=0)
    data_frame = html[0]
    raw = data_frame.drop(data_frame[
        data_frame.Age == "Age"].index)  # Deletes repeating headers in content
    raw = raw.fillna(0)
    playerstats = raw.drop(["Rk"], axis=1)
    aggregation_functions = {}
Ejemplo n.º 18
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

from utils.data_utils import DataUtils
from data_helpers import DataHelpers
du = DataUtils()
dh = DataHelpers()
train_data = "statue of liberty"
train_data_list = ["statue of liberty", "new york"]

# print du.get_text_feature_splits(train_data, mode='word')
# print du.get_text_feature_splits(train_data_list, mode='word')
# print du.get_text_feature_splits(train_data, mode='char')
# print du.get_text_feature_splits(train_data_list, mode='char')
#
# print du.get_text_feature_splits(train_data, mode='char', cutoff=5)
# print du.get_text_feature_splits(train_data_list, mode='char', cutoff=5)
# print "ngram ..."
# print du.get_text_feature_splits(train_data, mode='ngram')
# print du.get_text_feature_splits(train_data_list, mode='ngram')
# print du.get_text_feature_splits(train_data, mode='ngram', cutoff=5)
# print du.get_text_feature_splits(train_data_list, mode='ngram', cutoff=5)

dh.generate_vocabulary_set()
Ejemplo n.º 19
0
class Train(object):
    def __init__(self):
        self.tfrecords_path = FLAGS.tfrecords_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.tensorboard_path = FLAGS.tensorboard_path

        self.use_crf = FLAGS.use_crf
        self.learning_rate = FLAGS.learning_rate
        self.learning_rate_decay_factor = FLAGS.learning_rate_decay_factor
        self.decay_steps = FLAGS.decay_steps
        self.clip_norm = FLAGS.clip_norm
        self.max_training_step = FLAGS.max_training_step

        self.train_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                     'train.tfrecords')
        self.test_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                    'test.tfrecords')

        self.data_utils = DataUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(
            os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.tensorflow_utils = TensorflowUtils()
        self.sequence_labeling_model = SequenceLabelingModel()

    def train(self):
        """
        train bilstm + crf model
        :return:
        """
        train_data = self.tensorflow_utils.read_and_decode(
            self.train_tfrecords_filename)
        train_batch_features, train_batch_labels, train_batch_features_lengths = train_data
        test_data = self.tensorflow_utils.read_and_decode(
            self.test_tfrecords_filename)
        test_batch_features, test_batch_labels, test_batch_features_lengths = test_data

        with tf.device('/cpu:0'):
            global_step = tf.Variable(0, name='global_step', trainable=False)
        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(self.learning_rate,
                                        global_step,
                                        self.decay_steps,
                                        self.learning_rate_decay_factor,
                                        staircase=True)
        optimizer = tf.train.RMSPropOptimizer(lr)

        with tf.variable_scope('model'):
            logits = self.sequence_labeling_model.inference(
                train_batch_features,
                train_batch_features_lengths,
                self.num_classes,
                is_training=True)
        train_batch_labels = tf.to_int64(train_batch_labels)

        if self.use_crf:
            loss, transition_params = self.sequence_labeling_model.crf_loss(
                logits, train_batch_labels, train_batch_features_lengths,
                self.num_classes)
        else:
            slice_logits, slice_train_batch_labels = self.sequence_labeling_model.slice_seq(
                logits, train_batch_labels, train_batch_features_lengths)
            loss = self.sequence_labeling_model.loss(slice_logits,
                                                     slice_train_batch_labels)

        with tf.variable_scope('model', reuse=True):
            accuracy_logits = self.sequence_labeling_model.inference(
                test_batch_features,
                test_batch_features_lengths,
                self.num_classes,
                is_training=False)
        test_batch_labels = tf.to_int64(test_batch_labels)
        if self.use_crf:
            accuracy = self.sequence_labeling_model.crf_accuracy(
                accuracy_logits, test_batch_labels,
                test_batch_features_lengths, transition_params,
                self.num_classes)
        else:
            slice_accuracy_logits, slice_test_batch_labels = self.sequence_labeling_model.slice_seq(
                accuracy_logits, test_batch_labels,
                test_batch_features_lengths)
            accuracy = self.sequence_labeling_model.accuracy(
                slice_accuracy_logits, slice_test_batch_labels)

        # summary
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('accuracy', accuracy)
        tf.summary.scalar('lr', lr)

        # compute and update gradient
        # train_op = optimizer.minimize(loss, global_step=global_step)

        # computer, clip and update gradient
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        clip_gradients, _ = tf.clip_by_global_norm(gradients, self.clip_norm)
        train_op = optimizer.apply_gradients(zip(clip_gradients, variables),
                                             global_step=global_step)

        init_op = tf.global_variables_initializer()
        saver = tf.train.Saver(max_to_keep=None)
        checkpoint_filename = os.path.join(self.checkpoint_path, 'model.ckpt')

        with tf.Session() as sess:
            summary_op = tf.summary.merge_all()
            writer = tf.summary.FileWriter(self.tensorboard_path, sess.graph)
            sess.run(init_op)

            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            if ckpt and ckpt.model_checkpoint_path:
                print('Continue training from the model {}'.format(
                    ckpt.model_checkpoint_path))
                saver.restore(sess, ckpt.model_checkpoint_path)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)

            max_accuracy = 0.0
            min_loss = 100000000.0
            try:
                while not coord.should_stop():
                    _, loss_value, step = sess.run(
                        [train_op, loss, global_step])
                    if step % 100 == 0:
                        accuracy_value, summary_value, lr_value = sess.run(
                            [accuracy, summary_op, lr])
                        china_tz = pytz.timezone('Asia/Shanghai')
                        current_time = datetime.datetime.now(china_tz)
                        print('[{}] Step: {}, loss: {}, accuracy: {}, lr: {}'.
                              format(current_time, step, loss_value,
                                     accuracy_value, lr_value))
                        if accuracy_value > max_accuracy and loss_value < min_loss:
                            writer.add_summary(summary_value, step)
                            data_clean.clean_checkpoint(self.checkpoint_path)
                            saver.save(sess,
                                       checkpoint_filename,
                                       global_step=step)
                            print('save model to %s-%d' %
                                  (checkpoint_filename, step))
                            max_accuracy = accuracy_value
                            min_loss = loss_value
                    if step >= self.max_training_step:
                        print('Done training after %d step' % step)
                        break
            except tf.errors.OutOfRangeError:
                print('Done training after reading all data')
            finally:
                coord.request_stop()
            coord.join(threads)
Ejemplo n.º 20
0
    def run(self):
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.index, self.type,
            self.load_config.server_username, self.load_config.server_password)

        self.data_utils = DataUtils()

        count = 0
        bulk_data = ''

        ids_to_load = self.data_loader_batch.keys()

        if not self.create_only:
            # Create ids to fetch
            ids_to_fetch = []
            for _id in ids_to_load:
                es_id = self.get_es_id(_id)
                ids_to_fetch.append(es_id)

            # Fetch ids
            self.load_config.log(LOG_LEVEL_TRACE, 'Fetching docs',
                                 self.load_config.server, self.index,
                                 self.type)

            self.data_utils.batch_fetch_docs_for_ids(
                self.load_config.server, ids_to_fetch, self.index, self.type,
                self.docs_fetched, self.load_config.doc_fetch_batch_size,
                self.load_config.server_username,
                self.load_config.server_password)

        for _id in ids_to_load:
            data_for_id = self.data_loader_batch[_id]
            es_id = self.get_es_id(_id)

            if es_id in self.existing_docs:
                # Update doc
                existing_doc = self.existing_docs[es_id]
                doc = self.load_config.data_mapper.update_doc(
                    existing_doc=existing_doc,
                    _id=_id,
                    data_source_name=self.load_config.data_source_name,
                    data=data_for_id)
                if self.load_config.test_mode and count % 2500 == 0:
                    # print 'Existing doc', self.load_manager.data_mapper.extract_fields_from_existing_doc(existing_doc)
                    self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id)
                    self.load_config.log(
                        LOG_LEVEL_INFO,
                        '--------------------------------------------------------'
                    )
                    self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc)

                if len(doc) > 0:
                    bulk_data += self.data_loader_utils.bulk_update_header(
                        es_id)
                    bulk_data += '\n'
                    doc = {'doc': doc}
                    bulk_data += json.dumps(doc)
                    bulk_data += '\n'
                else:
                    self.add_to_failed_docs(
                        _id, data_for_id,
                        'Data mapper: update doc returned empty')
            elif self.allow_doc_creation:
                # Create new doc
                doc = self.load_config.data_mapper.create_doc(
                    _id=_id,
                    data_source_name=self.load_config.data_source_name,
                    data=data_for_id)
                if self.load_config.test_mode and count % 2500 == 0:
                    self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id)
                    self.load_config.log(
                        LOG_LEVEL_INFO,
                        '--------------------------------------------------------'
                    )
                    self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc)

                if len(doc) > 0:
                    bulk_data += self.data_loader_utils.bulk_index_header(
                        es_id)
                    bulk_data += '\n'
                    bulk_data += json.dumps(doc)
                    bulk_data += '\n'
                else:
                    self.add_to_failed_docs(
                        _id, data_for_id,
                        'Data mapper: create doc returned empty')
            else:
                self.add_to_failed_docs(
                    _id, data_for_id, 'Update failed: existing doc not found')

            count += 1
            if count % 500 == 0:
                self.load_config.log(LOG_LEVEL_DEBUG, 'Processed', count,
                                     'docs')

            if len(bulk_data) >= self.load_config.bulk_data_size:
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        if not self.load_config.test_mode:
            self.save_summary(ids_to_load)
Ejemplo n.º 21
0
class DataLoader(object):
    def __init__(self,
                 load_config,
                 data_loader_batch,
                 _index,
                 _type,
                 data_source_batch_name=None):
        self.load_config = load_config

        self.data_loader_batch = data_loader_batch
        self.index = _index
        self.type = _type

        self.data_source_batch_directory = self.load_config.data_source_batch_directory(
            data_source_batch_name)
        self.failed_docs_directory = self.load_config.failed_docs_directory(
            data_source_batch_name)
        self.loaded_docs_directory = self.load_config.loaded_docs_directory(
            data_source_batch_name)
        self.bulk_update_response_directory = self.load_config.bulk_update_response_directory(
            data_source_batch_name)

        self.existing_docs = {}

        self.failed_docs = {}
        self.updated_ids = {}
        self.indexed_ids = {}

        self.allow_doc_creation = self.load_config.data_mapper.allow_doc_creation(
            self.load_config.data_source_name)
        self.create_only = self.load_config.data_mapper.create_only(
            self.load_config.data_source_name)

        self.data_loader_utils = None
        self.data_utils = None

    def get_es_id(self, doc_id):
        return self.load_config.data_mapper.get_es_id(doc_id)

    def get_doc_id(self, es_id):
        return self.load_config.data_mapper.get_doc_id(es_id)

    def docs_fetched(self, docs, index, type):
        self.load_config.log(LOG_LEVEL_TRACE, 'Docs fetched', len(docs))
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                self.existing_docs[_id] = existing_doc

    def run(self):
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.index, self.type,
            self.load_config.server_username, self.load_config.server_password)

        self.data_utils = DataUtils()

        count = 0
        bulk_data = ''

        ids_to_load = self.data_loader_batch.keys()

        if not self.create_only:
            # Create ids to fetch
            ids_to_fetch = []
            for _id in ids_to_load:
                es_id = self.get_es_id(_id)
                ids_to_fetch.append(es_id)

            # Fetch ids
            self.load_config.log(LOG_LEVEL_TRACE, 'Fetching docs',
                                 self.load_config.server, self.index,
                                 self.type)

            self.data_utils.batch_fetch_docs_for_ids(
                self.load_config.server, ids_to_fetch, self.index, self.type,
                self.docs_fetched, self.load_config.doc_fetch_batch_size,
                self.load_config.server_username,
                self.load_config.server_password)

        for _id in ids_to_load:
            data_for_id = self.data_loader_batch[_id]
            es_id = self.get_es_id(_id)

            if es_id in self.existing_docs:
                # Update doc
                existing_doc = self.existing_docs[es_id]
                doc = self.load_config.data_mapper.update_doc(
                    existing_doc=existing_doc,
                    _id=_id,
                    data_source_name=self.load_config.data_source_name,
                    data=data_for_id)
                if self.load_config.test_mode and count % 2500 == 0:
                    # print 'Existing doc', self.load_manager.data_mapper.extract_fields_from_existing_doc(existing_doc)
                    self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id)
                    self.load_config.log(
                        LOG_LEVEL_INFO,
                        '--------------------------------------------------------'
                    )
                    self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc)

                if len(doc) > 0:
                    bulk_data += self.data_loader_utils.bulk_update_header(
                        es_id)
                    bulk_data += '\n'
                    doc = {'doc': doc}
                    bulk_data += json.dumps(doc)
                    bulk_data += '\n'
                else:
                    self.add_to_failed_docs(
                        _id, data_for_id,
                        'Data mapper: update doc returned empty')
            elif self.allow_doc_creation:
                # Create new doc
                doc = self.load_config.data_mapper.create_doc(
                    _id=_id,
                    data_source_name=self.load_config.data_source_name,
                    data=data_for_id)
                if self.load_config.test_mode and count % 2500 == 0:
                    self.load_config.log(LOG_LEVEL_INFO, 'Data', data_for_id)
                    self.load_config.log(
                        LOG_LEVEL_INFO,
                        '--------------------------------------------------------'
                    )
                    self.load_config.log(LOG_LEVEL_INFO, 'Updated doc', doc)

                if len(doc) > 0:
                    bulk_data += self.data_loader_utils.bulk_index_header(
                        es_id)
                    bulk_data += '\n'
                    bulk_data += json.dumps(doc)
                    bulk_data += '\n'
                else:
                    self.add_to_failed_docs(
                        _id, data_for_id,
                        'Data mapper: create doc returned empty')
            else:
                self.add_to_failed_docs(
                    _id, data_for_id, 'Update failed: existing doc not found')

            count += 1
            if count % 500 == 0:
                self.load_config.log(LOG_LEVEL_DEBUG, 'Processed', count,
                                     'docs')

            if len(bulk_data) >= self.load_config.bulk_data_size:
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        if not self.load_config.test_mode:
            self.save_summary(ids_to_load)

    def load_bulk_data(self, bulk_data):
        self.load_config.log(LOG_LEVEL_DEBUG, 'Bulk data size', len(bulk_data),
                             'loading...')
        response = None
        if not self.load_config.test_mode:
            response = self.data_loader_utils.load_bulk_data(bulk_data)

        if response:
            self.load_config.log(LOG_LEVEL_DEBUG,
                                 'Done loading bulk data, saving response')
            if not self.load_config.test_mode:
                # Extract and save the failed docs
                self.process_bulk_update_response(response)
        else:
            self.load_config.log(LOG_LEVEL_ERROR, 'Bulk data load failed')

    def process_response_item(self, item, op):
        item_op = item[op]
        es_id = item_op['_id']
        _id = self.get_doc_id(es_id)
        try:
            doc = self.data_loader_batch[es_id]
        except Exception as e:
            doc = self.data_loader_batch[_id]

        if 'status' in item_op:
            if item_op['status'] == 200 or item_op['status'] == 201:
                # doc success
                if op == OP_INDEX:
                    self.indexed_ids[_id] = 0
                elif op == OP_UPDATE:
                    self.updated_ids[_id] = 0
            else:
                self.add_to_failed_docs(_id, doc, item)
        else:
            self.add_to_failed_docs(_id, doc, item)

    def process_bulk_update_response(self, response):
        load_summary = json.loads(response)
        items = load_summary['items']
        # print load_summary
        for item in items:
            if OP_INDEX in item:
                self.process_response_item(item, OP_INDEX)
            elif OP_UPDATE in item:
                self.process_response_item(item, OP_UPDATE)

        # save response to file
        self.load_config.log(LOG_LEVEL_TRACE, 'Updated ids:',
                             len(self.updated_ids), 'Indexed ids:',
                             len(self.indexed_ids), 'Failed ids:',
                             len(self.failed_docs))
        bulk_update_response_file_name = file_utils.batch_file_name_with_prefix(
            'summary')
        file_utils.save_text_file(self.bulk_update_response_directory,
                                  bulk_update_response_file_name + '.json',
                                  response)

    def save_summary(self, ids_to_load):
        data_loader_batch_name = file_utils.batch_file_name_with_prefix(
            DATA_LOADER_BATCH_PREFIX)

        # Find skipped ids
        for _id in ids_to_load:
            if _id not in self.updated_ids and _id not in self.indexed_ids and _id not in self.failed_docs:
                doc = self.data_loader_batch[_id]
                self.add_to_failed_docs(_id, doc, 'Skipped')

        # Save failed docs
        if len(self.failed_docs) > 0:
            file_utils.save_file(self.failed_docs_directory,
                                 data_loader_batch_name + '.json',
                                 self.failed_docs)

        # Save batch summary
        summary = {
            'indexed_ids': self.indexed_ids.keys(),
            'updated_ids': self.updated_ids.keys(),
        }

        file_utils.save_file(self.loaded_docs_directory,
                             data_loader_batch_name + '.json', summary)

        # Print summary
        self.load_config.log(
            LOG_LEVEL_INFO,
            '---------------------------------------------------------------------------------------------'
        )
        self.load_config.log(LOG_LEVEL_INFO, self.load_config.server,
                             self.load_config.server_username, self.index,
                             self.type, ' Updated docs:',
                             len(self.updated_ids) + len(self.indexed_ids),
                             ', Failed docs:', len(self.failed_docs))
        self.load_config.log(
            LOG_LEVEL_INFO,
            '---------------------------------------------------------------------------------------------'
        )

    def add_to_failed_docs(self, _id, doc, reason):
        data_for_id = {'reason': reason, 'doc': doc}

        self.failed_docs[_id] = data_for_id
Ejemplo n.º 22
0
 def __init__(self):
     self.data_utils = DataUtils()
Ejemplo n.º 23
0
 def get_editable_original_data_str(self) -> str:
     return DataUtils.parse_arr_data_to_comma_str_data(self.editable_original_data)
Ejemplo n.º 24
0
from bayes.origin.mulyinomial_native_bayes import MultinomialNB
from bayes.origin.gaussian_native_bayes import GaussianNB
from utils.data_utils import DataUtils

x, y = DataUtils.get_data_set('data/mushroom.txt', split=',')
print(x)
print(y)
nb = GaussianNB()
nb.fit(x, y)

nb.evaluate(x, y)
Ejemplo n.º 25
0
 def get_answer_data_str(self) -> str:
     return DataUtils.parse_arr_data_to_comma_str_data(self.answer_data)
Ejemplo n.º 26
0
class Predict(object):
    def __init__(self):
        self.vocab_path = FLAGS.vocab_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.freeze_graph_path = FLAGS.freeze_graph_path
        self.saved_model_path = FLAGS.saved_model_path

        self.use_crf = FLAGS.use_crf
        self.num_steps = FLAGS.num_steps

        self.default_label = FLAGS.default_label
        self.default_score = FLAGS.default_predict_score

        self.data_utils = DataUtils()
        self.tensorflow_utils = TensorflowUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(
            os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.sequence_labeling_model = SequenceLabelingModel()
        self.init_predict_graph()

    def init_predict_graph(self):
        """
        init predict model graph
        :return:
        """
        # split 1-D String dense Tensor to words SparseTensor
        self.input_sentences = tf.placeholder(dtype=tf.string,
                                              shape=[None],
                                              name='input_sentences')
        sparse_words = tf.string_split(self.input_sentences, delimiter=' ')

        # slice SparseTensor
        valid_indices = tf.less(sparse_words.indices,
                                tf.constant([self.num_steps], dtype=tf.int64))
        valid_indices = tf.reshape(
            tf.split(valid_indices, [1, 1], axis=1)[1], [-1])
        valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices)

        excess_indices = tf.greater_equal(
            sparse_words.indices, tf.constant([self.num_steps],
                                              dtype=tf.int64))
        excess_indices = tf.reshape(
            tf.split(excess_indices, [1, 1], axis=1)[1], [-1])
        excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices)

        # compute sentences lengths
        int_values = tf.ones(shape=tf.shape(valid_sparse_words.values),
                             dtype=tf.int64)
        int_valid_sparse_words = tf.SparseTensor(
            indices=valid_sparse_words.indices,
            values=int_values,
            dense_shape=valid_sparse_words.dense_shape)
        input_sentences_lengths = tf.sparse_reduce_sum(int_valid_sparse_words,
                                                       axis=1)

        # sparse to dense
        default_padding_word = self.data_utils._START_VOCAB[0]
        words = tf.sparse_to_dense(
            sparse_indices=valid_sparse_words.indices,
            output_shape=[valid_sparse_words.dense_shape[0], self.num_steps],
            sparse_values=valid_sparse_words.values,
            default_value=default_padding_word)

        # dict words to ids
        with open(os.path.join(self.vocab_path, 'words_vocab.txt'),
                  encoding='utf-8',
                  mode='rt') as data_file:
            words_table_list = [
                line.strip() for line in data_file if line.strip()
            ]
        words_table_tensor = tf.constant(words_table_list, dtype=tf.string)
        words_table = lookup.index_table_from_tensor(
            mapping=words_table_tensor,
            default_value=self.data_utils._START_VOCAB_ID[3])
        # words_table = lookup.index_table_from_file(os.path.join(vocab_path, 'words_vocab.txt'), default_value=3)
        words_ids = words_table.lookup(words)

        # blstm model predict
        with tf.variable_scope('model', reuse=None):
            logits = self.sequence_labeling_model.inference(
                words_ids,
                input_sentences_lengths,
                self.num_classes,
                is_training=False)

        if self.use_crf:
            logits = tf.reshape(logits,
                                shape=[-1, self.num_steps, self.num_classes])
            transition_params = tf.get_variable(
                "transitions", [self.num_classes, self.num_classes])
            input_sentences_lengths = tf.to_int32(input_sentences_lengths)
            predict_labels_ids, sequence_scores = crf.crf_decode(
                logits, transition_params, input_sentences_lengths)
            predict_labels_ids = tf.to_int64(predict_labels_ids)
            sequence_scores = tf.reshape(sequence_scores, shape=[-1, 1])
            normalized_sequence_scores = self.tensorflow_utils.score_normalize(
                sequence_scores)
            predict_scores = tf.matmul(
                normalized_sequence_scores,
                tf.ones(shape=[1, self.num_steps], dtype=tf.float32))
        else:
            props = tf.nn.softmax(logits)
            max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1)
            predict_labels_ids = tf.reshape(max_prop_indices,
                                            shape=[-1, self.num_steps])
            predict_labels_ids = tf.to_int64(predict_labels_ids)
            predict_scores = tf.reshape(max_prop_values,
                                        shape=[-1, self.num_steps])
        predict_scores = tf.as_string(predict_scores, precision=3)

        # dict ids to labels
        with open(os.path.join(self.vocab_path, 'labels_vocab.txt'),
                  encoding='utf-8',
                  mode='rt') as data_file:
            labels_table_list = [
                line.strip() for line in data_file if line.strip()
            ]
        labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string)
        labels_table = lookup.index_to_string_table_from_tensor(
            mapping=labels_table_tensor, default_value=self.default_label)
        # labels_table = lookup.index_to_string_table_from_file(os.path.join(vocab_path, 'labels_vocab.txt'), default_value='O')
        predict_labels = labels_table.lookup(predict_labels_ids)

        sparse_predict_labels = self.tensorflow_utils.sparse_concat(
            predict_labels, valid_sparse_words, excess_sparse_words,
            self.default_label)
        sparse_predict_scores = self.tensorflow_utils.sparse_concat(
            predict_scores, valid_sparse_words, excess_sparse_words, '0.0')

        self.format_predict_labels = self.tensorflow_utils.sparse_string_join(
            sparse_predict_labels, 'predict_labels')
        self.format_predict_scores = self.tensorflow_utils.sparse_string_join(
            sparse_predict_scores, 'predict_scores')

        saver = tf.train.Saver()
        tables_init_op = tf.tables_initializer()

        self.sess = tf.Session()
        self.sess.run(tables_init_op)
        ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            print('read model from {}'.format(ckpt.model_checkpoint_path))
            saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found at %s' % self.checkpoint_path)
            return

    def predict(self, words_list):
        """
        Predict labels, the operation of transfer words to ids is processed by tensorflow tensor
        Input words list
        :param words_list:
        :return:
        """
        split_words_list = []
        map_split_indexes = []
        for index in range(len(words_list)):
            temp_words_list = self.data_utils.split_long_sentence(
                words_list[index], self.num_steps)
            map_split_indexes.append(
                list(
                    range(len(split_words_list),
                          len(split_words_list) + len(temp_words_list))))
            split_words_list.extend(temp_words_list)

        predict_labels, predict_scores = self.sess.run(
            [self.format_predict_labels, self.format_predict_scores],
            feed_dict={self.input_sentences: split_words_list})
        predict_labels_str = [
            predict_label.decode('utf-8') for predict_label in predict_labels
        ]
        predict_scores_str = [
            predict_score.decode('utf-8') for predict_score in predict_scores
        ]

        merge_predict_labels_str = []
        merge_predict_scores_str = []
        for indexes in map_split_indexes:
            merge_predict_label_str = ' '.join(
                [predict_labels_str[index] for index in indexes])
            merge_predict_labels_str.append(merge_predict_label_str)
            merge_predict_score_str = ' '.join(
                [predict_scores_str[index] for index in indexes])
            merge_predict_scores_str.append(merge_predict_score_str)

        return merge_predict_labels_str, merge_predict_scores_str

    def file_predict(self, data_filename, predict_filename):
        """
        Predict data_filename, save the predict result into predict_filename
        The label is split into single word, -B -M -E -S
        :param data_filename:
        :param predict_filename:
        :return:
        """
        print('Predict file ' + data_filename)
        sentence_list = []
        words_list = []
        labels_list = []
        predict_labels_list = []
        with open(data_filename, encoding='utf-8', mode='rt') as data_file:
            for line in data_file:
                words, labels = self.data_utils.split(line)
                if words and labels:
                    sentence_list.append(''.join(words))
                    words_list.append(' '.join(words))
                    labels_list.append(' '.join(labels))
                    predict_labels, _ = self.predict([' '.join(words)])
                    predict_labels_list.append(predict_labels[0])
        word_predict_label_list = []
        word_category_list = []
        word_predict_category_list = []
        for (words, labels, predict_labels) in zip(words_list, labels_list,
                                                   predict_labels_list):
            word_list = words.split()
            label_list = labels.split()
            predict_label_list = predict_labels.split()
            word_predict_label = ' '.join([
                word + '/' + predict_label
                for (word, predict_label) in zip(word_list, predict_label_list)
            ])
            word_predict_label_list.append(word_predict_label)
            # merge label
            merge_word_list, merge_label_list = self.data_utils.merge_label(
                word_list, label_list)
            word_category = ' '.join([
                word + '/' + label
                for (word, label) in zip(merge_word_list, merge_label_list)
                if label != self.default_label
            ])
            word_category_list.append(word_category)
            # merge predict label
            merge_predict_word_list, merge_predict_label_list = self.data_utils.merge_label(
                word_list, predict_label_list)
            word_predict_category = ' '.join([
                predict_word + '/' + predict_label
                for (predict_word, predict_label) in zip(
                    merge_predict_word_list, merge_predict_label_list)
                if predict_label != 'O'
            ])
            word_predict_category_list.append(word_predict_category)
        with open(predict_filename, encoding='utf-8',
                  mode='wt') as predict_file:
            for (sentence, word_predict_label, word_category, word_predict_category) in \
                    zip(sentence_list, word_predict_label_list, word_category_list, word_predict_category_list):
                predict_file.write('Passage: ' + sentence + '\n')
                predict_file.write('SinglePredict: ' + word_predict_label +
                                   '\n')
                predict_file.write('Merge: ' + word_category + '\n')
                predict_file.write('MergePredict: ' + word_predict_category +
                                   '\n\n')

    def freeze_graph(self):
        """
        Save graph into .pb file
        :return:
        """
        graph = tf.graph_util.convert_variables_to_constants(
            self.sess, self.sess.graph_def,
            ['init_all_tables', 'predict_labels', 'predict_scores'])
        tf.train.write_graph(graph,
                             self.freeze_graph_path,
                             'frozen_graph.pb',
                             as_text=False)
        print('Successfully freeze model to %s' % self.freeze_graph_path)

    def saved_model_pb(self):
        """
        Saved model into .ph and variables files, loading it by tensorflow serving,
        :return:
        """
        saved_model_path = os.path.join(self.saved_model_path, '1')
        if os.path.exists(saved_model_path):
            shutil.rmtree(saved_model_path)
        builder = tf.saved_model.builder.SavedModelBuilder(saved_model_path)
        input_tensor_info = tf.saved_model.utils.build_tensor_info(
            self.input_sentences)
        output_labels_tensor_info = tf.saved_model.utils.build_tensor_info(
            self.format_predict_labels)
        output_scores_tensor_info = tf.saved_model.utils.build_tensor_info(
            self.format_predict_scores)
        prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'input_sentences': input_tensor_info},
            outputs={
                'predict_labels': output_labels_tensor_info,
                'predict_scores': output_scores_tensor_info
            },
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')
        builder.add_meta_graph_and_variables(
            self.sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={'predict_segment': prediction_signature},
            legacy_init_op=legacy_init_op)
        builder.save()
        print('Successfully exported model to %s' % saved_model_path)
Ejemplo n.º 27
0
    def __init__(self,
                 writer: SummaryWriter,
                 model,
                 data_utils: DataUtils,
                 device,
                 run_folder_path,
                 do_validation,
                 lr,
                 optimizer,
                 mse,
                 ssim,
                 save_checkpoints=False,
                 do_persistence=False,
                 nr_of_input_steps=3,
                 **model_params):

        self.data_utils = data_utils
        self.estimate_total_nr_train_data_points = self.data_utils.get_estimate_total_nr_train_data_points(
        )
        self.estimate_total_nr_test_data_points = self.data_utils.get_estimate_total_nr_test_data_points(
        )
        self.validation_train_ratio = data_utils.get_validation_train_ratio()
        self.net = self.net.to(device)

        logging.info(
            'Model params - {}, number of parameters in net: {}, '.format([
                '{}: {}'.format(x, y)
                for x, y in model_params.items() if 'state_dict' not in x
            ], self.count_parameters()))

        logging.info(
            'Run params - device: {}, run folder: {}, validation: {},'
            ' learning rate: {}, save checkpoints: {}, do persistence: {}'.
            format(
                device,
                run_folder_path,
                do_validation,
                lr,
                save_checkpoints,
                do_persistence,
            ))

        logging.info(self.net)
        self.nr_of_input_steps = nr_of_input_steps
        self.do_persistence = do_persistence
        self.save_checkpoints = save_checkpoints
        self.lr = lr
        self.do_validation = do_validation
        self.model_params = model_params
        self.run_folder_path = run_folder_path
        self.device = device
        self.model = model
        self.writer = writer
        self.optimizer = optimizer(self.net.parameters(), lr=self.lr)
        self.ssim = ssim()
        self.mse = mse()
        self.train_mse_loss_array = []
        self.train_ssim_loss_array = []
        self.test_mse_loss_array = []
        self.test_ssim_loss_array = []
        self.running_train_mse_loss = 0.0
        self.running_test_mse_loss = 0.0
        self.hidden_state = None

        # writer.add_graph(net, data_utils.get_next_train_data_point().to(device))
        # writer.close()

        if self.do_validation:
            self.running_validation_loss = 0.0
            self.validation_loss_array = []
            self.when_validate = data_utils.get_validation_train_ratio()
Ejemplo n.º 28
0
class TensorflowUtils(object):
    def __init__(self):
        self.batch_size = FLAGS.batch_size
        self.num_steps = FLAGS.num_steps
        self.min_after_dequeue = FLAGS.min_after_dequeue
        self.num_threads = FLAGS.num_threads
        self.embedding_size = FLAGS.embedding_size

        self.data_utils = DataUtils()
        self.default_word_padding_id = self.data_utils._START_VOCAB_ID[0]
        self.default_label_padding_id = self.data_utils.load_default_label_id()

    def create_record(self, words_list, labels_list, tfrecords_filename):
        """"
        Store data into tfrecords file
        :param words_list:
        :param labels_list:
        :param tfrecords_filename:
        :return:
        """
        print('Create record to ' + tfrecords_filename)
        writer = tf.python_io.TFRecordWriter(tfrecords_filename)
        assert len(words_list) == len(labels_list)
        for (word_ids, label_ids) in zip(words_list, labels_list):
            word_list = [int(word) for word in word_ids.strip().split()]
            label_list = [int(label) for label in label_ids.strip().split()]
            assert len(word_list) == len(label_list)
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'words':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=word_list)),
                    'labels':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=label_list)),
                }))
            writer.write(example.SerializeToString())
        writer.close()

    def read_and_decode(self, tfrecords_filename):
        """"
        Shuffled read batch data from tfrecords file
        :param tfrecords_filename:
        :return:
        """
        print('Read record from ' + tfrecords_filename)
        filename_queue = tf.train.string_input_producer([tfrecords_filename],
                                                        num_epochs=None)
        reader = tf.TFRecordReader()
        _, serialized_example = reader.read(filename_queue)
        feature_configs = {
            # 'words': tf.FixedLenFeature(shape=[num_steps], dtype=tf.int64, default_value=0),
            'words': tf.VarLenFeature(dtype=tf.int64),
            'labels': tf.VarLenFeature(dtype=tf.int64),
        }
        features = tf.parse_single_example(serialized_example,
                                           features=feature_configs)
        words = features['words']
        words_len = words.dense_shape[0]
        words_len = tf.minimum(words_len, tf.constant(self.num_steps,
                                                      tf.int64))
        words = tf.sparse_to_dense(
            sparse_indices=words.indices[:self.num_steps],
            output_shape=[self.num_steps],
            sparse_values=words.values[:self.num_steps],
            default_value=self.default_word_padding_id)
        labels = features['labels']
        labels = tf.sparse_to_dense(
            sparse_indices=labels.indices[:self.num_steps],
            output_shape=[self.num_steps],
            sparse_values=labels.values[:self.num_steps],
            default_value=self.default_label_padding_id)
        capacity = self.min_after_dequeue + 3 * self.batch_size
        words_batch, labels_batch, words_len_batch = tf.train.shuffle_batch(
            [words, labels, words_len],
            batch_size=self.batch_size,
            capacity=capacity,
            min_after_dequeue=self.min_after_dequeue,
            num_threads=self.num_threads)
        return words_batch, labels_batch, words_len_batch

    def print_all(self, tfrecords_filename):
        """
        Print all data from tfrecords file
        :param tfrecords_filename:
        :return:
        """
        number = 1
        for serialized_example in tf.python_io.tf_record_iterator(
                tfrecords_filename):
            example = tf.train.Example()
            example.ParseFromString(serialized_example)
            words = example.features.feature['words'].int64_list.value
            labels = example.features.feature['labels'].int64_list.value
            word_list = [word for word in words]
            label_list = [label for label in labels]
            print('Number:{}, labels: {}, features: {}'.format(
                number, label_list, word_list))
            number += 1

    def print_shuffle(self, tfrecords_filename):
        """
        Print shuffled data from tfrecords file calling read_and_decode method
        :param tfrecords_filename:
        :return:
        """
        words_batch, labels_batch, words_len_batch = self.read_and_decode(
            tfrecords_filename)
        with tf.Session() as sess:
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            try:
                while not coord.should_stop():
                    batch_words_r, batch_labels_r, batch_words_len_r = sess.run(
                        [words_batch, labels_batch, words_len_batch])
                    print('batch_words_r : ', batch_words_r.shape)
                    print(batch_words_r)
                    print('batch_labels_r : ', batch_labels_r.shape)
                    print(batch_labels_r)
                    print('batch_words_len_r : ', batch_words_len_r.shape)
                    print(batch_words_len_r)
            except tf.errors.OutOfRangeError:
                print('Done reading')
            finally:
                coord.request_stop()
            coord.join(threads)

    def load_embedding(self, embedding_filename, vocab_filename):
        """
        Load word embedding, that pretrained by Word2Vec
        :param embedding_filename:
        :param vocab_filename:
        :return:
        """
        embedding_dict = dict()
        with open(embedding_filename, encoding='utf-8',
                  mode='rt') as data_file:
            for line in data_file:
                words = line.strip().split()
                if len(words) != self.embedding_size + 1:
                    raise Exception('Invalid embedding exist : %s' %
                                    (line.strip()))
                word = words[0]
                embedding = [float(num) for num in words[1:]]
                embedding_dict[word] = embedding

        words_vocab = self.data_utils.initialize_single_vocabulary(
            vocab_filename)

        embedding = [[0.0 for _ in range(self.embedding_size)]
                     for _ in range(len(words_vocab))]
        for word, word_ids in words_vocab.items():
            if word in embedding_dict:
                embedding[word_ids] = embedding_dict[word]
        embedding_tensor = tf.constant(embedding,
                                       dtype=tf.float32,
                                       name='embedding')
        return embedding_tensor

    def sparse_concat(self, sparse_tensor_input, base_tensor, excess_tensor,
                      default_value):
        """
        Extend sparse_tensor_input using base_indices and excess_indices
        :param sparse_tensor_input:
        :param base_indices:
        :param base_shape:
        :param excess_indices:
        :param excess_value_shape:
        :param excess_shape:
        :param default_value:
        :return:
        """
        # extract real blstm predict in dense and save to sparse
        base_sparse_tensor = tf.SparseTensor(
            indices=base_tensor.indices,
            values=tf.gather_nd(sparse_tensor_input, base_tensor.indices),
            dense_shape=base_tensor.dense_shape)

        # create excess SparseTensor with default_value
        excess_sparse_tensor = tf.SparseTensor(
            indices=excess_tensor.indices,
            values=tf.fill(tf.shape(excess_tensor.values), default_value),
            dense_shape=excess_tensor.dense_shape)

        # concat SparseTensor
        concat_sparse_tensor = tf.SparseTensor(
            indices=tf.concat(axis=0,
                              values=[
                                  base_sparse_tensor.indices,
                                  excess_sparse_tensor.indices
                              ]),
            values=tf.concat(axis=0,
                             values=[
                                 base_sparse_tensor.values,
                                 excess_sparse_tensor.values
                             ]),
            dense_shape=excess_sparse_tensor.dense_shape)
        concat_sparse_tensor = tf.sparse_reorder(concat_sparse_tensor)
        return concat_sparse_tensor

    def sparse_string_join(self, sparse_tensor_input, name):
        """
        Join SparseTensor to 1-D String dense Tensor
        :param sparse_tensor_input:
        :param name:
        :return:
        """
        dense_tensor_input = tf.sparse_to_dense(
            sparse_indices=sparse_tensor_input.indices,
            output_shape=sparse_tensor_input.dense_shape,
            sparse_values=sparse_tensor_input.values,
            default_value='')
        dense_tensor_input_join = tf.reduce_join(dense_tensor_input,
                                                 axis=1,
                                                 separator=' ')
        format_predict_labels = tf.string_strip(dense_tensor_input_join,
                                                name=name)
        return format_predict_labels

    def score_normalize(self, scores):
        """
        Normalize crf score
        :param scores: shape [-1, 1]
        :return:
        """
        lambda_factor = tf.constant(0.05, dtype=tf.float32)
        normalized_scores = tf.reciprocal(
            tf.add(tf.constant(1.0, dtype=tf.float32),
                   tf.exp(tf.negative(tf.multiply(lambda_factor, scores)))))
        return normalized_scores