def vectorization_core(vectorizer, init_term_vocabulary=True, merge_doc_vocabularies=False): """ Main function of collection vectorization vectorizer : message vectorization function returns : None """ init_logger() if (sys.argv < 8): exit(0) config = { 'task_type': sys.argv[1], 'database': sys.argv[2], # save the output results 'train_table': sys.argv[3], 'test_table': sys.argv[4], 'train_output': sys.argv[5], 'test_output': sys.argv[6], 'pconf_output': sys.argv[7] } with io.open(configs.TWITTER_MESSAGE_PARSER_CONFIG, "r") as f: message_settings = json.load(f, encoding='utf-8') with io.open(configs.FEATURES_CONFIG, 'r') as f: features_settings = json.load(f, encoding='utf-8') # Create vocabulary of terms if init_term_vocabulary is True: term_vocabulary = core.indexer.create_term_vocabulary( [config['train_table'], config['test_table']], message_settings) else: term_vocabulary = TermVocabulary() features = Features( TwitterMessageParser(message_settings, config['task_type']), features_settings) doc_vocabulary = DocVocabulary() # Train problem train_problem = create_problem(config['task_type'], 'train', config['train_table'], vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) if not merge_doc_vocabularies: doc_vocabulary = DocVocabulary() # Test problem test_problem = create_problem(config['task_type'], 'test', config['test_table'], vectorizer, features, term_vocabulary, doc_vocabulary, message_settings) result_table = config['test_table'] + '.result.csv' logging.info( 'Create a file for classifier results: {}'.format(result_table)) result_df = pd.read_csv(config['test_table'], sep=',') result_df.to_csv(result_table, sep=',') # Save save_problem(train_problem, config['train_output']) save_problem(test_problem, config['test_output']) save_predict_config(columns=get_score_columns(config['task_type']), prediction_table=result_table, out_filepath=config['pconf_output'])
def DeepFM(feature_metas, linear_slots, fm_slots, dnn_slots, embedding_initializer='glorot_uniform', embedding_regularizer=tf.keras.regularizers.l2(1e-5), fm_fixed_embedding_dim=None, linear_use_bias=True, linear_kernel_initializer=tf.keras.initializers.RandomNormal( stddev=1e-4, seed=1024), linear_kernel_regularizer=tf.keras.regularizers.l2(1e-5), dnn_hidden_units=(128, 64, 1), dnn_activations=('relu', 'relu', None), dnn_use_bias=True, dnn_use_bn=False, dnn_dropout=0, dnn_kernel_initializers='glorot_uniform', dnn_bias_initializers='zeros', dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5), dnn_bias_regularizers=None, name='DeepFM'): assert isinstance(feature_metas, FeatureMetas) with tf.name_scope(name): features = Features(metas=feature_metas) # Linear Part with tf.name_scope('Linear'): linear_output = features.get_linear_logit( use_bias=linear_use_bias, kernel_initializer=linear_kernel_initializer, kernel_regularizer=linear_kernel_regularizer, embedding_group='dot_embedding', slots_filter=linear_slots) # FM Part with tf.name_scope('FM'): fm_embedded_dict = features.get_embedded_dict( group_name='embedding', fixed_embedding_dim=fm_fixed_embedding_dim, embedding_initializer=embedding_initializer, embedding_regularizer=embedding_regularizer, slots_filter=fm_slots) fm_dim_groups = group_embedded_by_dim(fm_embedded_dict) fms = [ FM()(group) for group in fm_dim_groups.values() if len(group) > 1 ] fm_output = tf.add_n(fms) # DNN Part with tf.name_scope('DNN'): dnn_inputs = features.gen_concated_feature( embedding_group='embedding', fixed_embedding_dim=fm_fixed_embedding_dim, embedding_initializer=embedding_initializer, embedding_regularizer=embedding_regularizer, slots_filter=dnn_slots) dnn_output = DNN( units=dnn_hidden_units, use_bias=dnn_use_bias, activations=dnn_activations, use_bn=dnn_use_bn, dropout=dnn_dropout, kernel_initializers=dnn_kernel_initializers, bias_initializers=dnn_bias_initializers, kernel_regularizers=dnn_kernel_regularizers, bias_regularizers=dnn_bias_regularizers)(dnn_inputs) # Output output = tf.add_n([linear_output, fm_output, dnn_output]) output = tf.keras.activations.sigmoid(output) model = tf.keras.Model(inputs=features.get_inputs_list(), outputs=output) return model
def FGCNN( feature_metas, fg_filters=(14, 16, 18, 20), fg_widths=(7, 7, 7, 7), fg_pool_widths=(2, 2, 2, 2), fg_new_feat_filters=(3, 3, 3, 3), embedding_initializer='glorot_uniform', embedding_regularizer=tf.keras.regularizers.l2(1e-5), fixed_embedding_dim=8, dnn_hidden_units=(128, 64, 1), dnn_activations=('relu', 'relu', None), dnn_use_bias=True, dnn_use_bn=False, dnn_dropout=0, dnn_kernel_initializers='glorot_uniform', dnn_bias_initializers='zeros', dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5), dnn_bias_regularizers=None, name='FGCNN'): assert isinstance(feature_metas, FeatureMetas) with tf.name_scope(name): features = Features(metas=feature_metas) raw_feats = features.get_stacked_feature( embedding_group='raw', fixed_embedding_dim=fixed_embedding_dim, embedding_initializer=embedding_initializer, embedding_regularizer=embedding_regularizer, slots_filter=None ) fg_inputs = features.get_stacked_feature( embedding_group='fgcnn', fixed_embedding_dim=fixed_embedding_dim, embedding_initializer=embedding_initializer, embedding_regularizer=embedding_regularizer, slots_filter=None ) fg_inputs = tf.expand_dims(fg_inputs, axis=-1) new_feats_list = list() for filters, width, pool, new_filters in zip(fg_filters, fg_widths, fg_pool_widths, fg_new_feat_filters): fg_inputs, new_feats = FGCNNlayer( filters=filters, kernel_width=width, pool_width=pool, new_feat_filters=new_filters )(fg_inputs) new_feats_list.append(new_feats) inputs = tf.concat(new_feats_list + [raw_feats], axis=1) inputs = split_tensor(inputs, axis=1) inputs_fm = InnerProduct(require_logit=False)(inputs) dnn_inputs = tf.concat(inputs + [inputs_fm], axis=1) output = DNN( units=dnn_hidden_units, use_bias=dnn_use_bias, activations=dnn_activations, use_bn=dnn_use_bn, dropout=dnn_dropout, kernel_initializers=dnn_kernel_initializers, bias_initializers=dnn_bias_initializers, kernel_regularizers=dnn_kernel_regularizers, bias_regularizers=dnn_bias_regularizers )(dnn_inputs) output = tf.keras.activations.sigmoid(output) model = tf.keras.Model(inputs=features.get_inputs_list(), outputs=output) return model
def AutoInt(feature_metas, seed=2333, interaction_layer_num=3, attention_embedding_size=8, attention_heads=2, interaction_use_res=True, embedding_initializer='glorot_uniform', embedding_regularizer=tf.keras.regularizers.l2(1e-5), fixed_embedding_dim=None, dnn_hidden_units=(128, 64, 1), dnn_activations=('relu', 'relu', None), dnn_use_bias=True, dnn_use_bn=False, dnn_dropout=0, dnn_kernel_initializers='glorot_uniform', dnn_bias_initializers='zeros', dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5), dnn_bias_regularizers=None, name='AutoInt'): assert isinstance(feature_metas, FeatureMetas) with tf.name_scope(name): features = Features(metas=feature_metas) embedded_dict = features.get_embedded_dict( slots_filter=None, fixed_embedding_dim=fixed_embedding_dim, embedding_initializer=embedding_initializer, embedding_regularizer=embedding_regularizer, group_name='embedding') grouped_embedded = group_embedded_by_dim(embedded_dict) grouped_inputs = [ tf.stack(group, axis=1) for group in grouped_embedded.values() ] for _ in range(interaction_layer_num): for i in range(len(grouped_inputs)): grouped_inputs[i] = AutoIntInteraction( att_embedding_size=attention_embedding_size, heads=attention_heads, use_res=interaction_use_res, seed=seed)(grouped_inputs[i]) dnn_inputs = tf.keras.layers.Flatten()(tf.concat(grouped_inputs, axis=2)) output = DNN(units=dnn_hidden_units, use_bias=dnn_use_bias, activations=dnn_activations, use_bn=dnn_use_bn, dropout=dnn_dropout, kernel_initializers=dnn_kernel_initializers, bias_initializers=dnn_bias_initializers, kernel_regularizers=dnn_kernel_regularizers, bias_regularizers=dnn_bias_regularizers)(dnn_inputs) output = tf.keras.activations.sigmoid(output) model = tf.keras.Model(inputs=features.get_inputs_list(), outputs=output) return model
def PNN(feature_metas, use_inner_product=True, use_outer_product=False, outer_kernel_initializer='glorot_uniform', outer_kernel_regularizer=tf.keras.regularizers.l2(1e-5), embedding_initializer='glorot_uniform', embedding_regularizer=tf.keras.regularizers.l2(1e-5), fixed_embedding_dim=None, dnn_hidden_units=(128, 64, 1), dnn_activations=('relu', 'relu', None), dnn_use_bias=True, dnn_use_bn=False, dnn_dropout=0, dnn_kernel_initializers='glorot_uniform', dnn_bias_initializers='zeros', dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5), dnn_bias_regularizers=None, name='PNN'): assert isinstance(feature_metas, FeatureMetas) with tf.name_scope(name): features = Features(metas=feature_metas) embedded_dict = features.get_embedded_dict( group_name='embedding', fixed_embedding_dim=fixed_embedding_dim, embedding_initializer=embedding_initializer, embedding_regularizer=embedding_regularizer, slots_filter=None) raw_embedded_inputs = features.gen_concated_feature( embedding_group='embedding', fixed_embedding_dim=fixed_embedding_dim, slots_filter=None) inputs = [raw_embedded_inputs] if use_inner_product: inner_product_inputs = InnerProduct()(list(embedded_dict.values())) inputs.append(inner_product_inputs) if use_outer_product: outer_product_inputs = OuterProduct( outer_kernel_regularizer=outer_kernel_regularizer, outer_kernel_initializer=outer_kernel_initializer)(list( embedded_dict.values())) inputs.append(outer_product_inputs) inputs = tf.concat(inputs, axis=1) output = DNN(units=dnn_hidden_units, use_bias=dnn_use_bias, activations=dnn_activations, use_bn=dnn_use_bn, dropout=dnn_dropout, kernel_initializers=dnn_kernel_initializers, bias_initializers=dnn_bias_initializers, kernel_regularizers=dnn_kernel_regularizers, bias_regularizers=dnn_bias_regularizers)(inputs) output = tf.keras.activations.sigmoid(output) model = tf.keras.Model(inputs=features.get_inputs_list(), outputs=output) return model