Exemple #1
0
def vectorization_core(vectorizer,
                       init_term_vocabulary=True,
                       merge_doc_vocabularies=False):
    """
    Main function of collection vectorization

    vectorizer : message vectorization function
    returns : None
    """
    init_logger()

    if (sys.argv < 8):
        exit(0)

    config = {
        'task_type': sys.argv[1],
        'database': sys.argv[2],  # save the output results
        'train_table': sys.argv[3],
        'test_table': sys.argv[4],
        'train_output': sys.argv[5],
        'test_output': sys.argv[6],
        'pconf_output': sys.argv[7]
    }

    with io.open(configs.TWITTER_MESSAGE_PARSER_CONFIG, "r") as f:
        message_settings = json.load(f, encoding='utf-8')

    with io.open(configs.FEATURES_CONFIG, 'r') as f:
        features_settings = json.load(f, encoding='utf-8')

    # Create vocabulary of terms
    if init_term_vocabulary is True:
        term_vocabulary = core.indexer.create_term_vocabulary(
            [config['train_table'], config['test_table']], message_settings)
    else:
        term_vocabulary = TermVocabulary()

    features = Features(
        TwitterMessageParser(message_settings, config['task_type']),
        features_settings)

    doc_vocabulary = DocVocabulary()
    # Train problem
    train_problem = create_problem(config['task_type'], 'train',
                                   config['train_table'], vectorizer, features,
                                   term_vocabulary, doc_vocabulary,
                                   message_settings)

    if not merge_doc_vocabularies:
        doc_vocabulary = DocVocabulary()
    # Test problem
    test_problem = create_problem(config['task_type'], 'test',
                                  config['test_table'], vectorizer, features,
                                  term_vocabulary, doc_vocabulary,
                                  message_settings)

    result_table = config['test_table'] + '.result.csv'
    logging.info(
        'Create a file for classifier results: {}'.format(result_table))
    result_df = pd.read_csv(config['test_table'], sep=',')
    result_df.to_csv(result_table, sep=',')

    # Save
    save_problem(train_problem, config['train_output'])
    save_problem(test_problem, config['test_output'])
    save_predict_config(columns=get_score_columns(config['task_type']),
                        prediction_table=result_table,
                        out_filepath=config['pconf_output'])
Exemple #2
0
def DeepFM(feature_metas,
           linear_slots,
           fm_slots,
           dnn_slots,
           embedding_initializer='glorot_uniform',
           embedding_regularizer=tf.keras.regularizers.l2(1e-5),
           fm_fixed_embedding_dim=None,
           linear_use_bias=True,
           linear_kernel_initializer=tf.keras.initializers.RandomNormal(
               stddev=1e-4, seed=1024),
           linear_kernel_regularizer=tf.keras.regularizers.l2(1e-5),
           dnn_hidden_units=(128, 64, 1),
           dnn_activations=('relu', 'relu', None),
           dnn_use_bias=True,
           dnn_use_bn=False,
           dnn_dropout=0,
           dnn_kernel_initializers='glorot_uniform',
           dnn_bias_initializers='zeros',
           dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5),
           dnn_bias_regularizers=None,
           name='DeepFM'):

    assert isinstance(feature_metas, FeatureMetas)

    with tf.name_scope(name):

        features = Features(metas=feature_metas)

        # Linear Part
        with tf.name_scope('Linear'):
            linear_output = features.get_linear_logit(
                use_bias=linear_use_bias,
                kernel_initializer=linear_kernel_initializer,
                kernel_regularizer=linear_kernel_regularizer,
                embedding_group='dot_embedding',
                slots_filter=linear_slots)

        # FM Part
        with tf.name_scope('FM'):
            fm_embedded_dict = features.get_embedded_dict(
                group_name='embedding',
                fixed_embedding_dim=fm_fixed_embedding_dim,
                embedding_initializer=embedding_initializer,
                embedding_regularizer=embedding_regularizer,
                slots_filter=fm_slots)
            fm_dim_groups = group_embedded_by_dim(fm_embedded_dict)
            fms = [
                FM()(group) for group in fm_dim_groups.values()
                if len(group) > 1
            ]
            fm_output = tf.add_n(fms)

        # DNN Part
        with tf.name_scope('DNN'):
            dnn_inputs = features.gen_concated_feature(
                embedding_group='embedding',
                fixed_embedding_dim=fm_fixed_embedding_dim,
                embedding_initializer=embedding_initializer,
                embedding_regularizer=embedding_regularizer,
                slots_filter=dnn_slots)
            dnn_output = DNN(
                units=dnn_hidden_units,
                use_bias=dnn_use_bias,
                activations=dnn_activations,
                use_bn=dnn_use_bn,
                dropout=dnn_dropout,
                kernel_initializers=dnn_kernel_initializers,
                bias_initializers=dnn_bias_initializers,
                kernel_regularizers=dnn_kernel_regularizers,
                bias_regularizers=dnn_bias_regularizers)(dnn_inputs)

        # Output
        output = tf.add_n([linear_output, fm_output, dnn_output])
        output = tf.keras.activations.sigmoid(output)

        model = tf.keras.Model(inputs=features.get_inputs_list(),
                               outputs=output)

        return model
Exemple #3
0
def FGCNN(
        feature_metas,
        fg_filters=(14, 16, 18, 20),
        fg_widths=(7, 7, 7, 7),
        fg_pool_widths=(2, 2, 2, 2),
        fg_new_feat_filters=(3, 3, 3, 3),
        embedding_initializer='glorot_uniform',
        embedding_regularizer=tf.keras.regularizers.l2(1e-5),
        fixed_embedding_dim=8,
        dnn_hidden_units=(128, 64, 1),
        dnn_activations=('relu', 'relu', None),
        dnn_use_bias=True,
        dnn_use_bn=False,
        dnn_dropout=0,
        dnn_kernel_initializers='glorot_uniform',
        dnn_bias_initializers='zeros',
        dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5),
        dnn_bias_regularizers=None,
        name='FGCNN'):

    assert isinstance(feature_metas, FeatureMetas)

    with tf.name_scope(name):

        features = Features(metas=feature_metas)

        raw_feats = features.get_stacked_feature(
            embedding_group='raw',
            fixed_embedding_dim=fixed_embedding_dim,
            embedding_initializer=embedding_initializer,
            embedding_regularizer=embedding_regularizer,
            slots_filter=None
        )

        fg_inputs = features.get_stacked_feature(
            embedding_group='fgcnn',
            fixed_embedding_dim=fixed_embedding_dim,
            embedding_initializer=embedding_initializer,
            embedding_regularizer=embedding_regularizer,
            slots_filter=None
        )
        fg_inputs = tf.expand_dims(fg_inputs, axis=-1)

        new_feats_list = list()
        for filters, width, pool, new_filters in zip(fg_filters, fg_widths, fg_pool_widths, fg_new_feat_filters):
            fg_inputs, new_feats = FGCNNlayer(
                filters=filters,
                kernel_width=width,
                pool_width=pool,
                new_feat_filters=new_filters
            )(fg_inputs)
            new_feats_list.append(new_feats)

        inputs = tf.concat(new_feats_list + [raw_feats], axis=1)
        inputs = split_tensor(inputs, axis=1)

        inputs_fm = InnerProduct(require_logit=False)(inputs)

        dnn_inputs = tf.concat(inputs + [inputs_fm], axis=1)
        output = DNN(
            units=dnn_hidden_units,
            use_bias=dnn_use_bias,
            activations=dnn_activations,
            use_bn=dnn_use_bn,
            dropout=dnn_dropout,
            kernel_initializers=dnn_kernel_initializers,
            bias_initializers=dnn_bias_initializers,
            kernel_regularizers=dnn_kernel_regularizers,
            bias_regularizers=dnn_bias_regularizers
        )(dnn_inputs)

        output = tf.keras.activations.sigmoid(output)

        model = tf.keras.Model(inputs=features.get_inputs_list(), outputs=output)

        return model
Exemple #4
0
def AutoInt(feature_metas,
            seed=2333,
            interaction_layer_num=3,
            attention_embedding_size=8,
            attention_heads=2,
            interaction_use_res=True,
            embedding_initializer='glorot_uniform',
            embedding_regularizer=tf.keras.regularizers.l2(1e-5),
            fixed_embedding_dim=None,
            dnn_hidden_units=(128, 64, 1),
            dnn_activations=('relu', 'relu', None),
            dnn_use_bias=True,
            dnn_use_bn=False,
            dnn_dropout=0,
            dnn_kernel_initializers='glorot_uniform',
            dnn_bias_initializers='zeros',
            dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5),
            dnn_bias_regularizers=None,
            name='AutoInt'):

    assert isinstance(feature_metas, FeatureMetas)

    with tf.name_scope(name):

        features = Features(metas=feature_metas)

        embedded_dict = features.get_embedded_dict(
            slots_filter=None,
            fixed_embedding_dim=fixed_embedding_dim,
            embedding_initializer=embedding_initializer,
            embedding_regularizer=embedding_regularizer,
            group_name='embedding')
        grouped_embedded = group_embedded_by_dim(embedded_dict)
        grouped_inputs = [
            tf.stack(group, axis=1) for group in grouped_embedded.values()
        ]
        for _ in range(interaction_layer_num):
            for i in range(len(grouped_inputs)):
                grouped_inputs[i] = AutoIntInteraction(
                    att_embedding_size=attention_embedding_size,
                    heads=attention_heads,
                    use_res=interaction_use_res,
                    seed=seed)(grouped_inputs[i])

        dnn_inputs = tf.keras.layers.Flatten()(tf.concat(grouped_inputs,
                                                         axis=2))
        output = DNN(units=dnn_hidden_units,
                     use_bias=dnn_use_bias,
                     activations=dnn_activations,
                     use_bn=dnn_use_bn,
                     dropout=dnn_dropout,
                     kernel_initializers=dnn_kernel_initializers,
                     bias_initializers=dnn_bias_initializers,
                     kernel_regularizers=dnn_kernel_regularizers,
                     bias_regularizers=dnn_bias_regularizers)(dnn_inputs)

        output = tf.keras.activations.sigmoid(output)

        model = tf.keras.Model(inputs=features.get_inputs_list(),
                               outputs=output)

        return model
Exemple #5
0
def PNN(feature_metas,
        use_inner_product=True,
        use_outer_product=False,
        outer_kernel_initializer='glorot_uniform',
        outer_kernel_regularizer=tf.keras.regularizers.l2(1e-5),
        embedding_initializer='glorot_uniform',
        embedding_regularizer=tf.keras.regularizers.l2(1e-5),
        fixed_embedding_dim=None,
        dnn_hidden_units=(128, 64, 1),
        dnn_activations=('relu', 'relu', None),
        dnn_use_bias=True,
        dnn_use_bn=False,
        dnn_dropout=0,
        dnn_kernel_initializers='glorot_uniform',
        dnn_bias_initializers='zeros',
        dnn_kernel_regularizers=tf.keras.regularizers.l2(1e-5),
        dnn_bias_regularizers=None,
        name='PNN'):

    assert isinstance(feature_metas, FeatureMetas)

    with tf.name_scope(name):

        features = Features(metas=feature_metas)

        embedded_dict = features.get_embedded_dict(
            group_name='embedding',
            fixed_embedding_dim=fixed_embedding_dim,
            embedding_initializer=embedding_initializer,
            embedding_regularizer=embedding_regularizer,
            slots_filter=None)
        raw_embedded_inputs = features.gen_concated_feature(
            embedding_group='embedding',
            fixed_embedding_dim=fixed_embedding_dim,
            slots_filter=None)
        inputs = [raw_embedded_inputs]

        if use_inner_product:
            inner_product_inputs = InnerProduct()(list(embedded_dict.values()))
            inputs.append(inner_product_inputs)

        if use_outer_product:
            outer_product_inputs = OuterProduct(
                outer_kernel_regularizer=outer_kernel_regularizer,
                outer_kernel_initializer=outer_kernel_initializer)(list(
                    embedded_dict.values()))
            inputs.append(outer_product_inputs)

        inputs = tf.concat(inputs, axis=1)
        output = DNN(units=dnn_hidden_units,
                     use_bias=dnn_use_bias,
                     activations=dnn_activations,
                     use_bn=dnn_use_bn,
                     dropout=dnn_dropout,
                     kernel_initializers=dnn_kernel_initializers,
                     bias_initializers=dnn_bias_initializers,
                     kernel_regularizers=dnn_kernel_regularizers,
                     bias_regularizers=dnn_bias_regularizers)(inputs)

        output = tf.keras.activations.sigmoid(output)

        model = tf.keras.Model(inputs=features.get_inputs_list(),
                               outputs=output)

        return model