Python get_dataの例

プログラミング言語: Python

名前空間/パッケージ名: run_text_processing

メソッド/関数: get_data

hotexamples.comのコード掲載数: 5

Python get_data - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのrun_text_processing.get_dataの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: execute_bilstm_conditional_sentiment.py プロジェクト: malvika-hp/stance-detection-in-news

def run_lstm_conditional(config,
                         split=True,
                         outputpath='../../xp',
                         final=False):
    ## Get data
    config, data_dict = get_data(
        config,
        filename_embeddings='/../../glove/glove.twitter.27B.50d.txt',
        pickle_path='/../../glove/twitter50d_h_ids_b_ids_pickle.p',
        concat=False)

    config1, lex_data_dict = get_lexicon_data_bilstm_sentiment(
        config,
        filename_embeddings='/../../glove/glove.twitter.27B.50d.txt',
        pickle_path='lexiconp.p',
        concat=False)
    # print(data_dict_l)

    ## pass data into local namespace:
    y = data_dict['y']
    h = data_dict['h_np']
    b = data_dict['b_np']
    h_len = data_dict['h_seqlen']
    b_len = data_dict['b_seqlen']

    # Do shortening of dataset ## affects number of samples and max_len.
    if config.num_samples is not None:
        ## Random seed
        np.random.seed(1)
        ind = range(np.shape(h)[0])
        random.shuffle(ind)
        indices = ind[0:config.num_samples]
        h = h[indices, :]
        b = b[indices, :]
        h_len = h_len[indices]
        b_len = b_len[indices]
        y = y[indices]

    # Truncate headlines and bodies
    if config.h_max_len is not None:
        h_max_len = config.h_max_len
        if np.shape(h)[1] > h_max_len:
            h = h[:, 0:h_max_len]
        h_len = np.minimum(h_len, h_max_len)

    if config.b_max_len is not None:
        b_max_len = config.b_max_len
        if np.shape(b)[1] > b_max_len:
            b = b[:, 0:b_max_len]
        b_len = np.minimum(b_len, b_max_len)

    ####################### LEX DATA #######################

    #lex_y = lex_data_dict['y']
    #lex_h = lex_data_dict['h_np']
    #lex_h_len = lex_data_dict['h_seqlen']

    lex_b = lex_data_dict['h_b_np']
    lex_b_len = lex_data_dict['seqlen']

    # lex_y = data_dict_l['y']
    # lex_h_b_np = data_dict_l['h_b_np']
    # lex_seqlen = data_dict_l['seqlen']

    # lex_data = pack_labels(lex_h_b_np, lex_y, lex_seqlen)
    # if config1.num_samples is not None:
    #     lex_num_samples = config1.num_samples
    #     lex_data = lex_data[0:num_samples - 1]
    # lex_train_data, lex_dev_data, lex_test_data, lex_train_indices, lex_dev_indices, lex_test_indices = split_data(lex_data, prop_train = 0.6, prop_dev = 0.2, seed = 56)
    # config1.num_samples = len(lex_train_indices)
    # config1.max_length = 75

    # print("lex_train_data", lex_train_data[0])

    if split:
        # Split data
        train_indices, dev_indices, test_indices = split_indices(
            np.shape(h)[0])
        # Divide data
        train_h = h[train_indices, :]
        train_b = b[train_indices, :]
        train_h_len = h_len[train_indices]
        train_b_len = b_len[train_indices]
        train_y = y[train_indices]

        ####################### LEX DATA #######################

        #lex_train_h = lex_h[train_indices,:]
        #lex_train_h_len = lex_h_len[train_indices]

        lex_train_b = lex_b[train_indices, :]
        lex_train_b_len = lex_b_len[train_indices]

        # test
        dev_h = h[dev_indices, :]
        dev_b = b[dev_indices, :]
        dev_h_len = h_len[dev_indices]
        dev_b_len = b_len[dev_indices]
        dev_y = y[dev_indices]

        ################## LEX ########################

        #lex_dev_h = lex_h[dev_indices,:]
        #lex_dev_h_len = lex_h_len[dev_indices]

        lex_dev_b = lex_b[dev_indices, :]
        lex_dev_b_len = lex_b_len[dev_indices]

        if final:
            # Combine train and dev
            train_dev_indices = train_indices + dev_indices
            train_h = h[train_dev_indices, :]
            train_b = b[train_dev_indices, :]
            train_h_len = h_len[train_dev_indices]
            train_b_len = b_len[train_dev_indices]
            train_y = y[train_dev_indices]

            ################## LEX  train ########################
            #lex_train_h = lex_h[train_dev_indices,:]
            #lex_train_h_len = lex_h_len[train_dev_indices]

            lex_train_b = lex_b[train_dev_indices, :]
            lex_train_b_len = lex_b_len[train_dev_indices]

            # Set dev to test
            dev_h = h[test_indices, :]
            dev_b = b[test_indices, :]
            dev_h_len = h_len[test_indices]
            dev_b_len = b_len[test_indices]
            dev_y = y[test_indices]

            ################### LEX dev ######################

            #lex_dev_h = lex_h[test_indices,:]
            #lex_dev_h_len = lex_h_len[test_indices]

            lex_dev_b = lex_b[test_indices, :]
            lex_dev_b_len = lex_b_len[test_indices]

    ## Passing parameter_dict to config settings
    ## Changes to config  based on data shape
    assert (np.shape(train_h)[0] == np.shape(train_b)[0] ==
            np.shape(train_y)[0] == np.shape(train_h_len)[0] ==
            np.shape(train_b_len)[0] == np.shape(lex_train_b_len)[0])
    config.num_samples = np.shape(train_h)[0]
    config.h_max_len = np.shape(train_h)[1]
    config.b_max_len = np.shape(train_b)[1]

    ## Start Tensorflow!
    print('Starting TensorFlow operations')
    print 'With hidden layers: ', config.n_layers  ## hidden layer?
    with tf.Graph().as_default():
        tf.set_random_seed(1)
        model = LSTMCondModel(config)
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(
                session, train_h, train_b, train_h_len, train_b_len, train_y,
                dev_h, dev_b, dev_h_len, dev_b_len, dev_y, lex_train_b,
                lex_train_b_len, lex_dev_b, lex_dev_b_len)  #M

    # Write results to csv
    convertOutputs(outputpath, config, losses_ep, dev_performances_ep)

    print('Losses ', losses_ep)
    print('Dev Performance ', dev_performances_ep)  #M
    return losses_ep, dev_predicted_classes_ep, dev_performances_ep  #M

コード例 #2

ファイルを表示

ファイル: execute_lstm_attention.py プロジェクト: meetnisha/stance_detection

def run_lstm_attention(config, outputpath=base_path + '/xp', final=False):
    config, data_dict = get_data(
        config,
        filename_embeddings='/glove/glove.twitter.27B.50d.txt',
        pickle_path='/glove/twitter50d_h_ids_b_ids_pickle.p',
        concat=True)

    y = data_dict['y']
    h_b_np = data_dict['h_b_np']
    seqlen = data_dict['seqlen']

    # Perform downsampling
    if 'downsample' in config.__dict__:
        if config.downsample == True:
            downsample_indices = downsample_label(y,
                                                  label_for_ds=3,
                                                  downsample_factor=4)
            y = y[downsample_indices]
            h_b_np = h_b_np[downsample_indices, :]
            seqlen = seqlen[downsample_indices]

    if config.max_length is not None:
        max_length = config.max_length
        if np.shape(h_b_np)[1] > max_length:
            h_b_np = h_b_np[:, 0:max_length]
        seqlen = np.minimum(seqlen, max_length)

    # Set maximum dataset size for testing purposes
    data = pack_labels(h_b_np, y, seqlen)
    if config.num_samples is not None:
        num_samples = config.num_samples
        data = data[0:num_samples - 1]

    # Split data, result is still packed
    train_data, dev_data, test_data, train_indices, dev_indices, test_indices = split_data(
        data, prop_train=0.6, prop_dev=0.2, seed=56)

    # Compute some convenience sub-sets
    # Dev
    dev_labels = y[dev_indices]
    dev_data_np = h_b_np[dev_indices, :]
    dev_seqlen = seqlen[dev_indices]
    # Test
    test_labels = y[test_indices]
    test_data_np = h_b_np[test_indices, :]
    test_seqlen = seqlen[test_indices]

    ## Config determined at data loading:
    config.num_samples = len(train_indices)
    config.max_length = np.shape(h_b_np)[1]

    # If this is the final test:
    # Combine test and dev
    # Reassign test to dev - for compatibility with rest of the code
    if final:
        # train_dev_indices = train_indices.extend(dev_indices)
        train_dev_indices = train_indices + dev_indices
        train_data = [data[i] for i in train_dev_indices]
        dev_data_np = test_data_np
        dev_seqlen = test_seqlen
        dev_labels = test_labels
        config.num_samples = len(train_dev_indices)

    with tf.Graph().as_default():

        tf.set_random_seed(59)

        logger.info("Building model...", )
        start = time.time()
        model = LSTMAttention(config)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()

        with tf.Session() as session:
            session.run(init)
            # losses = model.fit(session, train_data)
            losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(
                session, train_data, dev_data_np, dev_seqlen,
                dev_labels)  # MODIF
            # dev_predictions = model.predict_on_batch(session, dev_data_np, dev_seqlen)

            #test_predictions = model.predict_on_batch(session, test_data_np, test_seqlen)

    # outputpath = '../../xp' # MODIF
    convertOutputs(outputpath, config, losses_ep, dev_performances_ep)  # MODIF
    # Compute testing predictions --> MODIF --> SHOULD BE REMOVED WHEN OK
    print('Dev Performance ', dev_performances_ep)  #M
    return losses_ep, dev_predicted_classes_ep, dev_performances_ep  #MODIF

コード例 #3

ファイルを表示

ファイル: execute_lstm_conditional.py プロジェクト: nehaiyer/nlu-project

def test_lstm_conditional(config,
                          split=True,
                          outputpath='output/lstm_conditional',
                          final=False):
    ## Get data
    config, data_dict = get_data(
        config,
        filename_embeddings='/../../glove.6B.50d.txt',
        pickle_path='/data/glove50d_h_ids_b_ids_comp_pickle.p',
        concat=False)

    ## pass data into local namespace:
    y = data_dict['y']
    h = data_dict['h_np']
    b = data_dict['b_np']
    h_len = data_dict['h_seqlen']
    b_len = data_dict['b_seqlen']

    # Do shortening of dataset ## affects number of samples and max_len.
    if config.num_samples is not None:
        ## Random seed
        np.random.seed(1)
        ind = range(np.shape(h)[0])
        random.shuffle(ind)
        indices = ind[0:config.num_samples]
        h = h[indices, :]
        b = b[indices, :]
        h_len = h_len[indices]
        b_len = b_len[indices]
        y = y[indices]

    # Truncate headlines and bodies
    if config.h_max_len is not None:
        h_max_len = config.h_max_len
        if np.shape(h)[1] > h_max_len:
            h = h[:, 0:h_max_len]
        h_len = np.minimum(h_len, h_max_len)

    if config.b_max_len is not None:
        b_max_len = config.b_max_len
        if np.shape(b)[1] > b_max_len:
            b = b[:, 0:b_max_len]
        b_len = np.minimum(b_len, b_max_len)

#    if split:
#        # Split data
#        train_indices, dev_indices, test_indices = split_indices(np.shape(h)[0])
#        # Divide data
#        train_h = h[train_indices,:]
#        train_b = b[train_indices,:]
#        train_h_len = h_len[train_indices]
#        train_b_len = b_len[train_indices]
#        train_y = y[train_indices]
#        # test
#        dev_h = h[dev_indices,:]
#        dev_b = b[dev_indices,:]
#        dev_h_len = h_len[dev_indices]
#        dev_b_len = b_len[dev_indices]
#        dev_y = y[dev_indices]
#
#        if final:
#            # Combine train and dev
#            train_dev_indices = train_indices + dev_indices
#            train_h = h[train_dev_indices,:]
#            train_b = b[train_dev_indices,:]
#            train_h_len = h_len[train_dev_indices]
#            train_b_len = b_len[train_dev_indices]
#            train_y = y[train_dev_indices]
#
#            # Set dev to test
#            dev_h = h[test_indices,:]
#            dev_b = b[test_indices,:]
#            dev_h_len = h_len[test_indices]
#            dev_b_len = b_len[test_indices]
#            dev_y = y[test_indices]

## Passing parameter_dict to config settings
## Changes to config  based on data shape
    assert (np.shape(h)[0] == np.shape(b)[0] == np.shape(y)[0] ==
            np.shape(h_len)[0] == np.shape(b_len)[0])
    config.num_samples = np.shape(h)[0]
    config.h_max_len = np.shape(h)[1]
    config.b_max_len = np.shape(b)[1]

    data_path = '/home/neha/sem2/nlu/project/stance_detection-master/code/models/lstm_conditional/'
    model_save_name = 'lstm-conditional-300blen-2l-45e-final'
    model_load = data_path + model_save_name

    ## Start Tensorflow!
    print('Starting TensorFlow operations')
    print 'With hidden layers: ', config.n_layers  ## hidden layer?

    #    with tf.Graph().as_default():
    tf.set_random_seed(1)
    tf.reset_default_graph()
    model = LSTMCondModel(config)
    #    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    with tf.Session() as session:
        saver = tf.train.import_meta_graph(
            data_path + 'lstm-conditional-300blen-2l-45e-40.meta')
        print "hello"
        saver.restore(
            session,
            tf.train.latest_checkpoint(
                checkpoint_dir=
                '/home/neha/sem2/nlu/project/stance_detection-master/code/models/lstm_conditional/'
            ))
        #        nod=[n.name for n in tf.get_default_graph().as_graph_def().node]
        #        for i in range(20):
        #            print nod[i]
        #        session.run(init)
        #        saver.restore(session, model_load)
        losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.compute_pred(
            session, h, b, h_len, b_len, y)  #M

    # Write results to csv
    convertOutputs(outputpath, config, losses_ep, dev_performances_ep)

    print('Losses ', losses_ep)
    print('Dev Performance ', dev_performances_ep)  #M
    return losses_ep, dev_predicted_classes_ep, dev_performances_ep  #M

コード例 #4

ファイルを表示

ファイル: execute_lstm_conditional.py プロジェクト: nehaiyer/nlu-project

def run_lstm_conditional(config,
                         split=True,
                         outputpath='output/lstm_conditional',
                         final=False):
    ## Get data
    config, data_dict = get_data(
        config,
        filename_embeddings='/../../glove.6B.50d.txt',
        pickle_path='/data/glove50d_h_ids_b_ids_pickle.p',
        concat=False)

    ## pass data into local namespace:
    y = data_dict['y']
    h = data_dict['h_np']
    b = data_dict['b_np']
    h_len = data_dict['h_seqlen']
    b_len = data_dict['b_seqlen']

    #    config, data_dict1 = get_data(config,
    #            filename_embeddings = '/../../glove.6B.50d.txt',
    #            pickle_path = '/data/glove50d_h_ids_b_ids_comp_pickle.p',
    #            concat = False)
    #
    #    ## pass data into local namespace:
    #    y1 = data_dict1['y']
    #    h1 = data_dict1['h_np']
    #    b1 = data_dict1['b_np']
    #    h1_len = data_dict1['h_seqlen']
    #    b1_len = data_dict1['b_seqlen']
    #
    #
    #
    #    if config.b_max_len is not None:
    #        b_max_len = config.b_max_len
    #        if np.shape(b1)[1] > b_max_len:
    #            b1 = b1[:, 0:b_max_len]
    #        b1_len = np.minimum(b1_len, b_max_len)

    # Do shortening of dataset ## affects number of samples and max_len.
    if config.num_samples is not None:
        ## Random seed
        np.random.seed(1)
        ind = range(np.shape(h)[0])
        random.shuffle(ind)
        indices = ind[0:config.num_samples]
        h = h[indices, :]
        b = b[indices, :]
        h_len = h_len[indices]
        b_len = b_len[indices]
        y = y[indices]

    # Truncate headlines and bodies
    if config.h_max_len is not None:
        h_max_len = config.h_max_len
        if np.shape(h)[1] > h_max_len:
            h = h[:, 0:h_max_len]
        h_len = np.minimum(h_len, h_max_len)

    if config.b_max_len is not None:
        b_max_len = config.b_max_len
        if np.shape(b)[1] > b_max_len:
            b = b[:, 0:b_max_len]
        b_len = np.minimum(b_len, b_max_len)

    if split:
        # Split data
        train_indices, dev_indices, test_indices = split_indices(
            np.shape(h)[0])
        # Divide data
        train_h = h[train_indices, :]
        train_b = b[train_indices, :]
        train_h_len = h_len[train_indices]
        train_b_len = b_len[train_indices]
        train_y = y[train_indices]
        # test
        dev_h = h[dev_indices, :]
        dev_b = b[dev_indices, :]
        dev_h_len = h_len[dev_indices]
        dev_b_len = b_len[dev_indices]
        dev_y = y[dev_indices]

        if final:
            # Combine train and dev
            train_dev_indices = train_indices + dev_indices
            train_h = h[train_dev_indices, :]
            train_b = b[train_dev_indices, :]
            train_h_len = h_len[train_dev_indices]
            train_b_len = b_len[train_dev_indices]
            train_y = y[train_dev_indices]

            # Set dev to test
            dev_h = h[test_indices, :]
            dev_b = b[test_indices, :]
            dev_h_len = h_len[test_indices]
            dev_b_len = b_len[test_indices]
            dev_y = y[test_indices]

    print np.shape(train_h)
    #    dev_h = h1
    #    dev_b = b1
    #    dev_h_len = h1_len
    #    dev_b_len = b1_len
    #    dev_y = y1
    ## Passing parameter_dict to config settings
    ## Changes to config  based on data shape
    assert (np.shape(train_h)[0] == np.shape(train_b)[0] ==
            np.shape(train_y)[0] == np.shape(train_h_len)[0] ==
            np.shape(train_b_len)[0])
    #    assert(np.shape(dev_h)[0] == np.shape(dev_b)[0] == np.shape(dev_y)[0] == np.shape(dev_h_len)[0] == np.shape(dev_b_len)[0])
    config.num_samples = np.shape(train_h)[0]
    config.h_max_len = np.shape(train_h)[1]
    config.b_max_len = np.shape(train_b)[1]

    data_path = '/home/neha/sem2/nlu/project/stance_detection-master/code/models/lstm_conditional/'
    #    model_save_name= 'lstm-conditional-50blen-2l-60e'
    model_save_name = 'lstm-conditional-test2'

    ## Start Tensorflow!
    print('Starting TensorFlow operations')
    print 'With hidden layers: ', config.n_layers  ## hidden layer?
    with tf.Graph().as_default():
        tf.set_random_seed(1)
        model = LSTMCondModel(config)
        init = tf.global_variables_initializer()
        #        saver = tf.train.Saver()
        with tf.Session() as session:
            session.run(init)
            #            saver.restore(session, tf.train.latest_checkpoint(checkpoint_dir = '/home/neha/sem2/nlu/project/stance_detection-master/code/models/lstm_conditional/'))
            #            saver = tf.train.Saver()
            losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(
                session, train_h, train_b, train_h_len, train_b_len, train_y,
                dev_h, dev_b, dev_h_len, dev_b_len, dev_y)  #M


#            saver.save(session, data_path + model_save_name + '-final')

# Write results to csv
    convertOutputs(outputpath, config, losses_ep, dev_performances_ep)

    print('Losses ', losses_ep)
    print('Dev Performance ', dev_performances_ep)  #M
    return losses_ep, dev_predicted_classes_ep, dev_performances_ep  #M

コード例 #5

ファイルを表示

ファイル: execute_bilstm_conditional.py プロジェクト: malvika-hp/stance-detection-in-news

def run_lstm_conditional(config,
                         split=True,
                         outputpath='../../xp',
                         final=False):
    ## Get data

    config, data_dict = get_data(
        config,
        filename_embeddings='/../../glove/glove.twitter.27B.50d.txt',
        pickle_path='/../../glove/twitter50d_h_ids_b_ids_pickle.p',
        concat=False)

    ## pass data into local namespace:
    y = data_dict['y']
    h = data_dict['h_np']
    b = data_dict['b_np']
    print("===========Number of head=========", len(h))
    print("===========Number of body=========", len(b))
    h_len = data_dict['h_seqlen']
    b_len = data_dict['b_seqlen']

    # Do shortening of dataset ## affects number of samples and max_len.
    if config.num_samples is not None:
        ## Random seed
        np.random.seed(1)
        ind = range(np.shape(h)[0])
        random.shuffle(ind)
        indices = ind[0:config.num_samples]
        h = h[indices, :]
        b = b[indices, :]
        h_len = h_len[indices]
        b_len = b_len[indices]
        y = y[indices]

    # Truncate headlines and bodies
    if config.h_max_len is not None:
        h_max_len = config.h_max_len
        if np.shape(h)[1] > h_max_len:
            h = h[:, 0:h_max_len]
        h_len = np.minimum(h_len, h_max_len)

    if config.b_max_len is not None:
        b_max_len = config.b_max_len
        if np.shape(b)[1] > b_max_len:
            b = b[:, 0:b_max_len]
        b_len = np.minimum(b_len, b_max_len)

    if split:
        # Split data
        train_indices, dev_indices, test_indices = split_indices(
            np.shape(h)[0])
        # Divide data
        train_h = h[train_indices, :]
        train_b = b[train_indices, :]
        train_h_len = h_len[train_indices]
        train_b_len = b_len[train_indices]
        train_y = y[train_indices]
        # test
        dev_h = h[dev_indices, :]
        dev_b = b[dev_indices, :]
        dev_h_len = h_len[dev_indices]
        dev_b_len = b_len[dev_indices]
        dev_y = y[dev_indices]

        if final:
            # Combine train and dev
            train_dev_indices = train_indices + dev_indices
            train_h = h[train_dev_indices, :]
            train_b = b[train_dev_indices, :]
            train_h_len = h_len[train_dev_indices]
            train_b_len = b_len[train_dev_indices]
            train_y = y[train_dev_indices]

            # Set dev to test
            dev_h = h[test_indices, :]
            dev_b = b[test_indices, :]
            dev_h_len = h_len[test_indices]
            dev_b_len = b_len[test_indices]
            dev_y = y[test_indices]

    ## Passing parameter_dict to config settings
    ## Changes to config  based on data shape
    assert (np.shape(train_h)[0] == np.shape(train_b)[0] ==
            np.shape(train_y)[0] == np.shape(train_h_len)[0] ==
            np.shape(train_b_len)[0])
    config.num_samples = np.shape(train_h)[0]
    config.h_max_len = np.shape(train_h)[1]
    config.b_max_len = np.shape(train_b)[1]

    ## Start Tensorflow!
    print('Starting TensorFlow operations')
    print 'With hidden layers: ', config.n_layers  ## hidden layer?
    with tf.Graph().as_default():
        tf.set_random_seed(1)
        model = LSTMCondModel(config)
        # saver = tf.train.Saver()
        saver = tf.train.Saver(tf.global_variables())
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(
                session, train_h, train_b, train_h_len, train_b_len, train_y,
                dev_h, dev_b, dev_h_len, dev_b_len, dev_y)  #M
            save_path = saver.save(session, "model/model.ckpt")
            print("Model saved in file: %s" % save_path)

    # Write results to csv
    convertOutputs(outputpath, config, losses_ep, dev_performances_ep)

    print('Losses ', losses_ep)
    print('Dev Performance ', dev_performances_ep)
    return losses_ep, dev_predicted_classes_ep, dev_performances_ep