コード例 #1
0
dataset_name = cfg["datasets"]["default"]
print('dataset_name: ', dataset_name)
if FLAGS.eval_train:
    if dataset_name == "mrpolarity":
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="test",
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "political_parties":
        print('Loading  political paries')
        datasets = data_helpers.get_datasets_political_parties()
    x_raw, y_test = data_helpers.load_data_labels(datasets)
    y_test = np.argmax(y_test, axis=1)
    print("Total number of test examples: {}".format(len(y_test)))
else:
    print("Flow shouldn't be here.")
    if dataset_name == "mrpolarity":
        datasets = {"target_names": ['positive_examples', 'negative_examples']}
        x_raw = [
            "a masterpiece four years in the making", "everything is off."
        ]
        y_test = [1, 0]
    else:
        datasets = {
            "target_names": [
                'alt.atheism', 'comp.graphics', 'sci.med',
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    datasets = data_helpers.get_datasets_political_parties()
    x_text, y = data_helpers.load_data_labels(datasets)
    #print('x_text',x_text)
    #print('labels',y)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)

    print('Load pre-trained word vectors')
    with open('fasttext_vocab_en.dat', 'rb') as fr:
        vocab = pickle.load(fr)
    embedding = np.load('fasttext_embedding_en.npy')

    pretrain = vocab_processor.fit(vocab.keys())
    x = np.array(list(vocab_processor.transform(x_text)))

    vocab_size = len(vocab)
    #print('VocabPr',x)

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    #print('x_shuffled', x_shuffled)
    #print('y_shuffled', y_shuffled)
    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    #dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    #x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    #y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    train_frac = 0.7
    val_frac = 0.2
    test_frac = 0.1

    def train_test_val_split(x_shuffled):
        return (x_shuffled[:int(len(x_shuffled) * train_frac)],
                x_shuffled[int(len(x_shuffled) *
                               train_frac):(int(len(x_shuffled) * train_frac) +
                                            int(len(x_shuffled) * val_frac))],
                x_shuffled[(int(len(x_shuffled) * train_frac) +
                            int(len(x_shuffled) * val_frac)):])

    def train_test_val_labels(y_shuffled):
        return (y_shuffled[:int(len(y_shuffled) * train_frac)],
                y_shuffled[int(len(y_shuffled) *
                               train_frac):(int(len(y_shuffled) * train_frac) +
                                            int(len(y_shuffled) * val_frac))],
                y_shuffled[(int(len(y_shuffled) * train_frac) +
                            int(len(y_shuffled) * val_frac)):])

    x_train, x_dev, x_test = train_test_val_split(x_shuffled)
    y_train, y_dev, y_test = train_test_val_labels(y_shuffled)
    #print('shape',x_train.shape)
    #print("Vocabulary". vocab_processor.vocabulary_)
    #print("Vocabulary",vocab_processor.vocabulary_._mapping)
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print('x_train', x_train.shape)
    print('y_train', y_train.shape)
    return x_train, y_train, vocab_processor, vocab_size, embedding, x_dev, y_dev, x_test, y_test