コード例 #1
0
def error_nn_classifier(size_data, batch_size, n_data_sets):
    """
    Compute out of sample error of nearest neighbor classifier

    :param size_data: int
        Size of the training set

    :param batch_size: int
        Size of the test set

    :param n_data_sets: int
        Number of data sets to be evaluated

    :return: array-like of shape (n_data_sets,)
        Out of sample error for different training sets
    """
    test_set = create_data(batch_size)
    oses = np.empty(n_data_sets)
    for i in range(n_data_sets):
        data = create_data(size_data)
        prediction = nn_classifier(data, test_set)
        n_errors = np.sum(np.abs(np.subtract(prediction, test_set[:, 1])))
        ose = 100 * n_errors / batch_size
        oses[i] = ose
    return oses
コード例 #2
0
ファイル: create_data.py プロジェクト: airKlizz/Wikification
def main():
    assert len(sys.argv) == 5, COMMAND
    num_titles = int(sys.argv[1])
    filename = sys.argv[2]
    top5000_filename = sys.argv[3] 
    titles_filename = sys.argv[4]
    
    create_data(num_titles, filename, top5000_filename, titles_filename)
コード例 #3
0
def plot_attAUC(GT, attributepattern, clf):
    AUC = []
    P = np.loadtxt(attributepattern)
    attributes = get_attributes()

    # Loading ground truth
    test_index = bzUnpickle('./CreatedData/test_features_index.txt')
    test_attributes = get_class_attributes('./Classes/', name='test')
    _, y_true = create_data('./CreatedData/test_featuresVGG19.pic.bz2',
                            test_index, test_attributes)

    for i in range(y_true.shape[1]):
        fp, tp, _ = roc_curve(y_true[:, i], P[:, i])
        roc_auc = auc(fp, tp)
        AUC.append(roc_auc)
    print("Mean attrAUC %g" % (np.nanmean(AUC)))

    xs = np.arange(y_true.shape[1])
    width = 0.5

    fig = plt.figure(figsize=(15, 5))
    ax = fig.add_subplot(1, 1, 1)
    rects = ax.bar(xs, AUC, width, align='center')
    ax.set_xticks(xs)
    ax.set_xticklabels(attributes, rotation=90)
    ax.set_ylabel("area under ROC curve")
    autolabel(rects, ax)
    plt.savefig('results/AwA-AttAUC-DAP-%s.pdf' % clf)
コード例 #4
0
def main():

    # Create Result Directory
    os.makedirs('./results/predict', exist_ok=True)

    # Get Arguments
    args = args_initialize()

    # Define Model
    net_G = ResNetGenerator(
        input_nc=args.input_nc,
        output_nc=args.output_nc,
        ngf=args.ngf,
        n_blocks=9
    )

    # Load Weights
    state_dict = torch.load('./latest_net_G.pth', map_location='cpu')
    net_G.load_state_dict(state_dict)

    # Create Tensor from Image file
    im_file = args.imfile
    tensor_img = utils.create_data(im_file)

    # Predict
    outputs = net_G.forward(tensor_img)[0]

    # Convert Output Tensor to Image file
    im = utils.tensor2im(outputs)
    file_name = os.path.basename(im_file)
    save_path = os.path.join('./results/predict', 'horse2zebra_' + str(file_name) + '.png')
    utils.save_image(im, save_path)
コード例 #5
0
ファイル: test.py プロジェクト: AlveinTeng/CSC311FinalProject
def load_data(base_path="../data"):
    """ Load the data in PyTorch Tensor.

    :return: (zero_train_matrix, train_data, valid_data, test_data)
        WHERE:
        zero_train_matrix: 2D sparse matrix where missing entries are
        filled with 0.
        train_data: 2D sparse matrix
        valid_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
        test_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
    """
    new_train_matrix, new_sparse_train_matrix, valid_data, test_data = create_data(base_path)
    # print(new_train_matrix)
    # print(new_sparse_train_matrix)
    # train_matrix = load_train_sparse(base_path).toarray()
    # valid_data = load_valid_csv(base_path)
    # test_data = load_public_test_csv(base_path)

    # print(len(valid_data["user_id"]))
    # print(len(test_data["user_id"]))

    # zero_train_matrix = train_matrix.copy()
    zero_train_matrix = new_sparse_train_matrix.copy()
    # Fill in the missing entries to 0.
    # zero_train_matrix[np.isnan(train_matrix)] = 0
    zero_train_matrix[np.isnan(new_sparse_train_matrix)] = 0
    # Change to Float Tensor for PyTorch.
    zero_train_matrix = torch.FloatTensor(zero_train_matrix)
    # train_matrix = torch.FloatTensor(train_matrix)
    train_matrix = torch.FloatTensor(new_sparse_train_matrix)

    return zero_train_matrix, train_matrix, valid_data, test_data
コード例 #6
0
def Predict_Mode():
    print ("\n","Test Mode select ... ","\n")

    id2tag = dict((y,x) for x,y in r_dic.items()) #인덱스를 key로, 관계명을 value인 dictionary.

    with open("./Pickles/word2idx.pkl","rb") as fout: #학습에 사용된 동일한 word2id loading.
        word2idx = pickle.load(fout)

    Model = torch.load("BestModel.pt") #best model reload.
    Pre_data = data_loading(opts.predict,p_dic,"+") #predict할 파일 loading
    sentence = []
    for line in open(opts.predict):
        line = line.strip()
        if len(line) == 0:
            continue
        if line[0] == ";":
            sentence.append(line)

    out = open("../output/result.txt","w") #의존관계 결과를 저장할 파일.
    Pre = create_data(Pre_data,word2idx,r_dic)
    Model.train(False)
    for i in range(len(Pre_data)):
        out.write(sentence[i]+"\n")
        points,labels = predict(Model,Pre[i]) #한 문장마다 best model로 예측 수행.
        points[-1] = -1
        for j in range(len(Pre_data[i][:-1])):
            input = Pre_data[i][j]
            out.write(str(input["current_idx"]+1)+"\t"+\
                      str(points[j]+1)+"\t"+\
                      id2tag[labels[j]]+"\t"+\
                      input["pure_morphemes"]+"\n")
        out.write("\n")
    print ("predict success ... ")
コード例 #7
0
def error_threshold_classifier(type, analytical=False, batch=None, n_data_sets=None, threshold=None):
    """
    Calculate out-of-sample error (generalization error) given number of data sets

    :param type: string
        Classifier type

    :param analytical: Bool
        If True, calculate analytical error of the classifier; otherwise empirical

    :param batch: int
        Size of the test set

    :param n_data_sets: int
        Number of data sets to be evaluated

    :param threshold: int
        Threshold of the classifier

    :return:
    If analytical is False, array-like of shape (n_data_sets,) -- Out of sample error for different training sets
    Otherwise, int -- Analytical error
    """
    if analytical is False:
        oses = np.empty(n_data_sets)
        for i in range(n_data_sets):
            test_set = create_data(batch)
            prediction = threshold_classifier(type=type, X=test_set[:, 0], threshold=threshold)
            n_errors = np.sum(np.abs(np.subtract(prediction, test_set[:, 1])))
            ose = n_errors * 100 / batch
            oses[i] = ose
        return oses
    else:
        a_error = threshold_classifier(type=type, threshold=threshold, error=True)
        return a_error
コード例 #8
0
def main():
  start = time.time()
  args = parser.parse_args()
  model = import_module(args.model_path)
  if (args.create_data):
    utils.create_data(model, args.number_of_core_samples, args.step_size, args.name, args.output_path)
  if (args.create_unstructured_data):
    model.create_unstructured_data(model, args.number_of_core_samples, args.name, args.output_path)
  if (args.rank_global):
    utils.rank_global(model, args.number_of_core_samples, args.step_size, args.name, args.plot, args.output_path)
  if (args.rank_local):
    print(utils.rank_local(model, args.number_of_core_samples, args.step_size, args.name, args.threshold, args.plot, args.output_path))
  if (args.measure_global_accuracy):
    print(utils.measure_global_accuracy(model, args.number_of_core_samples, args.step_size, args.name, args.output_path))
  if (args.measure_local_accuracy):
    print(utils.measure_local_accuracy(model, args.number_of_core_samples, args.step_size, args.name, args.output_path))
  print('this took {} seconds'.format(time.time() - start))
コード例 #9
0
ファイル: test_schemes.py プロジェクト: akleeman/slocum
 def get_data(self):
     ds = utils.create_data()
     mids = np.linspace(-7. / 8. * np.pi, np.pi, 16).astype(np.float32)
     data = mids[np.random.randint(mids.size, size=ds['x_wind'].size)]
     ds['albatros_flight_direction'] = (('time', 'longitude', 'latitude'),
                                        data.reshape(ds['x_wind'].shape),
                                        {'units': 'radians'})
     return ds[['albatros_flight_direction']]
コード例 #10
0
ファイル: test_tinylib.py プロジェクト: akleeman/slocum
    def test_small_time(self):
        ds = create_data()

        sm_time = tinylib.small_time(ds['time'])
        num_times, units = tinylib.expand_small_time(sm_time['packed_array'])
        actual = xray.Dataset({'time': ('time', num_times,
                                        {'units': units})})
        actual = xray.decode_cf(actual)
        self.assertTrue(np.all(actual['time'].values == ds['time'].values))
        self.assertTrue(units == ds['time'].encoding['units'])
コード例 #11
0
ファイル: test_compress.py プロジェクト: akleeman/slocum
 def test_version_assert(self):
     # create a forecast that looks like its from a newer version
     # and make sure an assertion is raised.
     ds = create_data()
     original_version = compress._VERSION
     compress._VERSION = np.array(compress._VERSION + 1, dtype=np.uint8)
     beaufort = compress.compress_dataset(ds)
     compress._VERSION = original_version
     self.assertRaises(ValueError,
                       lambda: compress.decompress_dataset(beaufort))
コード例 #12
0
ファイル: test_schemes.py プロジェクト: akleeman/slocum
    def get_data(self):
        ds = utils.create_data()
        bins = self.get_scheme().bins
        mids = 0.5 * (bins[1:] + bins[:-1])
        data = mids[np.random.randint(mids.size, size=ds['x_wind'].size)]
        ds['great_white_shark_length'] = (('time', 'longitude', 'latitude'),
                                          data.reshape(ds['x_wind'].shape),
                                          {'units': 'm'})

        return ds[['great_white_shark_length']]
コード例 #13
0
def indirectAttributePrediction(classifier='SVM'):
    # Get features index to recover samples
    train_index = bzUnpickle('./CreatedData/train_features_index.txt')
    test_index = bzUnpickle('./CreatedData/test_features_index.txt')

    # Get classes-attributes relationship
    train_attributes = get_class_attributes('./', name='train')
    test_attributes = get_class_attributes('./', name='test')

    # Create training Dataset
    print('Creating training dataset...')
    X_train, a_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2',
                                   train_index, train_attributes)
    y_train = []
    for (animal, num) in train_index:
        y_train += num * [animal_dict[animal]]
    y_train = np.array(y_train)

    print('X_train to dense...')
    X_train = X_train.toarray()

    print('Creating test dataset...')
    X_test, a_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2',
                                 test_index, test_attributes)

    print('X_test to dense...')
    X_test = X_test.toarray()

    clf = SVMClassifierIAP(n_components=100, C=1.0)

    print('Training model... (takes around 10 min)')
    t0 = time()
    clf.fit(X_train, y_train)
    print('Training finished in', time() - t0)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)

    print('Saving files...')
    np.savetxt('./IAP/prediction_SVM', y_pred)
    np.savetxt('./IAP/probabilities_SVM', y_proba)
コード例 #14
0
    def test_subset_spot_dataset(self):

        fcst = utils.create_data()
        times, units, cal = xray.conventions.encode_cf_datetime(fcst['time'])
        assert 'hours' in units

        def test_one_query(lon_slice, lat_slice, hour_slice):
            lon = np.mean(fcst['longitude'].values[lon_slice])
            lat = np.mean(fcst['latitude'].values[lat_slice])
            hours = times[hour_slice]
            query = {'location': {'latitude': lat, 'longitude': lon},
                       'model': 'gefs',
                       'type': 'spot',
                       'hours': hours,
                       'variables': ['wind']}

            ss = subset.subset_spot_dataset(fcst, query)

            assert fcst['x_wind'].dims == ('time', 'longitude', 'latitude')
            expected = np.mean(np.mean(fcst['x_wind'].values[hour_slice,
                                                             lon_slice,
                                                             lat_slice],
                                       axis=2),
                               axis=1)
            np.testing.assert_array_almost_equal(ss['x_wind'].values.reshape(-1),
                                                 expected)

            assert fcst['y_wind'].dims == ('time', 'longitude', 'latitude')
            expected = np.mean(np.mean(fcst['y_wind'].values[hour_slice,
                                                             lon_slice,
                                                             lat_slice],
                                       axis=2),
                               axis=1)
            np.testing.assert_array_almost_equal(ss['y_wind'].values.reshape(-1),
                                                 expected)

            np.testing.assert_array_equal(ss['latitude'].values, lat)
            np.testing.assert_array_equal(ss['longitude'].values, lon)

        # test a query with lat/lon in the middle of a grid.
        test_one_query(slice(0, 2),
                       slice(0, 2),
                       slice(0, None, 3))
        # and with the lat/lon exactly on a grid
        test_one_query(slice(1, 2),
                       slice(1, 2),
                       slice(1, None, 3))
コード例 #15
0
    def test_subset(self):
        fcst = utils.create_data()

        query = {'hours': np.array([0., 2., 4., 6.]),
                 'domain': {'N': np.max(fcst['latitude'].values) - 1,
                            'S': np.min(fcst['latitude'].values) + 1,
                            'E': np.max(fcst['longitude'].values) - 1,
                            'W': np.min(fcst['longitude'].values) + 1},
                 'grid_delta': (1., 1.),
                 'variables': ['wind']}
        ss = subset.subset_dataset(fcst, query)

        np.testing.assert_array_equal(ss['longitude'].values,
                                      np.arange(query['domain']['W'],
                                                query['domain']['E'] + 1))
        np.testing.assert_array_equal(np.sort(ss['latitude'].values),
                                      np.arange(query['domain']['S'],
                                                query['domain']['N'] + 1))
コード例 #16
0
ファイル: test_compress.py プロジェクト: akleeman/slocum
    def test_compress_dataset(self):
        ds = create_data()
        compressed = compress.compress_dataset(ds)
        actual = compress.decompress_dataset(compressed)

        np.testing.assert_allclose(actual['x_wind'].values, ds['x_wind'].values,
                                   atol=1e-4, rtol=1e-4)
        np.testing.assert_allclose(actual['y_wind'].values, ds['y_wind'].values,
                                   atol=1e-4, rtol=1e-4)
        np.testing.assert_allclose(actual['air_pressure_at_sea_level'].values,
                                   ds['air_pressure_at_sea_level'].values,
                                   atol=1e-4, rtol=1e-4)
        # pass it through the system again, it should be idempotent.
        compressed = compress.compress_dataset(ds)
        actual = compress.decompress_dataset(compressed)
        np.testing.assert_allclose(actual['x_wind'].values, ds['x_wind'].values,
                                   atol=1e-4, rtol=1e-4)
        np.testing.assert_allclose(actual['y_wind'].values, ds['y_wind'].values,
                                   atol=1e-4, rtol=1e-4)
        np.testing.assert_allclose(actual['air_pressure_at_sea_level'].values,
                                   ds['air_pressure_at_sea_level'].values,
                                   atol=1e-4, rtol=1e-4)
コード例 #17
0
def testAll():
    results_dir = "results/"
    filename = "static_arithmetic_test.txt"

    for _op, op_func in operations.items():
        x, y, x_test, y_test = create_data(50000, 100, 0, 1000, 1000, 10000,
                                           op_func)
        print("In operation {}".format(_op))
        print("NAC")
        model = NAC(100, 2, 1)
        nac_err = model.train(x, y, x_test, y_test)
        tf.reset_default_graph()

        counter = 0
        nalu_err = np.nan
        while (np.isnan(nalu_err) and counter < 10):
            # NALU can often become NaN
            counter += 1
            print("NALU")
            model = NALU(100, 2, 1)
            nalu_err = model.train(x, y, x_test, y_test)
            tf.reset_default_graph()
        print("MLP")
        model = MLP(100, 2, 1)
        random_err, _ = model.validate(x_test, y_test)
        mlp_err = model.train(x, y, x_test, y_test)
        tf.reset_default_graph()

        max_score = np.nanmax([nac_err, nalu_err, random_err, mlp_err])

        with open(results_dir + filename, "a") as f:
            f.write("\n{}\n".format(_op))
            f.write("NAC err: {} | {}\n".format(nac_err, nac_err / max_score))
            f.write("NALU err: {} | {}\n".format(nalu_err,
                                                 nalu_err / max_score))
            f.write("MLP err: {} | {}\n".format(mlp_err, mlp_err / max_score))
            f.write("Random err: {} | {}\n".format(random_err,
                                                   random_err / max_score))
コード例 #18
0
    def test_forecast_containing_point(self):
        fcst = utils.create_data()
        lat = np.random.uniform(np.min(fcst['latitude'].values),
                                np.max(fcst['latitude'].values))
        lon = np.random.uniform(np.min(fcst['longitude'].values),
                                np.max(fcst['longitude'].values))
        query = {'location': {'latitude': lat, 'longitude': lon},
                   'model': 'gfs',
                   'type': 'spot',
                   'hours': np.linspace(0, 9, 3).astype('int'),
                   'variables': ['wind'],
                   'warnings': []}

        modified_query = subset.query_containing_point(query)
        ss = subset.subset_gridded_dataset(fcst, modified_query)

        self.assertTrue(np.any(lon >= ss['longitude'].values))
        self.assertTrue(np.any(lon <= ss['longitude'].values))
        self.assertTrue(np.any(lat >= ss['latitude'].values))
        self.assertTrue(np.any(lat <= ss['latitude'].values))

        # we should be able to pass the results through again and get the same thing.
        subset2 = subset.subset_gridded_dataset(ss, modified_query)
        self.assertTrue(subset2.equals(ss))
コード例 #19
0
ファイル: ner.py プロジェクト: votamvan/vie-ner-lstm
word_dir = args.word_dir
vector_dir = args.vector_dir
train_dir = args.train_dir
dev_dir = args.dev_dir
test_dir = args.test_dir
num_lstm_layer = int(args.num_lstm_layer)
num_hidden_node = int(args.num_hidden_node)
dropout = float(args.dropout)
batch_size = int(args.batch_size)
patience = int(args.patience)

startTime = datetime.now()

print 'Loading data...'
input_train, output_train, input_dev, output_dev, input_test, output_test, alphabet_tag, max_length = \
    utils.create_data(word_dir, vector_dir, train_dir, dev_dir, test_dir)
print 'Building model...'
time_step, input_length = np.shape(input_train)[1:]
output_length = np.shape(output_train)[2]
ner_model = network.building_ner(num_lstm_layer, num_hidden_node, dropout,
                                 time_step, input_length, output_length)
print 'Model summary...'
print ner_model.summary()
print 'Training model...'
early_stopping = EarlyStopping(patience=patience)
history = ner_model.fit(input_train,
                        output_train,
                        batch_size=batch_size,
                        epochs=1000,
                        validation_data=(input_dev, output_dev),
                        callbacks=[early_stopping])
コード例 #20
0
# create_data.py - create and process input data into outputted JSON lists.

from utils import create_data

if __name__ == '__main__':
    create_data(
        train_folders=['../input_data/train2014', '../input_data/val2014'],
        test_folders=[
            '../input_data/BSD100/image_SRF_4',
            '../input_data/Set5/image_SRF_4', '../input_data/Set14/image_SRF_4'
        ],
        min_size=100,
        output_folder='../output_lists')

    # create_data(train_folders=['dataset/BSDS300/images/train'],
    # 			test_folders=['dataset/BSDS300/images/test'],
    # 			min_size=100,
    # 			output_folder='../output_lists')
コード例 #21
0
ファイル: test_schemes.py プロジェクト: akleeman/slocum
 def get_data(self):
     scheme = self.get_scheme()
     ds = utils.create_data()
     for x in scheme.variables:
         ds = utils.add_tiny_variable(x, ds)
     return ds
コード例 #22
0
ファイル: main.py プロジェクト: blind-anonymous/pti-candgen
else:
    qid2title_add = _pickle.load(open(qid2title_add_fi, "rb"))

# Merge EN and HR&LR worlds
qid2title.update(qid2title_add)

# DATA
if weight_hr == -1:  #Zero-Shot
    data_tr_sampled = "data/{}/mentions_tr_ZS_hr={}_size={}.txt".format(
        lang, hr_lang, num_data)
else:
    data_tr_sampled = "data/{}/mentions_tr_hr={}_size={}.txt".format(
        lang, hr_lang, num_data)

if not os.path.exists(data_tr_sampled):
    create_data(path_tr, path_tr_hr, data_tr_sampled, num_data, qid2title,
                weight_hr)

# TOKENIZER
if weight_hr == -1:  #Zero-Shot
    path = "data/{}/charagram_ZS_hr={}_vocabulary_size={}.pkl".format(
        lang, hr_lang, num_data)
else:
    path = "data/{}/charagram_hr={}_vocabulary_size={}.pkl".format(
        lang, hr_lang, num_data)

if os.path.exists(path):
    tokenizer = tok_ngram(data_tr_sampled, path)
    tokenizer.load()
else:
    tokenizer = tok_ngram(data_tr_sampled, path)
    tokenizer.train()
コード例 #23
0

lf_3d_T = lambda x: np.atleast_2d(lf_3d(x)).T
hf_3d_T = lambda x: np.atleast_2d(hf_3d(x)).T


def create_mfgp_obj(dim, lf, hf, X_hf):
    # model = models.GPDF(dim, 0.001, 2, hf, lf)
    model = models.NARGP(dim, hf, lf, add_noise=True)
    model.fit(X_hf)
    return model


if __name__ == '__main__':
    dim = 3
    X_lf, Y_lf, X_hf, Y_hf, X_test = utils.create_data(lf_3d, hf_3d, dim)
    Y_test = hf_3d_T(X_test)
    mfgp_obj = utils.create_mfgp_obj(dim,
                                     lf_3d_T,
                                     hf_3d_T,
                                     X_hf,
                                     method='GPDF',
                                     add_noise=True)
    actual_mean, actual_variance = utils.analytical_mean(
        a, constant=5), utils.analytical_var(a)
    distribution = cp.J(cp.Uniform(0, 1), cp.Uniform(0, 1), cp.Uniform(0, 1))
    temp_f = lambda x: mfgp_obj.predict(x)[0]
    cp_wrapper = cpw.ChaospyWrapper(temp_f,
                                    distribution,
                                    polynomial_order=10,
                                    quadrature_order=10)
コード例 #24
0
ファイル: train.py プロジェクト: montymse/Chatbot
pattern_tag_list = []
words_to_ignore = ['?', '.', '!']

inputs = []
targets = []

# Loading data from json
intents = load_data('Data/intents.json')

# Preparing data by tokenizing and stemming
prepare_data = prepare_data(intents, tags, words, pattern_tag_list,
                            words_to_ignore)
words, tags = prepare_data

# Creating data for training
create_data = create_data(inputs, targets, pattern_tag_list, tags, words)
inputs, targets = create_data

# Training data
dataset = ChatbotDataset(inputs, targets)
train_loader = DataLoader(dataset=dataset, batch_size=8, shuffle=True)

input_size = len(inputs[0])
hidden_size = len(tags)
output_size = len(tags)
model = NeuralNetwork(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training phase
コード例 #25
0
import numpy as np

import utils

if __name__ == '__main__':
    X_norm, _ = utils.create_data()
    A = utils.create_affinity_matrix(X_norm)

    D = np.diag(np.sum(A, axis=1))
    L = D - A
    eigvals, eigvecs = np.linalg.eig(L)

    n_dim = eigvecs.shape[0]
    p = np.zeros(n_dim)
    p[eigvecs[:, 1] > 0] = 1.0

    utils.show_result(X_norm, p)
コード例 #26
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==========================================================
print("Loading Data...")
process_data("data/processed/sst1.p")
x = _pickle.load(open("data/processed/sst1.p", "rb"))
revs, embedding, W2, word_idx_map, vocab, max_length = x[0], x[1], x[2], x[
    3], x[4], x[5]
x_train, y_train, x_dev, y_dev = create_data(revs, word_idx_map, max_length,
                                             FLAGS.num_classes)

# Training
# ==========================================================
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Code that operates on the default graph and session comes here
        if FLAGS.random:
            embedding = W2
        cnn = TextCNN(seq_length=x_train.shape[1],
                      num_classes=y_train.shape[1],
                      vocab_size=len(vocab) + 1,
コード例 #27
0
def DirectAttributePrediction(predicate_type='binary'):
    # Get features index to recover samples
    train_index = bzUnpickle('./CreatedData/train_features_index.txt')
    test_index = bzUnpickle('./CreatedData/test_features_index.txt')
    val_index = bzUnpickle('./CreatedData/validation_features_index.txt')
    # Get classes-attributes relationship
    train_attributes = get_class_attributes('./Classes/',
                                            name='train',
                                            predicate_type=predicate_type)
    test_attributes = get_class_attributes('./Classes/',
                                           name='test',
                                           predicate_type=predicate_type)
    N_ATTRIBUTES = train_attributes.shape[1]

    # Create training Dataset
    print('Creating training dataset...')
    X_train, y_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2',
                                   train_index, train_attributes)

    print('Creating seen test dataset...')
    X_test_seen, y_test_seen = create_data(
        './CreatedData/validation_featuresVGG19.pic.bz2', val_index,
        train_attributes)
    y_pred_ = np.zeros(y_test_seen.shape)
    y_proba_ = np.copy(y_pred_)

    print('X_train to dense...')
    X_train = X_train.toarray()

    print('X_test_seen to dense...')
    X_test_seen = X_test_seen.toarray()

    print('Creating test dataset...')
    X_test, y_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2',
                                 test_index, test_attributes)
    y_pred = np.zeros(y_test.shape)
    y_proba = np.copy(y_pred)

    print('X_test to dense...')
    X_test = X_test.toarray()

    if predicate_type != 'binary':
        clf = NeuralNetworkRegressor(dim_features=X_train.shape[1],
                                     nb_attributes=N_ATTRIBUTES)
    else:
        clf = NeuralNetworkClassifier(dim_features=X_train.shape[1],
                                      nb_attributes=N_ATTRIBUTES)

    print('Fitting Neural Network...')
    # fix random seed for reproducibility
    # seed = 7
    # numpy.random.seed(seed)
    # X_train_, X_test_, y_train_, y_test_ = train_test_split(X_train, y_train, test_size=1, random_state=seed)
    his = clf.fit(X_train, y_train)

    print('Predicting attributes...')
    y_pred = np.array(clf.predict(X_test))
    y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[1])).T
    y_proba = y_pred

    y_pred_ = np.array(clf.predict(X_test_seen))
    y_pred_ = y_pred_.reshape((y_pred_.shape[0], y_pred_.shape[1])).T
    y_proba_ = y_pred_

    print('Saving files...')
    np.savetxt('./DAP_' + predicate_type + '/prediction_NN', y_pred)
    np.savetxt('./DAP_' + predicate_type + '/xprediction_NN', y_pred_)
    if predicate_type == 'binary':
        np.savetxt('./DAP_' + predicate_type + '/probabilities_NN', y_proba)
        np.savetxt('./DAP_' + predicate_type + '/xprobabilities_NN', y_proba_)
コード例 #28
0
def Train_Mode():
 
    print ("\n","Train Mode select ... ","\n")

    assert os.path.isfile(opts.train)
    assert os.path.isfile(opts.dev)
    assert os.path.isfile(opts.test)

    try : #pre_emb 파일이 있으면 embedding file을 loading.
        assert os.path.isfile(opts.pre_emb)
        with open(opts.pre_emb,"rb") as fout:
            embedding = pickle.load(fout)
        print ("vocab loading success ... ")
    except : #pre_emb 파일이 없으면 random embedding으로 학습 수행.
        print ("embedding is random...")
        embeddings = {} 
    
    Tr_data = []
    Te_data = []
    Dev_data = []

    Tr_data = data_loading(opts.train,p_dic,"+") #학습 데이터 loading.
    Dev_data = data_loading(opts.dev,p_dic,"+") #평가 데이터 loading.
    Te_data = data_loading(opts.test,p_dic,"+") #개발 데이터 loading.
    print ("data_loading success ...")


    word_dim = parameters["word_dim"] #word embedding의 dimensions
    word2idx = create_word2idx(Tr_data,p_dic) # key:word, value:index의 형태로 dictionary 생성.
    print ("word2idx create success ...")

    #embedding layer 초기 weights는 random 사용.
    matrix = np.random.uniform(-np.sqrt(1.0),np.sqrt(1.0),(len(word2idx),word_dim))

    for w in word2idx:
        if w in embeddings:
            matrix[word2idx[w]] = embeddings[w] #word2id에 있는 word가 pretrained된 embedding이 있으면 matrix 변환.
    print ("word matrix create success ...")

    with open("./Pickles/word2idx.pkl","wb") as fout: #word2id를 pickle타입으로 저장.
        pickle.dump(word2idx,fout)
    print ("Pickle file save success ...")


    Tr = create_data(Tr_data,word2idx,r_dic) #data를 학습이 가능한 형태로 변환.(각 형태소 및 태그를 형태소 단위로 변환)
    Dev = create_data(Dev_data,word2idx,r_dic)
    Te = create_data(Te_data,word2idx,r_dic)

    #Pointer Networks 모델 생성.
    Model = PointerNetworks(word_dim = parameters["word_dim"],\
                            lstm_width=parameters["lstm_dim"], #lstm dimensions
                            nword=len(word2idx), 
                            weights = matrix,
                            drop_rate = parameters["dropout"]) #드랍아웃 파라미터.
    print ("Model Create...")

    optimizer = torch.optim.Adam(Model.parameters(), lr=0.0001) #optimizer 정의. learning rate는 0.0001
    max_v = -100 # dev data를 이용하여 가장 높은 성능을 기록하는 변수.

    Model.train(True)
    for e in range(opts.epoch):
        losses = []
        #random.shuffle(Tr)
        for i in range(len(Tr)):
            input = Tr[i]
            Model.zero_grad()
            train_dic = {"inputs":input["morphemes"],\
                         "prime_idx":input["prime_idx"],\
                         "istrain" : 1,\
                         "drop_rate" : 0.2,\
                         "point_idx":input["point_idx"],\
                         "pointings":input["pointings"]}

            _,predict_points,predict_labels,_1 = Model(train_dic)
            answer_point = input["point_idx"]
            answer_point = torch.LongTensor(answer_point)
            answer_label = input["label_idx"]
            answer_label = torch.LongTensor(answer_label)
            cost1 = torch.nn.functional.cross_entropy(predict_points,Variable(answer_point)) #의존 관계에 대한 Loss계산.
            cost2 = torch.nn.functional.cross_entropy(predict_labels,Variable(answer_label)) #의존 관계명에 대한 Loss 계산.
            costs = 0.8*cost1 + 0.2*cost2 #cost1과 cost2의 반영 비율에 따라 계산.
            costs.backward()
            torch.nn.utils.clip_grad_norm(Model.parameters(), 5.0)
            optimizer.step()
            losses.append(float(costs)) #한 문장에 대한 loss append.
            if len(losses) == 3000: #1000문장당 loss출력 및 dev data 평가, 성능이 오를때마다 model save.
                print ("Epoch : "+str(e)+", sentence_num : "+str(i+1)+", average_loss : "+str(round(sum(losses)/len(losses),2))+", max_uas : "+str(max_v))
                losses = []
                Model.train(False)
                if e > 2:
                    uas,las = evaluation(Model,Dev) #dev data 평가.
                    if uas > max_v: # uas가 max_v보다 높으면 max_v 교체.
                        max_v = uas
                        print ("Best Model chanes ... ")
                        torch.save(Model,"BestModel.pt") #model save.
                Model.train(True)
def DirectAttributePrediction(classifier='SVM',
                              predicate_type='binary',
                              C=10.0):
    # Get features index to recover samples
    train_index = bzUnpickle('./CreatedData/train_features_index.txt')
    test_index = bzUnpickle('./CreatedData/test_features_index.txt')

    # Get classes-attributes relationship
    train_attributes = get_class_attributes('./',
                                            name='train',
                                            predicate_type=predicate_type)
    test_attributes = get_class_attributes('./',
                                           name='test',
                                           predicate_type=predicate_type)
    N_ATTRIBUTES = train_attributes.shape[1]

    # Create training Dataset
    print('Creating training dataset...')
    X_train, y_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2',
                                   train_index, train_attributes)

    print('X_train to dense...')
    X_train = X_train.toarray()

    Xplat_train, Xplat_val, yplat_train, yplat_val = train_test_split(
        X_train, y_train, test_size=0.10, random_state=42)

    print('Creating test dataset...')
    X_test, y_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2',
                                 test_index, test_attributes)
    y_pred = np.zeros(y_test.shape)
    y_proba = np.copy(y_pred)

    print('X_test to dense...')
    X_test = X_test.toarray()

    # CHOOSING SVM
    if classifier == 'SVM':
        platt_params = []
        for i in range(N_ATTRIBUTES):
            print('--------- Attribute %d/%d ---------' %
                  (i + 1, N_ATTRIBUTES))
            t0 = time()

            # SVM classifier
            if predicate_type == 'binary':
                clf = SVMClassifier()
            else:
                clf = SVMRegressor()

            # Training
            clf.fit(X_train, y_train[:, i])
            print('Fitted classifier in: %fs' % (time() - t0))
            if predicate_type == 'binary':
                clf.set_platt_params(Xplat_val, yplat_val[:, i])

            # Predicting
            print('Predicting for attribute %d...' % (i + 1))
            y_pred[:, i] = clf.predict(X_test)
            if predicate_type == 'binary':
                y_proba[:, i] = clf.predict_proba(X_test)

            print('Saving files...')
            np.savetxt('./DAP_' + predicate_type + '/prediction_SVM', y_pred)
            if predicate_type == 'binary':
                np.savetxt('./DAP_' + predicate_type + '/platt_params_SVM',
                           platt_params)
                np.savetxt('./DAP_' + predicate_type + '/probabilities_SVM',
                           y_proba)

    # CHOOSING NEURAL NETWORK
    if classifier == 'NN':
        if predicate_type != 'binary':
            clf = NeuralNetworkRegressor(dim_features=X_train.shape[1],
                                         nb_attributes=N_ATTRIBUTES)
        else:
            clf = NeuralNetworkClassifier(dim_features=X_train.shape[1],
                                          nb_attributes=N_ATTRIBUTES)

        print('Fitting Neural Network...')
        clf.fit(X_train, y_train)

        print('Predicting attributes...')
        y_pred = np.array(clf.predict(X_test))
        y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[1])).T
        y_proba = y_pred

        print('Saving files...')
        np.savetxt('./DAP_' + predicate_type + '/prediction_NN', y_pred)
        if predicate_type == 'binary':
            np.savetxt('./DAP_' + predicate_type + '/probabilities_NN',
                       y_proba)
コード例 #30
0
ファイル: main.py プロジェクト: blind-anonymous/pti-candgen
# DATA PREPROCESSING
path_tr = "../mentions_dumps/{}/mentions_tr.txt".format(lang)
path_tr_hr = "../mentions_dumps/{}/mentions_tr.txt".format(hr_lang)

if not os.path.exists("data/{}".format(lang)):
    os.mkdir("data/{}".format(lang))

if weight_hr == -1:  # Zero-shot
    data_info = "data/{}/info_ngram_lookup_ZS_hr={}.pkl".format(lang, hr_lang)
else:
    data_info = "data/{}/info_ngram_lookup_hr={}.pkl".format(lang, hr_lang)

if os.path.exists(data_info):
    mnt2ent_hr, mnt2ent_lr, ent2ind = _pickle.load(open(data_info, "rb"))
else:
    mnt2ent_hr, mnt2ent_lr, ent2ind = create_data(path_tr, path_tr_hr,
                                                  data_info, weight_hr)

# N-GRAMS TOKENIZER
if weight_hr == -1:  # Zero-shot
    path_tkn = "data/{}/ngram_lookup_ZS_hr={}_vocabulary.pkl".format(
        lang, hr_lang)
    path_cnt_hr = "data/{}/ngram_counter_ZS_{}.pkl".format(lang, hr_lang)
    path_cnt_lr = "data/{}/ngram_counter_ZS_{}.pkl".format(lang, lang)
else:
    path_tkn = "data/{}/ngram_lookup_hr={}_vocabulary.pkl".format(
        lang, hr_lang)
    path_cnt_hr = "data/{}/ngram_counter_{}.pkl".format(lang, hr_lang)
    path_cnt_lr = "data/{}/ngram_counter_{}.pkl".format(lang, lang)

data_tr = [mnt2ent_hr, mnt2ent_lr]
if os.path.exists(path_tkn):
コード例 #31
0
ファイル: test_schemes.py プロジェクト: akleeman/slocum
 def get_data(self):
     return utils.create_data()
コード例 #32
0
import os
from utils import create_data
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Preprocess corpus dataset')
    parser.add_argument('--folder_path',
                        type=str,
                        required=True,
                        help='required path to questions')
    parser.add_argument('--output',
                        type=str,
                        required=True,
                        help='data output filename')
    args = parser.parse_args()

    for data_type in ['training', 'test']:
        create_data(os.path.join(args.folder_path, 'questions/' + data_type),
                    data_type + '_' + args.output)
コード例 #33
0
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
if __name__ == '__main__':

    # --------------------------------------------------------------------------
    # initialize the logger
    FMT_LOG = '%(asctime)s - %(name)s:%(funcName)s:%(lineno)s - %(levelname)s - %(message)s'
    sh = logging.StreamHandler()
    sh.setFormatter(logging.Formatter(FMT_LOG))

    LOGGER.setLevel(logging.DEBUG)
    LOGGER.addHandler(sh)

    # --------------------------------------------------------------------------
    # create a fake 2D dataset
    x, _, _, _ = create_data()  # only the first return value is relevant
    LOGGER.info('Created {} data'.format(x.shape))

    # let's split this data into three parts
    ndata, dim = x.shape[0], x.shape[1]
    a, b = ndata // 2, ndata // 2 + ndata // 4
    x1, x2, x3 = x[:a], x[a:b], x[b:]

    LOGGER.info('Split the data: {}, {}, and {}'.format(
        x1.shape, x2.shape, x3.shape))

    # --------------------------------------------------------------------------
    # here, we will create a dynim hdspace with approx metric
    # we will train this hdspace using some data and save (as a faiss index)

    # nlist and nprobes are the key parameters that define
コード例 #34
0
import numpy as np
import scipy as sp
from sklearn.cluster import KMeans
import utils

if __name__ == '__main__':
    K = 4
    X_norm, z = utils.create_data()
    X_norm = np.concatenate((X_norm, X_norm + (0, 3.2)))
    z = np.concatenate((z, z + 2))

    A = utils.create_affinity_matrix(X_norm)
    Q = utils.create_constraint_matrix(z)
    
    D = np.diag(np.sum(A, axis=1))
    vol = np.sum(A)

    D_norm = np.linalg.inv(np.sqrt(D))
    L_norm = np.eye(*A.shape) - D_norm.dot(A.dot(D_norm))
    Q_norm = D_norm.dot(Q.dot(D_norm))

    # alpha < K-th eigenval of Q_norm
    alpha = 0.6 * sp.linalg.svdvals(Q_norm)[K]
    Q1 = Q_norm - alpha * np.eye(*Q_norm.shape)
    
    val, vec = sp.linalg.eig(L_norm, Q1)
    
    vec = vec[:,val >= 0]
    vec_norm = (vec / np.linalg.norm(vec, axis=0)) * np.sqrt(vol)

    costs = np.multiply(vec_norm.T.dot(L_norm), vec_norm.T).sum(axis=1)