Ejemplo n.º 1
0
    if shuffle:
        idxs = random.sample(range(len(X)), len(X))
        for i, idx in enumerate(idxs):
            tmpx, tmpy = X[idx], y[idx]
            X[idx], y[idx] = X[i], y[i]
            X[i], y[i] = tmpx, tmpy

    dev_X, dev_y = X[-1 * val_size:], y[-1 * val_size:]
    X, y = X[:-1 * val_size], y[:-1 * val_size]
    return X, y, dev_X, dev_y


# Load data

trainset = resources.read_relations(
    "conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-zh-01-08-2016-train/",
    ignore_types=["Explicit", "AltLex"],
    partial_sampling=True)
devset = resources.read_relations(
    "conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-zh-01-08-2016-dev/",
    ignore_types=["Explicit", "AltLex"],
    partial_sampling=True)
testset = resources.read_relations(
    "conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-zh-01-08-2016-test/",
    ignore_types=["Explicit", "AltLex"])
"""
trainset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
devset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
testset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-test/", ignore_types=["Explicit", "AltLex"])
"""

max_len = 256  # Maximum input sequence length
Ejemplo n.º 2
0
        idxs = random.sample(range(len(X)), len(X))
        for i, idx in enumerate(idxs):
            tmpx, tmpy = X[idx], y[idx]
            X[idx], y[idx] = X[i], y[i]
            X[i], y[i] = tmpx, tmpy

    dev_X, dev_y = X[-1 * val_size:], y[-1 * val_size:]
    X, y = X[:-1 * val_size], y[:-1 * val_size]
    return X, y, dev_X, dev_y


## Load data

# Chinese
trainset = resources.read_relations("conll16st-zh-01-08-2016-train/",
                                    ignore_types=["Explicit", "AltLex"],
                                    partial_sampling=True)
devset = resources.read_relations("conll16st-zh-01-08-2016-dev/",
                                  ignore_types=["Explicit", "AltLex"],
                                  partial_sampling=True)
testset = resources.read_relations("conll16st-zh-01-08-2016-test/",
                                   ignore_types=["Explicit", "AltLex"])
blindset = resources.read_relations("conll16st-zh-04-27-2016-blind-test/",
                                    ignore_types=["Explicit", "AltLex"])
# English
#trainset = resources.read_relations("conll16st-en-03-29-16-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
#devset = resources.read_relations("conll16st-en-03-29-16-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
#testset = resources.read_relations("conll16st-en-03-29-16-test/", ignore_types=["Explicit", "AltLex"])
#blindset = resources.read_relations("conll15st-en-03-29-16-blind-test/", ignore_types=["Explicit", "AltLex"])

max_len = 256  # Maximum input sequence length
Ejemplo n.º 3
0
	X, y = np.concatenate((dev_X,X)), np.concatenate((dev_y,y))
	if shuffle:
		idxs = random.sample(range(len(X)), len(X))
		for i, idx in enumerate(idxs):
			tmpx, tmpy = X[idx], y[idx]
			X[idx], y[idx] = X[i], y[i]
			X[i], y[i] = tmpx, tmpy

	dev_X, dev_y = X[-1*val_size:], y[-1*val_size:]
	X, y = X[:-1*val_size], y[:-1*val_size]
	return X, y, dev_X, dev_y


## Load data

trainset = resources.read_relations("data/en.test/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
devset = resources.read_relations("data/en.test/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
testset = resources.read_relations("data/en.test/", ignore_types=["Explicit", "AltLex"])
"""
trainset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
devset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True)
testset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-test/", ignore_types=["Explicit", "AltLex"])
"""


max_len	= 256	# Maximum input sequence length
# Set maximum input sequence length as percentile of actual lengths
#max_perc = 98.0
#max_len = int(np.percentile([len(smpl[0]) for smpl in trainset+devset+testset], max_perc))
print ("Maximum sequence length", max_len)