hash_length = conf.chunk_hash_length

output_length = conf.chunk_NP_length

split_rate = conf.chunk_split_rate
batch_size = conf.batch_size
nb_epoch = conf.nb_epoch

model_name = os.path.basename(__file__)[:-3]

folder_path = 'model/%s' % model_name
if not os.path.isdir(folder_path):
    os.makedirs(folder_path)

# the data, shuffled and split between train and test sets
train_data, dev_data = load_data.load_chunk(dataset='train.txt',
                                            split_rate=split_rate)

train_samples = len(train_data)
dev_samples = len(dev_data)
print('train shape:', train_samples)
print('dev shape:', dev_samples)
print()

word_embedding = pd.read_csv('../preprocessing/senna/embeddings.txt',
                             delimiter=' ',
                             header=None)
word_embedding = word_embedding.values
word_embedding = np.concatenate([
    np.zeros((1, emb_length)), word_embedding,
    np.random.uniform(-1, 1, (1, emb_length))
])
Exemple #2
0
hash_length = conf.chunk_hash_length

output_length = conf.chunk_ALL_length

split_rate = conf.chunk_split_rate
batch_size = conf.batch_size
nb_epoch = 70 #conf.nb_epoch

model_name = os.path.basename(__file__)[:-3]

folder_path = 'model/%s'%model_name
if not os.path.isdir(folder_path):
    os.makedirs(folder_path)

# the data, shuffled and split between train and test sets
train_data, dev_data = load_data.load_chunk(dataset='train.txt', split_rate=split_rate, chunk_type="ALL")

train_samples = len(train_data)
dev_samples = len(dev_data)
print('train shape:', train_samples)
print('dev shape:', dev_samples)
print()

word_embedding = pd.read_csv('../preprocessing/senna/embeddings.txt', delimiter=' ', header=None)
word_embedding = word_embedding.values
word_embedding = np.concatenate([np.zeros((1,emb_length)),word_embedding, np.random.uniform(-1,1,(1,emb_length))])

random_embedding = pd.read_csv('../preprocessing/random/chunk_embeddings.txt', delimiter=' ', header=None)
random_embedding = random_embedding.values
random_embedding = np.concatenate([np.zeros((1,hash_length)),random_embedding, np.random.rand(1,hash_length)])
np.random.seed(0)

# input sentence dimensions
step_length = conf.chunk_step_length
pos_length = conf.chunk_pos_length
IOB = conf.chunk_NP_IOB_decode

split_rate = conf.chunk_split_rate

data = sys.argv[1]

best_epoch = sys.argv[2]

if data == "dev":
    train_data, test_data = load_data.load_chunk(dataset='train.txt',
                                                 split_rate=split_rate)
elif data == "test":
    test_data = load_data.load_chunk(dataset='test.txt')
tokens = [len(x[0]) for x in test_data]
print(sum(tokens))
print('%s shape:' % data, len(test_data))

model_name = os.path.basename(__file__)[9:-3]
folder_path = './model/%s' % model_name

model_path = '%s/model_epoch_%s.h5' % (folder_path, best_epoch)
result = open('%s/predict.txt' % folder_path, 'w')

print('loading model...')
model = load_model(model_path)
print('loading model finished.')
# add path
sys.path.append('../')
sys.path.append('../tools')


from tools import conf
from tools import load_data
from tools import prepare

# input sentence dimensions
step_length = conf.chunk_step_length
pos_length = conf.chunk_pos_length

IOB = conf.chunk_NP_IOB_decode

test_data = load_data.load_chunk(dataset='test.txt')

best_epoch = sys.argv[1]

model_name = os.path.basename(__file__)[9:-3]
folder_path = './model/%s'%model_name

model_path = '%s/model_epoch_%s.h5'%(folder_path, best_epoch)
result = open('%s/predict.txt'%folder_path, 'w')


print('loading model...')
model = load_model(model_path)
print('loading model finished.')

for each in test_data:
sys.path.append('../')
sys.path.append('../tools')

from tools import load_data
from tools import prepare

model_path = './model/word-hash-2-auto-encoder-128/hidden_model_epoch_26.h5'

w = open('../preprocessing/chunk-auto-encoder-2/conll2000-word.lst', 'w')
embeddings = pd.DataFrame(columns=range(128))

print('loading model...')
encoder = load_model(model_path)
print('loading model finished.')

train_data, dev_data = load_data.load_chunk(dataset='train.txt',
                                            split_rate=0.9)
test_data = load_data.load_chunk(dataset='test.txt')

all_word = []

# all word
[all_word.extend(list(each[0])) for each in train_data]
[all_word.extend(list(each[0])) for each in dev_data]
[all_word.extend(list(each[0])) for each in test_data]

all_word = [each.strip().lower() for each in all_word]
all_word = list(set(all_word))

for i, word in enumerate(all_word):
    w.write(word + '\n')
    word_hashing = prepare.prepare_auto_encoder(batch=[word],