Beispiel #1
0
ap.add_argument("--w-featdrop", type=float, default = 0.2)
ap.add_argument("--rnn", default = 'GRU')

opt = ap.parse_args()
o = opt

data_trn = load(opt.input_prefix)
data_tst = load(opt.test_prefix)

trn_labels = to_categorical(np.array(data_trn.labels))

if not o.c_maxlen:
    o.c_maxlen = np.max(data_trn.len_char)

c_vocab = Counter({k:v for k,v in data_trn.chars.items() if v > o.c_cutoff})
c_trn, _ = doc_to_numseq(np.array(data_trn.docs), vocab=c_vocab,
        pad=o.c_maxlen)
c_tst, _ = doc_to_numseq(np.array(data_tst.docs), vocab=c_vocab,
        pad=o.c_maxlen)

if not o.w_maxlen:
    o.w_maxlen = np.max(data_trn.len_word)

w_vocab =  Counter({k:v for k,v in data_trn.words.items() if v > o.w_cutoff})
w_trn, _ = doc_to_numseq(np.array(data_trn.docs), vocab=w_vocab,
        tokenizer="word", pad=o.w_maxlen)
w_tst, _ = doc_to_numseq(np.array(data_tst.docs), vocab=w_vocab,
        tokenizer="word", pad=o.w_maxlen)

c_inp = Input(shape=(o.c_maxlen, ), name='char_input')
w_inp = Input(shape=(o.w_maxlen, ), name='word_input')
Beispiel #2
0
trn_labels = to_categorical(np.array(data.labels)[trn_idx])
dev_labels = to_categorical(np.array(data.labels)[dev_idx])

search_iter = 1000
search_done = set()
for _ in range(search_iter):
    o, h = Options.sample()
    if h in search_done: continue
    search_done.add(h)

    if not o.c_maxlen:
        o.c_maxlen = np.max(data.len_char)
    c_vocab = Counter({k: v for k, v in data.chars.items() if v > o.c_cutoff})
    c_trn, _ = doc_to_numseq(np.array(data.docs)[trn_idx],
                             vocab=c_vocab,
                             pad=o.c_maxlen)
    c_dev, _ = doc_to_numseq(np.array(data.docs)[dev_idx],
                             vocab=c_vocab,
                             pad=o.c_maxlen)

    if not o.w_maxlen:
        o.w_maxlen = np.max(data.len_word)
    w_vocab = Counter({k: v for k, v in data.words.items() if v > o.w_cutoff})
    w_trn, _ = doc_to_numseq(np.array(data.docs)[trn_idx],
                             vocab=w_vocab,
                             tokenizer="word",
                             pad=o.w_maxlen)
    w_dev, _ = doc_to_numseq(np.array(data.docs)[dev_idx],
                             vocab=w_vocab,
                             tokenizer="word",
Beispiel #3
0
trn_idx, dev_idx = list(ssp.split(data.docs, data.labels))[0]

trn_labels = to_categorical(np.array(data.labels)[trn_idx])
dev_labels = to_categorical(np.array(data.labels)[dev_idx])

search_iter = 1000
search_done = set()
for _ in range(search_iter):
    o, h = Options.sample()
    if h in search_done: continue
    search_done.add(h)

    if not o.c_maxlen:
        o.c_maxlen = np.max(data.len_char)
    c_vocab = Counter({k:v for k,v in data.chars.items() if v > o.c_cutoff})
    c_trn, _ = doc_to_numseq(np.array(data.docs)[trn_idx], vocab=c_vocab,
            pad=o.c_maxlen)
    c_dev, _ = doc_to_numseq(np.array(data.docs)[dev_idx], vocab=c_vocab,
            pad=o.c_maxlen)

    if not o.w_maxlen:
        o.w_maxlen = np.max(data.len_word)
    w_vocab =  Counter({k:v for k,v in data.words.items() if v > o.w_cutoff})
    w_trn, _ = doc_to_numseq(np.array(data.docs)[trn_idx], vocab=w_vocab,
            tokenizer="word", pad=o.w_maxlen)
    w_dev, _ = doc_to_numseq(np.array(data.docs)[dev_idx], vocab=w_vocab,
            tokenizer="word", pad=o.w_maxlen)

    c_inp = Input(shape=(o.c_maxlen, ), name='char_input')
    w_inp = Input(shape=(o.w_maxlen, ), name='word_input')

    c_emb = Embedding(len(c_vocab) + 4, o.c_embdim, mask_zero=True,