Exemple #1
0
EPOCHS = 1
# DIGITS = 3
# REVERSE = True
# Try replacing GRU, or SimpleRNN.
RNN = layers.LSTM
HIDDEN_SIZE = 128
BATCH_SIZE = 128
LAYERS = 1
SENTLEN = 1
EMBEDDINGS = 100
ITERATIONS = 1

sawarefData = SawarefData(MYPATH, EMBEDDINGS)

questions, expected, embeddings = sawarefData.get1DJoinedFeatures()
ctable_x = CharacterTable(set("-").union(set(questions)))

ctable_y = CharacterTable(set("-").union(set(expected)))

print(ctable_y.chars)
print(ctable_y.encode(["QAposV"], 1))
print(ctable_y.decode([0, 0], False))

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
# MAXLEN = DIGITS + 1 + DIGITS

print('Total ayat questions:', len(questions))

print('Vectorization...')
x = np.zeros((len(questions), SENTLEN, len(ctable_x.chars) + EMBEDDINGS),
Exemple #2
0
import math
from os.path import isfile

from hyperopt import Trials, tpe, STATUS_OK, fmin, hp


class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'


#__ENCODING__
SIGNS = "+"
CHARS = "0123456789 " + SIGNS
CHAR_TABLE = CharacterTable(CHARS)
#__PARAMETERS__
PARAMETERS = {
    "RNN": hp.choice("RNN", [layers.LSTM]),
    "HIDDEN_SIZE": hp.choice("HIDDEN_SIZE", [64]),
    "LAYERS": hp.choice("LAYERS", [1]),
    "N_EPOCH": 3,
    "N_BATCH": 1,
    "LIMIT": 100,
    "N_EXAMPLES": 10000,
    "ITERATIONS": 10,
    "CHAR_TABLE": CHAR_TABLE,
    "CHARS": CHARS,
    "SIGNS": SIGNS
}
PARAMETERS["LENGTH"] = math.ceil(

# In[31]:

# c. pad joined morphemes
e.STRING_LENGTH = max([len(x) for k in strings for x in df_strings[k]])
for s in strings:
    df_strings[s] = df_strings[s].apply(padStringWithSpaces)


# In[32]:

# d. encode them in one hot encoding
charset = set("+").union(
    *[list(set("".join(df_strings[x] + "-"))) for x in strings])
ctable = CharacterTable(charset, e.STRING_LENGTH)
### Now we have one shape for all strings: (STRING_LENGTH, len(charset))
for x in strings:
    df_strings[x + "_onehot"] = df_strings[x].apply(ctable.encode)
df_strings['num'] = [x for x in range(len(df_strings))]
df_strings.set_index('num', append=True, inplace=True)


# In[33]:

# e. remove diac
for x in strings:
    df_strings[x+ "_undiac"] = df_strings[x].apply(removeDiac)


# In[54]: