EPOCHS = 1 # DIGITS = 3 # REVERSE = True # Try replacing GRU, or SimpleRNN. RNN = layers.LSTM HIDDEN_SIZE = 128 BATCH_SIZE = 128 LAYERS = 1 SENTLEN = 1 EMBEDDINGS = 100 ITERATIONS = 1 sawarefData = SawarefData(MYPATH, EMBEDDINGS) questions, expected, embeddings = sawarefData.get1DJoinedFeatures() ctable_x = CharacterTable(set("-").union(set(questions))) ctable_y = CharacterTable(set("-").union(set(expected))) print(ctable_y.chars) print(ctable_y.encode(["QAposV"], 1)) print(ctable_y.decode([0, 0], False)) # Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of # int is DIGITS. # MAXLEN = DIGITS + 1 + DIGITS print('Total ayat questions:', len(questions)) print('Vectorization...') x = np.zeros((len(questions), SENTLEN, len(ctable_x.chars) + EMBEDDINGS),
import math from os.path import isfile from hyperopt import Trials, tpe, STATUS_OK, fmin, hp class colors: ok = '\033[92m' fail = '\033[91m' close = '\033[0m' #__ENCODING__ SIGNS = "+" CHARS = "0123456789 " + SIGNS CHAR_TABLE = CharacterTable(CHARS) #__PARAMETERS__ PARAMETERS = { "RNN": hp.choice("RNN", [layers.LSTM]), "HIDDEN_SIZE": hp.choice("HIDDEN_SIZE", [64]), "LAYERS": hp.choice("LAYERS", [1]), "N_EPOCH": 3, "N_BATCH": 1, "LIMIT": 100, "N_EXAMPLES": 10000, "ITERATIONS": 10, "CHAR_TABLE": CHAR_TABLE, "CHARS": CHARS, "SIGNS": SIGNS } PARAMETERS["LENGTH"] = math.ceil(
# In[31]: # c. pad joined morphemes e.STRING_LENGTH = max([len(x) for k in strings for x in df_strings[k]]) for s in strings: df_strings[s] = df_strings[s].apply(padStringWithSpaces) # In[32]: # d. encode them in one hot encoding charset = set("+").union( *[list(set("".join(df_strings[x] + "-"))) for x in strings]) ctable = CharacterTable(charset, e.STRING_LENGTH) ### Now we have one shape for all strings: (STRING_LENGTH, len(charset)) for x in strings: df_strings[x + "_onehot"] = df_strings[x].apply(ctable.encode) df_strings['num'] = [x for x in range(len(df_strings))] df_strings.set_index('num', append=True, inplace=True) # In[33]: # e. remove diac for x in strings: df_strings[x+ "_undiac"] = df_strings[x].apply(removeDiac) # In[54]: