def get_input_text(filename, redditor, train_len, posts=200): input_text="#FAILED_READING_FILE<{}>#".format(filename) if filename: with open(filename) as f: input_text = f.read() else: input_text = redditor_text(redditor,posts,False,REDDIT_MODE) start = int(random() * (len(input_text) - train_len - 100)) if (train_len+100) >= len(input_text): start = 0 try: start = input_text.index(".", start) + 1 input_text = input_text.strip() except: start = 0 pruned = input_text[start:start + train_len] try: pruned = ".".join(pruned.split(".")[:-1]) + "." pruned = pruned.strip() except: pass # print("Pruned:",pruned) return input_text, pruned
def run(): # ######################### MAKE TRAINING DATA # ######################### MAKE TRAINING DATA # ######################### MAKE TRAINING DATA # input_text = "Ein Test-Text, hurra!" # pruned = " ".join(input_text.split(" ")[:4]) banner("--") banner("Run() starting, getting data..") input_text, pruned = get_input_text(TEXT_FILE, REDDITOR, TRAIN_LEN, posts=NUM_POSTS) if REPLACE_CAPS: input_text = replace_caps(input_text) pruned = replace_caps(pruned) #codec="ascii" codec="cp1252" input_text = input_text.encode(codec,errors='xmlcharrefreplace').decode(codec) pruned = pruned.encode(codec,errors='xmlcharrefreplace').decode(codec) with open("run{}-input_text-cp1252.txt".format(RUN_ID),"wb") as f: f.write(input_text.encode("cp1252",errors="replace")) #input_text = pruned = "abbcccdddd eeeeeffffff abc def? " * 8 print("---------------------\ninput -400:\n", input_text[:400]) print("---------------------\npruned -400:\n", pruned[:400]) print("Total text length: {}, training set {}".format(len(input_text),len(pruned))) # v=RandomVectorizer(". "+input_text) # v=OneCharacterVectorizer(". "+input_text) #v = TimeVectorizer2Lemma(input_text) <- for when using REDDIT_MODE="TEXT", not words LIMIT = 1000 if REDDIT_MODE == "TEXT" : LIMIT = 100 print("LIMITING DICTIONARY TO ", LIMIT) banner("Vectorizing") v = TimeVectorizerNoUnknown(input_text,cutoff=LIMIT) print("Len vectorizer:", len(v.dictionary)) if load_vectorizer_file: v.load(load_vectorizer_file) print("Saving vectorizer") v.save("run{}-vector.pkl".format(RUN_ID)) input_mat = v.to_matrix(pruned) # V throws ascii/unicode error #print("\n",type(v)," Dictionary: {}".format(v.dictionary.encode("ascii",errors="ignore"))) #my=v.to_matrix('my') #print("v.to_matrix('my')", my) #print("my[0]",v.from_vector_rand_no_dummy(list(my[0]),0.1,unknown_token_value="#?#")) #print("my",v.from_vector_rand_no_dummy(list(my),0.1,unknown_token_value="#?#")) #for _ in range(500): # x=v.vector(input_text[randint(0,len(input_text))]) # print(v.from_vector_rand(x,0.5,unknown_token_value="#?#"),end="") print(v.dictionary) print("") #print("?? my == ",v.from_matrix(my)) #from time import sleep #sleep(4) # print("Dictionary:",["".join(str(x)) for x in v.dictionary]) #lemma = choice(v.dictionary) # print("dictionary choice:",lemma) # print("vector", v.vector(lemma)) # print("index", v.index(lemma)) # # check if mapping worksS # for num,i in enumerate(input_mat[0:10]): # debug_vec_print(v,i,"input[{}]".format(num)) # print("input_mat[:2] : ",np.array(input_mat[:2])) #anneal_mat = anneal_matrix(input_mat) # # check if anneal-mapping works # for num,i in enumerate(anneal_mat[0:10]): # debug_vec_print(v,i,"input[{}]".format(num)) # # anneal_mat # # !!!!!!!!!!!!!!!!!!!!!! # ##### ###### ###### MAKE NETWORK ###### ###### ###### # ##### ###### ###### MAKE NETWORK ###### ###### ###### # ##### ###### ###### MAKE NETWORK ###### ###### ###### banner("Compiling net") categories = v.len() if args.fromyamlfile: with open(args.fromyamlfile,"rt") as jsonfile: json=jsonfile.read() net = model_from_yaml(json) else: net = make_net(categories, categories, hidden_size=HIDDEN_NEURONS) #from keras.utils.dot_utils import Grapher #Grapher().plot(net,'run{}-model.png'.format(RUN_ID)) # ^ needs pydot, pydot no workie py34? with open("run{}-model.yaml".format(RUN_ID),"wt") as jsonfile: jsonfile.write(net.to_yaml()) banner("Net compiled!") if load_weights_file: print("/// Loading weights from {} as per argument!".format(load_weights_file)) net = load_weights(net,load_weights_file) banner("Make dataset..") # X,y = make_dataset_n(input_mat,v,WINDOW_LEN) X, y = make_dataset_single_predict(input_mat, v, WINDOW_LEN,step=WINDOW_STEP) del input_mat #print("----------X-----------\n", X) if False: print("Shapes: X", X.shape, "y", y.shape) print("X - {} entries".format(len(X))) print("Shape X[0]", X[0].shape) if True: debug_vec_print(v, X[0][0], "X[0][0]") debug_vec_print(v, X[0][1], "X[0][1]") debug_vec_print(v, X[0][2], "X[0][2]") debug_vec_print(v, X[0][-1], "X[0][-1]") debug_vec_print(v, y[0], "y[0]") for item in range(4): print("\nLETTERS: X[",item,"][..]") for letter in X[item]: print(v.from_vector_sampled(letter),end="") print(" ---> y[]: <", end=">") print(v.from_vector_sampled(y[item])) stdout.flush() #print("X[0]") #v.print_matrix(X[0]) #print("y[0]") #v.print_matrix(y[0]) from time import sleep #sleep(2) #predict_100(net,v,X,y,custom_primer="This is awesome!") #save_weights(net,'run{}-weights'.format(RUN_ID)) #net.fit(X, y, nb_epoch=1, batch_size=min(512,len(y)), show_accuracy=True, validation_split=0.1, verbose=1) #save_weights(net,'run{}-weights'.format(RUN_ID)) zipped = list(zip(X, y)) train_epochs=1.0 trained_amount=1.0 for iteration in range(10000): i=iteration # Train in mini-batches in stead of fll set? Did I do this because of 32 bit memory limits? print("Saving network weights") save_weights(net,'run{}-weights'.format(RUN_ID)) if args.redditor: try: primer = redditor_text('w0nk0',10,justonerandom=True) except: primer = "Getting reddit post failed :(" primer.encode('cp1252',errors='replace').decode('cp1252',errors='replace') primer = primer[-WINDOW_LEN-6:] + ' #_E_#' else: primer_idx = randint(0,(len(input_text) - WINDOW_LEN)) primer = input_text[primer_idx : primer_idx+WINDOW_LEN] banner("Generating") predict_100(net, v, X, y, randomness=[0.2,0.3,0.45][i%3],custom_primer=primer) predict_100(net, v, X, y, randomness=[-0.2,-0.3,-0.5,-0.7,-0.9][i%5],custom_primer=primer) #fit for x seconds initial_time = time() SECONDS = 60 train_epochs = max(1,int(0.5*max(1,0.5*trained_amount + 0.5*train_epochs))) trained_amount=0 banner(" ITERATION {} ".format(iteration)) banner("Fitting 2x{} epochs at least {} seconds..".format(train_epochs, SECONDS)) while time() < initial_time + SECONDS: mX, my = sampleXy(X,y,int(min(len(X),2560*1.1))) trained_amount+=train_epochs fit_result = net.fit(mX, my, nb_epoch=train_epochs, batch_size=256, show_accuracy=True, validation_split=0.1, verbose=1) #batch_size=min(128,len(X[0])),
u = u.replace('Ä','Ae') u = u.replace('Ü','Ue') s = u.encode(cp,errors='xmlcharrefreplace').decode(cp) return s except: return str(str(u).encode(errors='ignore')) print(recode("häßlich äöüß ♣◙")) #text = str(str(FlatSubreddit(subreddit,subreddit_posts,True).text()).encode()) if subreddit: print('Reading {} posts from /r/{}'.format(subreddit_posts,subreddit)) text = recode(FlatSubreddit(subreddit,subreddit_posts,True).text()) elif redditor: print('Reading {} posts from /r/{}'.format(redditor_posts,redditor)) text = redditor_text(redditor,redditor_posts) else: print('Using dummy text for training.') text = "#_B_# Holy diver. You've been down to long in the midnight sea. #_B_# Oh what's becoming of me. #_E_# " * 10 #text = open("rddt-de-300.cache").read().lower() #text = text[:400] #text = open("lyrics.txt").read() #text = open("lstm_keras_mod_fix.py").read() print('corpus length:', len(text)) print('corpus [-70:]',text[-70:]) chars = set(text) print('total chars in vectorizer:', len(chars))