def get_input_text(filename, redditor, train_len, posts=200):
    input_text="#FAILED_READING_FILE<{}>#".format(filename)
    if filename:
        with open(filename) as f:
            input_text = f.read()
    else:
        input_text = redditor_text(redditor,posts,False,REDDIT_MODE)

    start = int(random() * (len(input_text) - train_len - 100))
    if (train_len+100) >= len(input_text): start = 0
    try:
        start = input_text.index(".", start) + 1
        input_text = input_text.strip()
    except:
        start = 0
    pruned = input_text[start:start + train_len]
    try:
        pruned = ".".join(pruned.split(".")[:-1]) + "."
        pruned = pruned.strip()
    except:
        pass
    # print("Pruned:",pruned)
    return input_text, pruned
def run():
    # ######################### MAKE TRAINING DATA
    # ######################### MAKE TRAINING DATA
    # ######################### MAKE TRAINING DATA
    # input_text = "Ein Test-Text, hurra!"
    # pruned = " ".join(input_text.split(" ")[:4])
    banner("--")
    banner("Run() starting, getting data..")
    input_text, pruned = get_input_text(TEXT_FILE, REDDITOR, TRAIN_LEN, posts=NUM_POSTS)
    if REPLACE_CAPS:
        input_text = replace_caps(input_text)
        pruned = replace_caps(pruned)
    #codec="ascii"
    codec="cp1252"
    input_text = input_text.encode(codec,errors='xmlcharrefreplace').decode(codec)
    pruned = pruned.encode(codec,errors='xmlcharrefreplace').decode(codec)
    with open("run{}-input_text-cp1252.txt".format(RUN_ID),"wb") as f:
        f.write(input_text.encode("cp1252",errors="replace"))
 
    #input_text = pruned = "abbcccdddd eeeeeffffff abc def? " * 8

    print("---------------------\ninput -400:\n", input_text[:400])
    print("---------------------\npruned -400:\n", pruned[:400])
    
    print("Total text length: {}, training set {}".format(len(input_text),len(pruned)))
    # v=RandomVectorizer(". "+input_text)
    # v=OneCharacterVectorizer(". "+input_text)
    #v = TimeVectorizer2Lemma(input_text) <- for when using REDDIT_MODE="TEXT", not words

    LIMIT = 1000
    if REDDIT_MODE == "TEXT" : LIMIT = 100
    print("LIMITING DICTIONARY TO ", LIMIT)
    banner("Vectorizing")
    v = TimeVectorizerNoUnknown(input_text,cutoff=LIMIT)
    print("Len vectorizer:", len(v.dictionary))
    
    if load_vectorizer_file:
      v.load(load_vectorizer_file)
    
    print("Saving vectorizer")
    v.save("run{}-vector.pkl".format(RUN_ID))
    input_mat = v.to_matrix(pruned)

    # V throws ascii/unicode error
    #print("\n",type(v)," Dictionary: {}".format(v.dictionary.encode("ascii",errors="ignore")))
    #my=v.to_matrix('my')
    #print("v.to_matrix('my')", my)
    #print("my[0]",v.from_vector_rand_no_dummy(list(my[0]),0.1,unknown_token_value="#?#"))
    #print("my",v.from_vector_rand_no_dummy(list(my),0.1,unknown_token_value="#?#"))
    #for _ in range(500):
    #    x=v.vector(input_text[randint(0,len(input_text))])
    #    print(v.from_vector_rand(x,0.5,unknown_token_value="#?#"),end="")
    print(v.dictionary)
    print("")
    #print("?? my == ",v.from_matrix(my))
    #from time import sleep
    #sleep(4)
    # print("Dictionary:",["".join(str(x)) for x in v.dictionary])
    #lemma = choice(v.dictionary)
    # print("dictionary choice:",lemma)
    # print("vector", v.vector(lemma))
    # print("index", v.index(lemma))

    # # check if mapping worksS
    # for num,i in enumerate(input_mat[0:10]):
    # debug_vec_print(v,i,"input[{}]".format(num))
    # print("input_mat[:2] : ",np.array(input_mat[:2]))

    #anneal_mat = anneal_matrix(input_mat)

    # # check if anneal-mapping works
    # for num,i in enumerate(anneal_mat[0:10]):
    # debug_vec_print(v,i,"input[{}]".format(num))
    # # anneal_mat  # # !!!!!!!!!!!!!!!!!!!!!!


    # ##### ###### ###### MAKE NETWORK ###### ###### ######
    # ##### ###### ###### MAKE NETWORK ###### ###### ######
    # ##### ###### ###### MAKE NETWORK ###### ###### ######
    banner("Compiling net")
    categories = v.len()

    if args.fromyamlfile:
        with open(args.fromyamlfile,"rt") as jsonfile:
            json=jsonfile.read()
            net = model_from_yaml(json)
    else:
        net = make_net(categories, categories, hidden_size=HIDDEN_NEURONS)
    #from keras.utils.dot_utils import Grapher
    #Grapher().plot(net,'run{}-model.png'.format(RUN_ID))
    # ^ needs pydot, pydot no workie py34?

    with open("run{}-model.yaml".format(RUN_ID),"wt") as jsonfile:
        jsonfile.write(net.to_yaml())

    banner("Net compiled!")

    if load_weights_file:
        print("/// Loading weights from {} as per argument!".format(load_weights_file))
        net = load_weights(net,load_weights_file)

    banner("Make dataset..")
    # X,y = make_dataset_n(input_mat,v,WINDOW_LEN)
    X, y = make_dataset_single_predict(input_mat, v, WINDOW_LEN,step=WINDOW_STEP)
    del input_mat
    #print("----------X-----------\n", X)
    if False:
        print("Shapes: X", X.shape, "y", y.shape)
        print("X - {} entries".format(len(X)))
        print("Shape X[0]", X[0].shape)

    if True:
        debug_vec_print(v, X[0][0], "X[0][0]")
        debug_vec_print(v, X[0][1], "X[0][1]")
        debug_vec_print(v, X[0][2], "X[0][2]")
        debug_vec_print(v, X[0][-1], "X[0][-1]")
        debug_vec_print(v, y[0], "y[0]")

        for item in range(4):
            print("\nLETTERS: X[",item,"][..]")
            for letter in X[item]:
                print(v.from_vector_sampled(letter),end="")
            print("  --->  y[]: <", end=">")
            print(v.from_vector_sampled(y[item]))
        stdout.flush()

    #print("X[0]")
    #v.print_matrix(X[0])
    #print("y[0]")
    #v.print_matrix(y[0])
    from time import sleep
    #sleep(2)

    #predict_100(net,v,X,y,custom_primer="This is awesome!")
    #save_weights(net,'run{}-weights'.format(RUN_ID))
    #net.fit(X, y, nb_epoch=1, batch_size=min(512,len(y)), show_accuracy=True, validation_split=0.1, verbose=1)
    #save_weights(net,'run{}-weights'.format(RUN_ID))

    zipped = list(zip(X, y))
    train_epochs=1.0
    trained_amount=1.0
    for iteration in range(10000):
        i=iteration
        # Train in mini-batches in stead of fll set? Did I do this because of 32 bit memory limits?
        print("Saving network weights")
        save_weights(net,'run{}-weights'.format(RUN_ID))
        if args.redditor:
            try:
                primer = redditor_text('w0nk0',10,justonerandom=True)
            except:
                primer = "Getting reddit post failed :("
            primer.encode('cp1252',errors='replace').decode('cp1252',errors='replace')
            primer = primer[-WINDOW_LEN-6:] + ' #_E_#'
        else:
            primer_idx = randint(0,(len(input_text) - WINDOW_LEN))
            primer = input_text[primer_idx : primer_idx+WINDOW_LEN]
        banner("Generating")
        predict_100(net, v, X, y, randomness=[0.2,0.3,0.45][i%3],custom_primer=primer)
        predict_100(net, v, X, y, randomness=[-0.2,-0.3,-0.5,-0.7,-0.9][i%5],custom_primer=primer)
        #fit for x seconds
        initial_time = time()
        SECONDS = 60
        train_epochs = max(1,int(0.5*max(1,0.5*trained_amount + 0.5*train_epochs)))
        trained_amount=0
        banner(" ITERATION {} ".format(iteration))
        banner("Fitting 2x{} epochs at least {} seconds..".format(train_epochs, SECONDS))
        while time() < initial_time + SECONDS:
            mX, my = sampleXy(X,y,int(min(len(X),2560*1.1)))
            trained_amount+=train_epochs
            fit_result = net.fit(mX, my, nb_epoch=train_epochs, batch_size=256, show_accuracy=True, validation_split=0.1, verbose=1) #batch_size=min(128,len(X[0])),
Example #3
0
        u = u.replace('Ä','Ae')
        u = u.replace('Ü','Ue')
        s = u.encode(cp,errors='xmlcharrefreplace').decode(cp)
        return s
    except:
        return str(str(u).encode(errors='ignore'))

print(recode("häßlich äöüß ♣◙"))

#text = str(str(FlatSubreddit(subreddit,subreddit_posts,True).text()).encode())
if subreddit:
    print('Reading {} posts from /r/{}'.format(subreddit_posts,subreddit))
    text = recode(FlatSubreddit(subreddit,subreddit_posts,True).text())
elif redditor:
    print('Reading {} posts from /r/{}'.format(redditor_posts,redditor))
    text = redditor_text(redditor,redditor_posts)
else:
    print('Using dummy text for training.')
    text = "#_B_# Holy diver. You've been down to long in the midnight sea. #_B_# Oh what's becoming of me. #_E_# " * 10

#text = open("rddt-de-300.cache").read().lower()
#text = text[:400]
#text = open("lyrics.txt").read()
#text = open("lstm_keras_mod_fix.py").read()

print('corpus length:', len(text))
print('corpus [-70:]',text[-70:])

chars = set(text)
print('total chars in vectorizer:', len(chars))