Beispiel #1
0
def train_encoder(name_of_data, sentences, max_epochs=5, save_frequency=1000):
    if not os.path.exists('data/'):
        os.makedirs('data')
    sys.path.insert(0, 'training/')
    import vocab
    worddict, wordcount = vocab.build_dictionary(sentences)
    vocab.save_dictionary(worddict, wordcount,
                          'data/' + name_of_data + '_dictionary.pkl')
    pickle.dump(sentences, open('data/' + name_of_data + '_sen.p', 'w'))
    with open('training/train.py', 'r') as f:
        text = f.read()
        text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs))
        text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\
         'saveto=\'data/' + name_of_data + '_encoder.npz\'')
        text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\
         'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'')
        text = text.replace('n_words=20000',
                            'n_words=' + str(len(wordcount.keys())))
        text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency))
        g = open('training/train_temp.py', 'w')
        g.write(text)
        g.close()

    import train_temp
    train_temp.trainer(sentences)
Beispiel #2
0
def main(data_path, dict_path, save_path, batch_size, reload_, reload_path):
    os.environ["THEANO_FLAGS"] = "floatX=float32"

    file_names = get_file_list(data_path, ['txt'])
    train_sent = load_txt_sent(file_names)

    if not os.path.exists(dict_path):
        print "Dictionary not found, recreating"
        worddict, wordcount = vocab.build_dictionary(train_sent)
        print "Built. Saving to: {}".format(dict_path)
        vocab.save_dictionary(worddict, wordcount, dict_path)
    else:
        print "Found dictionary at {}... Loading...".format(dict_path)
        worddict = vocab.load_dictionary(dict_path)
   
    print "Beginning Training..." 
    train.trainer(train_sent, batch_size=batch_size,  reload_=reload_, dictionary=dict_path, saveto=save_path, reload_path=reload_path, saveFreq=10000)  
Beispiel #3
0
def train_decoder(name_of_data,
                  sentences,
                  model,
                  p,
                  max_epochs=5,
                  save_frequency=1000,
                  n_words=20000,
                  maxlen_w=30,
                  reload_=False):
    if not os.path.exists('data/'):
        os.makedirs('data')
    sys.path.insert(1, 'decoding/')
    import vocab
    reload(vocab)
    worddict, wordcount = vocab.build_dictionary(sentences, n_words)
    vocab.save_dictionary(worddict, wordcount,
                          'data/' + name_of_data + '_dictionary.pkl')
    with open('decoding/train.py', 'r') as f:
        text = f.read()
        text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs))
        text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\
         'saveto=\'data/' + name_of_data + '_decoder.npz\'')
        text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\
         'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'')
        text = text.replace('n_words=40000',
                            'n_words=' + str(len(wordcount.keys())))
        text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency))
        g = open('decoding/train_temp.py', 'w')
        g.write(text)
        g.close()

    import train_temp
    reload(train_temp)
    return train_temp.trainer(sentences,
                              sentences,
                              model,
                              p,
                              maxlen_w=maxlen_w,
                              reload_=reload_)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Pass target style genre to train decoder')
    parser.add_argument('-s',
                        '--style_genre',
                        help='the name of style corpus',
                        required='True',
                        default='localhost')
    flag = parser.parse_args()

    style_corpus_path = "/media/VSlab3/kuanchen_arxiv/artistic_style_corpora/{}".format(
        flag.style_genre)
    style_genre = flag.style_genre.split(".")[0]

    X = []
    with open(style_corpus_path, 'r') as handle:
        for line in handle.readlines():
            X.append(line.strip())
    C = X
    if not os.path.isfile("./vocab_save/{}.pkl".format(style_genre)):
        print "Get vocabulary..."
        worddict, wordcount = vocab.build_dictionary(X)
        vocab.save_dictionary(worddict=worddict,
                              wordcount=wordcount,
                              loc="vocab_save/{}.pkl".format(style_genre))
    else:
        pass
    savepath = "./logs_{}".format(style_genre)
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    skmodel = skipthoughts.load_model()
    train.trainer(X,
                  C,
                  skmodel,
                  dictionary="vocab_save/{}.pkl".format(style_genre),
                  savepath=savepath,
                  saveto="model.npz")
Beispiel #5
0
    target_name = args.targe_text.split("/")[-1].split(".")[
        0]  # Get target text file name. eg. "speeches.txt"

    download_model()
    print("Loading Skip-Vector Model...")
    skmodel = skipthoughts.load_model()
    print("Done!")
    """
    Step 1: Generating dictionary for the target text.
    """
    print("Generating dictionary for the target text...")
    X = load_text(args.targe_text)
    worddict, wordcount = vocab.build_dictionary(X)
    #vocab.save_dictionary(worddict, wordcount, './target_dict/%s_dict.pkl'%target_name)
    #print("Done! Saved dictionary under ./target_dict/ as %s_dict.pkl"%target_name)
    vocab.save_dictionary(worddict, wordcount,
                          './%s/%s_dict.pkl' % (target_name, target_name))
    print("Done! Saved dictionary under ./%s/ as %s_dict.pkl" %
          (target_name, target_name))
    """
    Step 2: Generating style vector for the target text.
    """
    print("Generating style vector for the target text...")
    nltk.download(
        'punkt')  # Natural Language Toolkit for skipthoughts encoder.
    print("The lenth of X is:")
    print len(X)
    skip_vector = skipthoughts.encode(skmodel, X)
    style_vector = skip_vector.mean(
        0
    )  # 0 indicate that mean method is performed over multiple axes, see numpy.mean document
    #np.save('./target_style/%s_style.npy'%target_name, style_vector)
Beispiel #6
0
    """
    all_sent = []
    for txt_file in flist_txt:
        print "Reading file: {}".format(txt_file)
        with open(txt_file, 'r') as f:
            data = f.read()
        sent = data.split('\n')
        all_sent += sent
    print "File loading complete. Cleaning..."
    #all_sent = map(clean_string, all_sent)
    return all_sent


if __name__ == "__main__":
    os.environ["THEANO_FLAGS"] = "floatX=float32"

    file_names = get_file_list(data_path, ['txt'])
    train_sent = load_txt_sent(file_names)

    if not os.path.exists(dict_path):
        print "Dictionary not found, recreating"
        worddict, wordcount = vocab.build_dictionary(train_sent)
        print "Built. Saving to: {}".format(dict_path)
        vocab.save_dictionary(worddict, wordcount, dict_path)
    else:
        print "Found dictionary at {}... Loading...".format(dict_path)
        worddict = vocab.load_dictionary(dict_path)
   
    print "Beginning Training..." 
    train.trainer(train_sent, n_words=20000, dim=2400, batch_size=128,  reload_=False, dictionary=dict_path, saveto=save_path)  
# coding: utf-8

import vocab
import train
import tools
import numpy as np

with open("../../wikipedia_txt/result_wakati.txt") as f:
    fdata = [line.rstrip() for i, line in enumerate(f)]
print '# lines: ', len(fdata)

worddict, wordcount = vocab.build_dictionary(fdata)
vocab.save_dictionary(worddict, wordcount, "word_dict")
print '# vocab: ', len(worddict)

train.trainer(fdata, dictionary="word_dict", saveFreq=100, saveto="model", reload_=True, n_words=40000)

model = tools.load_model()
vectors = tools.encode(model, fdata, use_norm=False)
np.savez('vecs.npz', vectors)

for f in FILES:
    file_counter += 1
    with open(f) as file_descriptor:
        file_content = file_descriptor.read().decode("utf-8", "ignore")
        file_content = sent_tokenize(file_content)
        for sentence in file_content:
            if sentence:
                X.append(sentence.strip())

sentence_embeddings = np.empty([file_counter, 4800])
loc = base_path_to_directory + "dictionary.pkl"
saveto = base_path_to_directory + "toy.npz"
maxlen_w = 70
worddict, wordcount = vocab.build_dictionary(X)
vocab.save_dictionary(worddict, wordcount,
                      loc)  #loc where you want to save dictionary
#in train.py set 1>path for dictionary, 2>save_to -path where to save model 3>maxlen_w
train.trainer(X, dictionary=loc, saveto=saveto, maxlen_w=maxlen_w)

#In tools.py set path_to_model=save_to in train, path_to_dictionary=dictionary in train and path_to_word2vec.
embed_map = tools.load_googlenews_vectors(path_to_word2vec)
model = tools.load_model(embed_map)
if not os.path.exists(SENTENCE_EMBEDDING_FOLDER):
    os.mkdir(SENTENCE_EMBEDDING_FOLDER)

for f in FILES:
    with open(f) as file_descriptor:
        file_content = sent_tokenize(file_descriptor.read())
        document_embedding = tools.encode(model, file_content, verbose=False)
        document_embedding = np.average(document_embedding, axis=0)
        file_name = f.split('/')[-1]
Beispiel #9
0
max_w = 10
saveF = 1000
batch = 128
clen = 6

if not reload_:
    # load the data and put in list
    f = open(data_path, 'r')
    X = f.read().splitlines()

    # preprocess
    X = preprocess.prepareentitylist(X, stop_path, clen)

    # store for future
    f = open(proc_data_path,'w')
    for item in X:
	f.write('%s\n' % item)
else:
    f = open(proc_data_path,'r')
    X = f.read().splitlines()

# subset
X = X[:N]

# build dictionary
worddict, wordcount = vocab.build_dictionary(X)
vocab.save_dictionary(worddict, wordcount, dict_path)

# train
train.trainer(X, saveto=out_path, dictionary=dict_path, saveFreq=saveF, max_epochs=max_e, dispFreq=dispF, maxlen_w=max_w, batch_size=batch)
Beispiel #10
0
import vocab
import nltk

X = []
file = open("/home/jm7432/big/Romance/romance-final.txt", "r")
for line in file:
    if len(line.strip()) > 0 and ("chapter" not in line or "part" not in line):
        X.append(line)

l = len(X)
Y = X[50:l - 50]
worddict, wordcount = vocab.build_dictionary(Y)
vocab.save_dictionary(
    worddict, wordcount,
    '/home/jm7432/tell-tall-tales/decoding/romance_dict_final.pkl')