def setUp(self): text = 'You said good-bye and I said hello.' cbm = CountBasedMethod() word_list = cbm.text_to_word_list(text) word_to_id, _, self.corpus = cbm.preprocess(word_list) vocab_size = len(word_to_id) hidden_size = 2 window_size = 1 self.cbow = CBOW(vocab_size, hidden_size, window_size, self.corpus) self.simple_word2vec = SimpleWord2Vec() self.contexts, self.target = self.simple_word2vec.create_contexts_target( self.corpus)
def main(fname, oname, verbose = True, parallel = True): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) # this could reasonably be some separate function # might make sense to merge cbow and namediff and have this be the main interface namediff = Namediff() cbow = CBOW() if verbose: print('Computing nearest names...') if parallel: nearest_names = namediff.nearest_par([c.name for c in cards], n=1) else: nearest_names = [namediff.nearest(c.name, n=1) for c in cards] if verbose: print('Computing nearest cards...') if parallel: nearest_cards = cbow.nearest_par(cards, n=1) else: nearest_cards = [cbow.nearest(c, n=1) for c in cards] for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] # # unfortunately this takes ~30 hours on 8 cores for a 10MB dump # if verbose: # print 'Computing nearest encodings by text edit distance...' # if parallel: # nearest_cards_text = namediff.nearest_card_par(cards, n=1) # else: # nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards] if verbose: print('...Done.') # write to a file to store the data, this is a terribly long computation # we could also just store this same info in the cards themselves as more fields... sep = '|' with open(oname, 'w') as ofile: for i in range(0, len(cards)): card = cards[i] ostr = str(i) + sep + card.name + sep ndist, _ = card.nearest_names[0] ostr += str(ndist) + sep cdist, _ = card.nearest_cards[0] ostr += str(cdist) + '\n' # tdist, _ = nearest_cards_text[i][0] # ostr += str(tdist) + '\n' ofile.write(ostr.encode('utf-8'))
def main(fname, oname, verbose = True, parallel = True): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) # this could reasonably be some separate function # might make sense to merge cbow and namediff and have this be the main interface namediff = Namediff() cbow = CBOW() if verbose: print 'Computing nearest names...' if parallel: nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=1) else: nearest_names = [namediff.nearest(c.name, n=1) for c in cards] if verbose: print 'Computing nearest cards...' if parallel: nearest_cards = cbow.nearest_par(cards, n=1) else: nearest_cards = [cbow.nearest(c, n=1) for c in cards] for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] # # unfortunately this takes ~30 hours on 8 cores for a 10MB dump # if verbose: # print 'Computing nearest encodings by text edit distance...' # if parallel: # nearest_cards_text = namediff.nearest_card_par(cards, n=1) # else: # nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards] if verbose: print '...Done.' # write to a file to store the data, this is a terribly long computation # we could also just store this same info in the cards themselves as more fields... sep = '|' with open(oname, 'w') as ofile: for i in range(0, len(cards)): card = cards[i] ostr = str(i) + sep + card.name + sep ndist, _ = card.nearest_names[0] ostr += str(ndist) + sep cdist, _ = card.nearest_cards[0] ostr += str(cdist) + '\n' # tdist, _ = nearest_cards_text[i][0] # ostr += str(tdist) + '\n' ofile.write(ostr.encode('utf-8'))
def main(): # ハイパーパラメータの設定 window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 # データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) contexts, target = create_contexts_target(corpus, window_size) # モデルなどの生成 model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) # 学習開始 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot() # 後ほど利用できるように、必要なデータを保存 word_vecs = model.word_vecs params = {} params['word_vecs'] = word_vecs.astype(np.float16) params['word_to_id'] = word_to_id params['id_to_word'] = id_to_word pkl_file = 'cbow_params.pkl' with open(pkl_file, 'wb') as f: pickle.dump(params, f, -1)
def main() -> None: window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) contexts, target = create_context_target(corpus, window_size) model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch, batch_size) # trainer.plot() word_vecs = model.word_vecs params = { 'word_vecs': word_vecs.astype(np.float16), 'word_to_id': word_to_id, 'id_to_word': id_to_word } with open('cbow_params.pkl', 'wb') as f: pickle.dump(params, f, -1)
class TestSimpleCBOW(unittest.TestCase): def setUp(self): text = 'You said good-bye and I said hello.' cbm = CountBasedMethod() word_list = cbm.text_to_word_list(text) word_to_id, _, self.corpus = cbm.preprocess(word_list) vocab_size = len(word_to_id) hidden_size = 2 window_size = 1 self.cbow = CBOW(vocab_size, hidden_size, window_size, self.corpus) self.simple_word2vec = SimpleWord2Vec() self.contexts, self.target = self.simple_word2vec.create_contexts_target( self.corpus) def test_forward(self): loss = self.cbow.forward(self.contexts, self.target) self.assertEqual(4.159, round(loss, 3)) def test_grads_diff(self): in_layer, *_ = self.cbow.in_layers before_in_layer_grad, = in_layer.grads before_in_layer_grad = copy.copy(before_in_layer_grad) before_ns_loss_layer_grad, *_ = self.cbow.ns_loss_layer.grads before_ns_loss_layer_grad = copy.copy(before_ns_loss_layer_grad) self.cbow.forward(self.contexts, self.target) self.cbow.backward() in_layer, *_ = self.cbow.in_layers after_in_layer_grad, = in_layer.grads after_ns_loss_layer_grad, *_ = self.cbow.ns_loss_layer.grads in_layer_grad = before_in_layer_grad == after_in_layer_grad ns_loss_layer_grad = before_ns_loss_layer_grad == after_ns_loss_layer_grad assert_array_equal( np.array([[False, False], [False, False], [False, False], [False, False], [False, False], [True, True], [True, True]]), in_layer_grad) assert_array_equal( np.array([[True, True], [False, False], [False, False], [False, False], [False, False], [False, False], [True, True]]), ns_loss_layer_grad)
parser.add_argument('--num_epochs', type=int, default=100, dest='num_epochs') parser.add_argument('--models_folder', default='./word_vector_models', dest='folder') parser.add_argument('--graph_folder', default='./trump_graph', dest='graphs') args = parser.parse_args() # Read the initial word vectors word_vectors = dill.load(open(args.init_vec, 'rb')) cbow = CBOW(len(word_vectors), word_vectors, args.lr) init = tf.global_variables_initializer() # Fit the model if args.mode == 'train': # Read training samples inputs = dill.load(open(args.samples, 'rb')) with tf.Session() as sess: sess.run(init) cbow.fit(sess, inputs, embed_data_path=args.embed_metadata, minibatch_size=args.minibatch_size, num_epochs=args.num_epochs, folder=args.folder, graph_folder=args.graphs)
def main(fname, oname, n=20, verbose=False): cbow = CBOW() realcards = jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt')), verbose=verbose) real_by_name = {c.name: c for c in realcards} lm = ngrams.build_ngram_model(realcards, 3, separate_lines=separate_lines, verbose=verbose) cards = jdecode.mtg_open_file(fname, verbose=verbose) stats = analysis.get_statistics(fname, lm=lm, sep=separate_lines, verbose=verbose) selected = [] for i in range(0, len(cards)): if select_card(cards, stats, i): selected += [(i, cards[i])] limit = 3000 random.shuffle(selected) #selected = selected[:limit] if verbose: print(('computing nearest cards for ' + str(len(selected)) + ' candindates...')) cbow_nearest = cbow.nearest_par([i_c[1] for i_c in selected]) for i in range(0, len(selected)): (j, card) = selected[i] selected[i] = (j, card, cbow_nearest[i]) if verbose: print('...done') final = [] for (i, card, nearest) in selected: for dist, rname in nearest: realcard = real_by_name[rname] if compare_to_real(card, realcard): final += [(i, card, realcard, dist)] break for (i, card, realcard, dist) in final: print('-- real --') print(realcard.format()) print('-- fake --') print(card.format()) print('-- stats --') perp_per = stats['ngram']['perp_per'][i] perp_max = stats['ngram']['perp_max'][i] print(dist) print(perp_per) print(perp_max) print('----') if not oname is None: with open(oname, 'wt') as ofile: ofile.write(utils.mse_prepend) for (i, card, realcard, dist) in final: name = realcard.name writecard(realcard, name, ofile) writecard(card, name, ofile) ofile.write('version control:\n\ttype: none\napprentice code: ') # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print('ERROR: tried to overwrite existing file "set" - aborting.') return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print('Made an MSE set file called ' + oname + '.mse-set.') # The set file is useless outside the .mse-set, delete it. os.remove('set')
def main(fname, oname=None, verbose=True, encoding='std', gatherer=False, for_forum=False, for_mse=False, creativity=False, vdump=False, for_html=False): # there is a sane thing to do here (namely, produce both at the same time) # but we don't support it yet. if for_mse and for_html: print 'ERROR - decode.py - incompatible formats "mse" and "html"' return fmt_ordered = cardlib.fmt_ordered_default if encoding in ['std']: pass elif encoding in ['named']: fmt_ordered = cardlib.fmt_ordered_named elif encoding in ['noname']: fmt_ordered = cardlib.fmt_ordered_noname elif encoding in ['rfields']: pass elif encoding in ['old']: fmt_ordered = cardlib.fmt_ordered_old elif encoding in ['norarity']: fmt_ordered = cardlib.fmt_ordered_norarity elif encoding in ['vec']: pass elif encoding in ['custom']: ## put custom format decisions here ########################## ## end of custom format ###################################### pass else: raise ValueError('encode.py: unknown encoding: ' + encoding) cards = jdecode.mtg_open_file(fname, verbose=verbose, fmt_ordered=fmt_ordered) if creativity: namediff = Namediff() cbow = CBOW() if verbose: print 'Computing nearest names...' nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3) if verbose: print 'Computing nearest cards...' nearest_cards = cbow.nearest_par(cards) for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] if verbose: print '...Done.' def hoverimg(cardname, dist, nd): truename = nd.names[cardname] code = nd.codes[cardname] namestr = '' if for_html: if code: namestr = ( '<div class="hover_img"><a href="#">' + truename + '<span><img style="background: url(http://magiccards.info/scans/en/' + code + ');" alt=""/></span></a>' + ': ' + str(dist) + '\n</div>\n') else: namestr = '<div>' + truename + ': ' + str(dist) + '</div>' elif for_forum: namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n' else: namestr = truename + ': ' + str(dist) + '\n' return namestr def writecards(writer): if for_mse: # have to prepend a massive chunk of formatting info writer.write(utils.mse_prepend) if for_html: # have to preapend html info writer.write(utils.html_prepend) # seperate the write function to allow for writing smaller chunks of cards at a time segments = sort_colors(cards) for i in range(len(segments)): # sort color by CMC segments[i] = sort_type(segments[i]) # this allows card boxes to be colored for each color # for coloring of each box seperately cardlib.Card.format() must change non-minimaly writer.write('<div id="' + utils.segment_ids[i] + '">') writehtml(writer, segments[i]) writer.write("</div><hr>") # closing the html file writer.write(utils.html_append) return #break out of the write cards funcrion to avoid writing cards twice for card in cards: if for_mse: writer.write(card.to_mse().encode('utf-8')) fstring = '' if card.json: fstring += 'JSON:\n' + card.json + '\n' if card.raw: fstring += 'raw:\n' + card.raw + '\n' fstring += '\n' fstring += card.format( gatherer=gatherer, for_forum=for_forum, vdump=vdump) + '\n' fstring = fstring.replace('<', '(').replace('>', ')') writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t')) else: fstring = card.format(gatherer=gatherer, for_forum=for_forum, vdump=vdump, for_html=for_html) writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += '~~ closest names ~~\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) if for_mse: cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t') writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) if for_mse: # more formatting info writer.write('version control:\n\ttype: none\napprentice code: ') def writehtml(writer, card_set): for card in card_set: fstring = card.format(gatherer=gatherer, for_forum=True, vdump=vdump, for_html=for_html) if creativity: fstring = fstring[: -6] # chop off the closing </div> to stick stuff in writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n<br>\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += "<br>\n" cstring += '~~ closest names ~~\n<br>\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring = '<hr><div>' + cstring + '</div>\n</div>' writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) # Sorting by colors def sort_colors(card_set): # Initialize sections red_cards = [] blue_cards = [] green_cards = [] black_cards = [] white_cards = [] multi_cards = [] colorless_cards = [] lands = [] for card in card_set: if len(card.get_colors()) > 1: multi_cards += [card] continue if 'R' in card.get_colors(): red_cards += [card] continue elif 'U' in card.get_colors(): blue_cards += [card] continue elif 'B' in card.get_colors(): black_cards += [card] continue elif 'G' in card.get_colors(): green_cards += [card] continue elif 'W' in card.get_colors(): white_cards += [card] continue else: if "land" in card.get_types(): lands += [card] continue colorless_cards += [card] return [ white_cards, blue_cards, black_cards, red_cards, green_cards, multi_cards, colorless_cards, lands ] def sort_type(card_set): sorting = [ "creature", "enchantment", "instant", "sorcery", "artifact", "planeswalker" ] sorted_cards = [[], [], [], [], [], [], []] sorted_set = [] for card in card_set: types = card.get_types() for i in range(len(sorting)): if sorting[i] in types: sorted_cards[i] += [card] break else: sorted_cards[6] += [card] for value in sorted_cards: for card in value: sorted_set += [card] return sorted_set def sort_cmc(card_set): sorted_cards = [] sorted_set = [] for card in card_set: # make sure there is an empty set for each CMC while len(sorted_cards) - 1 < card.get_cmc(): sorted_cards += [[]] # add card to correct set of CMC values sorted_cards[card.get_cmc()] += [card] # combine each set of CMC valued cards together for value in sorted_cards: for card in value: sorted_set += [card] return sorted_set if oname: if for_html: print oname # if ('.html' != oname[-]) # oname += '.html' if verbose: print 'Writing output to: ' + oname with open(oname, 'w') as ofile: writecards(ofile) if for_mse: # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print 'ERROR: tried to overwrite existing file "set" - aborting.' return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname + '.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print 'Made an MSE set file called ' + oname + '.mse-set.' # The set file is useless outside the .mse-set, delete it. os.remove('set') else: writecards(sys.stdout) sys.stdout.flush()
And see thy blood warm when thou feel'st it cold.""" # Load data # wcd = WordContextDataset(corpus=tiny_corpus, # context_size=2, # min_word=1) wcd = WordContextDataset(corpus_path="./data/alice.txt", context_size=2, min_word=1) data_loader = DataLoader(wcd, batch_size=128, shuffle=True) # Model cbow = CBOW(vocab_size=wcd.vocab_size, embed_dim=100) # Training Parameters n_epoch = 1000 learning_rate = 0.001 optimizer = optim.SGD(cbow.parameters(), lr=learning_rate) loss_fn = nn.NLLLoss() loss_list = [] # Use GPU, if available. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") cbow.to(device) for epoch_i in range(n_epoch):
def main(fname, oname, n=20, verbose=False): cbow = CBOW() realcards = jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt')), verbose=verbose) real_by_name = {c.name: c for c in realcards} lm = ngrams.build_ngram_model(realcards, 3, separate_lines=separate_lines, verbose=verbose) cards = jdecode.mtg_open_file(fname, verbose=verbose) stats = analysis.get_statistics(fname, lm=lm, sep=separate_lines, verbose=verbose) selected = [] for i in range(0, len(cards)): if select_card(cards, stats, i): selected += [(i, cards[i])] limit = 3000 random.shuffle(selected) #selected = selected[:limit] if verbose: print('computing nearest cards for ' + str(len(selected)) + ' candindates...') cbow_nearest = cbow.nearest_par(map(lambda (i, c): c, selected)) for i in range(0, len(selected)): (j, card) = selected[i] selected[i] = (j, card, cbow_nearest[i]) if verbose: print('...done') final = [] for (i, card, nearest) in selected: for dist, rname in nearest: realcard = real_by_name[rname] if compare_to_real(card, realcard): final += [(i, card, realcard, dist)] break for (i, card, realcard, dist) in final: print '-- real --' print realcard.format() print '-- fake --' print card.format() print '-- stats --' perp_per = stats['ngram']['perp_per'][i] perp_max = stats['ngram']['perp_max'][i] print dist print perp_per print perp_max print '----' if not oname is None: with open(oname, 'wt') as ofile: ofile.write(utils.mse_prepend) for (i, card, realcard, dist) in final: name = realcard.name writecard(realcard, name, ofile) writecard(card, name, ofile) ofile.write('version control:\n\ttype: none\napprentice code: ') # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print 'ERROR: tried to overwrite existing file "set" - aborting.' return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print 'Made an MSE set file called ' + oname + '.mse-set.' # The set file is useless outside the .mse-set, delete it. os.remove('set')
def train(_=None, corpus=None, corpus_path=None, context_size=2, min_word=1, embed_dim=100, n_epoch=10, batch_size=32, learning_rate=0.001, shuffle=True, verbose_iterval=1): if _: raise Exception("Don't put parameters without keys. Set parameters with the key together.") # Load data wcd = WordContextDataset(corpus=corpus, corpus_path=corpus_path, context_size=context_size, min_word=min_word) data_loader = DataLoader(wcd, batch_size=batch_size, shuffle=shuffle) # Model cbow = CBOW(vocab_size=wcd.vocab_size, embed_dim=embed_dim) # Training Parameters optimizer = optim.SGD(cbow.parameters(), lr=learning_rate) loss_fn = nn.NLLLoss() loss_list = [] # Use GPU, if available. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") cbow.to(device) for epoch_i in range(n_epoch): for batch_i, (X, Y) in enumerate(data_loader): X, Y = X.to(device), Y.to(device) cbow.zero_grad() pred_log_prob = cbow(X) loss = loss_fn(pred_log_prob, Y) loss.backward() loss_list.append(float(loss.to('cpu').data.numpy())) optimizer.step() if epoch_i % verbose_iterval == 0: print("loss : {:.3f}".format(loss_list[-1])) return {'wcd': wcd, 'cbow': cbow, 'loss_list': loss_list, 'data_loader': data_loader}
print('- max_vocab_size:', max_vocab_size) print('- min_word_freq:', min_word_freq) print('- corpus:', corpus_file) print() train, word_to_id, id_to_word = get_vocab(corpus_file, max_vocab_size, min_word_freq) vocab_size = len(word_to_id) unk_rate = train.count(word_to_id['UNK']) / len(train) * 100.0 if 'UNK' in word_to_id.keys() else 0.0 print('\n\033[92m[ statics ]\033[0m') print('- token_size:', len(train)) print('- vocab_size:', vocab_size) print('- unk_rate: {:.2f}%'.format(unk_rate)) train_iter = WindowIterator(train, window_size, batch_size, max_epoch) model = CBOW(vocab_size, hidden_size, window_size, train) optimizer = Adam() trainer = Word2vecTrainer(model, optimizer) print('\n\033[92m[ progress ]\033[0m') trainer.fit(train_iter, eval_interval=eval_interval) word_vecs = model.word_vecs if GPU: word_vecs = to_device(device=-1, x=word_vecs) params = {} params['word_vecs'] = word_vecs.astype(np.float16) params['word_to_id'] = word_to_id params['id_to_word'] = id_to_word pkl_file = 'cbow_params.pkl'
"plot": False, "save_embeddings": True, # preprocessing variables "lemmatize": False, "stem": False, "remove_stopwords": False, "library": "nltk" # "nltk" or "spacy" } opts.update(args_dict) # random seed for initializing weights if not opts.seed is None: torch.manual_seed(opts.seed) # getting data vocab, train_loader, valid_loader = load_data(preprocess=True, **opts) # Run this once #vocab, train_loader, valid_loader = load_data(preprocess=False, **opts) # then you should only run this one opts.vocab_size = len(vocab) # creating model model = CBOW(opts.context_length, opts.vocab_size, opts.embedding_size) #model = Bengio(opts.context_length, opts.vocab_size, opts.embedding_size, opts.hidden_size) # training model final_statistics = train(model, train_loader, valid_loader, opts) # extracting word embeddings if opts.save_embeddings: embeddings = model.embedding.weight.data.numpy().T np.savetxt(f"{model.name.lower()}_word_vectors.csv", embeddings) print(f"word vectors saved to {model.name.lower()}_word_vectors.csv")
def main(fname, oname = None, verbose = True, encoding = 'std', gatherer = False, for_forum = False, for_mse = False, creativity = False, vdump = False, for_html = False): # there is a sane thing to do here (namely, produce both at the same time) # but we don't support it yet. if for_mse and for_html: print 'ERROR - decode.py - incompatible formats "mse" and "html"' return fmt_ordered = cardlib.fmt_ordered_default if encoding in ['std']: pass elif encoding in ['named']: fmt_ordered = cardlib.fmt_ordered_named elif encoding in ['noname']: fmt_ordered = cardlib.fmt_ordered_noname elif encoding in ['rfields']: pass elif encoding in ['old']: fmt_ordered = cardlib.fmt_ordered_old elif encoding in ['norarity']: fmt_ordered = cardlib.fmt_ordered_norarity elif encoding in ['vec']: pass elif encoding in ['custom']: ## put custom format decisions here ########################## ## end of custom format ###################################### pass else: raise ValueError('encode.py: unknown encoding: ' + encoding) cards = jdecode.mtg_open_file(fname, verbose=verbose, fmt_ordered=fmt_ordered) if creativity: namediff = Namediff() cbow = CBOW() if verbose: print 'Computing nearest names...' nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3) if verbose: print 'Computing nearest cards...' nearest_cards = cbow.nearest_par(cards) for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] if verbose: print '...Done.' def hoverimg(cardname, dist, nd): truename = nd.names[cardname] code = nd.codes[cardname] namestr = '' if for_html: if code: namestr = ('<div class="hover_img"><a href="#">' + truename + '<span><img src="http://magiccards.info/scans/en/' + code + '" alt="image"/></span></a>' + ': ' + str(dist) + '</div>') else: namestr = '<div>' + truename + ': ' + str(dist) + '</div>' elif for_forum: namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n' else: namestr = truename + ': ' + str(dist) + '\n' return namestr def writecards(writer): if for_mse: # have to prepend a massive chunk of formatting info writer.write(utils.mse_prepend) if for_html: # have to preapend html info writer.write(utils.html_prepend) for card in cards: if for_mse: writer.write(card.to_mse().encode('utf-8')) fstring = '' if card.json: fstring += 'JSON:\n' + card.json + '\n' if card.raw: fstring += 'raw:\n' + card.raw + '\n' fstring += '\n' fstring += card.format(gatherer = gatherer, for_forum = for_forum, vdump = vdump) + '\n' fstring = fstring.replace('<', '(').replace('>', ')') writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t')) else: fstring = card.format(gatherer = gatherer, for_forum = for_forum, vdump = vdump, for_html = for_html) if creativity and for_html: fstring = fstring[:-6] # chop off the closing </div> to stick stuff in writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += '~~ closest names ~~\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) if for_html: cstring = '<hr><div>' + cstring.replace('\n', '<br>\n') + '</div>\n</div>' elif for_mse: cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t') writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) if for_mse: # more formatting info writer.write('version control:\n\ttype: none\napprentice code: ') if for_html: # closing the html file writer.write(utils.html_append) if oname: if for_html: print oname # if ('.html' != oname[-]) # oname += '.html' if verbose: print 'Writing output to: ' + oname with open(oname, 'w') as ofile: writecards(ofile) if for_mse: # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print 'ERROR: tried to overwrite existing file "set" - aborting.' return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print 'Made an MSE set file called ' + oname + '.mse-set.' # The set file is useless outside the .mse-set, delete it. os.remove('set') else: writecards(sys.stdout) sys.stdout.flush()
#ハイパーパラメータの設定 window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 #データの読み込み corpus, wordtoid, idtoword = ptb.load_data('train') vocab_size = len(wordtoid) contexts, target = create_contexts_target(corpus, window_size) if config.GPU: contexts, target = to_gpu(contexts), to_gpu(target) #モデルなどの生成 model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) #学習開始 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot() #後ほど利用できるように、必要なデータを保存 word_vecs = model.word_vecs if config.GPU: word_vecs = to_cpu(word_vecs) params = {} params['word_vecs'] = word_vecs.astype(np.float16) params['wordtoid'] = wordtoid
def run_cbow(): vec = CBOW() vec.train()
def main(fname, oname = None, verbose = True, encoding = 'std', gatherer = False, for_forum = False, for_mse = False, creativity = False, vdump = False, for_html = False): # there is a sane thing to do here (namely, produce both at the same time) # but we don't support it yet. if for_mse and for_html: print 'ERROR - decode.py - incompatible formats "mse" and "html"' return fmt_ordered = cardlib.fmt_ordered_default if encoding in ['std']: pass elif encoding in ['named']: fmt_ordered = cardlib.fmt_ordered_named elif encoding in ['noname']: fmt_ordered = cardlib.fmt_ordered_noname elif encoding in ['rfields']: pass elif encoding in ['old']: fmt_ordered = cardlib.fmt_ordered_old elif encoding in ['norarity']: fmt_ordered = cardlib.fmt_ordered_norarity elif encoding in ['vec']: pass elif encoding in ['custom']: ## put custom format decisions here ########################## ## end of custom format ###################################### pass else: raise ValueError('encode.py: unknown encoding: ' + encoding) cards = jdecode.mtg_open_file(fname, verbose=verbose, fmt_ordered=fmt_ordered) if creativity: namediff = Namediff() cbow = CBOW() if verbose: print 'Computing nearest names...' nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3) if verbose: print 'Computing nearest cards...' nearest_cards = cbow.nearest_par(cards) for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] if verbose: print '...Done.' def hoverimg(cardname, dist, nd): truename = nd.names[cardname] code = nd.codes[cardname] namestr = '' if for_html: if code: namestr = ('<div class="hover_img"><a href="#">' + truename + '<span><img style="background: url(http://magiccards.info/scans/en/' + code + ');" alt=""/></span></a>' + ': ' + str(dist) + '\n</div>\n') else: namestr = '<div>' + truename + ': ' + str(dist) + '</div>' elif for_forum: namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n' else: namestr = truename + ': ' + str(dist) + '\n' return namestr def writecards(writer): if for_mse: # have to prepend a massive chunk of formatting info writer.write(utils.mse_prepend) if for_html: # have to preapend html info writer.write(utils.html_prepend) # seperate the write function to allow for writing smaller chunks of cards at a time segments = sort_colors(cards) for i in range(len(segments)): # sort color by CMC segments[i] = sort_type(segments[i]) # this allows card boxes to be colored for each color # for coloring of each box seperately cardlib.Card.format() must change non-minimaly writer.write('<div id="' + utils.segment_ids[i] + '">') writehtml(writer, segments[i]) writer.write("</div><hr>") # closing the html file writer.write(utils.html_append) return #break out of the write cards funcrion to avoid writing cards twice for card in cards: if for_mse: writer.write(card.to_mse().encode('utf-8')) fstring = '' if card.json: fstring += 'JSON:\n' + card.json + '\n' if card.raw: fstring += 'raw:\n' + card.raw + '\n' fstring += '\n' fstring += card.format(gatherer = gatherer, for_forum = for_forum, vdump = vdump) + '\n' fstring = fstring.replace('<', '(').replace('>', ')') writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t')) else: fstring = card.format(gatherer = gatherer, for_forum = for_forum, vdump = vdump, for_html = for_html) writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += '~~ closest names ~~\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) if for_mse: cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t') writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) if for_mse: # more formatting info writer.write('version control:\n\ttype: none\napprentice code: ') def writehtml(writer, card_set): for card in card_set: fstring = card.format(gatherer = gatherer, for_forum = True, vdump = vdump, for_html = for_html) if creativity: fstring = fstring[:-6] # chop off the closing </div> to stick stuff in writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n<br>\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += "<br>\n" cstring += '~~ closest names ~~\n<br>\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring = '<hr><div>' + cstring + '</div>\n</div>' writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) # Sorting by colors def sort_colors(card_set): # Initialize sections red_cards = [] blue_cards = [] green_cards = [] black_cards = [] white_cards = [] multi_cards = [] colorless_cards = [] lands = [] for card in card_set: if len(card.get_colors())>1: multi_cards += [card] continue if 'R' in card.get_colors(): red_cards += [card] continue elif 'U' in card.get_colors(): blue_cards += [card] continue elif 'B' in card.get_colors(): black_cards += [card] continue elif 'G' in card.get_colors(): green_cards += [card] continue elif 'W' in card.get_colors(): white_cards += [card] continue else: if "land" in card.get_types(): lands += [card] continue colorless_cards += [card] return[white_cards, blue_cards, black_cards, red_cards, green_cards, multi_cards, colorless_cards, lands] def sort_type(card_set): sorting = ["creature", "enchantment", "instant", "sorcery", "artifact", "planeswalker"] sorted_cards = [[],[],[],[],[],[],[]] sorted_set = [] for card in card_set: types = card.get_types() for i in range(len(sorting)): if sorting[i] in types: sorted_cards[i] += [card] break else: sorted_cards[6] += [card] for value in sorted_cards: for card in value: sorted_set += [card] return sorted_set def sort_cmc(card_set): sorted_cards = [] sorted_set = [] for card in card_set: # make sure there is an empty set for each CMC while len(sorted_cards)-1 < card.get_cmc(): sorted_cards += [[]] # add card to correct set of CMC values sorted_cards[card.get_cmc()] += [card] # combine each set of CMC valued cards together for value in sorted_cards: for card in value: sorted_set += [card] return sorted_set if oname: if for_html: print oname # if ('.html' != oname[-]) # oname += '.html' if verbose: print 'Writing output to: ' + oname with open(oname, 'w') as ofile: writecards(ofile) if for_mse: # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print 'ERROR: tried to overwrite existing file "set" - aborting.' return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print 'Made an MSE set file called ' + oname + '.mse-set.' # The set file is useless outside the .mse-set, delete it. os.remove('set') else: writecards(sys.stdout) sys.stdout.flush()
parser.add_argument('--optimizer', type=str, default='SGD', help='Model optimization algorithm') parser.add_argument('--loss_function', type=str, default='ce', help='Loss function that would be minimized') parser.add_argument('--epochs', type=int, default=10, help='Number of iterations') args = parser.parse_args() print("Begin Reading Corpus Data and Tokenizing") data_reader = CBOWCorpusReader(args.corpus) grams = data_reader.get_ngram_words() words_freq = data_reader.get_words_frequency() word2idx = data_reader.get_word2idx() idx2word = data_reader.get_idx2word() print("End Reading the Data") args.vocab_size = len(words_freq) cbow = CBOW(args) cbow.init_session() cbow.build() print("Begin Training") learning_curve = [] for epoch in range(0, args.epochs): error = 0.0 print(epoch) for batch in grams: x_input, y_output, x_input_reshape = [], [], [] for item in batch: def get_one_hot(idx): one_hot = ([0] * (args.vocab_size + 1))
""" vocab_size = len(word_to_id) # 10000 contexts, target = create_contexts_target(corpus, window_size) #print(contexts.shape, target.shape);exit(1) """ 929589개의 단어가 window_size만큼 양옆으로 짤리고 나머지를 중심단어의 targets으로 보고, 중심단어를 기준으로 양옆 5단어를 contexts로 본다. contexts.shape : (929579, 10) # 중심단어를 기준으로 양옆 10개의 단어들의 나열 되어 있음. target.shape : (929579,) # 중심단어가 나열되어 있음. """ if config.GPU: contexts, target = to_gpu(contexts), to_gpu(target) # 모델 등 생성 model = CBOW(vocab_size, hidden_size, window_size, corpus) # 인스턴스 생성, 모델생성. ( 인접단어로 중심단어 추론 ) # model = SkipGram(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) # 학습 시작 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot() # 나중에 사용할 수 있도록 필요한 데이터 저장 word_vecs = model.word_vecs if config.GPU: word_vecs = to_cpu(word_vecs) params = {} params['word_vecs'] = word_vecs.astype(np.float16) params['word_to_id'] = word_to_id
############################################################################### if __name__ == '__main__': parser = argparse.ArgumentParser(description='Visualize word vectors with tensorboard') parser.add_argument('--init_vecs', required=True, dest='init_vecs') parser.add_argument('--model_ckpt', required=True, dest='model_ckpt') parser.add_argument('--num_embed', required=True, type=int, dest='num_embed') parser.add_argument('--embed_metadata', required=True, dest='metadata') parser.add_argument('--graph_folder', required=True, dest='graph_folder') args = parser.parse_args() # Initialize model word_vectors = dill.load(open(args.init_vecs,'rb')) model = CBOW(len(word_vectors), word_vectors, 5e-4) # Load model of last epoch saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, args.model_ckpt) # Visualizing embeddings final_embed = sess.run(model.init_vecs) embedding_var = tf.Variable(final_embed[:args.num_embed], name='embedding') sess.run(embedding_var.initializer) config = projector.ProjectorConfig() summary_writer = tf.summary.FileWriter(args.graph_folder) embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name embedding.metadata_path = os.path.join(args.graph_folder, args.metadata) projector.visualize_embeddings(summary_writer, config)
model_hard_file ="GRU_REG_HARD_1024_350_0.001_mse.pt" model_hard = GRU_REG(hard_dataset.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_gru_reg_model, easy_dataset, easy_testset, hard_dataset, hard_testset) # GRU best_top_params = [300, 56, 0.0001] embedding_space, hidden_layer_dim, learning_rate = best_top_params model_easy_file = "GRU_EASY_56_300_0.0001_mse.pt" model_easy = GRU(easy_dataset.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) model_hard_file = "GRU_HARD_56_300_0.0001_mse.pt" model_hard = GRU(hard_dataset.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_gru_model, easy_dataset, easy_testset, hard_dataset, hard_testset) # CBOW NAIVE best_top_params = [150, 256, 0.0001] embedding_space, hidden_layer_dim, learning_rate = best_top_params model_easy_file = "CBOW_NAIVE_EASY_256_150_0.0001_smooth_l1.pt" model_easy = CBOW(easy_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) model_hard_file = "CBOW_NAIVE_HARD_256_150_0.0001_smooth_l1.pt" model_hard = CBOW(hard_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_cbow_model, easy_dataset_cbow, easy_testset, hard_dataset_cbow, hard_testset) # CBOW REGRESSION best_top_params = [150, 256, 0.0001] embedding_space, hidden_layer_dim, learning_rate = best_top_params model_easy_file = "CBOW_REG_EASY_256_300_0.001_smooth_l1.pt" model_easy = CBOW_REG(easy_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) model_hard_file = "CBOW_REG_HARD_256_300_0.001_smooth_l1.pt" model_hard = CBOW_REG(hard_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space) test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_cbow_model, easy_dataset_cbow, easy_testset, hard_dataset_cbow, hard_testset)