Example #1
0
 def setUp(self):
     text = 'You said good-bye and I said hello.'
     cbm = CountBasedMethod()
     word_list = cbm.text_to_word_list(text)
     word_to_id, _, self.corpus = cbm.preprocess(word_list)
     vocab_size = len(word_to_id)
     hidden_size = 2
     window_size = 1
     self.cbow = CBOW(vocab_size, hidden_size, window_size, self.corpus)
     self.simple_word2vec = SimpleWord2Vec()
     self.contexts, self.target = self.simple_word2vec.create_contexts_target(
         self.corpus)
Example #2
0
def main(fname, oname, verbose = True, parallel = True):
    # may need to set special arguments here
    cards = jdecode.mtg_open_file(fname, verbose=verbose)

    # this could reasonably be some separate function
    # might make sense to merge cbow and namediff and have this be the main interface
    namediff = Namediff()
    cbow = CBOW()

    if verbose:
        print('Computing nearest names...')
    if parallel:
        nearest_names = namediff.nearest_par([c.name for c in cards], n=1)
    else:
        nearest_names = [namediff.nearest(c.name, n=1) for c in cards]

    if verbose:
        print('Computing nearest cards...')
    if parallel:
        nearest_cards = cbow.nearest_par(cards, n=1)
    else:
        nearest_cards = [cbow.nearest(c, n=1) for c in cards]

    for i in range(0, len(cards)):
        cards[i].nearest_names = nearest_names[i]
        cards[i].nearest_cards = nearest_cards[i]

    # # unfortunately this takes ~30 hours on 8 cores for a 10MB dump
    # if verbose:
    #     print 'Computing nearest encodings by text edit distance...'
    # if parallel:
    #     nearest_cards_text = namediff.nearest_card_par(cards, n=1)
    # else:
    #     nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards]

    if verbose:
        print('...Done.')

    # write to a file to store the data, this is a terribly long computation
    # we could also just store this same info in the cards themselves as more fields...
    sep = '|'
    with open(oname, 'w') as ofile:
        for i in range(0, len(cards)):
            card = cards[i]
            ostr = str(i) + sep + card.name + sep
            ndist, _ = card.nearest_names[0]
            ostr += str(ndist) + sep
            cdist, _ = card.nearest_cards[0]
            ostr += str(cdist) + '\n'
            # tdist, _ = nearest_cards_text[i][0]
            # ostr += str(tdist) + '\n'
            ofile.write(ostr.encode('utf-8'))
def main(fname, oname, verbose = True, parallel = True):
    # may need to set special arguments here
    cards = jdecode.mtg_open_file(fname, verbose=verbose)

    # this could reasonably be some separate function
    # might make sense to merge cbow and namediff and have this be the main interface
    namediff = Namediff()
    cbow = CBOW()

    if verbose:
        print 'Computing nearest names...'
    if parallel:
        nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=1)
    else:
        nearest_names = [namediff.nearest(c.name, n=1) for c in cards]

    if verbose:
        print 'Computing nearest cards...'
    if parallel:
        nearest_cards = cbow.nearest_par(cards, n=1)
    else:
        nearest_cards = [cbow.nearest(c, n=1) for c in cards]

    for i in range(0, len(cards)):
        cards[i].nearest_names = nearest_names[i]
        cards[i].nearest_cards = nearest_cards[i]

    # # unfortunately this takes ~30 hours on 8 cores for a 10MB dump
    # if verbose:
    #     print 'Computing nearest encodings by text edit distance...'
    # if parallel:
    #     nearest_cards_text = namediff.nearest_card_par(cards, n=1)
    # else:
    #     nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards]

    if verbose:
        print '...Done.'

    # write to a file to store the data, this is a terribly long computation
    # we could also just store this same info in the cards themselves as more fields...
    sep = '|'
    with open(oname, 'w') as ofile:
        for i in range(0, len(cards)):
            card = cards[i]
            ostr = str(i) + sep + card.name + sep
            ndist, _ = card.nearest_names[0]
            ostr += str(ndist) + sep
            cdist, _ = card.nearest_cards[0]
            ostr += str(cdist) + '\n'
            # tdist, _ = nearest_cards_text[i][0]
            # ostr += str(tdist) + '\n'
            ofile.write(ostr.encode('utf-8'))
Example #4
0
def main():
    # ハイパーパラメータの設定
    window_size = 5
    hidden_size = 100
    batch_size = 100
    max_epoch = 10

    # データの読み込み
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    contexts, target = create_contexts_target(corpus, window_size)

    # モデルなどの生成
    model = CBOW(vocab_size, hidden_size, window_size, corpus)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)

    # 学習開始
    trainer.fit(contexts, target, max_epoch, batch_size)
    trainer.plot()

    # 後ほど利用できるように、必要なデータを保存
    word_vecs = model.word_vecs

    params = {}
    params['word_vecs'] = word_vecs.astype(np.float16)
    params['word_to_id'] = word_to_id
    params['id_to_word'] = id_to_word
    pkl_file = 'cbow_params.pkl'
    with open(pkl_file, 'wb') as f:
        pickle.dump(params, f, -1)
Example #5
0
def main() -> None:
    window_size = 5
    hidden_size = 100
    batch_size = 100
    max_epoch = 10

    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    contexts, target = create_context_target(corpus, window_size)

    model = CBOW(vocab_size, hidden_size, window_size, corpus)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)

    trainer.fit(contexts, target, max_epoch, batch_size)
    # trainer.plot()

    word_vecs = model.word_vecs
    params = {
        'word_vecs': word_vecs.astype(np.float16),
        'word_to_id': word_to_id,
        'id_to_word': id_to_word
    }
    with open('cbow_params.pkl', 'wb') as f:
        pickle.dump(params, f, -1)
Example #6
0
class TestSimpleCBOW(unittest.TestCase):
    def setUp(self):
        text = 'You said good-bye and I said hello.'
        cbm = CountBasedMethod()
        word_list = cbm.text_to_word_list(text)
        word_to_id, _, self.corpus = cbm.preprocess(word_list)
        vocab_size = len(word_to_id)
        hidden_size = 2
        window_size = 1
        self.cbow = CBOW(vocab_size, hidden_size, window_size, self.corpus)
        self.simple_word2vec = SimpleWord2Vec()
        self.contexts, self.target = self.simple_word2vec.create_contexts_target(
            self.corpus)

    def test_forward(self):
        loss = self.cbow.forward(self.contexts, self.target)
        self.assertEqual(4.159, round(loss, 3))

    def test_grads_diff(self):
        in_layer, *_ = self.cbow.in_layers
        before_in_layer_grad, = in_layer.grads
        before_in_layer_grad = copy.copy(before_in_layer_grad)
        before_ns_loss_layer_grad, *_ = self.cbow.ns_loss_layer.grads
        before_ns_loss_layer_grad = copy.copy(before_ns_loss_layer_grad)
        self.cbow.forward(self.contexts, self.target)
        self.cbow.backward()
        in_layer, *_ = self.cbow.in_layers
        after_in_layer_grad, = in_layer.grads
        after_ns_loss_layer_grad, *_ = self.cbow.ns_loss_layer.grads
        in_layer_grad = before_in_layer_grad == after_in_layer_grad
        ns_loss_layer_grad = before_ns_loss_layer_grad == after_ns_loss_layer_grad
        assert_array_equal(
            np.array([[False, False], [False, False], [False, False],
                      [False, False], [False, False], [True, True],
                      [True, True]]), in_layer_grad)
        assert_array_equal(
            np.array([[True, True], [False, False], [False, False],
                      [False, False], [False, False], [False, False],
                      [True, True]]), ns_loss_layer_grad)
Example #7
0
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        dest='num_epochs')
    parser.add_argument('--models_folder',
                        default='./word_vector_models',
                        dest='folder')
    parser.add_argument('--graph_folder',
                        default='./trump_graph',
                        dest='graphs')
    args = parser.parse_args()

    # Read the initial word vectors
    word_vectors = dill.load(open(args.init_vec, 'rb'))

    cbow = CBOW(len(word_vectors), word_vectors, args.lr)
    init = tf.global_variables_initializer()

    # Fit the model
    if args.mode == 'train':
        # Read training samples
        inputs = dill.load(open(args.samples, 'rb'))
        with tf.Session() as sess:
            sess.run(init)
            cbow.fit(sess,
                     inputs,
                     embed_data_path=args.embed_metadata,
                     minibatch_size=args.minibatch_size,
                     num_epochs=args.num_epochs,
                     folder=args.folder,
                     graph_folder=args.graphs)
Example #8
0
def main(fname, oname, n=20, verbose=False):
    cbow = CBOW()
    realcards = jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt')), verbose=verbose)
    real_by_name = {c.name: c for c in realcards}
    lm = ngrams.build_ngram_model(realcards, 3, separate_lines=separate_lines, verbose=verbose)
    cards = jdecode.mtg_open_file(fname, verbose=verbose)
    stats = analysis.get_statistics(fname, lm=lm, sep=separate_lines, verbose=verbose)

    selected = []
    for i in range(0, len(cards)):
        if select_card(cards, stats, i):
            selected += [(i, cards[i])]

    limit = 3000

    random.shuffle(selected)
    #selected = selected[:limit]

    if verbose:
        print(('computing nearest cards for ' + str(len(selected)) + ' candindates...'))
    cbow_nearest = cbow.nearest_par([i_c[1] for i_c in selected])
    for i in range(0, len(selected)):
        (j, card) = selected[i]
        selected[i] = (j, card, cbow_nearest[i])
    if verbose:
        print('...done')

    final = []
    for (i, card, nearest) in selected:
        for dist, rname in nearest:
            realcard = real_by_name[rname]
            if compare_to_real(card, realcard):
                final += [(i, card, realcard, dist)]
                break

    for (i, card, realcard, dist) in final:
        print('-- real --')
        print(realcard.format())
        print('-- fake --')
        print(card.format())
        print('-- stats --')
        perp_per = stats['ngram']['perp_per'][i]
        perp_max = stats['ngram']['perp_max'][i]
        print(dist)
        print(perp_per)
        print(perp_max)
        print('----')

    if not oname is None:
        with open(oname, 'wt') as ofile:
            ofile.write(utils.mse_prepend)
            for (i, card, realcard, dist) in final:
                name = realcard.name
                writecard(realcard, name, ofile)
                writecard(card, name, ofile)
            ofile.write('version control:\n\ttype: none\napprentice code: ')
            # Copy whatever output file is produced, name the copy 'set' (yes, no extension).
            if os.path.isfile('set'):
                print('ERROR: tried to overwrite existing file "set" - aborting.')
                return
            shutil.copyfile(oname, 'set')
            # Use the freaky mse extension instead of zip.
            with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf:
                try:
                    # Zip up the set file into oname.mse-set.
                    zf.write('set') 
                finally:
                    if verbose:
                        print('Made an MSE set file called ' + oname + '.mse-set.')
                    # The set file is useless outside the .mse-set, delete it.
                    os.remove('set')
Example #9
0
def main(fname,
         oname=None,
         verbose=True,
         encoding='std',
         gatherer=False,
         for_forum=False,
         for_mse=False,
         creativity=False,
         vdump=False,
         for_html=False):

    # there is a sane thing to do here (namely, produce both at the same time)
    # but we don't support it yet.
    if for_mse and for_html:
        print 'ERROR - decode.py - incompatible formats "mse" and "html"'
        return

    fmt_ordered = cardlib.fmt_ordered_default

    if encoding in ['std']:
        pass
    elif encoding in ['named']:
        fmt_ordered = cardlib.fmt_ordered_named
    elif encoding in ['noname']:
        fmt_ordered = cardlib.fmt_ordered_noname
    elif encoding in ['rfields']:
        pass
    elif encoding in ['old']:
        fmt_ordered = cardlib.fmt_ordered_old
    elif encoding in ['norarity']:
        fmt_ordered = cardlib.fmt_ordered_norarity
    elif encoding in ['vec']:
        pass
    elif encoding in ['custom']:
        ## put custom format decisions here ##########################

        ## end of custom format ######################################
        pass
    else:
        raise ValueError('encode.py: unknown encoding: ' + encoding)

    cards = jdecode.mtg_open_file(fname,
                                  verbose=verbose,
                                  fmt_ordered=fmt_ordered)

    if creativity:
        namediff = Namediff()
        cbow = CBOW()
        if verbose:
            print 'Computing nearest names...'
        nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3)
        if verbose:
            print 'Computing nearest cards...'
        nearest_cards = cbow.nearest_par(cards)
        for i in range(0, len(cards)):
            cards[i].nearest_names = nearest_names[i]
            cards[i].nearest_cards = nearest_cards[i]
        if verbose:
            print '...Done.'

    def hoverimg(cardname, dist, nd):
        truename = nd.names[cardname]
        code = nd.codes[cardname]
        namestr = ''
        if for_html:
            if code:
                namestr = (
                    '<div class="hover_img"><a href="#">' + truename +
                    '<span><img style="background: url(http://magiccards.info/scans/en/'
                    + code + ');" alt=""/></span></a>' + ': ' + str(dist) +
                    '\n</div>\n')
            else:
                namestr = '<div>' + truename + ': ' + str(dist) + '</div>'
        elif for_forum:
            namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n'
        else:
            namestr = truename + ': ' + str(dist) + '\n'
        return namestr

    def writecards(writer):
        if for_mse:
            # have to prepend a massive chunk of formatting info
            writer.write(utils.mse_prepend)

        if for_html:
            # have to preapend html info
            writer.write(utils.html_prepend)
            # seperate the write function to allow for writing smaller chunks of cards at a time
            segments = sort_colors(cards)
            for i in range(len(segments)):
                # sort color by CMC
                segments[i] = sort_type(segments[i])
                # this allows card boxes to be colored for each color
                # for coloring of each box seperately cardlib.Card.format() must change non-minimaly
                writer.write('<div id="' + utils.segment_ids[i] + '">')
                writehtml(writer, segments[i])
                writer.write("</div><hr>")
            # closing the html file
            writer.write(utils.html_append)
            return  #break out of the write cards funcrion to avoid writing cards twice

        for card in cards:
            if for_mse:
                writer.write(card.to_mse().encode('utf-8'))
                fstring = ''
                if card.json:
                    fstring += 'JSON:\n' + card.json + '\n'
                if card.raw:
                    fstring += 'raw:\n' + card.raw + '\n'
                fstring += '\n'
                fstring += card.format(
                    gatherer=gatherer, for_forum=for_forum, vdump=vdump) + '\n'
                fstring = fstring.replace('<', '(').replace('>', ')')
                writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t'))
            else:
                fstring = card.format(gatherer=gatherer,
                                      for_forum=for_forum,
                                      vdump=vdump,
                                      for_html=for_html)
                writer.write((fstring + '\n').encode('utf-8'))

            if creativity:
                cstring = '~~ closest cards ~~\n'
                nearest = card.nearest_cards
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring += '~~ closest names ~~\n'
                nearest = card.nearest_names
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                if for_mse:
                    cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t')
                writer.write(cstring.encode('utf-8'))

            writer.write('\n'.encode('utf-8'))

        if for_mse:
            # more formatting info
            writer.write('version control:\n\ttype: none\napprentice code: ')

    def writehtml(writer, card_set):
        for card in card_set:
            fstring = card.format(gatherer=gatherer,
                                  for_forum=True,
                                  vdump=vdump,
                                  for_html=for_html)
            if creativity:
                fstring = fstring[:
                                  -6]  # chop off the closing </div> to stick stuff in
            writer.write((fstring + '\n').encode('utf-8'))

            if creativity:
                cstring = '~~ closest cards ~~\n<br>\n'
                nearest = card.nearest_cards
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring += "<br>\n"
                cstring += '~~ closest names ~~\n<br>\n'
                nearest = card.nearest_names
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring = '<hr><div>' + cstring + '</div>\n</div>'
                writer.write(cstring.encode('utf-8'))

            writer.write('\n'.encode('utf-8'))

    # Sorting by colors
    def sort_colors(card_set):
        # Initialize sections
        red_cards = []
        blue_cards = []
        green_cards = []
        black_cards = []
        white_cards = []
        multi_cards = []
        colorless_cards = []
        lands = []
        for card in card_set:
            if len(card.get_colors()) > 1:
                multi_cards += [card]
                continue
            if 'R' in card.get_colors():
                red_cards += [card]
                continue
            elif 'U' in card.get_colors():
                blue_cards += [card]
                continue
            elif 'B' in card.get_colors():
                black_cards += [card]
                continue
            elif 'G' in card.get_colors():
                green_cards += [card]
                continue
            elif 'W' in card.get_colors():
                white_cards += [card]
                continue
            else:
                if "land" in card.get_types():
                    lands += [card]
                    continue
                colorless_cards += [card]
        return [
            white_cards, blue_cards, black_cards, red_cards, green_cards,
            multi_cards, colorless_cards, lands
        ]

    def sort_type(card_set):
        sorting = [
            "creature", "enchantment", "instant", "sorcery", "artifact",
            "planeswalker"
        ]
        sorted_cards = [[], [], [], [], [], [], []]
        sorted_set = []
        for card in card_set:
            types = card.get_types()
            for i in range(len(sorting)):
                if sorting[i] in types:
                    sorted_cards[i] += [card]
                    break
            else:
                sorted_cards[6] += [card]
        for value in sorted_cards:
            for card in value:
                sorted_set += [card]
        return sorted_set

    def sort_cmc(card_set):
        sorted_cards = []
        sorted_set = []
        for card in card_set:
            # make sure there is an empty set for each CMC
            while len(sorted_cards) - 1 < card.get_cmc():
                sorted_cards += [[]]
            # add card to correct set of CMC values
            sorted_cards[card.get_cmc()] += [card]
        # combine each set of CMC valued cards together
        for value in sorted_cards:
            for card in value:
                sorted_set += [card]
        return sorted_set

    if oname:
        if for_html:
            print oname
            # if ('.html' != oname[-])
            #     oname += '.html'
        if verbose:
            print 'Writing output to: ' + oname
        with open(oname, 'w') as ofile:
            writecards(ofile)
        if for_mse:
            # Copy whatever output file is produced, name the copy 'set' (yes, no extension).
            if os.path.isfile('set'):
                print 'ERROR: tried to overwrite existing file "set" - aborting.'
                return
            shutil.copyfile(oname, 'set')
            # Use the freaky mse extension instead of zip.
            with zipfile.ZipFile(oname + '.mse-set', mode='w') as zf:
                try:
                    # Zip up the set file into oname.mse-set.
                    zf.write('set')
                finally:
                    if verbose:
                        print 'Made an MSE set file called ' + oname + '.mse-set.'
                    # The set file is useless outside the .mse-set, delete it.
                    os.remove('set')
    else:
        writecards(sys.stdout)
        sys.stdout.flush()
Example #10
0
And see thy blood warm when thou feel'st it cold."""

# Load data
# wcd = WordContextDataset(corpus=tiny_corpus,
#                          context_size=2,
#                          min_word=1)

wcd = WordContextDataset(corpus_path="./data/alice.txt",
                         context_size=2,
                         min_word=1)


data_loader = DataLoader(wcd, batch_size=128, shuffle=True)

# Model
cbow = CBOW(vocab_size=wcd.vocab_size,
            embed_dim=100)

# Training Parameters
n_epoch = 1000
learning_rate = 0.001

optimizer = optim.SGD(cbow.parameters(),
                      lr=learning_rate)
loss_fn = nn.NLLLoss()
loss_list = []

# Use GPU, if available.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cbow.to(device)

for epoch_i in range(n_epoch):
Example #11
0
def main(fname, oname, n=20, verbose=False):
    cbow = CBOW()
    realcards = jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt')), verbose=verbose)
    real_by_name = {c.name: c for c in realcards}
    lm = ngrams.build_ngram_model(realcards, 3, separate_lines=separate_lines, verbose=verbose)
    cards = jdecode.mtg_open_file(fname, verbose=verbose)
    stats = analysis.get_statistics(fname, lm=lm, sep=separate_lines, verbose=verbose)

    selected = []
    for i in range(0, len(cards)):
        if select_card(cards, stats, i):
            selected += [(i, cards[i])]

    limit = 3000

    random.shuffle(selected)
    #selected = selected[:limit]

    if verbose:
        print('computing nearest cards for ' + str(len(selected)) + ' candindates...')
    cbow_nearest = cbow.nearest_par(map(lambda (i, c): c, selected))
    for i in range(0, len(selected)):
        (j, card) = selected[i]
        selected[i] = (j, card, cbow_nearest[i])
    if verbose:
        print('...done')

    final = []
    for (i, card, nearest) in selected:
        for dist, rname in nearest:
            realcard = real_by_name[rname]
            if compare_to_real(card, realcard):
                final += [(i, card, realcard, dist)]
                break

    for (i, card, realcard, dist) in final:
        print '-- real --'
        print realcard.format()
        print '-- fake --'
        print card.format()
        print '-- stats --'
        perp_per = stats['ngram']['perp_per'][i]
        perp_max = stats['ngram']['perp_max'][i]
        print dist
        print perp_per
        print perp_max
        print '----'

    if not oname is None:
        with open(oname, 'wt') as ofile:
            ofile.write(utils.mse_prepend)
            for (i, card, realcard, dist) in final:
                name = realcard.name
                writecard(realcard, name, ofile)
                writecard(card, name, ofile)
            ofile.write('version control:\n\ttype: none\napprentice code: ')
            # Copy whatever output file is produced, name the copy 'set' (yes, no extension).
            if os.path.isfile('set'):
                print 'ERROR: tried to overwrite existing file "set" - aborting.'
                return
            shutil.copyfile(oname, 'set')
            # Use the freaky mse extension instead of zip.
            with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf:
                try:
                    # Zip up the set file into oname.mse-set.
                    zf.write('set') 
                finally:
                    if verbose:
                        print 'Made an MSE set file called ' + oname + '.mse-set.'
                    # The set file is useless outside the .mse-set, delete it.
                    os.remove('set')
Example #12
0
def train(_=None,
          corpus=None,
          corpus_path=None,
          context_size=2,
          min_word=1,

          embed_dim=100,

          n_epoch=10,
          batch_size=32,
          learning_rate=0.001,
          shuffle=True,
          verbose_iterval=1):

    if _:
        raise Exception("Don't put parameters without keys. Set parameters with the key together.")

    # Load data
    wcd = WordContextDataset(corpus=corpus,
                             corpus_path=corpus_path,
                             context_size=context_size,
                             min_word=min_word)

    data_loader = DataLoader(wcd,
                             batch_size=batch_size,
                             shuffle=shuffle)

    # Model
    cbow = CBOW(vocab_size=wcd.vocab_size,
                embed_dim=embed_dim)

    # Training Parameters
    optimizer = optim.SGD(cbow.parameters(),
                          lr=learning_rate)
    loss_fn = nn.NLLLoss()
    loss_list = []

    # Use GPU, if available.
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    cbow.to(device)

    for epoch_i in range(n_epoch):
        for batch_i, (X, Y) in enumerate(data_loader):
            X, Y = X.to(device), Y.to(device)
            cbow.zero_grad()

            pred_log_prob = cbow(X)

            loss = loss_fn(pred_log_prob, Y)

            loss.backward()
            loss_list.append(float(loss.to('cpu').data.numpy()))

            optimizer.step()

            if epoch_i % verbose_iterval == 0:
                print("loss : {:.3f}".format(loss_list[-1]))

    return {'wcd': wcd,
            'cbow': cbow,
            'loss_list': loss_list,
            'data_loader': data_loader}
Example #13
0
print('- max_vocab_size:', max_vocab_size)
print('- min_word_freq:', min_word_freq)
print('- corpus:', corpus_file)
print()

train, word_to_id, id_to_word = get_vocab(corpus_file, max_vocab_size, min_word_freq)
vocab_size = len(word_to_id)
unk_rate = train.count(word_to_id['UNK']) / len(train) * 100.0 if 'UNK' in word_to_id.keys() else 0.0

print('\n\033[92m[ statics ]\033[0m')
print('- token_size:', len(train))
print('- vocab_size:', vocab_size)
print('- unk_rate: {:.2f}%'.format(unk_rate))

train_iter = WindowIterator(train, window_size, batch_size, max_epoch)
model = CBOW(vocab_size, hidden_size, window_size, train)
optimizer = Adam()
trainer = Word2vecTrainer(model, optimizer)

print('\n\033[92m[ progress ]\033[0m')
trainer.fit(train_iter, eval_interval=eval_interval)

word_vecs = model.word_vecs
if GPU:
    word_vecs = to_device(device=-1, x=word_vecs)

params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl'
Example #14
0
        "plot": False,
        "save_embeddings": True,
        # preprocessing variables
        "lemmatize": False,
        "stem": False,
        "remove_stopwords": False,
        "library": "nltk"               # "nltk" or "spacy"
    }
    opts.update(args_dict)

    # random seed for initializing weights
    if not opts.seed is None:
        torch.manual_seed(opts.seed)

    # getting data
    vocab, train_loader, valid_loader = load_data(preprocess=True, **opts)          # Run this once
    #vocab, train_loader, valid_loader = load_data(preprocess=False, **opts)        # then you should only run this one
    opts.vocab_size = len(vocab)

    # creating model
    model = CBOW(opts.context_length, opts.vocab_size, opts.embedding_size)
    #model = Bengio(opts.context_length, opts.vocab_size, opts.embedding_size, opts.hidden_size)

    # training model
    final_statistics = train(model, train_loader, valid_loader, opts)

    # extracting word embeddings
    if opts.save_embeddings:
        embeddings = model.embedding.weight.data.numpy().T
        np.savetxt(f"{model.name.lower()}_word_vectors.csv", embeddings)
        print(f"word vectors saved to {model.name.lower()}_word_vectors.csv")
Example #15
0
def main(fname, oname = None, verbose = True, encoding = 'std',
         gatherer = False, for_forum = False, for_mse = False,
         creativity = False, vdump = False, for_html = False):

    # there is a sane thing to do here (namely, produce both at the same time)
    # but we don't support it yet.
    if for_mse and for_html:
        print 'ERROR - decode.py - incompatible formats "mse" and "html"'
        return

    fmt_ordered = cardlib.fmt_ordered_default

    if encoding in ['std']:
        pass
    elif encoding in ['named']:
        fmt_ordered = cardlib.fmt_ordered_named
    elif encoding in ['noname']:
        fmt_ordered = cardlib.fmt_ordered_noname
    elif encoding in ['rfields']:
        pass
    elif encoding in ['old']:
        fmt_ordered = cardlib.fmt_ordered_old
    elif encoding in ['norarity']:
        fmt_ordered = cardlib.fmt_ordered_norarity
    elif encoding in ['vec']:
        pass
    elif encoding in ['custom']:
        ## put custom format decisions here ##########################
        
        ## end of custom format ######################################
        pass
    else:
        raise ValueError('encode.py: unknown encoding: ' + encoding)

    cards = jdecode.mtg_open_file(fname, verbose=verbose, fmt_ordered=fmt_ordered)

    if creativity:
        namediff = Namediff()
        cbow = CBOW()
        if verbose:
            print 'Computing nearest names...'
        nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3)
        if verbose:
            print 'Computing nearest cards...'
        nearest_cards = cbow.nearest_par(cards)
        for i in range(0, len(cards)):
            cards[i].nearest_names = nearest_names[i]
            cards[i].nearest_cards = nearest_cards[i]
        if verbose:
            print '...Done.'

    def hoverimg(cardname, dist, nd):
        truename = nd.names[cardname]
        code = nd.codes[cardname]
        namestr = ''
        if for_html:
            if code:
                namestr = ('<div class="hover_img"><a href="#">' + truename 
                           + '<span><img src="http://magiccards.info/scans/en/' + code
                           + '" alt="image"/></span></a>' + ': ' + str(dist) + '</div>')
            else:
                namestr = '<div>' + truename + ': ' + str(dist) + '</div>'
        elif for_forum:
            namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n'
        else:
            namestr = truename + ': ' + str(dist) + '\n'
        return namestr 

    def writecards(writer):
        if for_mse:
            # have to prepend a massive chunk of formatting info
            writer.write(utils.mse_prepend)

        if for_html:
            # have to preapend html info
            writer.write(utils.html_prepend)

        for card in cards:
            if for_mse:
                writer.write(card.to_mse().encode('utf-8'))
                fstring = ''
                if card.json:
                    fstring += 'JSON:\n' + card.json + '\n'
                if card.raw: 
                    fstring += 'raw:\n' + card.raw + '\n'
                fstring += '\n'
                fstring += card.format(gatherer = gatherer, for_forum = for_forum,
                                       vdump = vdump) + '\n'
                fstring = fstring.replace('<', '(').replace('>', ')')
                writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t'))
            else:
                fstring = card.format(gatherer = gatherer, for_forum = for_forum,
                                      vdump = vdump, for_html = for_html)
                if creativity and for_html:
                    fstring = fstring[:-6] # chop off the closing </div> to stick stuff in
                writer.write((fstring + '\n').encode('utf-8'))

            if creativity:
                cstring = '~~ closest cards ~~\n'
                nearest = card.nearest_cards
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring += '~~ closest names ~~\n'
                nearest = card.nearest_names
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                if for_html:
                    cstring = '<hr><div>' + cstring.replace('\n', '<br>\n') + '</div>\n</div>'
                elif for_mse:
                    cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t')
                
                writer.write(cstring.encode('utf-8'))

            writer.write('\n'.encode('utf-8'))

        if for_mse:
            # more formatting info
            writer.write('version control:\n\ttype: none\napprentice code: ')
        if for_html:
            # closing the html file
            writer.write(utils.html_append)

    if oname:
        if for_html:
            print oname
            # if ('.html' != oname[-])
            #     oname += '.html'
        if verbose:
            print 'Writing output to: ' + oname
        with open(oname, 'w') as ofile:
            writecards(ofile)
        if for_mse:
            # Copy whatever output file is produced, name the copy 'set' (yes, no extension).
            if os.path.isfile('set'):
                print 'ERROR: tried to overwrite existing file "set" - aborting.'
                return
            shutil.copyfile(oname, 'set')
            # Use the freaky mse extension instead of zip.
            with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf:
                try:
                    # Zip up the set file into oname.mse-set.
                    zf.write('set') 
                finally:
                    if verbose:
                        print 'Made an MSE set file called ' + oname + '.mse-set.'
                    # The set file is useless outside the .mse-set, delete it.
                    os.remove('set') 
    else:
        writecards(sys.stdout)
        sys.stdout.flush()
Example #16
0
#ハイパーパラメータの設定
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

#データの読み込み
corpus, wordtoid, idtoword = ptb.load_data('train')
vocab_size = len(wordtoid)

contexts, target = create_contexts_target(corpus, window_size)
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)

#モデルなどの生成
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

#学習開始
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

#後ほど利用できるように、必要なデータを保存
word_vecs = model.word_vecs

if config.GPU:
    word_vecs = to_cpu(word_vecs)
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['wordtoid'] = wordtoid
Example #17
0
def run_cbow():
    vec = CBOW()
    vec.train()
Example #18
0
def main(fname, oname = None, verbose = True, encoding = 'std',
         gatherer = False, for_forum = False, for_mse = False,
         creativity = False, vdump = False, for_html = False):

    # there is a sane thing to do here (namely, produce both at the same time)
    # but we don't support it yet.
    if for_mse and for_html:
        print 'ERROR - decode.py - incompatible formats "mse" and "html"'
        return

    fmt_ordered = cardlib.fmt_ordered_default

    if encoding in ['std']:
        pass
    elif encoding in ['named']:
        fmt_ordered = cardlib.fmt_ordered_named
    elif encoding in ['noname']:
        fmt_ordered = cardlib.fmt_ordered_noname
    elif encoding in ['rfields']:
        pass
    elif encoding in ['old']:
        fmt_ordered = cardlib.fmt_ordered_old
    elif encoding in ['norarity']:
        fmt_ordered = cardlib.fmt_ordered_norarity
    elif encoding in ['vec']:
        pass
    elif encoding in ['custom']:
        ## put custom format decisions here ##########################
        
        ## end of custom format ######################################
        pass
    else:
        raise ValueError('encode.py: unknown encoding: ' + encoding)

    cards = jdecode.mtg_open_file(fname, verbose=verbose, fmt_ordered=fmt_ordered)

    if creativity:
        namediff = Namediff()
        cbow = CBOW()
        if verbose:
            print 'Computing nearest names...'
        nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3)
        if verbose:
            print 'Computing nearest cards...'
        nearest_cards = cbow.nearest_par(cards)
        for i in range(0, len(cards)):
            cards[i].nearest_names = nearest_names[i]
            cards[i].nearest_cards = nearest_cards[i]
        if verbose:
            print '...Done.'

    def hoverimg(cardname, dist, nd):
        truename = nd.names[cardname]
        code = nd.codes[cardname]
        namestr = ''
        if for_html:
            if code:
                namestr = ('<div class="hover_img"><a href="#">' + truename 
                           + '<span><img style="background: url(http://magiccards.info/scans/en/' + code
                           + ');" alt=""/></span></a>' + ': ' + str(dist) + '\n</div>\n')
            else:
                namestr = '<div>' + truename + ': ' + str(dist) + '</div>'
        elif for_forum:
            namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n'
        else:
            namestr = truename + ': ' + str(dist) + '\n'
        return namestr 

    def writecards(writer):
        if for_mse:
            # have to prepend a massive chunk of formatting info
            writer.write(utils.mse_prepend)

        if for_html:
            # have to preapend html info
            writer.write(utils.html_prepend)
            # seperate the write function to allow for writing smaller chunks of cards at a time
            segments = sort_colors(cards)
            for i in range(len(segments)):
                # sort color by CMC
                segments[i] = sort_type(segments[i])
                # this allows card boxes to be colored for each color 
                # for coloring of each box seperately cardlib.Card.format() must change non-minimaly
                writer.write('<div id="' + utils.segment_ids[i] + '">')
                writehtml(writer, segments[i])
                writer.write("</div><hr>")
            # closing the html file
            writer.write(utils.html_append)
            return #break out of the write cards funcrion to avoid writing cards twice


        for card in cards:
            if for_mse:
                writer.write(card.to_mse().encode('utf-8'))
                fstring = ''
                if card.json:
                    fstring += 'JSON:\n' + card.json + '\n'
                if card.raw: 
                    fstring += 'raw:\n' + card.raw + '\n'
                fstring += '\n'
                fstring += card.format(gatherer = gatherer, for_forum = for_forum,
                                       vdump = vdump) + '\n'
                fstring = fstring.replace('<', '(').replace('>', ')')
                writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t'))
            else:
                fstring = card.format(gatherer = gatherer, for_forum = for_forum,
                                      vdump = vdump, for_html = for_html)
                writer.write((fstring + '\n').encode('utf-8'))

            if creativity:
                cstring = '~~ closest cards ~~\n'
                nearest = card.nearest_cards
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring += '~~ closest names ~~\n'
                nearest = card.nearest_names
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                if for_mse:
                    cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t')
                writer.write(cstring.encode('utf-8'))

            writer.write('\n'.encode('utf-8'))

        if for_mse:
            # more formatting info
            writer.write('version control:\n\ttype: none\napprentice code: ')
            

    def writehtml(writer, card_set):
        for card in card_set:
            fstring = card.format(gatherer = gatherer, for_forum = True,
                                      vdump = vdump, for_html = for_html)
            if creativity:
                fstring = fstring[:-6] # chop off the closing </div> to stick stuff in
            writer.write((fstring + '\n').encode('utf-8'))

            if creativity:
                cstring = '~~ closest cards ~~\n<br>\n'
                nearest = card.nearest_cards
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring += "<br>\n"
                cstring += '~~ closest names ~~\n<br>\n'
                nearest = card.nearest_names
                for dist, cardname in nearest:
                    cstring += hoverimg(cardname, dist, namediff)
                cstring = '<hr><div>' + cstring + '</div>\n</div>'
                writer.write(cstring.encode('utf-8'))

            writer.write('\n'.encode('utf-8'))

    # Sorting by colors
    def sort_colors(card_set):
        # Initialize sections
        red_cards = []
        blue_cards = []
        green_cards = []
        black_cards = []
        white_cards = []
        multi_cards = []
        colorless_cards = []
        lands = []
        for card in card_set:
            if len(card.get_colors())>1:
                multi_cards += [card]
                continue
            if 'R' in card.get_colors():
                red_cards += [card]
                continue
            elif 'U' in card.get_colors():
                blue_cards += [card]
                continue
            elif 'B' in card.get_colors():
                black_cards += [card]
                continue
            elif 'G' in card.get_colors():
                green_cards += [card]
                continue
            elif 'W' in card.get_colors():
                white_cards += [card]
                continue
            else:
                if "land" in card.get_types():
                    lands += [card]
                    continue
                colorless_cards += [card]
        return[white_cards, blue_cards, black_cards, red_cards, green_cards, multi_cards, colorless_cards, lands]

    def sort_type(card_set):
        sorting = ["creature", "enchantment", "instant", "sorcery", "artifact", "planeswalker"]
        sorted_cards = [[],[],[],[],[],[],[]]
        sorted_set = []
        for card in card_set:
            types = card.get_types()
            for i in range(len(sorting)):
                if sorting[i] in types:
                    sorted_cards[i] += [card]
                    break
            else:
                sorted_cards[6] += [card]
        for value in sorted_cards:
            for card in value:
                sorted_set += [card]
        return sorted_set



    def sort_cmc(card_set):
        sorted_cards = []
        sorted_set = []
        for card in card_set:
            # make sure there is an empty set for each CMC
            while len(sorted_cards)-1 < card.get_cmc():
                sorted_cards += [[]]
            # add card to correct set of CMC values
            sorted_cards[card.get_cmc()] += [card]
        # combine each set of CMC valued cards together
        for value in sorted_cards:
            for card in value:
                sorted_set += [card]
        return sorted_set


    if oname:
        if for_html:
            print oname
            # if ('.html' != oname[-])
            #     oname += '.html'
        if verbose:
            print 'Writing output to: ' + oname
        with open(oname, 'w') as ofile:
            writecards(ofile)
        if for_mse:
            # Copy whatever output file is produced, name the copy 'set' (yes, no extension).
            if os.path.isfile('set'):
                print 'ERROR: tried to overwrite existing file "set" - aborting.'
                return
            shutil.copyfile(oname, 'set')
            # Use the freaky mse extension instead of zip.
            with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf:
                try:
                    # Zip up the set file into oname.mse-set.
                    zf.write('set') 
                finally:
                    if verbose:
                        print 'Made an MSE set file called ' + oname + '.mse-set.'
                    # The set file is useless outside the .mse-set, delete it.
                    os.remove('set') 
    else:
        writecards(sys.stdout)
        sys.stdout.flush()
Example #19
0
parser.add_argument('--optimizer', type=str, default='SGD', help='Model optimization algorithm')
parser.add_argument('--loss_function', type=str, default='ce', help='Loss function that would be minimized')
parser.add_argument('--epochs', type=int, default=10, help='Number of iterations')

args = parser.parse_args()

print("Begin Reading Corpus Data and Tokenizing")
data_reader = CBOWCorpusReader(args.corpus)
grams = data_reader.get_ngram_words()
words_freq = data_reader.get_words_frequency()
word2idx = data_reader.get_word2idx()
idx2word = data_reader.get_idx2word()
print("End Reading the Data")

args.vocab_size = len(words_freq)
cbow = CBOW(args)
cbow.init_session()
cbow.build()

print("Begin Training")
learning_curve = []

for epoch in range(0, args.epochs):
    error = 0.0
    print(epoch)
    for batch in grams:
        x_input, y_output, x_input_reshape = [], [], []

        for item in batch:
            def get_one_hot(idx):
                one_hot = ([0] * (args.vocab_size + 1))
Example #20
0
"""
vocab_size = len(word_to_id)  # 10000

contexts, target = create_contexts_target(corpus, window_size)
#print(contexts.shape, target.shape);exit(1)
"""
929589개의 단어가 window_size만큼 양옆으로 짤리고 나머지를 중심단어의 targets으로 보고, 
중심단어를 기준으로 양옆 5단어를 contexts로 본다. 
contexts.shape : (929579, 10) # 중심단어를 기준으로 양옆 10개의 단어들의 나열 되어 있음.
target.shape : (929579,)  # 중심단어가 나열되어 있음. 
"""
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)

# 모델 등 생성
model = CBOW(vocab_size, hidden_size, window_size,
             corpus)  # 인스턴스 생성, 모델생성. ( 인접단어로 중심단어 추론 )
# model = SkipGram(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

# 학습 시작
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

# 나중에 사용할 수 있도록 필요한 데이터 저장
word_vecs = model.word_vecs
if config.GPU:
    word_vecs = to_cpu(word_vecs)
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
Example #21
0
###############################################################################

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='Visualize word vectors with tensorboard')
    parser.add_argument('--init_vecs', required=True, dest='init_vecs')
    parser.add_argument('--model_ckpt', required=True, dest='model_ckpt')
    parser.add_argument('--num_embed', required=True, type=int, dest='num_embed')
    parser.add_argument('--embed_metadata', required=True, dest='metadata')
    parser.add_argument('--graph_folder', required=True, dest='graph_folder')
    args = parser.parse_args()
    
    # Initialize model
    word_vectors = dill.load(open(args.init_vecs,'rb'))
    model = CBOW(len(word_vectors), word_vectors, 5e-4)
    
    # Load model of last epoch
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, args.model_ckpt)
        # Visualizing embeddings
        final_embed = sess.run(model.init_vecs)
        embedding_var = tf.Variable(final_embed[:args.num_embed], name='embedding')
        sess.run(embedding_var.initializer)
        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter(args.graph_folder)
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = os.path.join(args.graph_folder, args.metadata)
        projector.visualize_embeddings(summary_writer, config)
Example #22
0
    model_hard_file ="GRU_REG_HARD_1024_350_0.001_mse.pt"
    model_hard = GRU_REG(hard_dataset.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_gru_reg_model, easy_dataset, easy_testset, hard_dataset, hard_testset)

    # GRU
    best_top_params = [300, 56, 0.0001]
    embedding_space, hidden_layer_dim, learning_rate = best_top_params
    model_easy_file = "GRU_EASY_56_300_0.0001_mse.pt"
    model_easy = GRU(easy_dataset.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    model_hard_file = "GRU_HARD_56_300_0.0001_mse.pt"
    model_hard = GRU(hard_dataset.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_gru_model, easy_dataset, easy_testset, hard_dataset, hard_testset)

    # CBOW NAIVE
    best_top_params = [150, 256, 0.0001]
    embedding_space, hidden_layer_dim, learning_rate = best_top_params
    model_easy_file = "CBOW_NAIVE_EASY_256_150_0.0001_smooth_l1.pt"
    model_easy = CBOW(easy_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    model_hard_file = "CBOW_NAIVE_HARD_256_150_0.0001_smooth_l1.pt"
    model_hard = CBOW(hard_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_cbow_model, easy_dataset_cbow, easy_testset, hard_dataset_cbow, hard_testset)

    # CBOW REGRESSION
    best_top_params = [150, 256, 0.0001]
    embedding_space, hidden_layer_dim, learning_rate = best_top_params
    model_easy_file = "CBOW_REG_EASY_256_300_0.001_smooth_l1.pt"
    model_easy = CBOW_REG(easy_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    model_hard_file = "CBOW_REG_HARD_256_300_0.001_smooth_l1.pt"
    model_hard = CBOW_REG(hard_dataset_cbow.vocab_size, hidden_layer_dim=hidden_layer_dim, embedding_space=embedding_space)
    test_easy_and_hard(model_easy, model_hard, model_easy_file, model_hard_file, validate_cbow_model, easy_dataset_cbow, easy_testset, hard_dataset_cbow, hard_testset)