def check_vocab(fname): cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True) vocab = {} for card in cards: words = card.text.vectorize().split() if card.bside: words += card.bside.text.vectorize().split() for word in words: if word not in vocab: vocab[word] = 1 else: vocab[word] += 1 for word in sorted(vocab, lambda x, y: cmp(vocab[x], vocab[y]), reverse=True): print('{:8d} : {:s}'.format(vocab[word], word)) n = 3 for card in cards: words = card.text.vectorize().split() if card.bside: words += card.bside.text.vectorize().split() for word in words: if vocab[word] <= n: # if 'name' in word: print('\n{:8d} : {:s}'.format(vocab[word], word)) print(card.encode()) break
def main(args): fds = args.fds fname = args.fname block_size = args.block_size main_seed = args.seed if args.seed != 0 else None # simple default encoding for now, will add more options with the curriculum # learning feature cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True) def write_stream(i, fd): local_random = random.Random(main_seed) local_random.jumpahead(i) local_cards = [card for card in cards] with open('/proc/self/fd/'+str(fd), 'wt') as f: while True: local_random.shuffle(local_cards) for card in local_cards: f.write(card.encode(randomize_mana=True, randomize_lines=True)) f.write(utils.cardsep) def mkargs(i, fd): return i, fd streaming_noreturn(fds, write_stream, mkargs)
def main(fname, oname = None, verbose = False, dump = False): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) do_grams = False if do_grams: rg = {} for card in cards: g = rare_grams(card, thresh=2, grams=2) if len(card.text_words) > 0: g = int(1.0 + (float(g) * 100.0 / float(len(card.text_words)))) if g in rg: rg[g] += 1 else: rg[g] = 1 if g >= 60: print(g) print(card.format()) tot = 0 vmax = sum(rg.values()) pct90 = None pct95 = None pct99 = None for i in sorted(rg): print(str(i) + ' rare ngrams: ' + str(rg[i])) tot += rg[i] if pct90 is None and tot >= vmax * 0.90: pct90 = i if pct95 is None and tot >= vmax * 0.95: pct95 = i if pct99 is None and tot >= vmax * 0.99: pct99 = i print('90% - ' + str(pct90)) print('95% - ' + str(pct95)) print('99% - ' + str(pct99)) else: ((total_all, total_good, total_bad, total_uncovered), values) = process_props(cards, dump=dump) # summary print('-- overall --') print(' total : ' + str(total_all)) print(' good : ' + str(total_good) + ' ' + pct(total_good, total_all)) print(' bad : ' + str(total_bad) + ' ' + pct(total_bad, total_all)) print(' uncocoverd: ' + str(total_uncovered) + ' ' + pct(total_uncovered, total_all)) print('----') # breakdown for prop in props: (total, good, bad) = values[prop] print(prop + ':') print(' total: ' + str(total) + ' ' + pct(total, total_all)) print(' good : ' + str(good) + ' ' + pct(good, total_all)) print(' bad : ' + str(bad) + ' ' + pct(bad, total_all))
def main(fname, oname, verbose=True, parallel=True): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) # this could reasonably be some separate function # might make sense to merge cbow and namediff and have this be the main interface namediff = Namediff() cbow = CBOW() if verbose: print('Computing nearest names...') if parallel: nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=1) else: nearest_names = [namediff.nearest(c.name, n=1) for c in cards] if verbose: print('Computing nearest cards...') if parallel: nearest_cards = cbow.nearest_par(cards, n=1) else: nearest_cards = [cbow.nearest(c, n=1) for c in cards] for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] # # unfortunately this takes ~30 hours on 8 cores for a 10MB dump # if verbose: # print 'Computing nearest encodings by text edit distance...' # if parallel: # nearest_cards_text = namediff.nearest_card_par(cards, n=1) # else: # nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards] if verbose: print('...Done.') # write to a file to store the data, this is a terribly long computation # we could also just store this same info in the cards themselves as more fields... sep = '|' with open(oname, 'w') as ofile: for i in range(0, len(cards)): card = cards[i] ostr = str(i) + sep + card.name + sep ndist, _ = card.nearest_names[0] ostr += str(ndist) + sep cdist, _ = card.nearest_cards[0] ostr += str(cdist) + '\n' # tdist, _ = nearest_cards_text[i][0] # ostr += str(tdist) + '\n' ofile.write(ostr.encode('utf-8'))
def main(fname, oname, gmin = 2, gmax = 8, nltk = False, sep = False, verbose = False): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) gmin = int(gmin) gmax = int(gmax) if nltk: n = gmin lm = build_ngram_model(cards, n, separate_lines=sep, verbose=verbose) if verbose: teststr = 'when @ enters the battlefield' print('litmus test: perplexity of ' + repr(teststr)) print(' ' + str(lm.perplexity(teststr.split()))) if verbose: print('pickling module to ' + oname) with open(oname, 'wb') as f: pickle.dump(lm, f) else: bins = [1, 2, 3, 10, 30, 100, 300, 1000] if gmin < 2 or gmax < gmin: print('invalid gram sizes: ' + str(gmin) + '-' + str(gmax)) exit(1) for grams in range(gmin, gmax+1): if verbose: print('generating ' + str(grams) + '-grams...') gramdict = {} for card in cards: update_ngrams(card.text_lines_words, gramdict, grams) oname_full = oname + '.' + str(grams) + 'g' if verbose: print(' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full) describe_bins(gramdict, bins) with open(oname_full, 'wt') as f: for ngram in sorted(gramdict, lambda x,y: cmp(gramdict[x], gramdict[y]), reverse = True): f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8'))
def check_characters(fname, vname): cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True) tokens = {c for c in utils.cardsep} for card in cards: for c in card.encode(): tokens.add(c) token_to_idx = {tok: i + 1 for i, tok in enumerate(sorted(tokens))} idx_to_token = {i + 1: tok for i, tok in enumerate(sorted(tokens))} print('Vocabulary: ({:d} symbols)'.format(len(token_to_idx))) for token in sorted(token_to_idx): print('{:8s} : {:4d}'.format(repr(token), token_to_idx[token])) # compliant with torch-rnn if vname: json_data = { 'token_to_idx': token_to_idx, 'idx_to_token': idx_to_token } print('writing vocabulary to {:s}'.format(vname)) with open(vname, 'w') as f: json.dump(json_data, f)
def get_statistics(fname, lm = None, sep = False, verbose=False): stats = OrderedDict() cards = jdecode.mtg_open_file(fname, verbose=verbose) stats['cards'] = cards # unpack the name of the checkpoint - terrible and hacky try: final_name = os.path.basename(fname) halves = final_name.split('_epoch') cp_name = halves[0] cp_info = halves[1][:-4] info_halves = cp_info.split('_') cp_epoch = float(info_halves[0]) fragments = info_halves[1].split('.') cp_vloss = float('.'.join(fragments[:2])) cp_temp = float('.'.join(fragments[-2:])) cp_ident = '.'.join(fragments[2:-2]) stats['cp'] = OrderedDict([('name', cp_name), ('epoch', cp_epoch), ('vloss', cp_vloss), ('temp', cp_temp), ('ident', cp_ident)]) except Exception as e: pass # validate ((total_all, total_good, total_bad, total_uncovered), values) = mtg_validate.process_props(cards) stats['props'] = annotate_values(values) stats['props']['overall'] = OrderedDict([('total', total_all), ('good', total_good), ('bad', total_bad), ('uncovered', total_uncovered)]) # distances distfname = fname + '.dist' if os.path.isfile(distfname): name_dupes = 0 card_dupes = 0 with open(distfname, 'rt') as f: distlines = f.read().split('\n') dists = OrderedDict([('name', []), ('cbow', [])]) for line in distlines: fields = line.split('|') if len(fields) < 4: continue idx = int(fields[0]) name = str(fields[1]) ndist = float(fields[2]) cdist = float(fields[3]) dists['name'] += [ndist] dists['cbow'] += [cdist] if ndist == 1.0: name_dupes += 1 if cdist == 1.0: card_dupes += 1 dists['name_mean'] = mean_nonan(dists['name']) dists['cbow_mean'] = mean_nonan(dists['cbow']) dists['name_geomean'] = gmean_nonzero(dists['name']) dists['cbow_geomean'] = gmean_nonzero(dists['cbow']) stats['dists'] = dists # n-grams if lm is not None: ngram = OrderedDict([('perp', []), ('perp_per', []), ('perp_max', []), ('perp_per_max', [])]) for card in cards: if len(card.text.text) == 0: perp = 0.0 perp_per = 0.0 elif sep: vtexts = [line.vectorize().split() for line in card.text_lines if len(line.vectorize().split()) > 0] perps = [lm.perplexity(vtext) for vtext in vtexts] perps_per = [perps[i] / float(len(vtexts[i])) for i in range(0, len(vtexts))] perp = gmean_nonzero(perps) perp_per = gmean_nonzero(perps_per) perp_max = max(perps) perp_per_max = max(perps_per) else: vtext = card.text.vectorize().split() perp = lm.perplexity(vtext) perp_per = perp / float(len(vtext)) perp_max = perp perp_per_max = perps_per ngram['perp'] += [perp] ngram['perp_per'] += [perp_per] ngram['perp_max'] += [perp_max] ngram['perp_per_max'] += [perp_per_max] ngram['perp_mean'] = mean_nonan(ngram['perp']) ngram['perp_per_mean'] = mean_nonan(ngram['perp_per']) ngram['perp_geomean'] = gmean_nonzero(ngram['perp']) ngram['perp_per_geomean'] = gmean_nonzero(ngram['perp_per']) stats['ngram'] = ngram return stats
def main(infile, verbose = False): lm = ngrams.build_ngram_model(jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt'))), 3, separate_lines=True, verbose=True) stats = get_statistics(infile, lm=lm, sep=True, verbose=verbose) print_statistics(stats)
def main(fname, oname=None, verbose=True, encoding='std', gatherer=False, for_forum=False, for_mse=False, creativity=False, vdump=False, for_html=False): # there is a sane thing to do here (namely, produce both at the same time) # but we don't support it yet. if for_mse and for_html: print('ERROR - decode.py - incompatible formats "mse" and "html"') return fmt_ordered = cardlib.fmt_labeled_default if encoding in ['std']: pass elif encoding in ['named']: fmt_ordered = cardlib.fmt_ordered_named elif encoding in ['noname']: fmt_ordered = cardlib.fmt_ordered_noname elif encoding in ['rfields']: pass elif encoding in ['old']: fmt_ordered = cardlib.fmt_ordered_old elif encoding in ['norarity']: fmt_ordered = cardlib.fmt_ordered_norarity elif encoding in ['vec']: pass elif encoding in ['custom']: ## put custom format decisions here ########################## ## end of custom format ###################################### pass else: raise ValueError('encode.py: unknown encoding: ' + encoding) cards = jdecode.mtg_open_file(fname, verbose=verbose, fmt_ordered=fmt_ordered) if creativity: namediff = Namediff() cbow = CBOW() if verbose: print('Computing nearest names...') nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=3) if verbose: print('Computing nearest cards...') nearest_cards = cbow.nearest_par(cards) for i in range(0, len(cards)): cards[i].nearest_names = nearest_names[i] cards[i].nearest_cards = nearest_cards[i] if verbose: print('...Done.') def hoverimg(cardname, dist, nd): truename = nd.names[cardname] code = nd.codes[cardname] namestr = '' if for_html: if code: namestr = ( '<div class="hover_img"><a href="#">' + truename + '<span><img style="background: url(http://magiccards.info/scans/en/' + code + ');" alt=""/></span></a>' + ': ' + str(dist) + '\n</div>\n') else: namestr = '<div>' + truename + ': ' + str(dist) + '</div>' elif for_forum: namestr = '[card]' + truename + '[/card]' + ': ' + str(dist) + '\n' else: namestr = truename + ': ' + str(dist) + '\n' return namestr def writecards(writer): if for_mse: # have to prepend a massive chunk of formatting info writer.write(utils.mse_prepend) if for_html: # have to preapend html info writer.write(utils.html_prepend) # seperate the write function to allow for writing smaller chunks of cards at a time segments = sort_colors(cards) for i in range(len(segments)): # sort color by type segments[i] = sort_type(segments[i]) # add internal loop to handle each card type and/or CMC # for adding navigation anchors to each subsection # this allows card boxes to be colored for each color # for coloring of each box seperately cardCard.format() must change non-minimaly writer.write('<div id="' + utils.segment_ids[i] + '">') writehtml(writer, segments[i]) writer.write("</div><hr>") # closing the html file writer.write(utils.html_append) return # break out of the write cards funcrion to avoid writing cards twice for card in cards: if for_mse: writer.write(card.to_mse().encode('utf-8')) fstring = '' if card.json: fstring += 'JSON:\n' + card.json + '\n' if card.raw: fstring += 'raw:\n' + card.raw + '\n' fstring += '\n' fstring += card.format( gatherer=gatherer, for_forum=for_forum, vdump=vdump) + '\n' fstring = fstring.replace('<', '(').replace('>', ')') writer.write(('\n' + fstring[:-1]).replace('\n', '\n\t\t')) else: fstring = card.format(gatherer=gatherer, for_forum=for_forum, vdump=vdump, for_html=for_html) writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += '~~ closest names ~~\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) if for_mse: cstring = ('\n\n' + cstring[:-1]).replace('\n', '\n\t\t') writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) if for_mse: # more formatting info writer.write('version control:\n\ttype: none\napprentice code: ') def writehtml(writer, card_set): for card in card_set: fstring = card.format(gatherer=gatherer, for_forum=True, vdump=vdump, for_html=for_html) if creativity: fstring = fstring[: -6] # chop off the closing </div> to stick stuff in writer.write((fstring + '\n').encode('utf-8')) if creativity: cstring = '~~ closest cards ~~\n<br>\n' nearest = card.nearest_cards for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring += "<br>\n" cstring += '~~ closest names ~~\n<br>\n' nearest = card.nearest_names for dist, cardname in nearest: cstring += hoverimg(cardname, dist, namediff) cstring = '<hr><div>' + cstring + '</div>\n</div>' writer.write(cstring.encode('utf-8')) writer.write('\n'.encode('utf-8')) # Sorting by colors def sort_colors(card_set): # Initialize sections red_cards = [] blue_cards = [] green_cards = [] black_cards = [] white_cards = [] multi_cards = [] colorless_cards = [] lands = [] for card in card_set: if len(card.get_colors()) > 1: multi_cards += [card] continue if 'R' in card.get_colors(): red_cards += [card] continue elif 'U' in card.get_colors(): blue_cards += [card] continue elif 'B' in card.get_colors(): black_cards += [card] continue elif 'G' in card.get_colors(): green_cards += [card] continue elif 'W' in card.get_colors(): white_cards += [card] continue else: if "land" in card.get_types(): lands += [card] continue colorless_cards += [card] return [ white_cards, blue_cards, black_cards, red_cards, green_cards, multi_cards, colorless_cards, lands ] # TODO: have this return each sorted set. def sort_type(card_set): sorting = [ "creature", "enchantment", "instant", "sorcery", "artifact", "planeswalker" ] sorted_cards = [[], [], [], [], [], [], []] sorted_set = [] for card in card_set: types = card.get_types() for i in range(len(sorting)): if sorting[i] in types: sorted_cards[i] += [card] break else: sorted_cards[6] += [card] # return sorted_cards for value in sorted_cards: for card in value: sorted_set += [card] return sorted_set # TODO: have this return each sorted set. def sort_cmc(card_set): sorted_cards = [] sorted_set = [] for card in card_set: # make sure there is an empty set for each CMC while len(sorted_cards) - 1 < card.get_cmc(): sorted_cards += [[]] # add card to correct set of CMC values sorted_cards[card.get_cmc()] += [card] # return sorted_cards # combine each set of CMC valued cards together for value in sorted_cards: for card in value: sorted_set += [card] return sorted_set if oname: if for_html: print(oname) # if ('.html' != oname[-]) # oname += '.html' if verbose: print('Writing output to: ' + oname) with open(oname, 'w') as ofile: writecards(ofile) if for_mse: # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print( 'ERROR: tried to overwrite existing file "set" - aborting.' ) return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname + '.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print('Made an MSE set file called ' + oname + '.mse-set.') # The set file is useless outside the .mse-set, delete it. os.remove('set') else: writecards(sys.stdout) sys.stdout.flush()
def main(fname, oname, n=20, verbose=False): cbow = CBOW() realcards = jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt')), verbose=verbose) real_by_name = {c.name: c for c in realcards} lm = ngrams.build_ngram_model(realcards, 3, separate_lines=separate_lines, verbose=verbose) cards = jdecode.mtg_open_file(fname, verbose=verbose) stats = analysis.get_statistics(fname, lm=lm, sep=separate_lines, verbose=verbose) selected = [] for i in range(0, len(cards)): if select_card(cards, stats, i): selected += [(i, cards[i])] limit = 3000 random.shuffle(selected) #selected = selected[:limit] if verbose: print('computing nearest cards for ' + str(len(selected)) + ' candindates...') cbow_nearest = cbow.nearest_par(map(lambda (i, c): c, selected)) for i in range(0, len(selected)): (j, card) = selected[i] selected[i] = (j, card, cbow_nearest[i]) if verbose: print('...done') final = [] for (i, card, nearest) in selected: for dist, rname in nearest: realcard = real_by_name[rname] if compare_to_real(card, realcard): final += [(i, card, realcard, dist)] break for (i, card, realcard, dist) in final: print('-- real --') print(realcard.format()) print('-- fake --') print(card.format()) print('-- stats --') perp_per = stats['ngram']['perp_per'][i] perp_max = stats['ngram']['perp_max'][i] print(dist) print(perp_per) print(perp_max) print('----') if not oname is None: with open(oname, 'wt') as ofile: ofile.write(utils.mse_prepend) for (i, card, realcard, dist) in final: name = realcard.name writecard(realcard, name, ofile) writecard(card, name, ofile) ofile.write('version control:\n\ttype: none\napprentice code: ') # Copy whatever output file is produced, name the copy 'set' (yes, no extension). if os.path.isfile('set'): print('ERROR: tried to overwrite existing file "set" - aborting.') return shutil.copyfile(oname, 'set') # Use the freaky mse extension instead of zip. with zipfile.ZipFile(oname+'.mse-set', mode='w') as zf: try: # Zip up the set file into oname.mse-set. zf.write('set') finally: if verbose: print('Made an MSE set file called ' + oname + '.mse-set.') # The set file is useless outside the .mse-set, delete it. os.remove('set')
def check_lines(fname): cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True) prelines = set() keylines = set() mainlines = set() costlines = set() postlines = set() known = [ 'enchant ', 'equip', 'countertype', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken', 'bestow', 'buyback', 'cumulative', 'dash', 'entwine', 'evoke', 'fortify', 'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu', 'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice', 'surge', 'unearth', 'transfigure', 'transmute' ] known = [] for card in cards: prel, keyl, mainl, costl, postl = transforms.separate_lines( card.text.encode(randomize=False)) if card.bside: prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines( card.bside.text.encode(randomize=False)) prel += prel2 keyl += keyl2 mainl += mainl2 costl += costl2 postl += postl2 for line in prel: if line.strip() == '': print(card.name, card.text.text) if any(line.startswith(s) for s in known): line = 'known' prelines.add(line) for line in postl: if line.strip() == '': print(card.name, card.text.text) if any(line.startswith(s) for s in known): line = 'known' postlines.add(line) for line in keyl: if line.strip() == '': print(card.name, card.text.text) if any(line.startswith(s) for s in known): line = 'known' keylines.add(line) for line in mainl: if line.strip() == '': print(card.name, card.text.text) # if any(line.startswith(s) for s in known): # line = 'known' mainlines.add(line) for line in costl: if line.strip() == '': print(card.name, card.text.text) # if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line: # line = 'known' costlines.add(line) print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'.format( len(prelines), len(keylines), len(mainlines), len(postlines))) print('\nprelines') for line in sorted(prelines): print(line) print('\npostlines') for line in sorted(postlines): print(line) print('\ncostlines') for line in sorted(costlines): print(line) print('\nkeylines') for line in sorted(keylines): print(line) print('\nmainlines') for line in sorted(mainlines): # if any(s in line for s in ['champion', 'devour', 'tribute']): print(line)