def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join( ["grid_xdid", "clues_pubid", "num_missing"]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header( "Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join( [xd.xdid(), pubid, str(nmissing)]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file( parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def main(): p = args_parser('catalog source files and create source.tsv') p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) info("importing from %s" % args.source) outf = open_output() sources = [] for input_source in args.inputs: for fn, contents, dt in find_files_with_time(input_source): if len(contents) == 0: info("ignoring empty file") continue outf.write_file(strip_toplevel(fn), contents, dt) sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt))) info("%s files cataloged" % len(sources)) outbase = parse_pathname(args.output).base outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources)) outf.write_file("%s.log" % outbase, get_log())
def main(): p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv") p.add_argument("-s", "--source", default=None, help="ExternalSource") args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else "misc" if prefix not in subzips: zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): global all_uses args = get_args('create word pages and index') outf = open_output() all_uses = {} for ca in clues(): if ca.answer not in all_uses: all_uses[ca.answer] = [] all_uses[ca.answer].append(ca) h = '<li>%d different words</li>' % len(all_uses) h += '<h2>Most used words</h2>' h += '<table class="clues most-used-words">' h += th("word", "# uses", "clues used with this answer") wordpages_to_make = set(args.inputs) for answer, uses in sorted(all_uses.items(), reverse=True, key=lambda x: len(x[1]))[:100]: wordpages_to_make.add(answer) h += td(mkhref(answer.upper(), answer.upper()), len(uses), html_select_options(uses, strmaker=lambda ca: ca.clue)) h += '</table>' for word in wordpages_to_make: outf.write_html('word/%s/index.html' % word.upper(), mkwww_wordpage(word), title=word) outf.write_html('word/index.html', h, title="Words")
def main(): args = get_args('aggregates all .log files into one .html') outwww = open_output() log_html = '' for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]): # earliest first log_html += '\n\n<h2>%s</h2><pre>%s</pre>' % (fn, cgi.escape(contents.decode("utf-8"))) datestr = iso8601() outwww.write_html("logs.html", log_html, title="logs for " + datestr)
def main(): args = utils.get_args() outf = utils.open_output() for htmlfn, contents in utils.find_files(*args.inputs): basepagename = utils.parse_pathname(htmlfn).base wrappeddiv = '<div class="text">' + contents.decode('utf-8') + '</div>' outf.write_html('%s/index.html' % basepagename, wrappeddiv)
def main(): global boiled_clues args = get_args('create clue index') outf = open_output() boiled_clues = load_clues() biggest_clues = "<li>%d total clues, which boil down to %d distinct clues" % (len(clues()), len(boiled_clues)) bcs = [ (len(v), bc, answers_from(v)) for bc, v in boiled_clues.items() ] nreused = len([bc for n, bc, _ in bcs if n > 1]) biggest_clues += "<li>%d (%d%%) of these clues are used in more than one puzzle" % (nreused, nreused*100/len(boiled_clues)) cluepages_to_make = set() # add all boiled clues from all input .xd files for fn, contents in find_files(*args.inputs, ext='.xd'): progress(fn) xd = xdfile.xdfile(contents.decode('utf-8'), fn) for pos, mainclue, mainanswer in xd.iterclues(): cluepages_to_make.add(boil(mainclue)) # add top 100 most used boiled clues from corpus biggest_clues += '<h2>Most used clues</h2>' biggest_clues += '<table class="clues most-used-clues">' biggest_clues += th("clue", "# uses", "answers used with this clue") for n, bc, ans in sorted(bcs, reverse=True)[:100]: cluepages_to_make.add(bc) biggest_clues += td(mkhref(unboil(bc), bc), n, html_select_options(ans)) biggest_clues += '</table>' most_ambig = "<h2>Most ambiguous clues</h2>" most_ambig += '(clues with the largest number of different answers)' most_ambig += '<table class="clues most-different-answers">' most_ambig += th("Clue", "answers") for n, bc, ans in sorted(bcs, reverse=True, key=lambda x: len(set(x[2])))[:100]: cluepages_to_make.add(bc) clue = mkhref(unboil(bc), bc) if 'quip' in bc or 'quote' in bc or 'theme' in bc or 'riddle' in bc: most_ambig += td(clue, html_select_options(ans), rowclass="theme") else: most_ambig += td(clue, html_select_options(ans)) most_ambig += '</table>' for bc in cluepages_to_make: contents = mkwww_cluepage(bc) if contents: outf.write_html('pub/clue/%s/index.html' % bc, contents, title=bc) outf.write_html('pub/clue/index.html', biggest_clues + most_ambig, title="Clues")
def main(): args = get_args("save all clues in simple .tsv") outf = open_output("clues.tsv") outf.write_row("PublicationId Date Answer Clue".split()) for xd in corpus(*args.inputs): pubid = xd.publication_id() dt = xd.date() for pos, clue, answer in xd.clues: outf.write_row((pubid or "", dt or "", answer, clue))
def main(): args = utils.get_args() outf = utils.open_output() # should be .zip outf.log = False outf.toplevel = 'xd' outf.write_file('README', open('doc/zip-README').read()) outf.write_file('puzzles.tsv', open('pub/puzzles.tsv').read()) outf.write_file('stats.tsv', open('pub/stats.tsv').read()) outf.write_file('similar.tsv', open('gxd/similar.tsv').read())
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def main(): args = get_args(desc="find similar grids") g_corpus = [x for x in corpus()] outf = open_output() outf.write(xd_similar_header) for fn, contents in find_files(*args.inputs, strip_toplevel=False): needle = xdfile(contents.decode("utf-8"), fn) for pct, a, b in find_similar_to(needle, g_corpus): outf.write(xd_similar_row(a, b, pct))
def main(): args = utils.get_args() outf = utils.open_output() # should be .zip outf.log = False outf.toplevel = 'xd' outf.write_file('README', open('doc/zip-README').read()) for fn, contents in sorted(utils.find_files(*args.inputs, ext='.xd')): xdid = utils.parse_xdid(fn) if metadb.is_public(xdid): outf.write_file(utils.strip_toplevel(fn), contents)
def main(): args = utils.get_args('make clues.tsv files') outf = utils.open_output() # should be .zip outf.log = False outf.toplevel = 'xd' outf.write_file('README', open('doc/zip-README').read()) all_clues = [(ca.pubid, str(xdfile.year_from_date(ca.date)), ca.answer, ca.clue) for ca in xdfile.clues()] clues_tsv = '' clues_tsv += '\t'.join("pubid year answer clue".split()) + '\n' clues_tsv += '\n'.join('\t'.join(cluerow) for cluerow in sorted(all_clues)) outf.write_file('clues.tsv', clues_tsv)
def main(): args = get_args('aggregates all .log files') outf = open_output() s3 = boto3.resource('s3') s3path = "logs/" # bucket = conn.get_bucket(s3path) bucket = s3.Bucket(os.environ['DOMAIN']) for obj in sorted(bucket.objects.all(), key=lambda x: x.last_modified): # last_modified if s3path in obj.key: print("Name: %s LastModified:%s" % (obj.key.encode('utf-8'), obj.last_modified)) for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]): # earliest first outf.write_file(fn, contents.decode("utf-8"))
def main(): global boiled_clues args = get_args('create clue index') outf = open_output() boiled_clues = load_clues() biggest_clues = "<li>%d total clues, which boil down to %d distinct clues" % (len(clues()), len(boiled_clues)) bcs = [ (len(v), bc, answers_from(v)) for bc, v in boiled_clues.items() ] nreused = len([bc for n, bc, _ in bcs if n > 1]) biggest_clues += "<li>%d (%d%%) of these clues are used in more than one puzzle" % (nreused, nreused*100/len(boiled_clues)) cluepages_to_make = set() biggest_clues += '<h2>Most used clues</h2>' biggest_clues += '<table class="clues most-used-clues">' biggest_clues += th("clue", "# uses", "answers used with this clue") for n, bc, ans in sorted(bcs, reverse=True)[:100]: cluepages_to_make.add(bc) biggest_clues += td(mkhref(unboil(bc), bc), n, html_select_options(ans)) biggest_clues += '</table>' most_ambig = "<h2>Most ambiguous clues</h2>" most_ambig += '(clues with the largest number of different answers)' most_ambig += '<table class="clues most-different-answers">' most_ambig += th("Clue", "answers") for n, bc, ans in sorted(bcs, reverse=True, key=lambda x: len(set(x[2])))[:100]: cluepages_to_make.add(bc) clue = mkhref(unboil(bc), bc) if 'quip' in bc or 'quote' in bc or 'theme' in bc or 'riddle' in bc: most_ambig += td(clue, html_select_options(ans), rowclass="theme") else: most_ambig += td(clue, html_select_options(ans)) most_ambig += '</table>' for bc in cluepages_to_make: outf.write_html('pub/clue/%s/index.html' % bc, mkwww_cluepage(bc), title=bc) outf.write_html('pub/clue/index.html', biggest_clues + most_ambig, title="Clues")
def main(): args = get_args('parse downloaded emails') outf = open_output() sources_tsv = '' for emailfn, emailcontents in find_files(*args.inputs): msg = email.message_from_bytes(emailcontents) upload_src = msg["From"] if not upload_src: continue email_sources_tsv = [] email_files = generate_email_files(msg) for puzfn, puzdata, puzdt in email_files: # a basic sanity check of filesize # accommodate small puzzles and .pdf info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src)) summary("%s puzzles from %s" % (len(email_files), upload_src)) if len(puzdata) > 1000 and len(puzdata) < 100000: email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt))) outf.write_file(puzfn, puzdata) # generate receipt row, send receipt email if email_sources_tsv: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload successful: %d files received' % len(email_sources_tsv), body="These files were received:\n" + "\n".join(email_sources_tsv)) sources_tsv += "".join(email_sources_tsv) else: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload error', body='No puzzle files received')
def main(): p = args_parser( 'process huge puzzles archive into separate .zip and create sources.tsv' ) p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*', parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else 'misc' if prefix not in subzips: zf = xdfile.utils.OutputZipFile( os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): args = utils.get_args('generates .html diffs with deep clues for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xdids_todo = [ parse_pathname(fn).base for fn in args.inputs ] if not xdids_todo: xdids_todo = [ xdid for xdid, matches in metadb.get_similar_grids().items() if matches ] for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: continue matches = metadb.get_similar_grids().get(mainxdid, []) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 poss_answers = [] # TODO: pub_uses = {} # [pubid] -> set(ClueAnswer) dcl_html = '' deepcl_html = [] # keep deep clues to parse later - per row for pos, mainclue, mainanswer in mainxd.iterclues(): deepcl_html = [] # Temporary to be replaced late mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue) # 'grid position' column deepcl_html.append('<td class="pos">%s.</td>' % pos) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # add 'other uses' to clues_html stale = False deepcl_html.append('<td class="other-uses">') if len(pub_uses) > 0: sortable_uses = [] for pubid, uses in pub_uses.items(): # show the earliest unboiled clue for u in sorted(uses, key=lambda x: x.date or ""): # only show those published earlier if u.date and u.date <= mainxd.date(): if pubid == mainxdid and u.date == mainxd.date(): pass else: stale = True sortable_uses.append((u.date, u, 1)) deepcl_html.append(html_select([ (clue, nuses) for dt, clue, nuses in sorted(sortable_uses, key=lambda x: x[0], reverse=True) ], top_option=mainclue)) else: deepcl_html.append('<div class="original">%s</div>' % esc(mainclue)) deepcl_html.append('</td>') # add 'other answers' to clues_html deepcl_html.append('<td class="other-answers">') deepcl_html.append(html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca)) deepcl_html.append('</td>') # add 'other clues' to clues_html deepcl_html.append('<td class="other-clues">') # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < mainxd.date() ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) if uses: deepcl_html.append(html_select(uses)) deepcl_html.append('</td>') # end 'other-clues' if stale_answer: nstaleanswers += 1 if stale: nstaleclues += 1 ntotalclues += 1 # Quick and dirty - to be replaced dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>' # Store in list to make further formatting as html table easier mainxd = xdfile.get_xd(mainxdid) if mainxd: html_grids[mainxdid] = grid_diff_html(mainxd) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): diff_h = mktag('div','fullgrid main') + '%s. ' %pos diff_h += mainclue diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for xdid in matches: xd = xdfile.get_xd(xdid) if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag('div','fullgrid') + '%s. ' %pos # Sometimes can return clue == None sm = difflib.SequenceMatcher(lambda x: x == ' ', mainxd.get_clue(pos) or '', clue) if sm.ratio() < 0.50: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == 'equal': diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] diff_h += mktag('span', tagclass=(answer == mainxd.get_answer(pos)) and 'match' or 'diff', inner=' ~ ' + answer.upper()) diff_h += mktag('/div') diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag('table') + mktag('tr') # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag('td') + html_grids[w] + mktag('/td') diff_h += mktag('/tr') for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag('tr') for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag('td') + html_clues[w][i] + mktag('/td') diff_h += mktag('/tr') # Process deepclues diff_h += mktag('table') + dcl_html + mktag('/table') diff_h += mktag('/table') outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h, title='Deep clue comparison for ' + mainxdid)
def main(): p = utils.args_parser(desc="generate pubyear svg and pubyear pages") p.add_argument('-p', '--pubonly', action="store_true", default=False, help='only output root map') args = utils.get_args(parser=p) outf = utils.open_output() pubyears = defaultdict(list) pubyears_idx = defaultdict(list) # years_idx = [] for r in metadb.read_rows('pub/stats'): y = r.year or '0000' pubyear = r.pubid + str(y) pubyears[pubyear].append(r) if y not in pubyears_idx[r.pubid]: pubyears_idx[r.pubid].append(y) # if r.year not in years_idx: # years_idx.append(r.year) # Making collapsed decades depends on args allyears = [] for i in range(DECADE_SKIP_START // 10, DECADE_SKIP_END // 10 + 1): allyears.append("%s0s" % i) allyears.extend( [str(y) for y in range(DECADE_SKIP_END + 10, date.today().year + 1)]) html_out = [] html_out.append( '<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>' ) html_out.append(legend) # See definition above html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">') # Table header with years \ decades year_header = gen_year_header(allyears) html_out.extend(year_header) pubs_total = {} for pubid in pubyears_idx: pubs_total[pubid] = len(metadb.xd_puzzles(pubid)) # sort rows by number of puzzles sorted_pubs = sorted(pubs_total.keys(), key=lambda pubid: pubs_total[pubid], reverse=True) for pub in args.inputs or sorted_pubs: if pubs_total[pub] < 20: continue # Process each pub in index pubobj = metadb.xd_publications().get(pub) if pubobj: pubname = pubobj.PublicationName or pubobj.PublisherName else: pubname = pub html_out.append('<tr><td class="header">{}</td>'.format( html.mkhref(pubname, 'pub/' + pub))) for year in sorted(allyears): html_out.append('<td class="year_widget">') py_td = td_for_pubyear(pubyears, pub, year) if py_td: html_out.append(py_td) if not args.pubonly: outf.write_html( 'pub/{pub}{year}/index.html'.format(**locals()), pubyear_html(pub, year), "{pubname}, {year}".format(**locals())) else: # otherwise width = svg_w if 's' not in year else svg_w * decade_scale html_out.append( pys.format(w=width, h=svg_h, title='', classes='notexists', body='')) html_out.append('</td>') # Add totals + publishers html_out.append('<td class="header">{}</td>'.format(pubs_total[pub])) html_out.append('<td class="header">{}</td>'.format( html.mkhref(pubname, 'pub/' + pub))) html_out.append('</tr>') html_out.extend(year_header) html_out.append('</table>') total_xd = len(metadb.xd_puzzles()) outf.write_html('index.html', "".join(html_out), "Comparison of %s published crossword grids" % total_xd)
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row("pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): args = utils.get_args('generates .html diffs for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xdids_todo = {} for row in metadb.xd_similar_all(): if row.xdid not in xdids_todo: xdids_todo[row.xdid] = [] xdids_todo[row.xdid].append(row) for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: warn('%s not in corpus' % mainxdid) continue matches = xdids_todo[mainxdid] info('generating diffs for %s (%d matches)' % (mainxdid, len(matches))) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # Store in list to make further formatting as html table easier html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid)) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): if not mainclue: continue diff_h = mktag('div','fullgrid main') + '%s. ' %pos diff_h += mainclue diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for row in matches: xdid = row.match_xdid xd = xdfile.get_xd(xdid) # Continue if can't load xdid if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag('div','fullgrid') + '%s. ' %pos if not clue: continue # Sometimes can return clue == None mainclue = mainxd.get_clue_for_answer(answer) sm = difflib.SequenceMatcher(lambda x: x == ' ', mainclue or '', clue) debug('MCLUE: %s [%s]' % (mainclue, sm.ratio())) if mainclue is None or sm.ratio() < 0.40: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == 'equal': diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] tagclass = 'match' if mainclue or answer == mainxd.get_answer(pos) else 'diff' diff_h += mktag('span', tagclass=tagclass, inner=' ~ ' + answer.upper()) diff_h += mktag('/div') diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag('table') + mktag('tr') # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag('td') + html_grids[w] + mktag('/td') diff_h += mktag('/tr') for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag('tr') for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag('td') + html_clues[w][i] + mktag('/td') diff_h += mktag('/tr') diff_h += mktag('/table') outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)
def main(): p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv") p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]") args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows("gxd/similar") for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode("utf-8"), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( "gxd/similar", [ mainxd.xdid(), # xdid int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids), # matches ], )
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row( "pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): p = utils.args_parser(desc="generate pubyear svg and pubyear pages") p.add_argument('-p', '--pubonly', action="store_true", default=False, help='only output root map') args = utils.get_args(parser=p) outf = utils.open_output() pubyears = defaultdict(list) pubyears_idx = defaultdict(list) # years_idx = [] for r in metadb.read_rows('pub/stats'): y = r.year or '0000' pubyear = r.pubid + str(y) pubyears[pubyear].append(r) if y not in pubyears_idx[r.pubid]: pubyears_idx[r.pubid].append(y) # if r.year not in years_idx: # years_idx.append(r.year) # Making collapsed decades depends on args allyears = [] for i in range(DECADE_SKIP_START//10, DECADE_SKIP_END//10 + 1): allyears.append("%s0s" % i) allyears.extend([ str(y) for y in range(DECADE_SKIP_END + 10, date.today().year + 1) ]) html_out = [] html_out.append('<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>') html_out.append(legend) # See definition above html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">') # Table header with years \ decades year_header = gen_year_header(allyears) html_out.extend(year_header) pubs_total = {} for pubid in pubyears_idx: pubs_total[pubid] = len(metadb.xd_puzzles(pubid)) # sort rows by number of puzzles sorted_pubs = sorted(pubs_total.keys(), key=lambda pubid: pubs_total[pubid], reverse=True) for pub in args.inputs or sorted_pubs: if pubs_total[pub] < 20: continue # Process each pub in index pubobj = metadb.xd_publications().get(pub) if pubobj: pubname = pubobj.PublicationName or pubobj.PublisherName else: pubname = pub html_out.append('<tr><td class="header">{}</td>'.format(html.mkhref(pubname, pub))) for year in sorted(allyears): html_out.append('<td class="year_widget">') py_td = td_for_pubyear(pubyears, pub, year) if py_td: html_out.append(py_td) if not args.pubonly: outf.write_html('pub/{pub}{year}/index.html'.format(**locals()), pubyear_html(pub, year), "{pubname}, {year}".format(**locals())) else: # otherwise width = svg_w if 's' not in year else svg_w*decade_scale html_out.append(pys.format(w=width, h=svg_h, title='', classes='notexists', body='')) html_out.append('</td>') # Add totals + publishers html_out.append('<td class="header">{}</td>'.format(pubs_total[pub])) html_out.append('<td class="header">{}</td>'.format(html.mkhref(pubname, pub))) html_out.append('</tr>') html_out.extend(year_header) html_out.append('</table>') total_xd = len(metadb.xd_puzzles()) outf.write_html('index.html', "".join(html_out), "Comparison of %s published crossword grids" % total_xd)
def main(): args = utils.get_args("generates .html diffs for all puzzles in similar.tsv") outf = utils.open_output() similars = utils.parse_tsv("gxd/similar.tsv", "Similar") xdids_todo = args.inputs or [xdid for xdid, matches in metadb.get_similar_grids().items() if matches] for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: continue matches = metadb.get_similar_grids().get(mainxdid, []) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # Store in list to make further formatting as html table easier html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid)) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): diff_h = mktag("div", "fullgrid main") + "%s. " % pos diff_h += mainclue diff_h += mktag("span", tagclass="main", inner=" ~ " + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for xdid in matches: xd = xdfile.get_xd(xdid) # Continue if can't load xdid if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag("div", "fullgrid") + "%s. " % pos # Sometimes can return clue == None mainclue = mainxd.get_clue_for_answer(answer) sm = difflib.SequenceMatcher(lambda x: x == " ", mainclue or "", clue) debug("MCLUE: %s [%s]" % (mainclue, sm.ratio())) if mainclue is None or sm.ratio() < 0.40: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == "equal": diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] tagclass = "match" if mainclue or answer == mainxd.get_answer(pos) else "diff" diff_h += mktag("span", tagclass=tagclass, inner=" ~ " + answer.upper()) diff_h += mktag("/div") diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag("table") + mktag("tr") # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag("td") + html_grids[w] + mktag("/td") diff_h += mktag("/tr") for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag("tr") for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag("td") + html_clues[w][i] + mktag("/td") diff_h += mktag("/tr") diff_h += mktag("/table") outf.write_html("pub/%s/index.html" % mainxdid, diff_h, title="Comparison for " + mainxdid)
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() todaystr = today.strftime("%Y-%m-%d") sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date)) for dt in sorted(dates_to_get): try: xdid = construct_xdid(pubid, dt) url = dt.strftime(puzsrc.urlfmt) fn = "%s.%s" % (xdid, puzsrc.ext) debug("downloading '%s' from '%s'" % (fn, url)) response = urllib.request.urlopen(url) content = response.read() outf.write_file(fn, content) most_recent[pubid] = todaystr except (urllib.error.HTTPError, urllib.error.URLError) as err: error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url)) except Exception as e: error(str(e)) sources_tsv += xd_sources_row(fn, url, todaystr) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) if sources_tsv: outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today # dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) dates_to_get = get_ungotten_dates(pubid, from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue all_dates_to_get = sorted(dates_to_get) dates_to_get = dates_to_get[0:10] + dates_to_get[-10:] summary( "*** %s: %d puzzles from %s to %s not yet gotten, getting %d of them" % (pubid, len(all_dates_to_get), all_dates_to_get[0], to_date, len(dates_to_get))) most_recent[pubid] = str( download_puzzles(outf, puzsrc, pubid, dates_to_get)) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) # if sources_tsv: # outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def main(): p = utils.args_parser( desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument( '-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]') args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows('gxd/similar') for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode('utf-8'), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( 'gxd/similar', [ mainxd.xdid(), # xdid int(100 * sum( pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids) # matches ])
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
#!/usr/bin/env python3 # Usage: # $0 -o wwwroot/ gxd/redirects.tsv from xdfile import html, utils args = utils.get_args() outf = utils.open_output() for tsvfn, contents in utils.find_files(*args.inputs): for row in utils.parse_tsv_data(contents.decode('utf-8'), "Redirect"): outf.write_file(row.SourcePath, html.redirect_page(row.DestURL))
def main(): args = utils.get_args( 'generates .html diffs with deep clues for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xds_todo = [] for fn, contents in find_files(*args.inputs, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) xds_todo.append(xd) for mainxd in xds_todo: mainxdid = mainxd.xdid() progress(mainxdid) matches = metadb.xd_similar(mainxdid) xddates = {} xddates[mainxdid] = mainxd.date( ) # Dict to store XD dates for further sort html_grids = {} # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 dcl_html = '<tr>' dcl_html += '<th></th>' dcl_html += '<th>Clue</th>' dcl_html += '<th>ANSWERs</th>' dcl_html += '<th>Alt. clue possibilities</th>' dcl_html += '</tr>' deepcl_html = [] # keep deep clues to parse later - per row for pos, mainclue, mainanswer in mainxd.iterclues(): if not pos: continue poss_answers = [] # TODO: pub_uses = {} # [pubid] -> set(ClueAnswer) deepcl_html = [] # Temporary to be replaced late mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue) # 'grid position' column deepcl_html.append('<td class="pos">%s.</td>' % pos) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # add 'other uses' to clues_html deepcl_html.append('<td class="other-uses">') prev = prev_uses(pub_uses, mainxd, mainclue) if prev: deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' % (boil(mainclue), mainclue, len(prev))) nstaleclues += 1 else: deepcl_html.append(mainclue) deepcl_html.append('</td>') # add 'other answers' to clues_html deepcl_html.append('<td class="other-answers">') deepcl_html.append( html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca, add_total=False)) deepcl_html.append('</td>') # add 'other clues' to clues_html deepcl_html.append('<td class="other-clues">') other_clues = html_other_clues(mainanswer, mainclue, mainxd) if other_clues: deepcl_html.append(other_clues) nstaleanswers += 1 deepcl_html.append('</td>') # end 'other-clues' ntotalclues += 1 # Quick and dirty - to be replaced dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>' # Process deepclues diff_h = '<div class="main-container">' diff_h += grid_to_html(mainxd) diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table') diff_h += '</div>' info('writing deepclues for %s' % mainxdid) outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h, title='Deep clue analysis for ' + mainxdid)