def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join( ["grid_xdid", "clues_pubid", "num_missing"]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header( "Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join( [xd.xdid(), pubid, str(nmissing)]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file( parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def download_puzzles(outf, puzsrc, pubid, dates_to_get): actually_gotten = [] for dt in sorted(dates_to_get): try: xdid = construct_xdid(pubid, dt) url = dt.strftime(puzsrc.urlfmt) fn = "%s.%s" % (xdid, puzsrc.ext) debug("downloading '%s' from '%s'" % (fn, url)) response = urllib.request.urlopen(url, timeout=10) content = response.read() outf.write_file(fn, content) actually_gotten.append(dt) except (urllib.error.HTTPError, urllib.error.URLError) as err: error('%s %s: %s' % (xdid, err, url)) except Exception as e: error(str(e)) # sources_tsv += xd_sources_row(fn, url, todaystr) time.sleep(2) return max(actually_gotten)
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def find_similar_to(needle, haystack, min_pct=0.3): if not needle.grid: return nsquares = len(needle.grid) * len(needle.grid[0]) min_similarity = min_pct * nsquares for xd in haystack: if xd.filename == needle.filename: continue try: similarity = fast_grid_similarity(needle, xd) except Exception as e: debug(str(e)) similarity = 0 if similarity >= min_similarity: if needle.xdid() != xd.xdid(): # skip if same puzzle # recompute with slower metric similarity = grid_similarity(needle, xd) if similarity >= 25: yield similarity, needle, xd
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row("pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() todaystr = today.strftime("%Y-%m-%d") sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date)) for dt in sorted(dates_to_get): try: xdid = construct_xdid(pubid, dt) url = dt.strftime(puzsrc.urlfmt) fn = "%s.%s" % (xdid, puzsrc.ext) debug("downloading '%s' from '%s'" % (fn, url)) response = urllib.request.urlopen(url) content = response.read() outf.write_file(fn, content) most_recent[pubid] = todaystr except (urllib.error.HTTPError, urllib.error.URLError) as err: error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url)) except Exception as e: error(str(e)) sources_tsv += xd_sources_row(fn, url, todaystr) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) if sources_tsv: outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def parse_xwordinfo(content, filename): content = content.decode('utf-8') REBUS_LONG_HANDS = {'NINE': '9', 'EIGHT': '8', 'SEVEN': '7', 'SIX': '6', 'FIVE': '5', 'FOUR': '4', 'THREE': '3', 'TWO': '2', 'ONE': '1', 'ZERO': '0', 'AUGHT': '0', 'AMPERSAND': '&', 'AND': '&', 'ASTERISK': '*', 'PERCENT': '%', 'STAR': '*', 'AT': '@', 'DOLLAR': '$', 'PLUS': '+', 'CENT': 'c', # 'DASH': '-', # 'DOT': '●' } rsh = 'zyxwvutsrqponmlkjihgfedcba♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠Фθиλπφя+&%$@?*0987654321' REBUS_SHORT_HANDS = list(rsh) content = content.replace("<b>", "{*") content = content.replace("</b>", "*}") content = content.replace("<i>", "{/") content = content.replace("</i>", "/}") content = content.replace("<em>", "{/") content = content.replace("</em>", "/}") content = content.replace("<u>", "{_") content = content.replace("</u>", "_}") content = content.replace("<strike>", "{-") content = content.replace("</strike>", "-}") content = content.replace("’", "'") content = content.replace('“', '"') # content = content.replace('–', '-') if "CPHContent_" in content: xwiprefix = '#CPHContent_' else: xwiprefix = '#' root = html.fromstring(content) ## debug("ROOT: %s" % root) special_type = '' rebus = {} rebus_order = [] xd = xdfile.xdfile('', filename) # get crossword info title = root.cssselect(xwiprefix + 'TitleLabel')[0].text.strip() try: subtitle = root.cssselect(xwiprefix + 'SubTitleLabel')[0].text.strip() subtitle = ' [%s]' % subtitle except: subtitle = "" # author = root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip() # editor = root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip() try: xd.notes = stringify_children(root.cssselect(xwiprefix + 'NotepadDiv')[0]) except Exception as e: xd.notes = "" debug('Exception %s' % e) xd.set_header("Title", '%s%s' % (title, subtitle)) xd.set_header("Author", root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip()) xd.set_header("Editor", root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip()) xd.notes = xd.notes.replace("<br/>", "\n") xd.notes = xd.notes.replace("<b>Notepad:</b>", "\n") xd.notes = xd.notes.replace(" ", "\n") xd.notes = xd.notes.strip() puzzle_table = root.cssselect(xwiprefix + 'PuzTable tr') or root.cssselect('#PuzTable tr') for row in puzzle_table: row_data = "" for cell in row.cssselect('td'): # check if the cell is special - with a shade or a circle cell_class = cell.get('class') cell_type = '' if cell_class == 'bigshade': cell_type = 'shaded' elif cell_class == 'bigcircle': cell_type = 'circle' letter = cell.cssselect('div.letter') letter = (len(letter) and letter[0].text) or xdfile.BLOCK_CHAR # handle rebuses if letter == xdfile.BLOCK_CHAR: subst = cell.cssselect('div.subst2') subst = (len(subst) and subst[0].text) or '' if not subst: subst = cell.cssselect('div.subst') if subst: if title in SPLIT_REBUS_TITLES: subst = "/".join(list(subst[0].text)) else: subst = subst[0].text else: subst = '' if subst: if subst not in rebus: if subst in REBUS_LONG_HANDS: rebus_val = REBUS_LONG_HANDS[subst] if rebus_val in REBUS_SHORT_HANDS: REBUS_SHORT_HANDS.remove(rebus_val) else: rebus_val = REBUS_SHORT_HANDS.pop() rebus[subst] = rebus_val rebus_order.append(subst) letter = rebus[subst] if cell_type: # the special cell's letter should be represented in lower case letter = letter.lower() if not special_type: # hopefully there shouldn't be both shades and circles in # the same puzzle - if that is the case, only the last value # will be put up in the header special_type = cell_type row_data += letter xd.grid.append(row_data) if len(rebus): rebus = ["%s=%s" % (rebus[x], x.upper()) for x in rebus_order] xd.set_header("Rebus", ','.join(rebus)) if special_type: xd.set_header("Special", special_type) # add clues across_clues = _fetch_clues(xd, 'A', root, xwiprefix + 'AcrossClues', rebus) down_clues = _fetch_clues(xd, 'D', root, xwiprefix + 'DownClues', rebus) return xd
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): args = utils.get_args("generates .html diffs for all puzzles in similar.tsv") outf = utils.open_output() similars = utils.parse_tsv("gxd/similar.tsv", "Similar") xdids_todo = args.inputs or [xdid for xdid, matches in metadb.get_similar_grids().items() if matches] for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: continue matches = metadb.get_similar_grids().get(mainxdid, []) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # Store in list to make further formatting as html table easier html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid)) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): diff_h = mktag("div", "fullgrid main") + "%s. " % pos diff_h += mainclue diff_h += mktag("span", tagclass="main", inner=" ~ " + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for xdid in matches: xd = xdfile.get_xd(xdid) # Continue if can't load xdid if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag("div", "fullgrid") + "%s. " % pos # Sometimes can return clue == None mainclue = mainxd.get_clue_for_answer(answer) sm = difflib.SequenceMatcher(lambda x: x == " ", mainclue or "", clue) debug("MCLUE: %s [%s]" % (mainclue, sm.ratio())) if mainclue is None or sm.ratio() < 0.40: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == "equal": diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] tagclass = "match" if mainclue or answer == mainxd.get_answer(pos) else "diff" diff_h += mktag("span", tagclass=tagclass, inner=" ~ " + answer.upper()) diff_h += mktag("/div") diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag("table") + mktag("tr") # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag("td") + html_grids[w] + mktag("/td") diff_h += mktag("/tr") for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag("tr") for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag("td") + html_clues[w][i] + mktag("/td") diff_h += mktag("/tr") diff_h += mktag("/table") outf.write_html("pub/%s/index.html" % mainxdid, diff_h, title="Comparison for " + mainxdid)
def parse_xwordinfo(content, filename): content = content.decode('utf-8') REBUS_LONG_HANDS = { 'NINE': '9', 'EIGHT': '8', 'SEVEN': '7', 'SIX': '6', 'FIVE': '5', 'FOUR': '4', 'THREE': '3', 'TWO': '2', 'ONE': '1', 'ZERO': '0', 'AUGHT': '0', 'AMPERSAND': '&', 'AND': '&', 'ASTERISK': '*', 'PERCENT': '%', 'STAR': '*', 'AT': '@', 'DOLLAR': '$', 'PLUS': '+', 'CENT': 'c', # 'DASH': '-', # 'DOT': '●' } rsh = 'zyxwvutsrqponmlkjihgfedcba♚♛♜♝♞♟⚅⚄⚃⚂⚁⚀♣♦♥♠Фθиλπφя+&%$@?*0987654321' REBUS_SHORT_HANDS = list(rsh) content = content.replace("<b>", "{*") content = content.replace("</b>", "*}") content = content.replace("<i>", "{/") content = content.replace("</i>", "/}") content = content.replace("<em>", "{/") content = content.replace("</em>", "/}") content = content.replace("<u>", "{_") content = content.replace("</u>", "_}") content = content.replace("<strike>", "{-") content = content.replace("</strike>", "-}") content = content.replace("’", "'") content = content.replace('“', '"') # content = content.replace('–', '-') if "CPHContent_" in content: xwiprefix = '#CPHContent_' else: xwiprefix = '#' root = html.fromstring(content) ## debug("ROOT: %s" % root) special_type = '' rebus = {} rebus_order = [] xd = xdfile.xdfile('', filename) # get crossword info title = root.cssselect(xwiprefix + 'TitleLabel')[0].text.strip() try: subtitle = root.cssselect(xwiprefix + 'SubTitleLabel')[0].text.strip() subtitle = ' [%s]' % subtitle except: subtitle = "" # author = root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip() # editor = root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip() try: xd.notes = stringify_children( root.cssselect(xwiprefix + 'NotepadDiv')[0]) except Exception as e: xd.notes = "" debug('Exception %s' % e) xd.set_header("Title", '%s%s' % (title, subtitle)) xd.set_header("Author", root.cssselect(xwiprefix + 'AuthorLabel')[0].text.strip()) xd.set_header("Editor", root.cssselect(xwiprefix + 'EditorLabel')[0].text.strip()) xd.notes = xd.notes.replace("<br/>", "\n") xd.notes = xd.notes.replace("<b>Notepad:</b>", "\n") xd.notes = xd.notes.replace(" ", "\n") xd.notes = xd.notes.strip() puzzle_table = root.cssselect(xwiprefix + 'PuzTable tr') or root.cssselect( '#PuzTable tr') for row in puzzle_table: row_data = "" for cell in row.cssselect('td'): # check if the cell is special - with a shade or a circle cell_class = cell.get('class') cell_type = '' if cell_class == 'bigshade': cell_type = 'shaded' elif cell_class == 'bigcircle': cell_type = 'circle' letter = cell.cssselect('div.letter') letter = (len(letter) and letter[0].text) or xdfile.BLOCK_CHAR # handle rebuses if letter == xdfile.BLOCK_CHAR: subst = cell.cssselect('div.subst2') subst = (len(subst) and subst[0].text) or '' if not subst: subst = cell.cssselect('div.subst') if subst: if title in SPLIT_REBUS_TITLES: subst = "/".join(list(subst[0].text)) else: subst = subst[0].text else: subst = '' if subst: if subst not in rebus: if subst in REBUS_LONG_HANDS: rebus_val = REBUS_LONG_HANDS[subst] if rebus_val in REBUS_SHORT_HANDS: REBUS_SHORT_HANDS.remove(rebus_val) else: rebus_val = REBUS_SHORT_HANDS.pop() rebus[subst] = rebus_val rebus_order.append(subst) letter = rebus[subst] if cell_type: # the special cell's letter should be represented in lower case letter = letter.lower() if not special_type: # hopefully there shouldn't be both shades and circles in # the same puzzle - if that is the case, only the last value # will be put up in the header special_type = cell_type row_data += letter xd.grid.append(row_data) if len(rebus): rebus = ["%s=%s" % (rebus[x], x.upper()) for x in rebus_order] xd.set_header("Rebus", ','.join(rebus)) if special_type: xd.set_header("Special", special_type) # add clues across_clues = _fetch_clues(xd, 'A', root, xwiprefix + 'AcrossClues', rebus) down_clues = _fetch_clues(xd, 'D', root, xwiprefix + 'DownClues', rebus) return xd
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row( "pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): args = utils.get_args('generates .html diffs for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xdids_todo = {} for row in metadb.xd_similar_all(): if row.xdid not in xdids_todo: xdids_todo[row.xdid] = [] xdids_todo[row.xdid].append(row) for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: warn('%s not in corpus' % mainxdid) continue matches = xdids_todo[mainxdid] info('generating diffs for %s (%d matches)' % (mainxdid, len(matches))) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # Store in list to make further formatting as html table easier html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid)) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): if not mainclue: continue diff_h = mktag('div','fullgrid main') + '%s. ' %pos diff_h += mainclue diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for row in matches: xdid = row.match_xdid xd = xdfile.get_xd(xdid) # Continue if can't load xdid if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag('div','fullgrid') + '%s. ' %pos if not clue: continue # Sometimes can return clue == None mainclue = mainxd.get_clue_for_answer(answer) sm = difflib.SequenceMatcher(lambda x: x == ' ', mainclue or '', clue) debug('MCLUE: %s [%s]' % (mainclue, sm.ratio())) if mainclue is None or sm.ratio() < 0.40: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == 'equal': diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] tagclass = 'match' if mainclue or answer == mainxd.get_answer(pos) else 'diff' diff_h += mktag('span', tagclass=tagclass, inner=' ~ ' + answer.upper()) diff_h += mktag('/div') diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag('table') + mktag('tr') # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag('td') + html_grids[w] + mktag('/td') diff_h += mktag('/tr') for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag('tr') for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag('td') + html_clues[w][i] + mktag('/td') diff_h += mktag('/tr') diff_h += mktag('/table') outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)