def main(): p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv") p.add_argument("-s", "--source", default=None, help="ExternalSource") args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else "misc" if prefix not in subzips: zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): p = args_parser('catalog source files and create source.tsv') p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) info("importing from %s" % args.source) outf = open_output() sources = [] for input_source in args.inputs: for fn, contents, dt in find_files_with_time(input_source): if len(contents) == 0: info("ignoring empty file") continue outf.write_file(strip_toplevel(fn), contents, dt) sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt))) info("%s files cataloged" % len(sources)) outbase = parse_pathname(args.output).base outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources)) outf.write_file("%s.log" % outbase, get_log())
def main(): p = args_parser( 'process huge puzzles archive into separate .zip and create sources.tsv' ) p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*', parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else 'misc' if prefix not in subzips: zf = xdfile.utils.OutputZipFile( os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): p = utils.args_parser(desc="generate pubyear svg and pubyear pages") p.add_argument('-p', '--pubonly', action="store_true", default=False, help='only output root map') args = utils.get_args(parser=p) outf = utils.open_output() pubyears = defaultdict(list) pubyears_idx = defaultdict(list) # years_idx = [] for r in metadb.read_rows('pub/stats'): y = r.year or '0000' pubyear = r.pubid + str(y) pubyears[pubyear].append(r) if y not in pubyears_idx[r.pubid]: pubyears_idx[r.pubid].append(y) # if r.year not in years_idx: # years_idx.append(r.year) # Making collapsed decades depends on args allyears = [] for i in range(DECADE_SKIP_START // 10, DECADE_SKIP_END // 10 + 1): allyears.append("%s0s" % i) allyears.extend( [str(y) for y in range(DECADE_SKIP_END + 10, date.today().year + 1)]) html_out = [] html_out.append( '<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>' ) html_out.append(legend) # See definition above html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">') # Table header with years \ decades year_header = gen_year_header(allyears) html_out.extend(year_header) pubs_total = {} for pubid in pubyears_idx: pubs_total[pubid] = len(metadb.xd_puzzles(pubid)) # sort rows by number of puzzles sorted_pubs = sorted(pubs_total.keys(), key=lambda pubid: pubs_total[pubid], reverse=True) for pub in args.inputs or sorted_pubs: if pubs_total[pub] < 20: continue # Process each pub in index pubobj = metadb.xd_publications().get(pub) if pubobj: pubname = pubobj.PublicationName or pubobj.PublisherName else: pubname = pub html_out.append('<tr><td class="header">{}</td>'.format( html.mkhref(pubname, 'pub/' + pub))) for year in sorted(allyears): html_out.append('<td class="year_widget">') py_td = td_for_pubyear(pubyears, pub, year) if py_td: html_out.append(py_td) if not args.pubonly: outf.write_html( 'pub/{pub}{year}/index.html'.format(**locals()), pubyear_html(pub, year), "{pubname}, {year}".format(**locals())) else: # otherwise width = svg_w if 's' not in year else svg_w * decade_scale html_out.append( pys.format(w=width, h=svg_h, title='', classes='notexists', body='')) html_out.append('</td>') # Add totals + publishers html_out.append('<td class="header">{}</td>'.format(pubs_total[pub])) html_out.append('<td class="header">{}</td>'.format( html.mkhref(pubname, 'pub/' + pub))) html_out.append('</tr>') html_out.extend(year_header) html_out.append('</table>') total_xd = len(metadb.xd_puzzles()) outf.write_html('index.html', "".join(html_out), "Comparison of %s published crossword grids" % total_xd)
def main(): p = utils.args_parser(desc="generate pubyear svg and pubyear pages") p.add_argument('-p', '--pubonly', action="store_true", default=False, help='only output root map') args = utils.get_args(parser=p) outf = utils.open_output() pubyears = defaultdict(list) pubyears_idx = defaultdict(list) # years_idx = [] for r in metadb.read_rows('pub/stats'): y = r.year or '0000' pubyear = r.pubid + str(y) pubyears[pubyear].append(r) if y not in pubyears_idx[r.pubid]: pubyears_idx[r.pubid].append(y) # if r.year not in years_idx: # years_idx.append(r.year) # Making collapsed decades depends on args allyears = [] for i in range(DECADE_SKIP_START//10, DECADE_SKIP_END//10 + 1): allyears.append("%s0s" % i) allyears.extend([ str(y) for y in range(DECADE_SKIP_END + 10, date.today().year + 1) ]) html_out = [] html_out.append('<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>') html_out.append(legend) # See definition above html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">') # Table header with years \ decades year_header = gen_year_header(allyears) html_out.extend(year_header) pubs_total = {} for pubid in pubyears_idx: pubs_total[pubid] = len(metadb.xd_puzzles(pubid)) # sort rows by number of puzzles sorted_pubs = sorted(pubs_total.keys(), key=lambda pubid: pubs_total[pubid], reverse=True) for pub in args.inputs or sorted_pubs: if pubs_total[pub] < 20: continue # Process each pub in index pubobj = metadb.xd_publications().get(pub) if pubobj: pubname = pubobj.PublicationName or pubobj.PublisherName else: pubname = pub html_out.append('<tr><td class="header">{}</td>'.format(html.mkhref(pubname, pub))) for year in sorted(allyears): html_out.append('<td class="year_widget">') py_td = td_for_pubyear(pubyears, pub, year) if py_td: html_out.append(py_td) if not args.pubonly: outf.write_html('pub/{pub}{year}/index.html'.format(**locals()), pubyear_html(pub, year), "{pubname}, {year}".format(**locals())) else: # otherwise width = svg_w if 's' not in year else svg_w*decade_scale html_out.append(pys.format(w=width, h=svg_h, title='', classes='notexists', body='')) html_out.append('</td>') # Add totals + publishers html_out.append('<td class="header">{}</td>'.format(pubs_total[pub])) html_out.append('<td class="header">{}</td>'.format(html.mkhref(pubname, pub))) html_out.append('</tr>') html_out.extend(year_header) html_out.append('</table>') total_xd = len(metadb.xd_puzzles()) outf.write_html('index.html', "".join(html_out), "Comparison of %s published crossword grids" % total_xd)
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() todaystr = today.strftime("%Y-%m-%d") sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date)) for dt in sorted(dates_to_get): try: xdid = construct_xdid(pubid, dt) url = dt.strftime(puzsrc.urlfmt) fn = "%s.%s" % (xdid, puzsrc.ext) debug("downloading '%s' from '%s'" % (fn, url)) response = urllib.request.urlopen(url) content = response.read() outf.write_file(fn, content) most_recent[pubid] = todaystr except (urllib.error.HTTPError, urllib.error.URLError) as err: error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url)) except Exception as e: error(str(e)) sources_tsv += xd_sources_row(fn, url, todaystr) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) if sources_tsv: outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today # dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) dates_to_get = get_ungotten_dates(pubid, from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue all_dates_to_get = sorted(dates_to_get) dates_to_get = dates_to_get[0:10] + dates_to_get[-10:] summary( "*** %s: %d puzzles from %s to %s not yet gotten, getting %d of them" % (pubid, len(all_dates_to_get), all_dates_to_get[0], to_date, len(dates_to_get))) most_recent[pubid] = str( download_puzzles(outf, puzsrc, pubid, dates_to_get)) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) # if sources_tsv: # outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def main(): p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv") p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]") args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows("gxd/similar") for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode("utf-8"), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( "gxd/similar", [ mainxd.xdid(), # xdid int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids), # matches ], )
def main(): p = utils.args_parser( desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument( '-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]') args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows('gxd/similar') for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode('utf-8'), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( 'gxd/similar', [ mainxd.xdid(), # xdid int(100 * sum( pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids) # matches ])