def mutate(xd, words, chance=1): nmutations = 0 for hwd, vwd, i, j, r, c in each_word_cross(xd): hwd_a, pivot_char, hwd_b = hwd[:i], hwd[i], hwd[i:][1:] vwd_a, pivot_char, vwd_b = vwd[:j], vwd[j], vwd[j:][1:] progress("%s[%s]%s/%s[%s]%s" % (hwd_a, pivot_char, hwd_b, vwd_a, pivot_char, vwd_b)) mutations_this_square = [] for ch in string.ascii_uppercase: if ch == pivot_char: continue new_hwd = hwd_a + ch + hwd_b new_vwd = vwd_a + ch + vwd_b if new_vwd in words and new_hwd in words: mutations_this_square.append((new_hwd, new_vwd, ch)) if mutations_this_square: most_common = sorted(mutations_this_square, key=lambda x: len(words[x[0]]) + len(words[x[1]]))[-1] new_hwd, new_vwd, best_replacement = most_common if random.random() < chance: nmutations += 1 xd.grid[r] = splice(xd.grid[r], c, best_replacement) info("-> %s/%s (%s)" % (new_hwd, new_vwd, "".join(br for h, v, br in mutations_this_square))) return nmutations
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join( ["grid_xdid", "clues_pubid", "num_missing"]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header( "Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join( [xd.xdid(), pubid, str(nmissing)]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file( parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def mutate(xd, words, chance=1): nmutations = 0 for hwd, vwd, i, j, r, c in each_word_cross(xd): hwd_a, pivot_char, hwd_b = hwd[:i], hwd[i], hwd[i:][1:] vwd_a, pivot_char, vwd_b = vwd[:j], vwd[j], vwd[j:][1:] progress("%s[%s]%s/%s[%s]%s" % (hwd_a, pivot_char, hwd_b, vwd_a, pivot_char, vwd_b)) mutations_this_square = [] for ch in string.ascii_uppercase: if ch == pivot_char: continue new_hwd = hwd_a + ch + hwd_b new_vwd = vwd_a + ch + vwd_b if new_vwd in words and new_hwd in words: mutations_this_square.append((new_hwd, new_vwd, ch)) if mutations_this_square: most_common = sorted( mutations_this_square, key=lambda x: len(words[x[0]]) + len(words[x[1]]))[-1] new_hwd, new_vwd, best_replacement = most_common if random.random() < chance: nmutations += 1 xd.grid[r] = splice(xd.grid[r], c, best_replacement) info("-> %s/%s (%s)" % (new_hwd, new_vwd, "".join( br for h, v, br in mutations_this_square))) return nmutations
def main(): args = utils.get_args() all_receipts = metadb.xd_receipts_header receipts = metadb.xd_receipts_rows() rids = set() # set of ReceiptId for r in receipts: oldpubid = "" oldpubid = utils.parse_pubid(r.xdid or '') newpubid = catalog.find_pubid("|".join((str(x) for x in r))) d = r._asdict() if newpubid and newpubid != oldpubid: seqnum = utils.parse_seqnum(r.xdid or r.SourceFilename) if seqnum: newxdid = newpubid + seqnum utils.info("changing xdid from '%s' to '%s'" % (r.xdid, newxdid)) d["xdid"] = newxdid else: utils.info("no date or number in xdid, not reshelving") all_receipts += metadb.xd_receipts_row(**d) open(metadb.RECEIPTS_TSV, 'w').write(all_receipts)
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def create_merge_request(): import urllib.request import urllib.parse parms = { 'id': '', 'source_branch': '', 'target_branch': '', 'title': '', } url = 'https://gitlab.com/projects/:id/merge_requests' r = urllib.request.urlopen(url, urllib.parse.urlencode(parms)) info('create_merge_request POST: %s' % r.getcode())
def xd_send_email(destaddr, fromaddr='*****@*****.**', subject='', body=''): client = boto3.client('ses', region_name=os.environ['REGION']) info("sending email to %s (subject '%s')" % (destaddr, subject)) try: response = client.send_email( Source=fromaddr, Destination= {'ToAddresses': [ destaddr ] }, Message={ 'Subject': { 'Data': subject }, 'Body': { 'Text': { 'Data': body } } }) return response except Exception as e: error("xd_send_email(): %s" % str(e)) return None
def xd_send_email(destaddr, fromaddr='*****@*****.**', subject='', body=''): client = boto3.client('ses', region_name=os.environ['REGION']) info("sending email to %s (subject '%s')" % (destaddr, subject)) try: response = client.send_email(Source=fromaddr, Destination={'ToAddresses': [destaddr]}, Message={ 'Subject': { 'Data': subject }, 'Body': { 'Text': { 'Data': body } } }) return response except Exception as e: error("xd_send_email(): %s" % str(e)) return None
def main(): p = args_parser('catalog source files and create source.tsv') p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) info("importing from %s" % args.source) outf = open_output() sources = [] for input_source in args.inputs: for fn, contents, dt in find_files_with_time(input_source): if len(contents) == 0: info("ignoring empty file") continue outf.write_file(strip_toplevel(fn), contents, dt) sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt))) info("%s files cataloged" % len(sources)) outbase = parse_pathname(args.output).base outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources)) outf.write_file("%s.log" % outbase, get_log())
def main(): args = get_args('parse downloaded emails') outf = open_output() sources_tsv = '' for emailfn, emailcontents in find_files(*args.inputs): msg = email.message_from_bytes(emailcontents) upload_src = msg["From"] if not upload_src: continue email_sources_tsv = [] email_files = generate_email_files(msg) for puzfn, puzdata, puzdt in email_files: # a basic sanity check of filesize # accommodate small puzzles and .pdf info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src)) summary("%s puzzles from %s" % (len(email_files), upload_src)) if len(puzdata) > 1000 and len(puzdata) < 100000: email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt))) outf.write_file(puzfn, puzdata) # generate receipt row, send receipt email if email_sources_tsv: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload successful: %d files received' % len(email_sources_tsv), body="These files were received:\n" + "\n".join(email_sources_tsv)) sources_tsv += "".join(email_sources_tsv) else: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload error', body='No puzzle files received')
def main(): args = utils.get_args( 'generates .html diffs with deep clues for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xds_todo = [] for fn, contents in find_files(*args.inputs, ext='.xd'): xd = xdfile.xdfile(contents.decode('utf-8'), fn) xds_todo.append(xd) for mainxd in xds_todo: mainxdid = mainxd.xdid() progress(mainxdid) matches = metadb.xd_similar(mainxdid) xddates = {} xddates[mainxdid] = mainxd.date( ) # Dict to store XD dates for further sort html_grids = {} # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 dcl_html = '<tr>' dcl_html += '<th></th>' dcl_html += '<th>Clue</th>' dcl_html += '<th>ANSWERs</th>' dcl_html += '<th>Alt. clue possibilities</th>' dcl_html += '</tr>' deepcl_html = [] # keep deep clues to parse later - per row for pos, mainclue, mainanswer in mainxd.iterclues(): if not pos: continue poss_answers = [] # TODO: pub_uses = {} # [pubid] -> set(ClueAnswer) deepcl_html = [] # Temporary to be replaced late mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue) # 'grid position' column deepcl_html.append('<td class="pos">%s.</td>' % pos) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # add 'other uses' to clues_html deepcl_html.append('<td class="other-uses">') prev = prev_uses(pub_uses, mainxd, mainclue) if prev: deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' % (boil(mainclue), mainclue, len(prev))) nstaleclues += 1 else: deepcl_html.append(mainclue) deepcl_html.append('</td>') # add 'other answers' to clues_html deepcl_html.append('<td class="other-answers">') deepcl_html.append( html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca, add_total=False)) deepcl_html.append('</td>') # add 'other clues' to clues_html deepcl_html.append('<td class="other-clues">') other_clues = html_other_clues(mainanswer, mainclue, mainxd) if other_clues: deepcl_html.append(other_clues) nstaleanswers += 1 deepcl_html.append('</td>') # end 'other-clues' ntotalclues += 1 # Quick and dirty - to be replaced dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>' # Process deepclues diff_h = '<div class="main-container">' diff_h += grid_to_html(mainxd) diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table') diff_h += '</div>' info('writing deepclues for %s' % mainxdid) outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h, title='Deep clue analysis for ' + mainxdid)
def main(): p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv") p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]") args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows("gxd/similar") for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode("utf-8"), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( "gxd/similar", [ mainxd.xdid(), # xdid int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids), # matches ], )
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row( "pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): args = utils.get_args('generates .html diffs for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xdids_todo = {} for row in metadb.xd_similar_all(): if row.xdid not in xdids_todo: xdids_todo[row.xdid] = [] xdids_todo[row.xdid].append(row) for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: warn('%s not in corpus' % mainxdid) continue matches = xdids_todo[mainxdid] info('generating diffs for %s (%d matches)' % (mainxdid, len(matches))) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # Store in list to make further formatting as html table easier html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid)) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): if not mainclue: continue diff_h = mktag('div','fullgrid main') + '%s. ' %pos diff_h += mainclue diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for row in matches: xdid = row.match_xdid xd = xdfile.get_xd(xdid) # Continue if can't load xdid if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag('div','fullgrid') + '%s. ' %pos if not clue: continue # Sometimes can return clue == None mainclue = mainxd.get_clue_for_answer(answer) sm = difflib.SequenceMatcher(lambda x: x == ' ', mainclue or '', clue) debug('MCLUE: %s [%s]' % (mainclue, sm.ratio())) if mainclue is None or sm.ratio() < 0.40: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == 'equal': diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] tagclass = 'match' if mainclue or answer == mainxd.get_answer(pos) else 'diff' diff_h += mktag('span', tagclass=tagclass, inner=' ~ ' + answer.upper()) diff_h += mktag('/div') diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag('table') + mktag('tr') # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag('td') + html_grids[w] + mktag('/td') diff_h += mktag('/tr') for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag('tr') for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag('td') + html_clues[w][i] + mktag('/td') diff_h += mktag('/tr') diff_h += mktag('/table') outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row("pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): p = utils.args_parser( desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument( '-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]') args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows('gxd/similar') for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode('utf-8'), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( 'gxd/similar', [ mainxd.xdid(), # xdid int(100 * sum( pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids) # matches ])