def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row("pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv") p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]") args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows("gxd/similar") for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode("utf-8"), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( "gxd/similar", [ mainxd.xdid(), # xdid int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids), # matches ], )
def main(): args = utils.get_args('generate pub-years data') outf = utils.open_output() weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] pubyears = {} # set() for xd in xdfile.corpus(): puby = (xd.publication_id(), xd.year()) if puby not in pubyears: pubyears[puby] = [] pubyears[puby].append(xd) if pubyears: metadb.delete_stats() for puby, xdlist in sorted(pubyears.items()): pubid, year = puby npublic = 0 # TODO: SELECT FROM publications nexisting = 0 # organize by day-of-week byweekday = {} byweekday_similar = {} for w in weekdays: byweekday[w] = [] byweekday_similar[w] = [] for xd in xdlist: dow = dow_from_date(xd.get_header('Date')) if dow: # Might be empty date or only a year byweekday[dow].append(xd) for r in metadb.xd_similar(pubid + str(year)): if r.match_pct < 25: continue xd = xdfile.get_xd(r.xdid) if xd: dt = xd.get_header('Date') if dt: dow = dow_from_date(dt) if dow: # Might be empty date or only a year byweekday_similar[dow].append(r) else: debug("Date not set for: %s" % xd) # tally stats for weekday in weekdays: copyrights = Counter() # [copyright_text] -> number of xd editors = Counter() # [editor_name] -> number of xd formats = Counter() # ["15x15 RS"] -> number of xd # todo nexisting = 0 nxd = len(byweekday[weekday]) public_xdids = [] # Empty for now for xd in byweekday[weekday]: xdid = xd.xdid() if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: npublic += 1 editor = xd.get_header('Editor').strip() if editor: editors[editor] += 1 sizestr = xd.sizestr() if sizestr: formats[sizestr] += 1 copyright = xd.get_header('Copyright').strip() if copyright: copyrights[copyright] += 1 # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) def process_counter(count, comp_value): # Process counter comparing with comp_value if count: item, num = count.most_common(1)[0] if num != comp_value: item += " (%s)" % num else: item = '' return item # maineditor = process_counter(editors, nxd) maincopyright = process_counter(copyrights, nxd) mainformat = process_counter(formats, nxd) reprints = 0 touchups = 0 redones = 0 copies = 0 themecopies = 0 for r in byweekday_similar[weekday]: xd1 = xdfile.get_xd(r.xdid) xd2 = xdfile.get_xd(r.match_xdid) if xd1 is None: info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid)) continue if xd2 is None: info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid)) continue dt1 = xd1.get_header('Date') dt2 = xd2.get_header('Date') aut1 = xd1.get_header('Author').lower() aut2 = xd2.get_header('Author').lower() pct = int(r.match_pct) if dt2 < dt1: # only capture the later one ##deduce_similarity_type if diff_authors(aut1, aut2): # suspicious if pct >= 50: copies += 1 elif pct >= 30: themecopies += 1 else: if pct == 100: reprints += 1 elif pct >= 50: touchups += 1 elif pct >= 30: themecopies += 1 metadb.append_row( "pub/stats", (pubid, year, weekday, mainformat, maineditor, maincopyright, nexisting, nxd, npublic, reprints, touchups, redones, copies, themecopies))
def main(): p = utils.args_parser( desc="annotate puzzle clues with earliest date used in the corpus") p.add_argument( '-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]') args = get_args(parser=p) outf = open_output() num_processed = 0 prev_similar = metadb.read_rows('gxd/similar') for fn, contents in find_files(*args.inputs, ext=".xd"): progress(fn) mainxd = xdfile(contents.decode('utf-8'), fn) if mainxd.xdid() in prev_similar: continue # skip reprocessing .xd that are already in similar.tsv """ find similar grids (pct, xd) for the mainxd in the corpus. Takes about 1 second per xd. sorted by pct. """ similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True) num_processed += 1 if num_processed > int(args.limit): break if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) mainpubid = mainxd.publication_id() maindate = mainxd.date() # go over each clue/answer, find all other uses, other answers, other possibilities. # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 for pos, mainclue, mainanswer in mainxd.iterclues(): progress(mainanswer) poss_answers = [] pub_uses = {} # [pubid] -> set(ClueAnswer) mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) # summary row to similar.tsv metadb.append_row( 'gxd/similar', [ mainxd.xdid(), # xdid int(100 * sum( pct / 100.0 for pct, xd1, xd2 in similar_grids)), # similar_grid_pct nstaleclues, # reused_clues nstaleanswers, # reused_answers ntotalclues, # total_clues " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids) # matches ])