def main(): p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv") p.add_argument("-s", "--source", default=None, help="ExternalSource") args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else "misc" if prefix not in subzips: zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): p = args_parser('catalog source files and create source.tsv') p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) info("importing from %s" % args.source) outf = open_output() sources = [] for input_source in args.inputs: for fn, contents, dt in find_files_with_time(input_source): if len(contents) == 0: info("ignoring empty file") continue outf.write_file(strip_toplevel(fn), contents, dt) sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt))) info("%s files cataloged" % len(sources)) outbase = parse_pathname(args.output).base outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources)) outf.write_file("%s.log" % outbase, get_log())
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join( ["grid_xdid", "clues_pubid", "num_missing"]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header( "Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join( [xd.xdid(), pubid, str(nmissing)]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file( parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def main(): args = utils.get_args() outf = utils.open_output() for htmlfn, contents in utils.find_files(*args.inputs): basepagename = utils.parse_pathname(htmlfn).base wrappeddiv = '<div class="text">' + contents.decode('utf-8') + '</div>' outf.write_html('%s/index.html' % basepagename, wrappeddiv)
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def deduce_set_seqnum(xd): # look to filename base = utils.parse_pathname(xd.filename).base # check for date dt = utils.parse_date_from_filename(base) # datetime object if dt: xd.set_header("Date", dt) else: # check for number in full path (eltana dir had number) m = re.search(r'(\d+)', xd.filename) if m: xd.set_header("Number", int(m.group(1)))
def main(): p = args_parser( 'process huge puzzles archive into separate .zip and create sources.tsv' ) p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*', parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else 'misc' if prefix not in subzips: zf = xdfile.utils.OutputZipFile( os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def generate_email_files(msg): counter = 1 upload_date = time.mktime(email.utils.parsedate(msg["Date"])) for part in msg.walk(): # multipart/* are just containers if part.get_content_maintype() == 'multipart': continue # Applications should really sanitize the given filename so that an # email message can't be used to overwrite important files filename = part.get_filename() if not filename: ext = mimetypes.guess_extension(part.get_content_type()) if not ext: # Use a generic bag-of-bits extension ext = '.bin' filename = 'part-%03d%s' % (counter, ext) counter += 1 data = part.get_payload(decode=True) if parse_pathname(filename).ext == '.zip': for zipfn, zipdata, zipdt in generate_zip_files(data): yield zipfn, zipdata, zipdt else: yield filename, data, upload_date
def main(): args = utils.get_args('generates .html diffs with deep clues for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xdids_todo = [ parse_pathname(fn).base for fn in args.inputs ] if not xdids_todo: xdids_todo = [ xdid for xdid, matches in metadb.get_similar_grids().items() if matches ] for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: continue matches = metadb.get_similar_grids().get(mainxdid, []) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # these are added directly to similar.tsv nstaleclues = 0 nstaleanswers = 0 ntotalclues = 0 poss_answers = [] # TODO: pub_uses = {} # [pubid] -> set(ClueAnswer) dcl_html = '' deepcl_html = [] # keep deep clues to parse later - per row for pos, mainclue, mainanswer in mainxd.iterclues(): deepcl_html = [] # Temporary to be replaced late mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue) # 'grid position' column deepcl_html.append('<td class="pos">%s.</td>' % pos) # find other uses of this clue, and other answers, in a single pass for clueans in find_clue_variants(mainclue): if clueans.answer != mainanswer: poss_answers.append(clueans) if clueans.answer == mainanswer: if clueans.pubid in pub_uses: otherpubs = pub_uses[clueans.pubid] else: otherpubs = set() # set of ClueAnswer pub_uses[clueans.pubid] = otherpubs otherpubs.add(clueans) # add 'other uses' to clues_html stale = False deepcl_html.append('<td class="other-uses">') if len(pub_uses) > 0: sortable_uses = [] for pubid, uses in pub_uses.items(): # show the earliest unboiled clue for u in sorted(uses, key=lambda x: x.date or ""): # only show those published earlier if u.date and u.date <= mainxd.date(): if pubid == mainxdid and u.date == mainxd.date(): pass else: stale = True sortable_uses.append((u.date, u, 1)) deepcl_html.append(html_select([ (clue, nuses) for dt, clue, nuses in sorted(sortable_uses, key=lambda x: x[0], reverse=True) ], top_option=mainclue)) else: deepcl_html.append('<div class="original">%s</div>' % esc(mainclue)) deepcl_html.append('</td>') # add 'other answers' to clues_html deepcl_html.append('<td class="other-answers">') deepcl_html.append(html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca)) deepcl_html.append('</td>') # add 'other clues' to clues_html deepcl_html.append('<td class="other-clues">') # bclues is all boiled clues for this particular answer: { [bc] -> #uses } bclues = load_answers().get(mainanswer, []) stale_answer = False if bclues: uses = [] for bc, nuses in bclues.items(): # then find all clues besides this one clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < mainxd.date() ] if clue_usages: stale_answer = True if nuses > 1: # only use one (the most recent) ClueAnswer per boiled clue # but use the clue only (no xdid) ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue else: ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1] uses.append((ca, nuses)) if uses: deepcl_html.append(html_select(uses)) deepcl_html.append('</td>') # end 'other-clues' if stale_answer: nstaleanswers += 1 if stale: nstaleclues += 1 ntotalclues += 1 # Quick and dirty - to be replaced dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>' # Store in list to make further formatting as html table easier mainxd = xdfile.get_xd(mainxdid) if mainxd: html_grids[mainxdid] = grid_diff_html(mainxd) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): diff_h = mktag('div','fullgrid main') + '%s. ' %pos diff_h += mainclue diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for xdid in matches: xd = xdfile.get_xd(xdid) if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag('div','fullgrid') + '%s. ' %pos # Sometimes can return clue == None sm = difflib.SequenceMatcher(lambda x: x == ' ', mainxd.get_clue(pos) or '', clue) if sm.ratio() < 0.50: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == 'equal': diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] diff_h += mktag('span', tagclass=(answer == mainxd.get_answer(pos)) and 'match' or 'diff', inner=' ~ ' + answer.upper()) diff_h += mktag('/div') diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag('table') + mktag('tr') # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag('td') + html_grids[w] + mktag('/td') diff_h += mktag('/tr') for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag('tr') for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag('td') + html_clues[w][i] + mktag('/td') diff_h += mktag('/tr') # Process deepclues diff_h += mktag('table') + dcl_html + mktag('/table') diff_h += mktag('/table') outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h, title='Deep clue comparison for ' + mainxdid)
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise