def main(): p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv") p.add_argument("-s", "--source", default=None, help="ExternalSource") args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else "misc" if prefix not in subzips: zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): p = args_parser('catalog source files and create source.tsv') p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) info("importing from %s" % args.source) outf = open_output() sources = [] for input_source in args.inputs: for fn, contents, dt in find_files_with_time(input_source): if len(contents) == 0: info("ignoring empty file") continue outf.write_file(strip_toplevel(fn), contents, dt) sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt))) info("%s files cataloged" % len(sources)) outbase = parse_pathname(args.output).base outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources)) outf.write_file("%s.log" % outbase, get_log())
def main(): args = get_args('aggregates all .log files into one .html') outwww = open_output() log_html = '' for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]): # earliest first log_html += '\n\n<h2>%s</h2><pre>%s</pre>' % (fn, cgi.escape(contents.decode("utf-8"))) datestr = iso8601() outwww.write_html("logs.html", log_html, title="logs for " + datestr)
def main(): args = get_args('parse downloaded emails') outf = open_output() sources_tsv = '' for emailfn, emailcontents in find_files(*args.inputs): msg = email.message_from_bytes(emailcontents) upload_src = msg["From"] if not upload_src: continue email_sources_tsv = [] email_files = generate_email_files(msg) for puzfn, puzdata, puzdt in email_files: # a basic sanity check of filesize # accommodate small puzzles and .pdf info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src)) summary("%s puzzles from %s" % (len(email_files), upload_src)) if len(puzdata) > 1000 and len(puzdata) < 100000: email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt))) outf.write_file(puzfn, puzdata) # generate receipt row, send receipt email if email_sources_tsv: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload successful: %d files received' % len(email_sources_tsv), body="These files were received:\n" + "\n".join(email_sources_tsv)) sources_tsv += "".join(email_sources_tsv) else: xd_send_email(upload_src, fromaddr='*****@*****.**', subject='Upload error', body='No puzzle files received')
def main(): p = args_parser( 'process huge puzzles archive into separate .zip and create sources.tsv' ) p.add_argument('-s', '--source', default=None, help='ExternalSource') args = get_args(parser=p) outf = open_output() if args.source: source = args.source else: source = parse_pathname(args.inputs[0]).base subzips = {} for inputfn in args.inputs: for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn): if not contents: continue m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*', parse_pathname(fn).base, flags=re.IGNORECASE) prefix = m.group(1).lower() if m else 'misc' if prefix not in subzips: zf = xdfile.utils.OutputZipFile( os.path.join(args.output, prefix + ".zip")) sources = [] subzips[prefix] = (zf, sources) else: zf, sources = subzips[prefix] progress("Processing %s -> %s" % (fn, prefix)) zf.write_file(fn, contents, dt) sources.append(xd_sources_row(fn, source, iso8601(dt))) for zf, sources in subzips.values(): zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise