def find_pubid(rowstr): '''rowstr is a concatentation of all metadata fields Returns None if file not exist or empty ''' try: regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read()) except FileNotFoundError: utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING') return None matching = set() for r in regexes: m = re.search(r['regex'], rowstr, flags=re.IGNORECASE) if m: matching.add(r['pubid']) if not matching: utils.warn("%s: no regex matches" % rowstr) else: if len(matching) > 1: utils.warn("%s: too many regex matches (%s)" % (rowstr, " ".join(matching))) return None else: return matching.pop() return None
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join( ["grid_xdid", "clues_pubid", "num_missing"]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header( "Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join( [xd.xdid(), pubid, str(nmissing)]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file( parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def clean_headers(xd): # remove known unwanted header fields, log unknown headers for hdr in list(xd.headers.keys()): if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]: xd.set_header(hdr, None) else: if hdr.lower() not in xdfile.HEADER_ORDER: utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr])) # clean Author and Editor headers author = xd.get_header("Author") or "" if not author: if xd.get_header("Creator"): assert not author author = xd.get_header("Creator") xd.set_header("Creator", None) editor = xd.get_header("Editor") or "" newauthor, neweditor = clean_author(author, editor) if newauthor != author: xd.set_header("Author" + CLEAN_SUFFIX, newauthor) if neweditor != editor: xd.set_header("Editor" + CLEAN_SUFFIX, neweditor) # clean Title header title = xd.get_header("Title") or "" newtitle = clean_title(title) if newtitle != title: xd.set_header("Title" + CLEAN_SUFFIX, newtitle) # create Date header dt = xd.get_header("Date") ## try getting Date from filename if not dt: try: d = utils.parse_date_from_filename(xd.filename) if d: dt = d.strftime("%Y-%m-%d") except Exception as e: utils.error(str(e)) if args.debug: raise ## try getting Date from copyright if not dt: rights = xd.get_header("Copyright") or "" dt = find_date(rights) if dt: xd.set_header("Date", dt)
def main(): args = get_args("reclue puzzle with clues from other publications") outf = open_output() all_clues = load_clues() missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL for fn, contents in find_files(*args.inputs, ext=".xd"): xd = xdfile(contents, fn) if not xd.grid: continue xd.set_header("Title", None) xd.set_header("Editor", "Timothy Parker Bot") xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last))) xd.set_header("Copyright", None) xd.set_header("Date", iso8601()) remixed = set() for pubid, pub_clues in list(all_clues.items()): try: if pubid == xd.publication_id(): continue # don't use same publisher's clues nmissing = reclue(xd, pub_clues) outfn = "%s-%s.xd" % (xd.xdid(), pubid) if nmissing == 0: nmutated = 0 while nmutated < 100: nmutated += mutate(xd, pub_clues) nmissing = reclue(xd, pub_clues) info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated)) remixed.add(pubid) outf.write_file(outfn, xd.to_unicode()) else: debug("%s missing %d clues" % (outfn, nmissing)) missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL except Exception as e: error("remix error %s" % str(e)) if remixed: info("%d remixed: %s" % (len(remixed), " ".join(remixed))) try: outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8")) except Exception as e: error("couldn't write: " + str(e)) outf.write_file("remix.log", get_log().encode("utf-8")) outf.write_file("remix.tsv", missing_tsv)
def xd_send_email(destaddr, fromaddr='*****@*****.**', subject='', body=''): client = boto3.client('ses', region_name=os.environ['REGION']) info("sending email to %s (subject '%s')" % (destaddr, subject)) try: response = client.send_email( Source=fromaddr, Destination= {'ToAddresses': [ destaddr ] }, Message={ 'Subject': { 'Data': subject }, 'Body': { 'Text': { 'Data': body } } }) return response except Exception as e: error("xd_send_email(): %s" % str(e)) return None
def xd_send_email(destaddr, fromaddr='*****@*****.**', subject='', body=''): client = boto3.client('ses', region_name=os.environ['REGION']) info("sending email to %s (subject '%s')" % (destaddr, subject)) try: response = client.send_email(Source=fromaddr, Destination={'ToAddresses': [destaddr]}, Message={ 'Subject': { 'Data': subject }, 'Body': { 'Text': { 'Data': body } } }) return response except Exception as e: error("xd_send_email(): %s" % str(e)) return None
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def parse_ccxml(data, filename): content = data.decode('utf-8', errors='replace') content = escape(content, xml_escape_table) content = consecutive(content) content = re.sub(r'(=["]{2}([^"]+?)["]{2})+',r'=""\2""', content) # Replace double quotes content_xml = content.encode('utf-8') ns = { 'puzzle': 'http://crossword.info/xml/rectangular-puzzle' } try: root = etree.fromstring(content_xml) except Exception as e: error('Exception %s' % e) error(content) exit # init crossword grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns) if not grid: return None grid = grid[0] rows = int(grid.attrib['height']) cols = int(grid.attrib['width']) xd = xdfile.xdfile('', filename) # add metadata for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]: text = metadata.text and metadata.text.strip() title = re.sub('\{[^\}]*\}', '', metadata.tag.title()) title = escape(title, rev_xml_escape_table) if text: text = escape(text, rev_xml_escape_table) xd.set_header(HEADER_RENAMES.get(title, title), text) # add puzzle puzzle = [] for i in range(rows): puzzle.append([" "] * cols) for cell in grid.xpath('./puzzle:cell', namespaces=ns): x = int(cell.attrib['x']) - 1 y = int(cell.attrib['y']) - 1 if 'solution' in cell.attrib: value = cell.attrib['solution'] if 'type' in cell.attrib and cell.attrib['type'] == 'block': value = xdfile.BLOCK_CHAR puzzle[y][x] = value xd.grid = ["".join(row) for row in puzzle] # add clues word_map = {} for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns): word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y']) for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns): type = clues.xpath('./puzzle:title', namespaces=ns)[0] type = "".join(chr(x) for x in etree.tostring(type, method='text').upper() if chr(x) in string.ascii_uppercase) type = type[0] for clue in clues.xpath('./puzzle:clue', namespaces=ns): word_id = clue.attrib['word'] number = int(clue.attrib['number']) text = "|".join(clue.itertext()).strip() text = escape(text, rev_xml_escape_table) solution = get_solution(word_id, word_map, puzzle) xd.clues.append(((type, number), text, solution)) return xd
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def parse_ccxml(data, filename): content = data.decode('utf-8', errors='replace') content = escape(content, xml_escape_table) content = consecutive(content) content = re.sub(r'(=["]{2}([^"]+?)["]{2})+', r'=""\2""', content) # Replace double quotes content_xml = content.encode('utf-8') ns = {'puzzle': 'http://crossword.info/xml/rectangular-puzzle'} try: root = etree.fromstring(content_xml) except Exception as e: error('Exception %s' % e) error(content) exit # init crossword grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns) if not grid: return None grid = grid[0] rows = int(grid.attrib['height']) cols = int(grid.attrib['width']) xd = xdfile.xdfile('', filename) # add metadata for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]: text = metadata.text and metadata.text.strip() title = re.sub('\{[^\}]*\}', '', metadata.tag.title()) title = escape(title, rev_xml_escape_table) if text: text = escape(text, rev_xml_escape_table) xd.set_header(HEADER_RENAMES.get(title, title), text) # add puzzle puzzle = [] for i in range(rows): puzzle.append([" "] * cols) for cell in grid.xpath('./puzzle:cell', namespaces=ns): x = int(cell.attrib['x']) - 1 y = int(cell.attrib['y']) - 1 if 'solution' in cell.attrib: value = cell.attrib['solution'] if 'type' in cell.attrib and cell.attrib['type'] == 'block': value = xdfile.BLOCK_CHAR puzzle[y][x] = value xd.grid = ["".join(row) for row in puzzle] # add clues word_map = {} for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns): word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y']) for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns): type = clues.xpath('./puzzle:title', namespaces=ns)[0] type = "".join( chr(x) for x in etree.tostring(type, method='text').upper() if chr(x) in string.ascii_uppercase) type = type[0] for clue in clues.xpath('./puzzle:clue', namespaces=ns): word_id = clue.attrib['word'] number = int(clue.attrib['number']) text = "|".join(clue.itertext()).strip() text = escape(text, rev_xml_escape_table) solution = get_solution(word_id, word_map, puzzle) xd.clues.append(((type, number), text, solution)) return xd