def find_pubid(rowstr): '''rowstr is a concatentation of all metadata fields Returns None if file not exist or empty ''' try: regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read()) except FileNotFoundError: utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING') return None matching = set() for r in regexes: m = re.search(r['regex'], rowstr, flags=re.IGNORECASE) if m: matching.add(r['pubid']) if not matching: utils.warn("%s: no regex matches" % rowstr) else: if len(matching) > 1: utils.warn("%s: too many regex matches (%s)" % (rowstr, " ".join(matching))) return None else: return matching.pop() return None
def pubyear_html(pubyears=[], skip_decades=None): """ skip_decades, default { 'start': 1910, 'end': 1970 } """ global g_all_pubyears if not g_all_pubyears: g_all_pubyears = utils.parse_tsv_data( open("pub/pubyears.tsv").read(), "pubyear") # Read similars to make background of widgets similar_d = defaultdict(dict) for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items(): xd_split = utils.split_xdid(xdid) if xd_split: pubid, year, mon, day = xd_split if year in similar_d[pubid]: similar_d[pubid][year].append(int(v.similar_grid_pct)) else: similar_d[pubid][year] = [int(v.similar_grid_pct)] b = [] # Body # Making collapsed decades depends on args skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1970 } allyears = [] for i in range(skip_decades['start'] // 10, skip_decades['end'] // 10 + 1): allyears.append("%s0s" % i) allyears.extend([ str(y) for y in range(skip_decades['end'] + 10, date.today().year + 1) ]) pubs = defaultdict(dict) # generate widget for each year for dowl in g_all_pubyears: dow = {} pubid, year, total = dowl[:3] hint = '' for d, v in zip(utils.WEEKDAYS, dowl[3:]): dow[d] = {'count': int(v) // 2, 'class': ''} dow[d]['class'] = 'red' if d == 'Sun' else 'ord' hint += '%s - %s\n' % (d, v) hint += 'Total: %s\n' % (total) # Define fill class based on average similarity fill_class = None # default fill class for widget if year in similar_d[pubid]: s_avg = sum(similar_d[pubid][year]) / len(similar_d[pubid][year]) hint += 'Avg similarity: %.2f%%' % (s_avg) # Example if average > 10 % fill_class = 'similar10' if s_avg >= 10 else None # Fill pubs with defferent blocks will be used below pubs[pubid][year] = { 'dow_data': dow, 'widget': year_widget(dow, total, fill_class), 'hint': hint, 'total': int(total), } # Process for all decades for dec_year in [x for x in allyears if 's' in x]: for pubid in pubs: year_key = dec_year[:-2] # Remove last year and "s" from the end total = 0 for yf in [x for x in pubs[pubid] if year_key in x]: total += pubs[pubid][yf]['total'] hint = 'Total: %s' % (total) if total > 0: pubs[pubid][dec_year] = { 'widget': decade_widget(total), 'hint': hint, 'total': int(total), } # main table b.append('<table class="pubyears">') yhdr = [' '] + [split_year(y) for y in allyears] yhdr.append("all") b.append( td_with_class(*yhdr, classes=get_pubheader_classes(*yhdr), rowclass="pubyearhead", tag="th")) b.append(tr_empty()) # Process each pubid sorted by earliest year for pubid in sorted(pubs, key=lambda x: min(pubs[x])): pub = metadb.xd_publications().get(pubid) pubname = pub.PublicationName if pub else '' # Pub id to first column b.append(mktag('tr')) b.append(mktag('td', 'pub')) b.append(mkcell( space_with_nbsp(pubname or pubid), "/pub/" + pubid, )) b.append(mktag('/td')) # Process each year not collapsed into decade for yi in allyears: if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0: b.append(mktag('td', 'this')) # Put link directly to year or to decade href = "/pub/%s%s" % ( pubid, yi) if 's' not in yi else "/pub/%s/index.html#%s" % ( pubid, yi[:-1]) b.append( mkcell(pubs[pubid][yi]['widget'], href=href, title=pubs[pubid][yi]['hint'])) b.append(mktag('/td')) else: b.append(mktag('td', 'block')) b.append(' ') b.append(mktag('/td')) b.append(mktag('td')) b.append( str(sum([pubs[pubid][x]['total'] for x in pubs[pubid].keys()]))) b.append(mktag('/td')) b.append(mktag('/tr')) b.append(mktag('/table')) return (" ".join(b))
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname(input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname(input_source).filename already_received = metadb.check_already_received(ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource,InternalSource,SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path(xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
def main(): global args parsers = { '.xml': [parse_ccxml, parse_uxml], '.json': [parse_ujson], '.puz': [parse_puz], '.html': [parse_xwordinfo], '.pdf': [], '.jpg': [], '.gif': [], '.xd': [], # special case, just copy the input, in case re-emitting screws it up } p = args_parser('convert crosswords to .xd format') p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers') p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource') p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource') p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use') args = get_args(parser=p) outf = open_output() for input_source in args.inputs: try: # collect 'sources' metadata source_files = {} # collect receipts receipts = [] for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'): progress(fn) for row in parse_tsv_data(contents.decode('utf-8'), "Source"): innerfn = strip_toplevel(row.SourceFilename) if innerfn in source_files: warn("%s: already in source_files!" % innerfn) continue source_files[innerfn] = row # enumerate all files in this source, reverse-sorted by time # (so most recent edition gets main slot in case of shelving # conflict) for fn, contents, dt in sorted(find_files_with_time( input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]): if fn.endswith(".tsv") or fn.endswith(".log"): continue if not contents: # 0-length files continue innerfn = strip_toplevel(fn) if innerfn in source_files: srcrow = source_files[innerfn] CaptureTime = srcrow.DownloadTime ExternalSource = args.extsrc or srcrow.ExternalSource SourceFilename = innerfn else: debug("%s not in sources.tsv" % innerfn) CaptureTime = iso8601(dt) ExternalSource = args.extsrc or parse_pathname( input_source).filename SourceFilename = innerfn ReceivedTime = iso8601(time.time()) InternalSource = args.intsrc or parse_pathname( input_source).filename already_received = metadb.check_already_received( ExternalSource, SourceFilename) xdid = "" prev_xdid = "" # unshelved by default existing_xdids = set(r.xdid for r in already_received) if existing_xdids: if len(existing_xdids) > 1: warn( 'previously received this same file under multiple xdids:' + ' '.join(existing_xdids)) else: prev_xdid = existing_xdids.pop() debug('already received as %s' % prev_xdid) # try each parser by extension ext = parse_pathname(fn).ext.lower() possible_parsers = parsers.get(ext, parsers[".puz"]) progress(fn) if ext == ".xd": outf.write_file(fn, contents.decode('utf-8'), dt) elif not possible_parsers: rejected = "no parser" else: rejected = "" for parsefunc in possible_parsers: try: try: xd = parsefunc(contents, fn) except IncompletePuzzleParse as e: error("%s %s" % (fn, e)) xd = e.xd if not xd: continue xd.filename = replace_ext(strip_toplevel(fn), ".xd") if not xd.get_header("Copyright"): if args.copyright: xd.set_header("Copyright", args.copyright) catalog.deduce_set_seqnum(xd) xdstr = xd.to_unicode() mdtext = "|".join((ExternalSource, InternalSource, SourceFilename)) xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext) path = catalog.get_shelf_path( xd, args.pubid, mdtext) outf.write_file(path + ".xd", xdstr, dt) rejected = "" break # stop after first successful parsing except xdfile.NoShelfError as e: error("could not shelve: %s" % str(e)) rejected += "[shelver] %s " % str(e) except Exception as e: error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e))) rejected += "[%s] %s " % (parsefunc.__name__, str(e)) # raise if rejected: error("could not convert: %s" % rejected) # only add receipt if first time converting this source if already_received: debug("already received %s:%s" % (ExternalSource, SourceFilename)) else: receipts.append([ CaptureTime, ReceivedTime, ExternalSource, InternalSource, SourceFilename, xdid ]) for r in receipts: metadb.append_row('gxd/receipts', r) except Exception as e: error(str(e)) if args.debug: raise
#!/usr/bin/env python3 # Usage: # $0 -o wwwroot/ gxd/redirects.tsv from xdfile import html, utils args = utils.get_args() outf = utils.open_output() for tsvfn, contents in utils.find_files(*args.inputs): for row in utils.parse_tsv_data(contents.decode('utf-8'), "Redirect"): outf.write_file(row.SourcePath, html.redirect_page(row.DestURL))
def pubyear_html(pubyears=[], skip_decades=None): """ skip_decades, default { 'start': 1910, 'end': 1970 } """ global g_all_pubyears if not g_all_pubyears: g_all_pubyears = utils.parse_tsv_data(open("pub/pubyears.tsv").read(), "pubyear") # Read similars to make background of widgets similar_d = defaultdict(dict) for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items(): xd_split = utils.split_xdid(xdid) if xd_split: pubid, year, mon, day = xd_split if year in similar_d[pubid]: similar_d[pubid][year].append(int(v.similar_grid_pct)) else: similar_d[pubid][year] = [ int(v.similar_grid_pct) ] b = [] # Body # Making collapsed decades depends on args skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1970 } allyears = [] for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1): allyears.append("%s0s" % i) allyears.extend([ str(y) for y in range(skip_decades['end'] + 10, date.today().year + 1) ]) pubs = defaultdict(dict) # generate widget for each year for dowl in g_all_pubyears: dow = {} pubid, year, total = dowl[:3] hint = '' for d, v in zip(utils.WEEKDAYS, dowl[3:]): dow[d] = { 'count': int(v)//2, 'class':'' } dow[d]['class'] = 'red' if d == 'Sun' else 'ord' hint += '%s - %s\n' % (d, v) hint += 'Total: %s\n' % (total) # Define fill class based on average similarity fill_class = None # default fill class for widget if year in similar_d[pubid]: s_avg = sum(similar_d[pubid][year]) / len(similar_d[pubid][year]) hint += 'Avg similarity: %.2f%%' % (s_avg) # Example if average > 10 % fill_class = 'similar10' if s_avg >= 10 else None # Fill pubs with defferent blocks will be used below pubs[pubid][year] = { 'dow_data': dow, 'widget': year_widget(dow, total, fill_class), 'hint': hint, 'total': int(total), } # Process for all decades for dec_year in [x for x in allyears if 's' in x]: for pubid in pubs: year_key = dec_year[:-2] # Remove last year and "s" from the end total = 0 for yf in [x for x in pubs[pubid] if year_key in x]: total += pubs[pubid][yf]['total'] hint = 'Total: %s' % (total) if total > 0: pubs[pubid][dec_year] = { 'widget': decade_widget(total), 'hint': hint, 'total': int(total), } # main table b.append('<table class="pubyears">') yhdr = [ ' ' ] + [ split_year(y) for y in allyears ] yhdr.append("all") b.append(td_with_class(*yhdr, classes=get_pubheader_classes(*yhdr), rowclass="pubyearhead",tag="th")) b.append(tr_empty()) # Process each pubid sorted by earliest year for pubid in sorted(pubs, key=lambda x:min(pubs[x])): pub = metadb.xd_publications().get(pubid) pubname = pub.PublicationName if pub else '' # Pub id to first column b.append(mktag('tr')) b.append(mktag('td','pub')) b.append(mkcell(space_with_nbsp(pubname or pubid), "/pub/" + pubid, )) b.append(mktag('/td')) # Process each year not collapsed into decade for yi in allyears: if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0: b.append(mktag('td','this')) # Put link directly to year or to decade href = "/pub/%s%s" % (pubid, yi) if 's' not in yi else "/pub/%s/index.html#%s" % (pubid, yi[:-1]) b.append(mkcell(pubs[pubid][yi]['widget'], href=href, title=pubs[pubid][yi]['hint'])) b.append(mktag('/td')) else: b.append(mktag('td', 'block')) b.append(' ') b.append(mktag('/td')) b.append(mktag('td')) b.append(str(sum([ pubs[pubid][x]['total'] for x in pubs[pubid].keys() ]))) b.append(mktag('/td')) b.append(mktag('/tr')) b.append(mktag('/table')) return (" ".join(b))