def get_shelf_path(xd, pubid, mdtext): publisher = "" if not pubid: pubid = find_pubid(mdtext) if pubid: publ = metadb.xd_publications()[pubid] else: publ = get_publication(xd) if publ: pubid = publ.PublicationAbbr else: return None if not pubid: utils.warn("unknown pubid for '%s'" % xd.filename) return None publisher = publ.PublisherAbbr num = xd.get_header('Number') if num: return "%s/%s-%03d" % (publisher or pubid, pubid, int(num)) dt = xd.get_header("Date") if not dt: utils.warn("neither Number nor Date for '%s'" % xd.filename) return 'misc/' + xd.filename year = xdfile.year_from_date(dt) return "%s/%s/%s%s" % (publisher, year, pubid, dt)
def find_pubid(rowstr): '''rowstr is a concatentation of all metadata fields Returns None if file not exist or empty ''' try: regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read()) except FileNotFoundError: utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING') return None matching = set() for r in regexes: m = re.search(r['regex'], rowstr, flags=re.IGNORECASE) if m: matching.add(r['pubid']) if not matching: utils.warn("%s: no regex matches" % rowstr) else: if len(matching) > 1: utils.warn("%s: too many regex matches (%s)" % (rowstr, " ".join(matching))) return None else: return matching.pop() return None
def clean_headers(xd): # remove known unwanted header fields, log unknown headers for hdr in list(xd.headers.keys()): if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]: xd.set_header(hdr, None) else: if hdr.lower() not in xdfile.HEADER_ORDER: utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr])) # clean Author and Editor headers author = xd.get_header("Author") or "" if not author: if xd.get_header("Creator"): assert not author author = xd.get_header("Creator") xd.set_header("Creator", None) editor = xd.get_header("Editor") or "" newauthor, neweditor = clean_author(author, editor) if newauthor != author: xd.set_header("Author" + CLEAN_SUFFIX, newauthor) if neweditor != editor: xd.set_header("Editor" + CLEAN_SUFFIX, neweditor) # clean Title header title = xd.get_header("Title") or "" newtitle = clean_title(title) if newtitle != title: xd.set_header("Title" + CLEAN_SUFFIX, newtitle) # create Date header dt = xd.get_header("Date") ## try getting Date from filename if not dt: try: d = utils.parse_date_from_filename(xd.filename) if d: dt = d.strftime("%Y-%m-%d") except Exception as e: utils.error(str(e)) if args.debug: raise ## try getting Date from copyright if not dt: rights = xd.get_header("Copyright") or "" dt = find_date(rights) if dt: xd.set_header("Date", dt)
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() todaystr = today.strftime("%Y-%m-%d") sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date)) for dt in sorted(dates_to_get): try: xdid = construct_xdid(pubid, dt) url = dt.strftime(puzsrc.urlfmt) fn = "%s.%s" % (xdid, puzsrc.ext) debug("downloading '%s' from '%s'" % (fn, url)) response = urllib.request.urlopen(url) content = response.read() outf.write_file(fn, content) most_recent[pubid] = todaystr except (urllib.error.HTTPError, urllib.error.URLError) as err: error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url)) except Exception as e: error(str(e)) sources_tsv += xd_sources_row(fn, url, todaystr) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) if sources_tsv: outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def main(): p = args_parser('download recent puzzles') args = get_args(parser=p) outf = open_output() today = datetime.date.today() sources_tsv = '' puzzle_sources = xd_puzzle_sources() new_recents_tsv = [] # some downloads may fail, track the last successful ones most_recent = {} # download new puzzles since most recent download for row in metadb.xd_recent_downloads().values(): pubid = row.pubid latest_date = datestr_to_datetime(row.date) # by default, keep the previous one most_recent[pubid] = row.date if pubid not in puzzle_sources: warn("unknown puzzle source for '%s', skipping" % pubid) continue puzsrc = puzzle_sources[pubid] if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"): warn("no source url for '%s', skipping" % pubid) continue from_date = latest_date to_date = today # dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq)) dates_to_get = get_ungotten_dates(pubid, from_date, to_date, int(puzsrc.freq)) if not dates_to_get: warn("*** %s: nothing to get since %s" % (pubid, from_date)) continue all_dates_to_get = sorted(dates_to_get) dates_to_get = dates_to_get[0:10] + dates_to_get[-10:] summary( "*** %s: %d puzzles from %s to %s not yet gotten, getting %d of them" % (pubid, len(all_dates_to_get), all_dates_to_get[0], to_date, len(dates_to_get))) most_recent[pubid] = str( download_puzzles(outf, puzsrc, pubid, dates_to_get)) for k, v in most_recent.items(): new_recents_tsv.append(xd_recent_download(k, v)) # if sources_tsv: # outf.write_file("sources.tsv", xd_sources_header + sources_tsv) if new_recents_tsv: # on filesystem open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
def main(): args = utils.get_args('generates .html diffs for all puzzles in similar.tsv') outf = utils.open_output() similars = utils.parse_tsv('gxd/similar.tsv', 'Similar') xdids_todo = {} for row in metadb.xd_similar_all(): if row.xdid not in xdids_todo: xdids_todo[row.xdid] = [] xdids_todo[row.xdid].append(row) for mainxdid in xdids_todo: progress(mainxdid) mainxd = xdfile.get_xd(mainxdid) if not mainxd: warn('%s not in corpus' % mainxdid) continue matches = xdids_todo[mainxdid] info('generating diffs for %s (%d matches)' % (mainxdid, len(matches))) xddates = {} xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort html_grids = {} html_clues = {} # Store in list to make further formatting as html table easier html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid)) # Add for main XD diff_l = [] for pos, mainclue, mainanswer in mainxd.iterclues(): if not mainclue: continue diff_h = mktag('div','fullgrid main') + '%s. ' %pos diff_h += mainclue diff_h += mktag('span', tagclass='main', inner=' ~ ' + mainanswer.upper()) diff_l.append(diff_h) html_clues[mainxdid] = diff_l # Process for all matches for row in matches: xdid = row.match_xdid xd = xdfile.get_xd(xdid) # Continue if can't load xdid if not xd: continue xddates[xdid] = xd.date() # output each grid html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd) diff_l = [] # output comparison of each set of clues for pos, clue, answer in xd.iterclues(): diff_h = mktag('div','fullgrid') + '%s. ' %pos if not clue: continue # Sometimes can return clue == None mainclue = mainxd.get_clue_for_answer(answer) sm = difflib.SequenceMatcher(lambda x: x == ' ', mainclue or '', clue) debug('MCLUE: %s [%s]' % (mainclue, sm.ratio())) if mainclue is None or sm.ratio() < 0.40: diff_h += clue else: # Compare based on op codes for opcode in sm.get_opcodes(): c, a1, a2, b1, b2 = opcode if c == 'equal': diff_h += '<span class="match">%s</span>' % clue[b1:b2] else: diff_h += '<span class="diff">%s</span>' % clue[b1:b2] tagclass = 'match' if mainclue or answer == mainxd.get_answer(pos) else 'diff' diff_h += mktag('span', tagclass=tagclass, inner=' ~ ' + answer.upper()) diff_h += mktag('/div') diff_l.append(diff_h) html_clues[xdid] = diff_l # Wrap into table diff_h = mktag('table') + mktag('tr') # Sort by date sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1)) for w, dt in sortedkeys: # Wrap into table diff_h += mktag('td') + html_grids[w] + mktag('/td') diff_h += mktag('/tr') for i, clue in enumerate(html_clues[sortedkeys[0][0]]): diff_h += mktag('tr') for w, dt in sortedkeys: if i < len(html_clues[w]): diff_h += mktag('td') + html_clues[w][i] + mktag('/td') diff_h += mktag('/tr') diff_h += mktag('/table') outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)