Beispiel #1
0
def main():
    p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv")
    p.add_argument("-s", "--source", default=None, help="ExternalSource")
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else "misc"
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Beispiel #2
0
def main():
    p = args_parser('catalog source files and create source.tsv')
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    info("importing from %s" % args.source)

    outf = open_output()

    sources = []

    for input_source in args.inputs:
        for fn, contents, dt in find_files_with_time(input_source):
            if len(contents) == 0:
                info("ignoring empty file")
                continue

            outf.write_file(strip_toplevel(fn), contents, dt)

            sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt)))

    info("%s files cataloged" % len(sources))

    outbase = parse_pathname(args.output).base

    outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources))
    outf.write_file("%s.log" % outbase, get_log())
Beispiel #3
0
def main():
    p = args_parser(
        'process huge puzzles archive into separate .zip and create sources.tsv'
    )
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*',
                         parse_pathname(fn).base,
                         flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else 'misc'
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(
                    os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Beispiel #4
0
def main():
    p = utils.args_parser(desc="generate pubyear svg and pubyear pages")
    p.add_argument('-p',
                   '--pubonly',
                   action="store_true",
                   default=False,
                   help='only output root map')
    args = utils.get_args(parser=p)
    outf = utils.open_output()

    pubyears = defaultdict(list)
    pubyears_idx = defaultdict(list)
    # years_idx = []
    for r in metadb.read_rows('pub/stats'):
        y = r.year or '0000'
        pubyear = r.pubid + str(y)
        pubyears[pubyear].append(r)
        if y not in pubyears_idx[r.pubid]:
            pubyears_idx[r.pubid].append(y)
        # if r.year not in years_idx:
        #    years_idx.append(r.year)

    # Making collapsed decades depends on args
    allyears = []
    for i in range(DECADE_SKIP_START // 10, DECADE_SKIP_END // 10 + 1):
        allyears.append("%s0s" % i)
    allyears.extend(
        [str(y) for y in range(DECADE_SKIP_END + 10,
                               date.today().year + 1)])

    html_out = []
    html_out.append(
        '<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>'
    )
    html_out.append(legend)  # See definition above
    html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">')

    # Table header with years \ decades
    year_header = gen_year_header(allyears)
    html_out.extend(year_header)

    pubs_total = {}
    for pubid in pubyears_idx:
        pubs_total[pubid] = len(metadb.xd_puzzles(pubid))

    # sort rows by number of puzzles
    sorted_pubs = sorted(pubs_total.keys(),
                         key=lambda pubid: pubs_total[pubid],
                         reverse=True)
    for pub in args.inputs or sorted_pubs:
        if pubs_total[pub] < 20:
            continue

        # Process each pub in index
        pubobj = metadb.xd_publications().get(pub)
        if pubobj:
            pubname = pubobj.PublicationName or pubobj.PublisherName
        else:
            pubname = pub
        html_out.append('<tr><td class="header">{}</td>'.format(
            html.mkhref(pubname, 'pub/' + pub)))

        for year in sorted(allyears):
            html_out.append('<td class="year_widget">')
            py_td = td_for_pubyear(pubyears, pub, year)
            if py_td:
                html_out.append(py_td)
                if not args.pubonly:
                    outf.write_html(
                        'pub/{pub}{year}/index.html'.format(**locals()),
                        pubyear_html(pub, year),
                        "{pubname}, {year}".format(**locals()))
            else:
                # otherwise
                width = svg_w if 's' not in year else svg_w * decade_scale
                html_out.append(
                    pys.format(w=width,
                               h=svg_h,
                               title='',
                               classes='notexists',
                               body=''))

            html_out.append('</td>')

        # Add totals + publishers
        html_out.append('<td class="header">{}</td>'.format(pubs_total[pub]))
        html_out.append('<td class="header">{}</td>'.format(
            html.mkhref(pubname, 'pub/' + pub)))
        html_out.append('</tr>')

    html_out.extend(year_header)
    html_out.append('</table>')
    total_xd = len(metadb.xd_puzzles())
    outf.write_html('index.html', "".join(html_out),
                    "Comparison of %s published crossword grids" % total_xd)
Beispiel #5
0
def main():
    p = utils.args_parser(desc="generate pubyear svg and pubyear pages")
    p.add_argument('-p', '--pubonly', action="store_true", default=False, help='only output root map')
    args = utils.get_args(parser=p)
    outf = utils.open_output()

    pubyears = defaultdict(list)
    pubyears_idx = defaultdict(list)
    # years_idx = []
    for r in metadb.read_rows('pub/stats'):
        y = r.year or '0000'
        pubyear = r.pubid + str(y)
        pubyears[pubyear].append(r)
        if y not in pubyears_idx[r.pubid]:
            pubyears_idx[r.pubid].append(y)
        # if r.year not in years_idx:
        #    years_idx.append(r.year)

    # Making collapsed decades depends on args
    allyears = []
    for i in range(DECADE_SKIP_START//10, DECADE_SKIP_END//10 + 1):
        allyears.append("%s0s" % i)
    allyears.extend([ str(y) for y in range(DECADE_SKIP_END + 10, date.today().year + 1) ])

    html_out = []
    html_out.append('<p>Grouped by publication-year and broken out by day-of-week (Monday at top, Sunday at bottom).</p>')
    html_out.append(legend) # See definition above
    html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">')

    # Table header with years \ decades
    year_header = gen_year_header(allyears)
    html_out.extend(year_header)

    pubs_total = {}
    for pubid in pubyears_idx:
        pubs_total[pubid] = len(metadb.xd_puzzles(pubid))

    # sort rows by number of puzzles
    sorted_pubs = sorted(pubs_total.keys(), key=lambda pubid: pubs_total[pubid], reverse=True)
    for pub in args.inputs or sorted_pubs:
        if pubs_total[pub] < 20:
            continue

        # Process each pub in index
        pubobj = metadb.xd_publications().get(pub)
        if pubobj:
            pubname = pubobj.PublicationName or pubobj.PublisherName
        else:
            pubname = pub
        html_out.append('<tr><td class="header">{}</td>'.format(html.mkhref(pubname, pub)))

        for year in sorted(allyears):
            html_out.append('<td class="year_widget">')
            py_td = td_for_pubyear(pubyears, pub, year)
            if py_td:
                html_out.append(py_td)
                if not args.pubonly:
                    outf.write_html('pub/{pub}{year}/index.html'.format(**locals()), pubyear_html(pub, year),
                                    "{pubname}, {year}".format(**locals()))
            else:
                # otherwise
                width = svg_w if 's' not in year else svg_w*decade_scale
                html_out.append(pys.format(w=width, h=svg_h, title='', classes='notexists', body=''))

            html_out.append('</td>')

        # Add totals + publishers
        html_out.append('<td class="header">{}</td>'.format(pubs_total[pub]))
        html_out.append('<td class="header">{}</td>'.format(html.mkhref(pubname, pub)))
        html_out.append('</tr>')


    html_out.extend(year_header)
    html_out.append('</table>')
    total_xd = len(metadb.xd_puzzles())
    outf.write_html('index.html', "".join(html_out), "Comparison of %s published crossword grids" % total_xd)
Beispiel #6
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd': [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource')
    p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(input_source).filename

                already_received = metadb.check_already_received(ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn), ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource,InternalSource,SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__, str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" % (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime,
                            ReceivedTime,
                            ExternalSource,
                            InternalSource,
                            SourceFilename,
                            xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    todaystr = today.strftime("%Y-%m-%d")

    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date))

        for dt in sorted(dates_to_get):
            try:
                xdid = construct_xdid(pubid, dt)
                url = dt.strftime(puzsrc.urlfmt)
                fn = "%s.%s" % (xdid, puzsrc.ext)

                debug("downloading '%s' from '%s'" % (fn, url))

                response = urllib.request.urlopen(url)
                content = response.read()

                outf.write_file(fn, content)

                most_recent[pubid] = todaystr
            except (urllib.error.HTTPError, urllib.error.URLError) as err:
                error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url))
            except Exception as e:
                error(str(e))

            sources_tsv += xd_sources_row(fn, url, todaystr)

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

    if sources_tsv:
        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
Beispiel #8
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd':
        [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright',
                   default=None,
                   help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc',
                   default=None,
                   help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc',
                   default=None,
                   help='Value for receipts.InternalSource')
    p.add_argument('--pubid',
                   default=None,
                   help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source,
                                                         ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(
                    input_source, strip_toplevel=False),
                                           reverse=True,
                                           key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(
                        input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(
                    input_source).filename

                already_received = metadb.check_already_received(
                    ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn(
                            'previously received this same file under multiple xdids:'
                            + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn),
                                                      ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource, InternalSource,
                                               SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(
                                xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" %
                                  (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__,
                                                       str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" %
                              (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime, ReceivedTime, ExternalSource,
                            InternalSource, SourceFilename, xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
Beispiel #9
0
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        #        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        dates_to_get = get_ungotten_dates(pubid, from_date, to_date,
                                          int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        all_dates_to_get = sorted(dates_to_get)
        dates_to_get = dates_to_get[0:10] + dates_to_get[-10:]

        summary(
            "*** %s: %d puzzles from %s to %s not yet gotten, getting %d of them"
            % (pubid, len(all_dates_to_get), all_dates_to_get[0], to_date,
               len(dates_to_get)))
        most_recent[pubid] = str(
            download_puzzles(outf, puzsrc, pubid, dates_to_get))

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

#    if sources_tsv:
#        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV,
             "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))
Beispiel #10
0
def main():
    p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv")
    p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]")
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows("gxd/similar")
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode("utf-8"), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv

        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            "gxd/similar",
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids),  # matches
            ],
        )
Beispiel #11
0
def main():
    p = utils.args_parser(
        desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument(
        '-a',
        '--all',
        default=False,
        help='analyze all puzzles, even those already in similar.tsv')
    p.add_argument('-l',
                   '--limit',
                   default=100,
                   help='limit amount of puzzles to be analyzed [default=100]')
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows('gxd/similar')
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode('utf-8'), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv
        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
                               key=lambda x: x[0],
                               reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
                                        for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, [])
                        if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            'gxd/similar',
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(
                    pct / 100.0
                    for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct))
                         for pct, xd1, xd2 in similar_grids)  # matches
            ])