Beispiel #1
0
def pubyear_html(pub, year):
    calendars_html = '<table class="puzzles">'
    colnames = [year] + pubyear_header
    calendars_html += html.table_row(colnames, colnames, tag='th')

    # write out /pub/nyt199x
    c_grids = {}

    # utils.info('Generating meta for {pub}{year}'.format(**locals()))
    for row in sorted(metadb.xd_similar(pub + year)):
        dt = utils.parse_iso8601(row.xdid)
        dt2 = utils.parse_iso8601(row.match_xdid)

        if not dt or not dt2:
            continue
        if dt < dt2:
            continue

        # dt = row["date"] # without - as GridCalendar needs; or fix GC
        if dt not in c_grids:
            c_grids[dt] = {'title': '', 'class': ''}

        if row.match_pct == 0:
            continue

        c_grids[dt]['link'] = '/pub/' + row.xdid

        matchxdid = row.match_xdid
        aut1 = metadb.get_author(row.xdid) or ''
        aut2 = metadb.get_author(matchxdid) or ''
        #        if aut1 is None or aut2 is None:
        #            continue

        pct = row.match_pct
        similargrids = '(%s%%) %s [%s]\n' % (pct, aut2, matchxdid)
        c_grids[dt]["title"] += similargrids

        ##deduce_similarity_type
        c_grids[dt]["class"] += ret_classes(aut1, aut2, pct)

    c_grids_b = {}  #  For those are not in c_grids

    # Generate grids for available puzzles
    for row in metadb.xd_puzzles(pub + year):
        if row.Date and row.Date not in c_grids_b and row.Date not in c_grids:
            # add styles only for those are not similar etc.
            c_grids_b[row.Date] = {
                'title': '',
                'class': 'privxd' if int(row.Date[:4]) > 1965 else 'pubxd',
            }

    # Generate calendars
    z = c_grids.copy()
    z.update(c_grids_b)

    if year[-1] == 's':  # decade
        from_year = int(year[:4])
        to_year = int(year[:4]) + 10
    else:
        from_year = int(year)
        to_year = int(year) + 1

    for year in range(from_year, to_year):
        for month in range(1, 13):
            dups_table = []
            for dt, d in c_grids.items():
                if not dt.startswith("%s-%02d" % (year, month)):
                    continue

                row_dict = {}  # Map row and style
                xdid = pub + dt
                puzmd = metadb.xd_puzzle(xdid)
                if not puzmd:
                    continue
                row_dict['class'] = d['class']
                row_dict['tag_params'] = {
                    'onclick': 'location.href=\'/pub/%s\'' % xdid,
                    'class': d['class'] + ' hrefrow puzrow',
                }
                row_dict['row'] = [
                    xdid, puzmd.Date, puzmd.Size, puzmd.Title, puzmd.Author,
                    puzmd.Editor, puzmd.Copyright, puzmd.A1_D1,
                    d["title"].replace("\n", "<br/>")
                ]
                dups_table.append(row_dict)

            calendars_html += '<tr class="calendar"><td class="calendar" rowspan="%s">' % (
                len(dups_table) + 1)
            calendars_html += html.GridCalendar(z).formatmonth(
                int(year), month) + '</td></tr>'

            for r in sorted(dups_table, key=lambda x: x['row'][1]):
                calendars_html += html.table_row(r["row"],
                                                 pubyear_header,
                                                 tag_params=r['tag_params'])

    calendars_html += '</table>'

    ret = '''%s <div class="calendars">%s</div> <hr/>''' % (legend,
                                                            calendars_html)
    return ret
Beispiel #2
0
def main():
    args = utils.get_args('generate pub-years data')
    outf = utils.open_output()

    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]

    pubyears = {} # set()
    for xd in xdfile.corpus():
        puby = (xd.publication_id(), xd.year())
        if puby not in pubyears:
            pubyears[puby] = []
        pubyears[puby].append(xd)

    if pubyears:
        metadb.delete_stats()

    for puby, xdlist in sorted(pubyears.items()):
        pubid, year = puby
        npublic = 0

        # TODO: SELECT FROM publications
        nexisting = 0

        # organize by day-of-week
        byweekday = {}
        byweekday_similar = {}
        for w in weekdays:
            byweekday[w] = []
            byweekday_similar[w] = []

        for xd in xdlist:
            dow = dow_from_date(xd.get_header('Date'))
            if dow:  # Might be empty date or only a year
                byweekday[dow].append(xd)

        for r in metadb.xd_similar(pubid + str(year)):
            if r.match_pct < 25:
                continue
            xd = xdfile.get_xd(r.xdid)

            if xd:
                dt = xd.get_header('Date')
                if dt:
                    dow = dow_from_date(dt)
                    if dow:  # Might be empty date or only a year
                        byweekday_similar[dow].append(r)
                else:
                    debug("Date not set for: %s" % xd)

        # tally stats
        for weekday in weekdays:
            copyrights = Counter()  # [copyright_text] -> number of xd
            editors = Counter()  # [editor_name] -> number of xd
            formats = Counter()  # ["15x15 RS"] -> number of xd
            # todo
            nexisting = 0

            nxd = len(byweekday[weekday])
            public_xdids = []  # Empty for now
            for xd in byweekday[weekday]:
                xdid = xd.xdid()
                if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids:
                    npublic += 1

                editor = xd.get_header('Editor').strip()
                if editor:
                    editors[editor] += 1

                sizestr = xd.sizestr()
                if sizestr:
                    formats[sizestr] += 1

                copyright = xd.get_header('Copyright').strip()
                if copyright:
                    copyrights[copyright] += 1

            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
            def process_counter(count, comp_value):
                # Process counter comparing with comp_value
                if count:
                    item, num = count.most_common(1)[0]
                    if num != comp_value:
                        item += " (%s)" % num
                else:
                    item = ''
                return item

            #
            maineditor = process_counter(editors, nxd)
            maincopyright = process_counter(copyrights, nxd)
            mainformat = process_counter(formats, nxd)

            reprints = 0
            touchups = 0
            redones = 0
            copies = 0
            themecopies = 0
            for r in byweekday_similar[weekday]:
                xd1 = xdfile.get_xd(r.xdid)
                xd2 = xdfile.get_xd(r.match_xdid)
                if xd1 is None:
                    info("%s: similar puzzle %s not in corpus" % (r.match_xdid, r.xdid))
                    continue

                if xd2 is None:
                    info("%s: similar puzzle %s not in corpus" % (r.xdid, r.match_xdid))
                    continue

                dt1 = xd1.get_header('Date')
                dt2 = xd2.get_header('Date')
                aut1 = xd1.get_header('Author').lower()
                aut2 = xd2.get_header('Author').lower()
                pct = int(r.match_pct)
                if dt2 < dt1:  # only capture the later one
                    ##deduce_similarity_type
                    if diff_authors(aut1, aut2): # suspicious
                        if pct >= 50:
                            copies += 1
                        elif pct >= 30:
                            themecopies += 1
                    else:
                        if pct == 100:
                            reprints += 1
                        elif pct >= 50:
                            touchups += 1
                        elif pct >= 30:
                            themecopies += 1

            metadb.append_row("pub/stats",
                                  (pubid, year, weekday,
                                      mainformat, maineditor, maincopyright,
                                      nexisting, nxd, npublic,
                                      reprints, touchups, redones,
                                      copies, themecopies))
Beispiel #3
0
def main():
    args = utils.get_args(
        'generates .html diffs with deep clues for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')

    xds_todo = []
    for fn, contents in find_files(*args.inputs, ext='.xd'):
        xd = xdfile.xdfile(contents.decode('utf-8'), fn)
        xds_todo.append(xd)

    for mainxd in xds_todo:
        mainxdid = mainxd.xdid()
        progress(mainxdid)

        matches = metadb.xd_similar(mainxdid)

        xddates = {}
        xddates[mainxdid] = mainxd.date(
        )  # Dict to store XD dates for further sort
        html_grids = {}

        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0

        dcl_html = '<tr>'
        dcl_html += '<th></th>'
        dcl_html += '<th>Clue</th>'
        dcl_html += '<th>ANSWERs</th>'
        dcl_html += '<th>Alt. clue possibilities</th>'
        dcl_html += '</tr>'

        deepcl_html = []  # keep deep clues to parse later - per row
        for pos, mainclue, mainanswer in mainxd.iterclues():
            if not pos:
                continue

            poss_answers = []  # TODO:
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            deepcl_html = []  # Temporary to be replaced late
            mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue)

            # 'grid position' column
            deepcl_html.append('<td class="pos">%s.</td>' % pos)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs
                    otherpubs.add(clueans)

            # add 'other uses' to clues_html
            deepcl_html.append('<td class="other-uses">')

            prev = prev_uses(pub_uses, mainxd, mainclue)
            if prev:
                deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' %
                                   (boil(mainclue), mainclue, len(prev)))
                nstaleclues += 1
            else:
                deepcl_html.append(mainclue)

            deepcl_html.append('</td>')

            # add 'other answers' to clues_html
            deepcl_html.append('<td class="other-answers">')
            deepcl_html.append(
                html_select_options(poss_answers,
                                    strmaker=lambda ca: ca.answer,
                                    force_top=mainca,
                                    add_total=False))
            deepcl_html.append('</td>')

            # add 'other clues' to clues_html
            deepcl_html.append('<td class="other-clues">')

            other_clues = html_other_clues(mainanswer, mainclue, mainxd)
            if other_clues:
                deepcl_html.append(other_clues)
                nstaleanswers += 1

            deepcl_html.append('</td>')  # end 'other-clues'

            ntotalclues += 1
            # Quick and dirty - to be replaced
            dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>'

        # Process deepclues
        diff_h = '<div class="main-container">'
        diff_h += grid_to_html(mainxd)
        diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table')
        diff_h += '</div>'

        info('writing deepclues for %s' % mainxdid)
        outf.write_html('pub/deep/%s/index.html' % mainxdid,
                        diff_h,
                        title='Deep clue analysis for ' + mainxdid)
Beispiel #4
0
def pubyear_html(pub, year):
    calendars_html = '<table class="puzzles">'
    colnames = [ year ] + pubyear_header
    calendars_html += html.table_row(colnames, colnames, tag='th')

    # write out /pub/nyt199x
    c_grids = {}

    # utils.info('Generating meta for {pub}{year}'.format(**locals()))
    for row in sorted(metadb.xd_similar(pub+year)):
        dt = utils.parse_iso8601(row.xdid)
        dt2 = utils.parse_iso8601(row.match_xdid)

        if not dt or not dt2:
            continue
        if dt < dt2:
            continue

        # dt = row["date"] # without - as GridCalendar needs; or fix GC
        if dt not in c_grids:
            c_grids[dt] = {
                'title': '',
                'class': ''
            }

        if row.match_pct == 0:
            continue

        c_grids[dt]['link'] = '/pub/' + row.xdid

        matchxdid = row.match_xdid
        aut1 = metadb.get_author(row.xdid) or ''
        aut2 = metadb.get_author(matchxdid) or ''
#        if aut1 is None or aut2 is None:
#            continue

        pct = row.match_pct
        similargrids = '(%s%%) %s [%s]\n' % (pct, aut2, matchxdid)
        c_grids[dt]["title"] += similargrids

        ##deduce_similarity_type
        c_grids[dt]["class"] += ret_classes(aut1, aut2, pct)

    c_grids_b = {}  #  For those are not in c_grids

    # Generate grids for available puzzles
    for row in metadb.xd_puzzles(pub+year):
        if row.Date and row.Date not in c_grids_b and row.Date not in c_grids:
            # add styles only for those are not similar etc.
            c_grids_b[row.Date] = {
                'title': '',
                'class': 'privxd' if int(row.Date[:4]) > 1965 else 'pubxd',
            }

    # Generate calendars
    z = c_grids.copy()
    z.update(c_grids_b)

    if year[-1] == 's':  # decade
        from_year = int(year[:4])
        to_year = int(year[:4]) + 10
    else:
        from_year = int(year)
        to_year = int(year) + 1

    for year in range(from_year, to_year):
      for month in range(1, 13):
        dups_table = []
        for dt, d in c_grids.items():
            if not dt.startswith("%s-%02d" % (year, month)):
                continue

            row_dict = {}  # Map row and style
            xdid = pub + dt
            puzmd = metadb.xd_puzzle(xdid)
            if not puzmd:
                continue
            row_dict['class'] = d['class']
            row_dict['tag_params'] = {
                'onclick': 'location.href=\'/pub/%s\'' % xdid,
                'class': d['class'] + ' hrefrow puzrow',
            }
            row_dict['row'] = [
                xdid,
                puzmd.Date,
                puzmd.Size,
                puzmd.Title,
                puzmd.Author,
                puzmd.Editor,
                puzmd.Copyright,
                puzmd.A1_D1,
                d["title"].replace("\n", "<br/>")
            ]
            dups_table.append(row_dict)

        calendars_html += '<tr class="calendar"><td class="calendar" rowspan="%s">' % (len(dups_table) + 1)
        calendars_html += html.GridCalendar(z).formatmonth(int(year), month) + '</td></tr>'

        for r in sorted(dups_table, key=lambda x: x['row'][1]):
            calendars_html += html.table_row(r["row"], pubyear_header, tag_params=r['tag_params'])

    calendars_html += '</table>'

    ret = '''%s <div class="calendars">%s</div> <hr/>''' % (legend, calendars_html)
    return ret
Beispiel #5
0
def main():
    args = utils.get_args('generate pub-years data')
    outf = utils.open_output()

    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    pubyears = {}  # set()
    for xd in xdfile.corpus():
        puby = (xd.publication_id(), xd.year())
        if puby not in pubyears:
            pubyears[puby] = []
        pubyears[puby].append(xd)

    if pubyears:
        metadb.delete_stats()

    for puby, xdlist in sorted(pubyears.items()):
        pubid, year = puby
        npublic = 0

        # TODO: SELECT FROM publications
        nexisting = 0

        # organize by day-of-week
        byweekday = {}
        byweekday_similar = {}
        for w in weekdays:
            byweekday[w] = []
            byweekday_similar[w] = []

        for xd in xdlist:
            dow = dow_from_date(xd.get_header('Date'))
            if dow:  # Might be empty date or only a year
                byweekday[dow].append(xd)

        for r in metadb.xd_similar(pubid + str(year)):
            if r.match_pct < 25:
                continue
            xd = xdfile.get_xd(r.xdid)

            if xd:
                dt = xd.get_header('Date')
                if dt:
                    dow = dow_from_date(dt)
                    if dow:  # Might be empty date or only a year
                        byweekday_similar[dow].append(r)
                else:
                    debug("Date not set for: %s" % xd)

        # tally stats
        for weekday in weekdays:
            copyrights = Counter()  # [copyright_text] -> number of xd
            editors = Counter()  # [editor_name] -> number of xd
            formats = Counter()  # ["15x15 RS"] -> number of xd
            # todo
            nexisting = 0

            nxd = len(byweekday[weekday])
            public_xdids = []  # Empty for now
            for xd in byweekday[weekday]:
                xdid = xd.xdid()
                if (year.isdigit()
                        and int(year) <= 1965) or xdid in public_xdids:
                    npublic += 1

                editor = xd.get_header('Editor').strip()
                if editor:
                    editors[editor] += 1

                sizestr = xd.sizestr()
                if sizestr:
                    formats[sizestr] += 1

                copyright = xd.get_header('Copyright').strip()
                if copyright:
                    copyrights[copyright] += 1

            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
            def process_counter(count, comp_value):
                # Process counter comparing with comp_value
                if count:
                    item, num = count.most_common(1)[0]
                    if num != comp_value:
                        item += " (%s)" % num
                else:
                    item = ''
                return item

            #
            maineditor = process_counter(editors, nxd)
            maincopyright = process_counter(copyrights, nxd)
            mainformat = process_counter(formats, nxd)

            reprints = 0
            touchups = 0
            redones = 0
            copies = 0
            themecopies = 0
            for r in byweekday_similar[weekday]:
                xd1 = xdfile.get_xd(r.xdid)
                xd2 = xdfile.get_xd(r.match_xdid)
                if xd1 is None:
                    info("%s: similar puzzle %s not in corpus" %
                         (r.match_xdid, r.xdid))
                    continue

                if xd2 is None:
                    info("%s: similar puzzle %s not in corpus" %
                         (r.xdid, r.match_xdid))
                    continue

                dt1 = xd1.get_header('Date')
                dt2 = xd2.get_header('Date')
                aut1 = xd1.get_header('Author').lower()
                aut2 = xd2.get_header('Author').lower()
                pct = int(r.match_pct)
                if dt2 < dt1:  # only capture the later one
                    ##deduce_similarity_type
                    if diff_authors(aut1, aut2):  # suspicious
                        if pct >= 50:
                            copies += 1
                        elif pct >= 30:
                            themecopies += 1
                    else:
                        if pct == 100:
                            reprints += 1
                        elif pct >= 50:
                            touchups += 1
                        elif pct >= 30:
                            themecopies += 1

            metadb.append_row(
                "pub/stats", (pubid, year, weekday, mainformat, maineditor,
                              maincopyright, nexisting, nxd, npublic, reprints,
                              touchups, redones, copies, themecopies))