Python find_filesの例、xdfile.utils.find_files Pythonの例

コード例 #1

0

ファイルを表示

ファイル: remix.py プロジェクト: rbairwell/xd

def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join(
        ["grid_xdid", "clues_pubid", "num_missing"]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header(
            "Author",
            "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" %
                         (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join(
                        [xd.xdid(), pubid, str(nmissing)]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(
                    parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)

コード例 #2

0

ファイルを表示

ファイル: 21-clean-metadata.py プロジェクト: century-arcade/xd

def main():
    args = utils.get_args(desc='outputs cleaned puzzle metadata rows')

    for input_source in args.inputs:
        for fn, contents in utils.find_files(input_source, ext='.xd'):
            xd = xdfile.xdfile(contents.decode('utf-8'), fn)
            clean_headers(xd)
            metadb.update_puzzles_row(xd)

コード例 #3

0

ファイルを表示

ファイル: 21-clean-metadata.py プロジェクト: rbairwell/xd

def main():
    args = utils.get_args(desc='outputs cleaned puzzle metadata rows')

    for input_source in args.inputs:
        for fn, contents in utils.find_files(input_source, ext='.xd'):
            xd = xdfile.xdfile(contents.decode('utf-8'), fn)
            clean_headers(xd)
            metadb.update_puzzles_row(xd)

コード例 #4

0

ファイルを表示

ファイル: 44-mkwww-pages.py プロジェクト: rbairwell/xd

def main():
    args = utils.get_args()
    outf = utils.open_output()

    for htmlfn, contents in utils.find_files(*args.inputs):
        basepagename = utils.parse_pathname(htmlfn).base

        wrappeddiv = '<div class="text">' + contents.decode('utf-8') + '</div>'
        outf.write_html('%s/index.html' % basepagename, wrappeddiv)

コード例 #5

0

ファイルを表示

def main():
    global boiled_clues
    args = get_args('create clue index')
    outf = open_output()

    boiled_clues = load_clues()

    biggest_clues = "<li>%d total clues, which boil down to %d distinct clues" % (len(clues()), len(boiled_clues))

    bcs = [ (len(v), bc, answers_from(v)) for bc, v in boiled_clues.items() ]

    nreused = len([bc for n, bc, _ in bcs if n > 1])
    biggest_clues += "<li>%d (%d%%) of these clues are used in more than one puzzle" % (nreused, nreused*100/len(boiled_clues))

    cluepages_to_make = set()

    # add all boiled clues from all input .xd files
    for fn, contents in find_files(*args.inputs, ext='.xd'):
        progress(fn)
        xd = xdfile.xdfile(contents.decode('utf-8'), fn)
        for pos, mainclue, mainanswer in xd.iterclues():
            cluepages_to_make.add(boil(mainclue))


    # add top 100 most used boiled clues from corpus
    biggest_clues += '<h2>Most used clues</h2>'

    biggest_clues += '<table class="clues most-used-clues">'
    biggest_clues += th("clue", "# uses", "answers used with this clue")
    for n, bc, ans in sorted(bcs, reverse=True)[:100]:
        cluepages_to_make.add(bc)
        biggest_clues += td(mkhref(unboil(bc), bc), n, html_select_options(ans))

    biggest_clues += '</table>'

    most_ambig = "<h2>Most ambiguous clues</h2>"
    most_ambig += '(clues with the largest number of different answers)'
    most_ambig += '<table class="clues most-different-answers">'
    most_ambig += th("Clue", "answers")

    for n, bc, ans in sorted(bcs, reverse=True, key=lambda x: len(set(x[2])))[:100]:
        cluepages_to_make.add(bc)
        clue = mkhref(unboil(bc), bc)
        if 'quip' in bc or 'quote' in bc or 'theme' in bc or 'riddle' in bc:
            most_ambig += td(clue, html_select_options(ans), rowclass="theme")
        else:
            most_ambig += td(clue, html_select_options(ans))

    most_ambig += '</table>'

    for bc in cluepages_to_make:
        contents = mkwww_cluepage(bc)
        if contents:
            outf.write_html('pub/clue/%s/index.html' % bc, contents, title=bc)

    outf.write_html('pub/clue/index.html', biggest_clues + most_ambig, title="Clues")

コード例 #6

0

ファイルを表示

ファイル: remix.py プロジェクト: century-arcade/xd

def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)

コード例 #7

0

ファイルを表示

ファイル: 28-mkzip-public.py プロジェクト: rbairwell/xd

def main():
    args = utils.get_args()
    outf = utils.open_output()  # should be .zip

    outf.log = False
    outf.toplevel = 'xd'
    outf.write_file('README', open('doc/zip-README').read())

    for fn, contents in sorted(utils.find_files(*args.inputs, ext='.xd')):
        xdid = utils.parse_xdid(fn)
        if metadb.is_public(xdid):
            outf.write_file(utils.strip_toplevel(fn), contents)

コード例 #8

0

ファイルを表示

ファイル: similarity.py プロジェクト: century-arcade/xd

def main():
    args = get_args(desc="find similar grids")
    g_corpus = [x for x in corpus()]

    outf = open_output()

    outf.write(xd_similar_header)

    for fn, contents in find_files(*args.inputs, strip_toplevel=False):
        needle = xdfile(contents.decode("utf-8"), fn)
        for pct, a, b in find_similar_to(needle, g_corpus):
            outf.write(xd_similar_row(a, b, pct))

コード例 #9

0

ファイルを表示

ファイル: grid_templates.py プロジェクト: saulpw/cwctools

def main():
    args = utils.get_args(desc='find grid templates')

    templates = set()

    for xd in xdfile.corpus():
        tmpl = tuple(''.join(x if x == BLOCK_CHAR else UNKNOWN_CHAR for x in L) for L in xd.grid)
        templates.add(tmpl)

    print(len(templates), 'templates')

    for input_source in args.inputs:
        for fn, contents in utils.find_files(input_source, ext='.xd'):
            xd = xdfile.xdfile(contents.decode('utf-8'), fn)
            for i, T in enumerate(templates):
                griddedxd = fit_template(T, xd)
                if griddedxd:
                    with open(args.output + ('-t%s.xd' % i), 'w') as fp:
                        fp.write(griddedxd.to_unicode())

コード例 #10

0

ファイルを表示

ファイル: 12-parse-email.py プロジェクト: rbairwell/xd

def main():
    args = get_args('parse downloaded emails')
    outf = open_output()

    sources_tsv = ''
    for emailfn, emailcontents in find_files(*args.inputs):
        msg = email.message_from_bytes(emailcontents)
        upload_src = msg["From"]

        if not upload_src:
            continue

        email_sources_tsv = []
        email_files = generate_email_files(msg)
        for puzfn, puzdata, puzdt in email_files:
            # a basic sanity check of filesize
            # accommodate small puzzles and .pdf
            info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src))

        summary("%s puzzles from %s" % (len(email_files), upload_src))

            if len(puzdata) > 1000 and len(puzdata) < 100000:
                email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt)))

                outf.write_file(puzfn, puzdata)

        # generate receipt row, send receipt email

        if email_sources_tsv:
            xd_send_email(upload_src,
                    fromaddr='*****@*****.**',
                    subject='Upload successful: %d files received' % len(email_sources_tsv),
                    body="These files were received:\n" + "\n".join(email_sources_tsv))
            sources_tsv += "".join(email_sources_tsv)
        else:
            xd_send_email(upload_src,
                    fromaddr='*****@*****.**',
                    subject='Upload error',
                    body='No puzzle files received')

コード例 #11

0

ファイルを表示

#!/usr/bin/env python3

# Usage:
#   $0 -o wwwroot/ gxd/redirects.tsv

from xdfile import html, utils

args = utils.get_args()
outf = utils.open_output()

for tsvfn, contents in utils.find_files(*args.inputs):
    for row in utils.parse_tsv_data(contents.decode('utf-8'), "Redirect"):
        outf.write_file(row.SourcePath, html.redirect_page(row.DestURL))

コード例 #12

0

ファイルを表示

def main():
    args = utils.get_args(
        'generates .html diffs with deep clues for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')

    xds_todo = []
    for fn, contents in find_files(*args.inputs, ext='.xd'):
        xd = xdfile.xdfile(contents.decode('utf-8'), fn)
        xds_todo.append(xd)

    for mainxd in xds_todo:
        mainxdid = mainxd.xdid()
        progress(mainxdid)

        matches = metadb.xd_similar(mainxdid)

        xddates = {}
        xddates[mainxdid] = mainxd.date(
        )  # Dict to store XD dates for further sort
        html_grids = {}

        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0

        dcl_html = '<tr>'
        dcl_html += '<th></th>'
        dcl_html += '<th>Clue</th>'
        dcl_html += '<th>ANSWERs</th>'
        dcl_html += '<th>Alt. clue possibilities</th>'
        dcl_html += '</tr>'

        deepcl_html = []  # keep deep clues to parse later - per row
        for pos, mainclue, mainanswer in mainxd.iterclues():
            if not pos:
                continue

            poss_answers = []  # TODO:
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            deepcl_html = []  # Temporary to be replaced late
            mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue)

            # 'grid position' column
            deepcl_html.append('<td class="pos">%s.</td>' % pos)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs
                    otherpubs.add(clueans)

            # add 'other uses' to clues_html
            deepcl_html.append('<td class="other-uses">')

            prev = prev_uses(pub_uses, mainxd, mainclue)
            if prev:
                deepcl_html.append('<a href="/pub/clue/%s">%s [x%s]</a>' %
                                   (boil(mainclue), mainclue, len(prev)))
                nstaleclues += 1
            else:
                deepcl_html.append(mainclue)

            deepcl_html.append('</td>')

            # add 'other answers' to clues_html
            deepcl_html.append('<td class="other-answers">')
            deepcl_html.append(
                html_select_options(poss_answers,
                                    strmaker=lambda ca: ca.answer,
                                    force_top=mainca,
                                    add_total=False))
            deepcl_html.append('</td>')

            # add 'other clues' to clues_html
            deepcl_html.append('<td class="other-clues">')

            other_clues = html_other_clues(mainanswer, mainclue, mainxd)
            if other_clues:
                deepcl_html.append(other_clues)
                nstaleanswers += 1

            deepcl_html.append('</td>')  # end 'other-clues'

            ntotalclues += 1
            # Quick and dirty - to be replaced
            dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>'

        # Process deepclues
        diff_h = '<div class="main-container">'
        diff_h += grid_to_html(mainxd)
        diff_h += mktag('table', 'deepclues') + dcl_html + mktag('/table')
        diff_h += '</div>'

        info('writing deepclues for %s' % mainxdid)
        outf.write_html('pub/deep/%s/index.html' % mainxdid,
                        diff_h,
                        title='Deep clue analysis for ' + mainxdid)

コード例 #13

0

ファイルを表示

ファイル: 25-analyze-puzzle.py プロジェクト: century-arcade/xd

def main():
    p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument("-a", "--all", default=False, help="analyze all puzzles, even those already in similar.tsv")
    p.add_argument("-l", "--limit", default=100, help="limit amount of puzzles to be analyzed [default=100]")
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows("gxd/similar")
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode("utf-8"), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv

        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20), key=lambda x: x[0], reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            "gxd/similar",
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(pct / 100.0 for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids),  # matches
            ],
        )

コード例 #14

0

ファイルを表示

ファイル: test_utils.py プロジェクト: rbairwell/xd

def test_find_files():
    mygen = utils.find_files(TEST_DIRECTORY)
    for fullfn, contents in mygen:
        # It should throw out anything starting with '.'
        assert not fullfn.startswith('.')

コード例 #15

0

ファイルを表示

def main():
    p = utils.args_parser(
        desc="annotate puzzle clues with earliest date used in the corpus")
    p.add_argument(
        '-a',
        '--all',
        default=False,
        help='analyze all puzzles, even those already in similar.tsv')
    p.add_argument('-l',
                   '--limit',
                   default=100,
                   help='limit amount of puzzles to be analyzed [default=100]')
    args = get_args(parser=p)
    outf = open_output()

    num_processed = 0
    prev_similar = metadb.read_rows('gxd/similar')
    for fn, contents in find_files(*args.inputs, ext=".xd"):
        progress(fn)
        mainxd = xdfile(contents.decode('utf-8'), fn)

        if mainxd.xdid() in prev_similar:
            continue  # skip reprocessing .xd that are already in similar.tsv
        """ find similar grids (pct, xd) for the mainxd in the corpus.
        Takes about 1 second per xd.  sorted by pct.
        """
        similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
                               key=lambda x: x[0],
                               reverse=True)

        num_processed += 1
        if num_processed > int(args.limit):
            break

        if similar_grids:
            info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
                                        for pct, xd1, xd2 in similar_grids))

        mainpubid = mainxd.publication_id()
        maindate = mainxd.date()

        # go over each clue/answer, find all other uses, other answers, other possibilities.
        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0
        for pos, mainclue, mainanswer in mainxd.iterclues():
            progress(mainanswer)

            poss_answers = []
            pub_uses = {}  # [pubid] -> set(ClueAnswer)

            mainca = ClueAnswer(mainpubid, maindate, mainanswer, mainclue)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs

                    otherpubs.add(clueans)

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [
                        ca for ca in load_clues().get(bc, [])
                        if ca.answer == mainanswer and ca.date < maindate
                    ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages,
                                        key=lambda ca: ca.date or "z")[-1]
                        uses.append((ca, nuses))

        # summary row to similar.tsv
        metadb.append_row(
            'gxd/similar',
            [
                mainxd.xdid(),  # xdid
                int(100 * sum(
                    pct / 100.0
                    for pct, xd1, xd2 in similar_grids)),  # similar_grid_pct
                nstaleclues,  # reused_clues
                nstaleanswers,  # reused_answers
                ntotalclues,  # total_clues
                " ".join(("%s=%s" % (xd2.xdid(), pct))
                         for pct, xd1, xd2 in similar_grids)  # matches
            ])