Ejemplo n.º 1
0
def main():
    p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv")
    p.add_argument("-s", "--source", default=None, help="ExternalSource")
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else "misc"
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Ejemplo n.º 2
0
def main():
    p = args_parser('catalog source files and create source.tsv')
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    info("importing from %s" % args.source)

    outf = open_output()

    sources = []

    for input_source in args.inputs:
        for fn, contents, dt in find_files_with_time(input_source):
            if len(contents) == 0:
                info("ignoring empty file")
                continue

            outf.write_file(strip_toplevel(fn), contents, dt)

            sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt)))

    info("%s files cataloged" % len(sources))

    outbase = parse_pathname(args.output).base

    outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources))
    outf.write_file("%s.log" % outbase, get_log())
Ejemplo n.º 3
0
def main():
    args = get_args('parse downloaded emails')
    outf = open_output()

    sources_tsv = ''
    for emailfn, emailcontents in find_files(*args.inputs):
        msg = email.message_from_bytes(emailcontents)
        upload_src = msg["From"]

        if not upload_src:
            continue

        email_sources_tsv = []
        email_files = generate_email_files(msg)
        for puzfn, puzdata, puzdt in email_files:
            # a basic sanity check of filesize
            # accommodate small puzzles and .pdf
            info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src))

        summary("%s puzzles from %s" % (len(email_files), upload_src))

            if len(puzdata) > 1000 and len(puzdata) < 100000:
                email_sources_tsv.append(xd_sources_row(puzfn, upload_src, iso8601(puzdt)))

                outf.write_file(puzfn, puzdata)

        # generate receipt row, send receipt email

        if email_sources_tsv:
            xd_send_email(upload_src,
                    fromaddr='*****@*****.**',
                    subject='Upload successful: %d files received' % len(email_sources_tsv),
                    body="These files were received:\n" + "\n".join(email_sources_tsv))
            sources_tsv += "".join(email_sources_tsv)
        else:
            xd_send_email(upload_src,
                    fromaddr='*****@*****.**',
                    subject='Upload error',
                    body='No puzzle files received')
Ejemplo n.º 4
0
def main():
    p = args_parser(
        'process huge puzzles archive into separate .zip and create sources.tsv'
    )
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*',
                         parse_pathname(fn).base,
                         flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else 'misc'
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(
                    os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Ejemplo n.º 5
0
def main():
    p = args_parser('download recent puzzles')
    args = get_args(parser=p)

    outf = open_output()

    today = datetime.date.today()
    todaystr = today.strftime("%Y-%m-%d")

    sources_tsv = ''

    puzzle_sources = xd_puzzle_sources()

    new_recents_tsv = []

    # some downloads may fail, track the last successful ones
    most_recent = {}

    # download new puzzles since most recent download
    for row in metadb.xd_recent_downloads().values():
        pubid = row.pubid
        latest_date = datestr_to_datetime(row.date)

        # by default, keep the previous one
        most_recent[pubid] = row.date

        if pubid not in puzzle_sources:
            warn("unknown puzzle source for '%s', skipping" % pubid)
            continue

        puzsrc = puzzle_sources[pubid]

        if not puzsrc.urlfmt or puzsrc.urlfmt.startswith("#"):
            warn("no source url for '%s', skipping" % pubid)
            continue

        from_date = latest_date
        to_date = today
        dates_to_get = get_dates_between(from_date, to_date, int(puzsrc.freq))
        if not dates_to_get:
            warn("*** %s: nothing to get since %s" % (pubid, from_date))
            continue

        summary("*** %s: downloading %d puzzles from %s to %s" % (pubid, len(dates_to_get), from_date, to_date))

        for dt in sorted(dates_to_get):
            try:
                xdid = construct_xdid(pubid, dt)
                url = dt.strftime(puzsrc.urlfmt)
                fn = "%s.%s" % (xdid, puzsrc.ext)

                debug("downloading '%s' from '%s'" % (fn, url))

                response = urllib.request.urlopen(url)
                content = response.read()

                outf.write_file(fn, content)

                most_recent[pubid] = todaystr
            except (urllib.error.HTTPError, urllib.error.URLError) as err:
                error('%s [%s] %s: %s' % (xdid, err.code, err.reason, url))
            except Exception as e:
                error(str(e))

            sources_tsv += xd_sources_row(fn, url, todaystr)

    for k, v in most_recent.items():
        new_recents_tsv.append(xd_recent_download(k, v))

    if sources_tsv:
        outf.write_file("sources.tsv", xd_sources_header + sources_tsv)

    if new_recents_tsv:
        # on filesystem
        open(metadb.RECENT_DOWNLOADS_TSV, "w").write(xd_recents_header + "".join(sorted(new_recents_tsv)))