def main():
    p = args_parser("process huge puzzles archive into separate .zip and create sources.tsv")
    p.add_argument("-s", "--source", default=None, help="ExternalSource")
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r"^([a-z]{2,4})[\-0-9]{1}\d.*", parse_pathname(fn).base, flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else "misc"
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Exemple #2
0
def main():
    p = args_parser('catalog source files and create source.tsv')
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    info("importing from %s" % args.source)

    outf = open_output()

    sources = []

    for input_source in args.inputs:
        for fn, contents, dt in find_files_with_time(input_source):
            if len(contents) == 0:
                info("ignoring empty file")
                continue

            outf.write_file(strip_toplevel(fn), contents, dt)

            sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt)))

    info("%s files cataloged" % len(sources))

    outbase = parse_pathname(args.output).base

    outf.write_file("%s.tsv" % outbase, xd_sources_header + "".join(sources))
    outf.write_file("%s.log" % outbase, get_log())
Exemple #3
0
def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join(
        ["grid_xdid", "clues_pubid", "num_missing"]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header(
            "Author",
            "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" %
                         (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join(
                        [xd.xdid(), pubid, str(nmissing)]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(
                    parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)
Exemple #4
0
def main():
    args = utils.get_args()
    outf = utils.open_output()

    for htmlfn, contents in utils.find_files(*args.inputs):
        basepagename = utils.parse_pathname(htmlfn).base

        wrappeddiv = '<div class="text">' + contents.decode('utf-8') + '</div>'
        outf.write_html('%s/index.html' % basepagename, wrappeddiv)
Exemple #5
0
def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)
Exemple #6
0
def deduce_set_seqnum(xd):
    # look to filename
    base = utils.parse_pathname(xd.filename).base

    # check for date
    dt = utils.parse_date_from_filename(base)  # datetime object
    if dt:
        xd.set_header("Date", dt)
    else:
        # check for number in full path (eltana dir had number)
        m = re.search(r'(\d+)', xd.filename)
        if m:
            xd.set_header("Number", int(m.group(1)))
Exemple #7
0
def deduce_set_seqnum(xd):
    # look to filename
    base = utils.parse_pathname(xd.filename).base

    # check for date
    dt = utils.parse_date_from_filename(base)  # datetime object
    if dt:
        xd.set_header("Date", dt)
    else:
        # check for number in full path (eltana dir had number)
        m = re.search(r'(\d+)', xd.filename)
        if m:
            xd.set_header("Number", int(m.group(1)))
Exemple #8
0
def main():
    p = args_parser(
        'process huge puzzles archive into separate .zip and create sources.tsv'
    )
    p.add_argument('-s', '--source', default=None, help='ExternalSource')
    args = get_args(parser=p)

    outf = open_output()

    if args.source:
        source = args.source
    else:
        source = parse_pathname(args.inputs[0]).base

    subzips = {}

    for inputfn in args.inputs:
        for fn, contents, dt in xdfile.utils.find_files_with_time(inputfn):
            if not contents:
                continue

            m = re.match(r'^([a-z]{2,4})[\-0-9]{1}\d.*',
                         parse_pathname(fn).base,
                         flags=re.IGNORECASE)
            prefix = m.group(1).lower() if m else 'misc'
            if prefix not in subzips:
                zf = xdfile.utils.OutputZipFile(
                    os.path.join(args.output, prefix + ".zip"))
                sources = []
                subzips[prefix] = (zf, sources)
            else:
                zf, sources = subzips[prefix]
            progress("Processing %s -> %s" % (fn, prefix))
            zf.write_file(fn, contents, dt)

            sources.append(xd_sources_row(fn, source, iso8601(dt)))

    for zf, sources in subzips.values():
        zf.write_file("sources.tsv", xd_sources_header + "".join(sources))
Exemple #9
0
def generate_email_files(msg):
    counter = 1
    upload_date = time.mktime(email.utils.parsedate(msg["Date"]))
    for part in msg.walk():
        # multipart/* are just containers
        if part.get_content_maintype() == 'multipart':
            continue
        # Applications should really sanitize the given filename so that an
        # email message can't be used to overwrite important files
        filename = part.get_filename()
        if not filename:
            ext = mimetypes.guess_extension(part.get_content_type())
            if not ext:
                # Use a generic bag-of-bits extension
                ext = '.bin'
            filename = 'part-%03d%s' % (counter, ext)
        counter += 1

        data = part.get_payload(decode=True)
        if parse_pathname(filename).ext == '.zip':
            for zipfn, zipdata, zipdt in generate_zip_files(data):
                yield zipfn, zipdata, zipdt
        else:
            yield filename, data, upload_date
def main():
    args = utils.get_args('generates .html diffs with deep clues for all puzzles in similar.tsv')
    outf = utils.open_output()

    similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')
    xdids_todo = [ parse_pathname(fn).base for fn in args.inputs ]
    if not xdids_todo:
        xdids_todo = [ xdid for xdid, matches in metadb.get_similar_grids().items() if matches ]

    for mainxdid in xdids_todo:
        progress(mainxdid)

        mainxd = xdfile.get_xd(mainxdid)
        if not mainxd:
            continue

        matches = metadb.get_similar_grids().get(mainxdid, [])

        xddates = {}
        xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort
        html_grids = {}
        html_clues = {}

        # these are added directly to similar.tsv
        nstaleclues = 0
        nstaleanswers = 0
        ntotalclues = 0

        poss_answers = [] # TODO:
        pub_uses = {}  # [pubid] -> set(ClueAnswer)

        dcl_html = ''
        deepcl_html = [] # keep deep clues to parse later - per row
        for pos, mainclue, mainanswer in mainxd.iterclues():
            deepcl_html = [] # Temporary to be replaced late
            mainca = ClueAnswer(mainxdid, mainxd.date(), mainanswer, mainclue)

            # 'grid position' column
            deepcl_html.append('<td class="pos">%s.</td>' % pos)

            # find other uses of this clue, and other answers, in a single pass
            for clueans in find_clue_variants(mainclue):
                if clueans.answer != mainanswer:
                    poss_answers.append(clueans)

                if clueans.answer == mainanswer:
                    if clueans.pubid in pub_uses:
                        otherpubs = pub_uses[clueans.pubid]
                    else:
                        otherpubs = set()  # set of ClueAnswer
                        pub_uses[clueans.pubid] = otherpubs
                    otherpubs.add(clueans)

            # add 'other uses' to clues_html
            stale = False
            deepcl_html.append('<td class="other-uses">')

            if len(pub_uses) > 0:
                sortable_uses = []
                for pubid, uses in pub_uses.items():
                    # show the earliest unboiled clue
                    for u in sorted(uses, key=lambda x: x.date or ""):
                        # only show those published earlier
                        if u.date and u.date <= mainxd.date():
                            if pubid == mainxdid and u.date == mainxd.date():
                                pass
                            else:
                                stale = True
                                sortable_uses.append((u.date, u, 1))

                deepcl_html.append(html_select([ (clue, nuses) for dt, clue, nuses in sorted(sortable_uses, key=lambda x: x[0], reverse=True) ], top_option=mainclue))

            else:
                deepcl_html.append('<div class="original">%s</div>' % esc(mainclue))

            deepcl_html.append('</td>')

            # add 'other answers' to clues_html

            deepcl_html.append('<td class="other-answers">')
            deepcl_html.append(html_select_options(poss_answers, strmaker=lambda ca: ca.answer, force_top=mainca))
            deepcl_html.append('</td>')

            # add 'other clues' to clues_html
            deepcl_html.append('<td class="other-clues">')

            # bclues is all boiled clues for this particular answer: { [bc] -> #uses }
            bclues = load_answers().get(mainanswer, [])
            stale_answer = False

            if bclues:
                uses = []
                for bc, nuses in bclues.items():
                    # then find all clues besides this one
                    clue_usages = [ ca for ca in load_clues().get(bc, []) if ca.answer == mainanswer and ca.date < mainxd.date() ]

                    if clue_usages:
                        stale_answer = True
                        if nuses > 1:
                            # only use one (the most recent) ClueAnswer per boiled clue
                            # but use the clue only (no xdid)
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1].clue
                        else:
                            ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
                            uses.append((ca, nuses))

                if uses:
                    deepcl_html.append(html_select(uses))

            deepcl_html.append('</td>')  # end 'other-clues'

            if stale_answer:
                nstaleanswers += 1
            if stale:
                nstaleclues += 1
            ntotalclues += 1
            # Quick and dirty - to be replaced
            dcl_html += '<tr>' + ' '.join(deepcl_html) + '</tr>'

        # Store in list to make further formatting as html table easier
        mainxd = xdfile.get_xd(mainxdid)
        if mainxd:
            html_grids[mainxdid] = grid_diff_html(mainxd)

        # Add for main XD
        diff_l = []
        for pos, mainclue, mainanswer in mainxd.iterclues():
            diff_h = mktag('div','fullgrid main') + '%s.&nbsp;' %pos
            diff_h += mainclue
            diff_h += mktag('span', tagclass='main', inner='&nbsp;~&nbsp;' + mainanswer.upper())
            diff_l.append(diff_h)
        html_clues[mainxdid] = diff_l
        # Process for all matches
        for xdid in matches:
            xd = xdfile.get_xd(xdid)
            if not xd:
                continue
            xddates[xdid] = xd.date()
            # output each grid
            html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd)
            diff_l = []
            # output comparison of each set of clues
            for pos, clue, answer in xd.iterclues():
                diff_h = mktag('div','fullgrid') + '%s.&nbsp;' %pos
                # Sometimes can return clue == None
                sm = difflib.SequenceMatcher(lambda x: x == ' ', mainxd.get_clue(pos) or '', clue)
                if sm.ratio() < 0.50:
                    diff_h += clue
                else:
                    # Compare based on op codes
                    for opcode in sm.get_opcodes():
                        c, a1, a2, b1, b2 = opcode
                        if c == 'equal':
                            diff_h += '<span class="match">%s</span>' % clue[b1:b2]
                        else:
                            diff_h += '<span class="diff">%s</span>' % clue[b1:b2]
                diff_h += mktag('span', tagclass=(answer == mainxd.get_answer(pos)) and 'match' or 'diff', inner='&nbsp;~&nbsp;' + answer.upper())
                diff_h += mktag('/div')
                diff_l.append(diff_h)
            html_clues[xdid] = diff_l

        # Wrap into table
        diff_h = mktag('table') + mktag('tr')
        # Sort by date
        sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
        for w, dt in sortedkeys:
            # Wrap into table
            diff_h += mktag('td') + html_grids[w] + mktag('/td')
        diff_h += mktag('/tr')
        for i, clue in enumerate(html_clues[sortedkeys[0][0]]):
            diff_h += mktag('tr')
            for w, dt in sortedkeys:
                if i < len(html_clues[w]):
                    diff_h += mktag('td') + html_clues[w][i] + mktag('/td')
            diff_h += mktag('/tr')
        # Process deepclues
        diff_h += mktag('table') + dcl_html + mktag('/table')

        diff_h += mktag('/table')
        outf.write_html('pub/deep/%s/index.html' % mainxdid, diff_h,
                    title='Deep clue comparison for ' + mainxdid)
Exemple #11
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd': [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource')
    p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(input_source).filename

                already_received = metadb.check_already_received(ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn), ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource,InternalSource,SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__, str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" % (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime,
                            ReceivedTime,
                            ExternalSource,
                            InternalSource,
                            SourceFilename,
                            xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
Exemple #12
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd':
        [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright',
                   default=None,
                   help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc',
                   default=None,
                   help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc',
                   default=None,
                   help='Value for receipts.InternalSource')
    p.add_argument('--pubid',
                   default=None,
                   help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source,
                                                         ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(
                    input_source, strip_toplevel=False),
                                           reverse=True,
                                           key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(
                        input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(
                    input_source).filename

                already_received = metadb.check_already_received(
                    ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn(
                            'previously received this same file under multiple xdids:'
                            + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn),
                                                      ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource, InternalSource,
                                               SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(
                                xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" %
                                  (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__,
                                                       str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" %
                              (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime, ReceivedTime, ExternalSource,
                            InternalSource, SourceFilename, xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise