def main():
    parser = OptionParser(
        description=
        "Parse Gutenberg RDF index file and produce SQLite database.")
    parser.add_option("--dbname",
                      dest="db_filename",
                      action="store",
                      default="gutenberg.db",
                      help="The gutenberg.db SQLite database")
    parser.add_option(
        "--rdfindex",
        dest="bz2_rdf_filename",
        action="store",
        default="catalog.rdf.bz2",
        help=
        "The filename for the bzip2 compressed XML RDF index for Project Gutenberg"
    )
    (options, args) = parser.parse_args()

    make_db = GutenbergDbCreator(options.db_filename)
    index_filter = GutenbergIndexFilter()
    make_db.add_many_records(
        parse_rdf_bz2(options.bz2_rdf_filename, index_filter.filter))
    if index_filter.notitle_count > 0:
        print "Omitted %d records without titles" % index_filter.notitle_count

    make_db.create_custom_title_order_index()
    make_db.compute_author_downloads()
    make_db.create_additional_indices()
def main():
    parser = OptionParser(description="Parse Gutenberg RDF index file and produce SQLite database.")
    parser.add_option("--dbname", dest="db_filename", action="store",
                      default="gutenberg.db",
                      help="The gutenberg.db SQLite database")
    parser.add_option("--rdfindex", dest="bz2_rdf_filename", action="store",
                      default="catalog.rdf.bz2",
                      help="The filename for the bzip2 compressed XML RDF index for Project Gutenberg")
    (options, args) = parser.parse_args()

    make_db = GutenbergDbCreator(options.db_filename)
    index_filter = GutenbergIndexFilter()
    make_db.add_many_records(parse_rdf_bz2(options.bz2_rdf_filename, index_filter.filter))
    if index_filter.notitle_count > 0:
        print "Omitted %d records without titles" % index_filter.notitle_count

    make_db.create_custom_title_order_index()
    make_db.compute_author_downloads()
    make_db.create_additional_indices()
def create_gutenberg_index_rdf(bz2_rdf_filename, indexdir):
    """Build whoosh index from parsed RDF.
    DB contents are no longer identical to RDF output. Plus index now stores selected db row ids.
    DEPRECATED"""
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)   # don't buffer stdout

    print "WARNING: direct use of rdf content may not accurately reflect database contents"

    schema = get_schema()
    whoosh_index = create_in(indexdir, schema)
    writer = whoosh_index.writer()
    for count, record in enumerate(gutenberg_rdf_parser.parse_rdf_bz2(bz2_rdf_filename, GutenbergIndexFilter().filter)):
        # Only index fields from description records. File records can be ignored.
        if record['record_type'] == 'DESCRIPTION':
            if count % 5000 == 0:
                print count,
            subset = {k : record[k] for k in schema.names() if k in record}
            writer.add_document(**subset)
    print "committing...",
    writer.commit()
    print "DONE"
Beispiel #4
0
def create_gutenberg_index_rdf(bz2_rdf_filename, indexdir):
    """Build whoosh index from parsed RDF.
    DB contents are no longer identical to RDF output. Plus index now stores selected db row ids.
    DEPRECATED"""
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)  # don't buffer stdout

    print "WARNING: direct use of rdf content may not accurately reflect database contents"

    schema = get_schema()
    whoosh_index = create_in(indexdir, schema)
    writer = whoosh_index.writer()
    for count, record in enumerate(
            gutenberg_rdf_parser.parse_rdf_bz2(bz2_rdf_filename,
                                               GutenbergIndexFilter().filter)):
        # Only index fields from description records. File records can be ignored.
        if record['record_type'] == 'DESCRIPTION':
            if count % 5000 == 0:
                print count,
            subset = {k: record[k] for k in schema.names() if k in record}
            writer.add_document(**subset)
    print "committing...",
    writer.commit()
    print "DONE"