Beispiel #1
0
                "url": {
                    "rank": DOCUMENTS[res["url"]]["static_rank"]
                }
            }
            yield res


if "--warc" in sys.argv:

    # Generate a WARC file
    devindex_dir = os.path.join(config["PATH_LOCALDATA"], "devindex")
    if not os.path.isdir(devindex_dir):
        os.makedirs(devindex_dir)
    warc_file = os.path.join(devindex_dir, "crawl.warc")

    create_warc_from_corpus(generate_corpus(), filename=warc_file)

    print "Created WARC file:", warc_file

elif "--index" in sys.argv:

    indexer = Indexer()
    if "--empty" in sys.argv:
        indexer.empty()
    docs = indexer.index_corpus(generate_corpus(), flush=True, refresh=True)
    print "Indexed %s documents." % len(docs)

else:
    print "Usage: python build_devindex.py [--warc | --index]"
    sys.exit(1)
Beispiel #2
0
    parser.add_argument("--save_linkgraph_domains",
                        default=False,
                        type=str,
                        help="Save a linkgraph domain file to this path")

    parser.add_argument("--profile",
                        action='store_true',
                        help="Profile Python usage")

    return parser.parse_args()


# Shared variables while indexing
args = get_args()
indexer = Indexer()
urlclient = indexer.urlclient


def list_warc_filenames():
    """ Return a list of all indexable WARC files """

    if args.warc_files:
        if args.warc_files.endswith(".txt"):
            with open(args.warc_files, "rb") as f:
                warc_files = [x.strip() for x in f.readlines()]
        else:
            warc_files = [x.strip() for x in args.warc_files.split(",")]

    else:
        warc_files = list_commoncrawl_warc_filenames(limit=args.warc_limit,
Beispiel #3
0
from cosrlib.document import load_document_type
from cosrlib.config import config
from cosrlib.searcher import Searcher
from cosrlib.indexer import Indexer


CURRENT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))

app = Flask(
    "explainer",
    static_folder=os.path.join(CURRENT_DIRECTORY, "static"),
    template_folder=os.path.join(CURRENT_DIRECTORY, "templates")
)

indexer = Indexer()
indexer.connect()

searcher = Searcher()
searcher.connect()


@app.route('/')
def route_search():
    """ Homepage, for debugging searches """
    return render_template("search.html", config={})


@app.route('/url')
def route_url():
    """ URL page, for debugging parsing """
Beispiel #4
0
 def make_client(self):
     return Indexer()
Beispiel #5
0
#!/usr/bin/env python

#
# This scripts empties and recreate the Elasticsearch indexes
#


import sys
import os
sys.path.insert(-1, os.getcwd())

from cosrlib.indexer import Indexer

indexer = Indexer()


if "--delete" in sys.argv or (raw_input("Do you want to delete the current indices and all data? [y/N]") == "y"):

  indexer.empty()

  print "Reset done."
#!/usr/bin/env python

#
# This scripts empties and recreate the Elasticsearch indexes
#

import sys
import os

sys.path.insert(-1, os.getcwd())

from cosrlib.indexer import Indexer

indexer = Indexer()

if "--delete" in sys.argv or (raw_input(
        "Do you want to delete the current indices and all data? [y/N]")
                              == "y"):

    indexer.empty()

    print "Reset done."
Beispiel #7
0
import requests

sys.path.insert(0, os.getcwd())

from cosrlib.document import load_document_type
from cosrlib.config import config
from cosrlib.searcher import Searcher
from cosrlib.indexer import Indexer

CURRENT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))

app = Flask("explainer",
            static_folder=os.path.join(CURRENT_DIRECTORY, "static"),
            template_folder=os.path.join(CURRENT_DIRECTORY, "templates"))

indexer = Indexer()
indexer.connect()

searcher = Searcher()
searcher.connect()


@app.route('/')
def route_search():
    """ Homepage, for debugging searches """
    return render_template("search.html", config={})


@app.route('/url')
def route_url():
    """ URL page, for debugging parsing """