Esempio n. 1
0
def _get_data(args):
    key_from = 'news:{}:'.format(args.date_from.replace('-', ''))
    key_to = 'news:{};'.format(args.date_to.replace('-', ''))
    db = datastore.corpus_db()
    for key, doc in db.range(key_from, key_to):
        if not any(uri in doc['url'] for uri in DISCARD_URLS):
            yield doc
Esempio n. 2
0
def _get_data(args):
    key_from = 'news:{}:'.format(args.date_from.replace('-', ''))
    key_to = 'news:{};'.format(args.date_to.replace('-', ''))
    db = datastore.corpus_db()
    for key, doc in db.range(key_from, key_to):
        if not any(uri in doc['url'] for uri in DISCARD_URLS):
            yield doc
Esempio n. 3
0
def main(args):
    # search corpus index
    db = datastore.corpus_db()
    sconn = xappy.SearchConnection(settings.XAPIAN_DB)

    query = ' '.join(args.query)

    print "Search {} documents for '{}'".format(
        sconn.get_doccount(), args.query
    )

    q = sconn.query_parse(query, default_op=sconn.OP_AND)

    if args.category:
        qc = q.compose(q.OP_OR, [
            sconn.query_field('category', c) for c in args.category
        ])
        q = q & qc

    if args.date:
        qd = q.compose(q.OP_OR, [
            sconn.query_field('date', d) for d in args.date
        ])
        q = q & qd

    if args.date_start and args.date_end:
        qr = sconn.query_range('date', args.date_start, args.date_end)
        q = q.filter(qr)

    if args.sort:
        sortby = [tuple(args.sort.split(','))]
    else:
        sortby = None

    print 'Query: {!r}'.format(q)
    results = execute_query(sconn, q, args.offset, args.limit,
                            getfacets=args.facet,
                            allowfacets=('category',),
                            sortby=sortby)

    if results.estimate_is_exact:
        print "Found {} results".format(results.matches_estimated)
    else:
        print "Found approximately {} results".format(results.matches_estimated)

    for i, result in enumerate(results, 1):
        doc = db[result.id]
        try:
            cat = result.get_terms('category').next()
        except StopIteration:
            cat = 'none'
        try:
            date = result.get_terms('date').next()
        except StopIteration:
            date = 'none'

        print "{:2}. {} -- {} -- {}\n\t{}\n\t{}\n".format(
            i, cat, doc['headline'], date, doc['url'], result.id)

    from IPython import embed; embed()
Esempio n. 4
0
File: load.py Progetto: rmax/yatiri
def load_jsonlines(stream):
    # do batch write
    db = datastore.corpus_db()
    with db.write_batch() as wb:
        for n, line in enumerate(stream, 1):
            doc = preprocess(json.loads(line))
            key = get_key(doc)
            wb[key] = doc
        return n
Esempio n. 5
0
def main(args):
    # index corpus
    db = datastore.corpus_db()
    if args.prefix:
        items = db.range(args.prefix, next_key(args.prefix))
    elif args.from_classified:
        items = get_classified_items(args.from_classified, db)
    else:
        items = db.range()
    count = search.index(items, 'corpus', create=args.create)
    print "Indexed {} documents".format(count)
def _get_data(category):
    # load data from xapian categories
    db = datastore.corpus_db()
    sconn = xappy.SearchConnection(settings.XAPIAN_DB)
    q = sconn.query_field('category', category)

    offset = 0
    limit = 1000
    while True:
        results = sconn.search(q, offset, offset + limit)

    for key, doc in db.range(key_from, key_to):
        if not any(uri in doc['url'] for uri in DISCARD_URLS):
            yield doc
Esempio n. 7
0
def _get_data(category):
    # load data from xapian categories
    db = datastore.corpus_db()
    sconn = xappy.SearchConnection(settings.XAPIAN_DB)
    q = sconn.query_field('category', category)

    offset = 0
    limit = 1000
    while True:
        results = sconn.search(q, offset, offset + limit)

    for key, doc in db.range(key_from, key_to):
        if not any(uri in doc['url'] for uri in DISCARD_URLS):
            yield doc
Esempio n. 8
0
File: load.py Progetto: rmax/yatiri
def load_csv(stream):
    # read first row as fields
    fields = csv.reader(stream).next()
    if any(f not in fields for f in REQUIRED_FIELDS):
        raise ValueError(
            "Required fields: {}".format(','.join(REQUIRED_FIELDS))
        )
    reader = csv.DictReader(stream, fields)
    # do batch write
    db = datastore.corpus_db()
    with db.write_batch() as wb:
        for n, doc in enumerate(reader, 1):
            doc = preprocess(dict(
                (k, v.decode('utf-8')) for k, v in doc.iteritems()
            ))
            key = get_key(doc)
            wb[key] = doc
        return n
Esempio n. 9
0
def load_keys(fromkey, offset, limit):
    tokey = next_key(fromkey)
    db = datastore.corpus_db()
    it = db.range(fromkey, tokey)
    return [v for k, v in itertools.islice(it, offset, offset + limit)]
Esempio n. 10
0
def load_keys(fromkey, offset, limit):
    tokey = next_key(fromkey)
    db = datastore.corpus_db()
    it = db.range(fromkey, tokey)
    return [v for k,v in itertools.islice(it, offset, offset + limit)]
Esempio n. 11
0
def main(args):
    # search corpus index
    db = datastore.corpus_db()
    sconn = xappy.SearchConnection(settings.XAPIAN_DB)

    query = ' '.join(args.query)

    print "Search {} documents for '{}'".format(sconn.get_doccount(),
                                                args.query)

    q = sconn.query_parse(query, default_op=sconn.OP_AND)

    if args.category:
        qc = q.compose(
            q.OP_OR, [sconn.query_field('category', c) for c in args.category])
        q = q & qc

    if args.date:
        qd = q.compose(q.OP_OR,
                       [sconn.query_field('date', d) for d in args.date])
        q = q & qd

    if args.date_start and args.date_end:
        qr = sconn.query_range('date', args.date_start, args.date_end)
        q = q.filter(qr)

    if args.sort:
        sortby = [tuple(args.sort.split(','))]
    else:
        sortby = None

    print 'Query: {!r}'.format(q)
    results = execute_query(sconn,
                            q,
                            args.offset,
                            args.limit,
                            getfacets=args.facet,
                            allowfacets=('category', ),
                            sortby=sortby)

    if results.estimate_is_exact:
        print "Found {} results".format(results.matches_estimated)
    else:
        print "Found approximately {} results".format(
            results.matches_estimated)

    for i, result in enumerate(results, 1):
        doc = db[result.id]
        try:
            cat = result.get_terms('category').next()
        except StopIteration:
            cat = 'none'
        try:
            date = result.get_terms('date').next()
        except StopIteration:
            date = 'none'

        print "{:2}. {} -- {} -- {}\n\t{}\n\t{}\n".format(
            i, cat, doc['headline'], date, doc['url'], result.id)

    from IPython import embed
    embed()