Exemple #1
0
    def get(self, request, *args, **kwargs):

        if not ('q' in request.GET and 'dictionary' in request.GET):
            return Response()

        q = u"{0}".format(request.GET['q'])
        dictionary = u"{0}".format(
            request.GET['dictionary'])  # The suggester to use
        manuscript = u"{0}".format(
            request.GET['manuscript']
        )  # Can be '*' when searching through all manuscripts

        connection = solr.Solr(settings.SOLR_SERVER)
        search_handler = solr.SearchHandler(connection, "/suggest")

        # TODO fix solr so that the suggesters work with a context field (cfq)]
        # search_results = search_handler(q=q, suggest_dictionary=dictionary, suggest_cfq=manuscript)
        search_results = search_handler(q=q, suggest_dictionary=dictionary)

        results = search_results.suggest[dictionary][q]

        # Remove duplicates from the suggestions and limits the return number to 10
        results['suggestions'] = self._get_filtered_results(
            results['suggestions'])

        response = Response(results)
        return response
Exemple #2
0
def first_core_of_connection(solr_connection):
    response = solr.SearchHandler(solr_connection,
                                  "/admin/cores").raw(**{
                                      'action': 'STATUS',
                                      'wt': 'json'
                                  })
    results = json.loads(response)
    core = results['status'].keys().pop()
    return core
Exemple #3
0
def query_math_docs(query_string, query_format):
    select = solr.SearchHandler(conn, "/select")
    request_string = "{!mathqueryparser} FORMAT(" + query_format.lower(
    ) + ")" + query_string + "&rows=20"
    print request_string
    response = select.__call__(request_string)
    if not response:
        print "No results"
    else:
        for hit in response:
            print hit['filename'] + ":" + hit['math_notational_field']
    return response


#add_math_file(1, "arx1312.6708.eq", "/share/math/arxiv_cds/arx1312.6708.eq")
Exemple #4
0
        return row.value


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Make csv report of indexed collections')
    parser.add_argument('auth_token', help='Authentication token')
    parser.add_argument('--solr_url', help='Solr index url')
    parser.add_argument('--couchdb_url', help='CouchDB url')
    args = parser.parse_args()
    solr_url = args.solr_url if args.solr_url else SOLR_URL
    print "SOLR_URL:{}".format(solr_url)
    SOLR = solr.SearchHandler(
        solr.Solr(
            solr_url,
            post_headers={
                'X-Authentication-Token': args.auth_token,
            },
        ), "/query")
    if args.couchdb_url:
        cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc')
    else:
        cdb = get_couchdb(dbname='ucldc')
    collections = get_indexed_collection_list(SOLR)
    date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M')
    fname = 'indexed_collections-{}.csv'.format(date_to_minute)
    with open(fname, 'wb') as csvfile:
        csvwriter = UnicodeWriter(csvfile)
        csvwriter.writerow(
            ('Collection Name', 'Collection URL', 'Number in index',
             'Number in couchdb', 'Number in OAC', 'Couch missing in solr',
Exemple #5
0
from __future__ import print_function
import solr
import sys
import re

con = solr.Solr('http://localhost:8983/solr/wikipedia_core')
mlt = solr.SearchHandler(con, "/mlt")

THRESHOLD = 0.9


def process(line):
    # replace some known utf-8 chars with ascii
    line = re.sub("\xe2\x80\x99", "x",
                  line)  # U+2019 (right single quotation mark)
    line = re.sub("\xe2\x80\x93", "-", line)  # U+2013 (EN-DASH)
    # remove the rest of the non-ascii chars
    line = re.sub(r'[^\x00-\x7F]+', ' ', line)

    r = mlt(mlt_fl='page_text', mlt_mindf=1, mlt_mintf=1, stream_body=line)
    return ",".join([
        d['page_name'] for d in r.results
        if d['score'] / r.maxScore >= THRESHOLD
    ]).encode('utf-8')


def main():
    try:
        for line in sys.stdin:
            print(process(line.strip()))
    except:
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Solr utilities.
"""

import itertools

from invenio.config import CFG_SOLR_URL
from intbitset import intbitset
from invenio.ext.logging import register_exception

if CFG_SOLR_URL:
    import solr
    conn = solr.Solr(CFG_SOLR_URL)
    SOLR_CONNECTION = solr.SolrConnection(CFG_SOLR_URL)  # pylint: disable=E1101
    SOLR_MLT_CONNECTION = solr.SearchHandler(conn, "/mlt")

BOOLEAN_EQUIVALENTS = {"+": "AND", "|": "OR", "-": "NOT"}


def get_collection_filter(hitset, cutoff_amount):
    # The last n hitset records are considered to be newest and therfore most relevant
    start_index = len(hitset) - cutoff_amount
    if start_index < 0:
        start_index = 0
    it = itertools.islice(hitset, start_index, None)
    ids = ' '.join([str(recid) for recid in it])

    if ids:
        return 'id:(%s)' % ids
    else:
    for word in counts.keys():
        c[word] = int(counts[word]['tf'])

    return c


s = solr.Solr('http://localhost:8983/solr')

#response = s.select('*:*')
#print response

#pdb.set_trace()

field_name = 'includes'

tvrh = solr.SearchHandler(s, '/tvrh')
response = tvrh.__call__(q='*:*',
                         tv_tf='true',
                         tv_fl=field_name,
                         fl='none',
                         rows=1000)

for x in xrange(0, 1000):
    print x
    tv = response.termVectors
    #del tv['uniqueKeyFieldName']

    response = response.next_batch()

    #print sum( [ get_tf_counter (  field_name, value) for value in  response.termVectors.values() ], Counter() )
import solr
import random
import json
import datetime

SOLR_NEW = solr.Solr('http://127.0.0.1:8983/solr/dc-collection')
SOLR=solr.SearchHandler(solr.Solr('https://registry.cdlib.org/solr',
post_headers = { 'X-Authentication-Token':'xxxyyyzzz'}), "/query")

def get_collection_urls():
    q_collections=SOLR(q="*:*", rows=0, facet_field="collection_url", 
     facet="true", facet_limit=20000)
    facets = q_collections.facet_counts
    f_fields = facets['facet_fields']
    return f_fields['collection_url']

def get_random_docs(collection_urls):
    docs = []
    for u in collection_urls:
        recs_in_coll = SOLR(q="collection_url:{}".format(u))
        num = recs_in_coll.numFound
        sample_size = num / 100 if num / 100 else 1
        print "NUMBER:{} SAMPLE:{}".format(num, sample_size)
        for i in range(sample_size):
            rand_index = random.randrange(num)
            q_rec = SOLR(q="collection_url:{}".format(u), rows=1, start=rand_index)
            #save locally
            doc_new = {}
            for key, val in q_rec.results[0].items():
                if '_ss' in key:
                    continue
if py_version.major == 2 and py_version.minor == 7 and py_version.micro > 8:
    #disable ssl verification
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context

import os
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection
import solr

SOLR_URL = os.environ.get('URL_SOLR_API', None)
SOLR_API_KEY = os.environ.get('SOLR_API_KEY', None)

SOLR = solr.SearchHandler(
    solr.Solr(
        SOLR_URL,
        post_headers={
            'X-Authentication-Token': SOLR_API_KEY,
        },
    ), "/query")


def fill_object_values_from_solr(doc):
    '''If no object field, try to get from current solr
    '''
    if 'object' not in doc:
        query = 'harvest_id_s:"{}"'.format(doc['_id'])
        msg = "NO OBJECT FOR {}".format(doc['_id'])
        resp = SOLR(
            q=query,
            fields=
            'harvest_id_s, reference_image_md5, id, collection_url, reference_image_dimensions',
Exemple #10
0
from django.http import HttpResponse
from django.shortcuts import render_to_response
import datetime
import pprint
import solr
import logging

logging.basicConfig(level=logging.INFO)


solr_url = "http://localhost:8080/discovery"
#solr_url = "http://192.168.45.10:8983/solr/aadda-discovery"

s = solr.Solr(solr_url, debug=True)
s.select = solr.SearchHandler(s, "/select","__")

# Create your views here.

def search(request):
  q = request.GET.get('q', "*")

  r = s.select(
    q, sort='sentiment_score', sort_order='asc',
    group='true', group__field='domain', group__main='true',
    facet='true', facet__sort='count',
    facet__field=['crawl_year', 'content_type', 'content_ffb'],
    facet__mincount=1, f__crawl_year__facet__mincount=0)

  print dir(r)

  return render_to_response('search_result_list.html', {'numFound': r, 'r': r, 'facets': r.facet_counts['facet_fields']})