Example #1
0
 def test_search_persistent(self):
     solr = Solr(os.getenv('SOLR_URL'), persistent=True, use_get=True)
     for _ in xrange(10):
         response = solr.search(q='*:*')
         self.assertEqual(response.status, 200)
         self.assertEqual(response.total_results, 4563722)
         self.assertEqual(len(response.documents), 10)
Example #2
0
 def sciencedata(self):
     sciencedata_prefix = "https://labcas-dev.jpl.nasa.gov/collections/collections/"
     results = []
     solr_collection = Solr(base_url='http://localhost:8983/solr/collections', version=4)
     solr_dataset = Solr(base_url='http://localhost:8983/solr/datasets', version=4)
     collection_query = {'q': '*:*'}
     collection_response = solr_collection.search(**collection_query)
     for obj in collection_response.documents:
         if obj.get("CollectionName") and obj.get("id"):
           dataset_query = {'q': '*:*', 'fq': "CollectionId='{}'".format(obj.get("id"))}
           dataset_response = solr_dataset.search(**dataset_query)
           datasetcount = self.countDatasets(dataset_response.documents)
           results.append(dict(
               collectionname=obj["CollectionName"],
               description=obj.get("CollectionDescription","None"),
               url=sciencedata_prefix+obj["id"],
               leadpi=obj.get("LeadPI",["None"]),
               organ=obj.get("OrganSite",["No Organ info"]),
               discipline=obj.get("Discipline",["None"]),
               protocol=obj.get("ProtocolId",["None"]),
               qastate=obj.get("QAState", ["None"]),
               species=obj.get("Species", ["None"]),
               datasetcount=datasetcount
           ))
     results.sort(lambda a, b: cmp(a['collectionname'], b['collectionname']))
     return results
def generate_sitemap(args):

	# init sitemap
	sm = Sitemap(changefreq='weekly')

	solr_handle = Solr('http://localhost:8080/solr4/fedobjs')
	query = {'q' : 'rels_isDiscoverable:True', 'fl' : 'id', 'start' : 0}

	# get solr cursor
	cursor = solr_handle.search_cursor(**query)

	# loop through and write to sitemap
	for chunk in cursor.fetch(100):
		for object_id in chunk.documents:
			urladd = "https://digital.library.wayne.edu/item/{object_id}".format(object_id=object_id)
			sm.add(
				urladd,
				lastmod="today"
			)

	# save to disk
	if args.output:
		filename = args.output
	else:
		filename = "/var/www/wsuls/digitalcollections/public/sitemaps/sitemap_https.xml"
	fhand = open(filename, "w")
	sm.write(fhand)
	fhand.close()
	print("sitemap created at %s, total time elapsed %s" % (filename, (time.time()-stime) ))
Example #4
0
class LogIndexer(object):
    '''
    classdocs
    '''

    def __init__(self,solrAddr):
        '''
        Constructor
        '''
        self.solr = Solr(solrAddr)

        
    
    def index(self,data):
        for key, value in data.items():
           if isinstance(value,datetime.datetime):
               try:
                   value = solr.core.utc_to_string(value)
               except:
                   pst = tz.gettz('Europe/Paris')
                   value = value.replace(tzinfo=pst)
                   value = solr.core.utc_to_string(value)
                   data[key] = value
                   
        try:
            self.solr.update([data])
        except:
            print "Erreur Index request: "        
        self.solr.commit()
        print "data indexed"
Example #5
0
def query_solr():
    query_string = parse_json(json.loads(request.data))

    solr = Solr("http://52.76.188.127:8983/solr/clickstream_event_shard1_replica1/")
    solr_response = solr.search(q=query_string)

    return json.dumps(solr_response.documents)
 def post_to_solr( self, solr_dict ):
     """ Posts solr_dict to solr. """
     SOLR_ROOT_URL = ( os.environ.get('BELL_I_SOLR_ROOT') )
     solr = Solr( SOLR_ROOT_URL )
     response = solr.update( [solr_dict], 'xml', commit=True )  # 'xml' param converts default json to xml for post; required for our old version of solr
     response_status = response.status
     self.logger.info( 'in tasks.indexer.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) )
     if not response_status == 200:
         raise Exception( 'custom-solr post problem logged' )
     return response_status
Example #7
0
 def post_to_solr( self, solr_dict ):
     """ Posts solr_dict to solr.
         Called by update_custom_index_entry() """
     solr = Solr( self.CUSTOM_INDEX_SOLR_URL_ROOT )
     response = solr.update( [solr_dict], 'xml', commit=True )  # 'xml' param converts default json to xml for post; required for our old version of solr
     response_status = response.status
     self.logger.info( 'in tasks.indexer.CustomIndexUpdater.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) )
     if not response_status == 200:
         raise Exception( 'custom-solr post problem logged' )
     return response_status
Example #8
0
class SolrUtils:
    def __init__(self, url):
        self.url = url
        self.conn = Solr(url)

    def addJSONDoc(self, doc):
        self.conn.update(doc, 'json', commit=False)

    def commit(self):
        self.conn.commit()
Example #9
0
class SolrUtils:

	def __init__(self, url):
		self.url = url
		self.conn = Solr(url)

	def addJSONDoc(self, doc):
		self.conn.update(doc, 'json', commit=False)

	def commit(self):
		self.conn.commit()
 def delete_item( self, pid ):
     """ Deletes item from custom bell index.
         Called by one_offs.rebuild_custom_index(). """
     SOLR_ROOT_URL = ( os.environ.get('BELL_I_SOLR_ROOT') )
     self.logger.info( 'in tasks.indexer.delete_item() [for custom-solr]; SOLR_ROOT_URL, %s' % SOLR_ROOT_URL )
     solr = Solr( SOLR_ROOT_URL )
     response = solr.delete_by_query( 'pid:"%s"' % pid, commit=True )
     response_status = response.status
     self.logger.info( 'in tasks.indexer.delete_item() [for custom-solr]; pid, %s; response_status, %s' % (pid, response_status) )
     if not response_status == 200:
         raise Exception( 'custom-solr delete problem logged' )
     return response_status
Example #11
0
File: plugin.py Project: RAPD/RAPD
 def solr_search(self, query):
     """Do the solr search and pass back results"""
     output_dict = {}
     # Setup connections
     solr = Solr(self.server,version=4)
     #UNLIMITED_ROWS = 10000000 # necessary because default in mysolr is mere 10
     # Run the search
     search_results = solr.search(**query)
     # Format results
     for pdb in search_results.documents:
         output_dict[pdb.get('pdb_id').upper()] = {'description': pdb.get('molecule_name')[0]}
     return output_dict
Example #12
0
def getSingleObjects(id_list, start):	
	smCount = 1
	tcount = 0	
	solr = Solr('http://localhost:8080/solr4/fedobjs')
	query = {'q' : 'rels_isDiscoverable:True', 'fl' : 'id', 'rows' : 50000, 'start' : 0}
	response = solr.search(**query)
	print "Num Results:",response.total_results
	for each in response.documents:
		# print "adding:",each['id']
		id_list.append(each['id'])		
		tcount+=1
	print "Writing",tcount,"results..."
	writeSitemapXML(id_list, smCount)
Example #13
0
 def delete_target_custom_solr_pids( self ):
     ## load pids to be deleted
     with open( self.PIDS_TO_DELETE_SOURCE_DATA_JSON_PATH ) as f:
         deletion_pid_lst = json.loads( f.read() )
     ## run deletion loop, tracking along way
     for pid in deletion_pid_lst:
         solr = Solr( self.CUSTOM_INDEX_SOLR_URL_ROOT )
         response = solr.delete_by_query( 'pid:"%s"' % pid, commit=True )
         response_status = response.status
         self.update_tracker( pid, response_status )
         if not response_status == 200:
             logger.error( 'custom-solr delete problem-response for pid `{pid}`: ```{response}```'.format(pid=pid, resp=response_status) )
     return
Example #14
0
 def __init__(self, context, request):
     self.context = context
     self.request = request
     self.create_count = 0
     self.update_count = 0
     self.messages = []
     self.to_index = []
     solr_uri = request.registry.settings.get('push.solr_uri', None)
     if solr_uri is None:
         raise AttributeError(u'A push.solr_uri is required')
     # XXX: We are importing solr here to be able to mock it in the tests
     from mysolr import Solr
     self.solr = Solr(solr_uri)
     self.shared = context.shared
Example #15
0
 def solr_search(self, query):
     """Do the solr search and pass back results"""
     output_dict = {}
     # Setup connections
     solr = Solr(self.server, version=4)
     #UNLIMITED_ROWS = 10000000 # necessary because default in mysolr is mere 10
     # Run the search
     search_results = solr.search(**query)
     # Format results
     for pdb in search_results.documents:
         output_dict[pdb.get('pdb_id').upper()] = {
             'description': pdb.get('molecule_name')[0]
         }
     return output_dict
Example #16
0
    def _readLabcasSolr(self, labcasurl, labcas_sourceurl_prefix):
        u'''Read the statements made at the RDF at ``url`` and return a
        dictionary of {s → [{p → [o]}]} where ``s`` is a subject URI mapping
        to a sequence of dictionaries whose keys ``p`` are predicate URIs
        mapping to a sequence of ``o`` objects, which may be literal values
        or reference URIs.'''
        solr_conn = Solr(base_url=labcasurl, version=4)
        solr_query = {'q': '*:*'}
        solr_response = solr_conn.search(**solr_query)
        results = {}
        for obj in solr_response.documents:
            obj['sourceurl'] = labcas_sourceurl_prefix + obj.get("id")
            results[obj.get("id")] = obj

        return results
Example #17
0
class VIVOService(object):
    def __init__(self):
        from mysolr import Solr
        surl = get_env('SOLR_URL')
        self.solr = Solr(surl)

    def get(self, query, class_type):
        out = []
        #Will use acNameStemmed for now.  Can construct a more intelligent query
        #later if necessary.
        query = {
            'q': u'acNameStemmed:{0} type:{1}'.format(query, class_type),
            'fl': 'URI,nameRaw,PREFERRED_TITLE',
            'rows': 20
        }
        response = self.solr.search(**query)
        #Massage the Solr response.
        for doc in response.documents:
            d = {}
            d['uri'] = doc['URI']
            d['id'] = doc['URI']
            d['text'] = "{} - {}".format(
                doc['nameRaw'][0],
                doc['PREFERRED_TITLE'][0]
            )
            out.append(d)
        return out
Example #18
0
 def __init__(self,
              exit_on_error=True,
              solr_host=settings.SOLR_HOST,
              solr_port=settings.SOLR_PORT,
              solr_collection=settings.SOLR_COLLECTION):
     if 'http://' not in solr_host and 'https://' not in solr_host:
         # forgiving of configurations
         solr_host = 'http://' + solr_host
     self.session = requests.Session()
     if len(solr_collection) > 1:
         solr_collection = '/' + solr_collection
     if solr_port == 80:
         solr_connection_string = solr_host \
             + '/solr' + solr_collection
     else:
         solr_connection_string = solr_host + ':' + str(solr_port) \
             + '/solr' + solr_collection
     try:
         # print(solr_connection_string)
         self.connection = Solr(solr_connection_string,
                                make_request=self.session,
                                version=4)
     except requests.ConnectionError:
         print('\nError: Could not connect to Solr at: ' + solr_connection_string +\
               '\nPlease verify your Solr instance and configuration.\n')
         if exit_on_error:
             sys.exit(1)
         else:
             self.connection = False
class call_number_app(object):

    def __init__(self,**kwargs):
        """
        The `call_number_app` takes a number of optional parameters
        including an URL where the Aristotle Library Apps instance
        is currently running.

        :param url: URL of Aristotle Library Apps path to the call 
                    number app, defaults to 
                    http://0.0.0.0/apps/call_number/json/.
        """
        if kwargs.has_key("url"):
            self.call_number_url = kwargs.get("url")
        else:
            self.call_number_url = "http://0.0.0.0/apps/call_number/json/"
        self.solr = Solr(base_url=settings.SOLR_URL)

    def json_search(self,request):
        """
        Performs a call number search using JSON interface to the call 
        number app. Results are returned as JSON.

        :param request: Django request
        """
        call_number = request.REQUEST.get('q')
        if request.REQUEST.has_key("number_type"):
            number_type = request.REQUEST.get('number_type')
        else:
            number_type = 'lccn'
        context = {'docs':None}
        json_search_url = os.path.join(self.call_number_url,
                                       'term_search')
        json_search_url = "{0}?call_number={1}&slice-size={2}&type={3}".format(json_search_url,
                                                                               call_number.strip(),
                                                                               int(settings.ITEMS_PER_PAGE) - 3,
                                                                               number_type)
                                  
        json_results = urllib2.urlopen(json_search_url).read()
        results = json.load(urllib2.urlopen(json_search_url))
        if len(results.get("bib_numbers")) > 0:
            context['docs'] = []
            for bib_num in results.get("bib_numbers"):
                query = {"q":bib_num,
                         "qt":"dismax",
                         "fl":"*"}
                response = self.solr.search(**query)
                for doc in response.documents:
                    context["docs"].append(doc)
            # Iterate through and create record_urls
            for doc in context['docs']:
                doc['record_url'] = settings.CATALOG_RECORD_URL.format(doc['id'])
        context['current_sort'] = None
        context['sorts'] = [x[0] for x in settings.SORTS]
        context['start_number'] = 1
        context['end_number'] = min(results,
                                    settings.ITEMS_PER_PAGE)
        return context
Example #20
0
    def run(self):
        df = pd.read_csv(self.input().open('r'), sep='\t')
        df['id'] = df['url']

        solr = Solr('SOLR_HOST')

        # Index 10 docs at a time
        start = 0
        increment = 10
        while len(df[start:start + increment]) > 0:
            sliced = df[start:start + increment]
            docs = []
            for index, row in sliced.iterrows():
                doc = json.loads(row.to_json())
                docs.append(doc)

            solr.update(docs, 'json')
            if start % 1000 == 0:
                # Just to see that is working
                print start
            start += increment
Example #21
0
def delete_items(context, request):
    """Delete the given items from the index
    """
    # If the request isn't an RSS feed, bail out
    if request.content_type not in ALLOWED_CONTENT:
        body_msg = (
            "The content-type of the request must be one of the "
            "following: %s"
        ) % ", ".join(ALLOWED_CONTENT)
        return HTTPBadRequest(body=body_msg)
    solr_uri = request.registry.settings.get('push.solr_uri', None)
    if solr_uri is None:
        raise AttributeError(u'A push.solr_uri is required')
    # XXX: We are importing solr here to be able to mock it in the tests
    from mysolr import Solr
    solr = Solr(solr_uri)
    shared_content = feedparser.parse(request.body)
    missing = []
    removed = 0
    for item in shared_content.entries:
        uid = item['id']
        uid = normalize_uid(uid)
        logger.debug('Deleting %s' % uid)
        if uid not in context.shared:
            missing.append(uid)
            solr.delete_by_key(uid)
            continue
        del context.shared[uid]
        solr.delete_by_key(uid)
        removed += 1
    body_msg = "Removed %s items." % removed
    if missing:
        msg_str = " %s items could not be found for deletion: %s"
        args = (len(missing), ', '.join(missing))
        msg = msg_str % args
        logger.warn(msg)
        body_msg += msg
    return HTTPOk(body=body_msg)
Example #22
0
 def __init__(self, context, request):
     self.context = context
     self.request = request
     self.create_count = 0
     self.update_count = 0
     self.messages = []
     self.to_index = []
     solr_uri = request.registry.settings.get('push.solr_uri', None)
     if solr_uri is None:
         raise AttributeError(u'A push.solr_uri is required')
     # XXX: We are importing solr here to be able to mock it in the tests
     from mysolr import Solr
     self.solr = Solr(solr_uri)
     self.shared = context.shared
Example #23
0
def atomicUpdate(chunkFile, solrURL):

    session = requests.Session()
    solr = Solr(solrURL, make_request=session, version=4)

    bufferDocs = []

    with open(chunkFile, 'r') as inF:
        for docID in inF:
            docID = docID.strip()

            delta_update = { "id": docID,
                              "dataSource_s_md": {"set": "ice"} } ## Caution change this value

            bufferDocs.append(delta_update)


    x = solr.update(bufferDocs, commit=True)

    if x.raw_content['responseHeader']['status'] != 0:
        print "Solr Commit Failed !!!! Error Status code: ", x.raw_content['responseHeader']['status']
    else:
        print "Awesome!! Solr Commit was a Success"
Example #24
0
def update_deletions(context, request):
    """Receive a UID from the request vars and remove the associated
    object from the deleted feed.
    """
    uid = request.POST.get('uid')
    if not uid:
        return
    solr_uri = request.registry.settings.get('push.solr_uri', None)
    if solr_uri is None:
        raise AttributeError(u'A push.solr_uri is required')
    from mysolr import Solr
    solr = Solr(solr_uri)
    logger.debug('Remove deleted status')
    remove_deleted_status(uid, context.shared, solr)
    return HTTPOk(body="Item no longer marked as deleted")
    def __init__(self,**kwargs):
        """
        The `title_search_app` takes a number of optional parameters
        including an URL where the Aristotle Library Apps instance
        is currently running.

        :param url: URL of Aristotle Library Apps path to the call 
                    number app, defaults to 
                    http://0.0.0.0/apps/call_number/json/.
        """
        if kwargs.has_key("url"):
            self.url = kwargs.get("url")
        else:
            self.url = "http://0.0.0.0/apps/title_search/search"
        self.solr = Solr(base_url=settings.SOLR_URL)
Example #26
0
def delete_items(context, request):
    """Delete the given items from the index
    """
    # If the request isn't an RSS feed, bail out
    if request.content_type not in ALLOWED_CONTENT:
        body_msg = (
            "The content-type of the request must be one of the "
            "following: %s"
        ) % ", ".join(ALLOWED_CONTENT)
        return HTTPBadRequest(body=body_msg)
    solr_uri = request.registry.settings.get('push.solr_uri', None)
    if solr_uri is None:
        raise AttributeError(u'A push.solr_uri is required')
    # XXX: We are importing solr here to be able to mock it in the tests
    from mysolr import Solr
    solr = Solr(solr_uri)
    shared_content = feedparser.parse(request.body)
    missing = []
    removed = 0
    for item in shared_content.entries:
        uid = item['id']
        uid = normalize_uid(uid)
        logger.debug('Deleting %s' % uid)
        if uid not in context.shared:
            missing.append(uid)
            solr.delete_by_key(uid)
            continue
        del context.shared[uid]
        solr.delete_by_key(uid)
        removed += 1
    body_msg = "Removed %s items." % removed
    if missing:
        msg_str = " %s items could not be found for deletion: %s"
        args = (len(missing), ', '.join(missing))
        msg = msg_str % args
        logger.warn(msg)
        body_msg += msg
    return HTTPOk(body=body_msg)
from mysolr import Solr

# Default connection to localhost:8080
solr = Solr("http://localhost:8983/solr/barcore")

# All solr params are supported!
query = {'q' : '*:*', 'facet' : 'true', 'facet.field' : 'zip'}
response = solr.search(**query)

# do stuff with documents
for document in response.documents:
    # modify field 'foo'
    document['rating'] = 2.0

# update index with modified documents
solr.update(response.documents, commit=True)
Example #28
0
 def setUp(self):
     self.solr = Solr('http://localhost:8983/solr')
Example #29
0
class QueryResultTestCase(unittest.TestCase):
    def setUp(self):
        self.solr = Solr('http://localhost:8983/solr')

    def test_search(self):
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{'id': 1}]
        response = self.solr.update(documents, input_type='json')
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{'id': 2}]
        response = self.solr.update(documents, input_type='xml')
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = 'id:1'
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Example #30
0
class UpdateItems(object):
    """Create a new SharedItem or update it if it already exists.
    This will find all the entries, then create / update them. Then
    do a batch index to Solr.
    """

    def __init__(self, context, request):
        self.context = context
        self.request = request
        self.create_count = 0
        self.update_count = 0
        self.messages = []
        self.to_index = []
        solr_uri = request.registry.settings.get('push.solr_uri', None)
        if solr_uri is None:
            raise AttributeError(u'A push.solr_uri is required')
        # XXX: We are importing solr here to be able to mock it in the tests
        from mysolr import Solr
        self.solr = Solr(solr_uri)
        self.shared = context.shared

    def __call__(self):
        #  If the request isn't an RSS feed, bail out
        if self.request.content_type not in ALLOWED_CONTENT:
            body_msg = (
                "The content-type of the request must be one of the "
                "following: %s"
            ) % ", ".join(ALLOWED_CONTENT)
            return HTTPBadRequest(body=body_msg)
        # Create / update
        self._process_items()
        # Index in Solr
        self._update_index()
        # Return a 200 with details on what happened in the body
        self.messages.append("%s items created." % self.create_count)
        self.messages.append("%s items updated." % self.update_count)
        return HTTPOk(body=" ".join(self.messages))

    def _process_items(self):
        """Get a list of new items to create and existing items that
        need to be updated.
        """
        shared_content = feedparser.parse(self.request.body)
        for item in shared_content.entries:
            uid = item['id']
            # Get the uid, minus the urn:syndication bit
            item['uid'] = uid = normalize_uid(uid)
            logger.info('Processing item %s' % uid)
            item['link'] = item.link
            item['feed_link'] = shared_content.feed.link
            if uid in self.shared:
                self._update_item(item)
            else:
                self._create_item(item)

    def _create_item(self, entry):
        """Create new items in the feed
        """
        new_item = SharedItem()
        uid = entry['uid']
        logger.info('Creating item %s' % uid)
        new_item.update_from_entry(entry)
        # XXX: Should name and parent be necessary here? Shouldn't
        #      the `add` method do that for us?
        new_item.__name__ = uid
        new_item.__parent__ = self.shared
        self.shared.add(uid, new_item)
        self.to_index.append(self.shared[uid])
        self.create_count += 1

    def _update_item(self, entry):
        """Update existing items in the db using their UID
        """
        uid = entry['uid']
        logger.info('Updating item %s' % uid)
        obj = self.shared[uid]
        # XXX: these aren't coming from the object. Why is that? Is
        #      the `add` method on the folder not setting them?
        obj.__name__ = uid
        obj.__parent__ = self.shared
        selected_or_shared = (
            'selected' in entry['feed_link'] or
            'shared' in entry['feed_link']
        )
        if selected_or_shared and hasattr(obj, 'deletion_type'):
            remove_deleted_status(uid, self.shared, self.solr)
        obj.update_from_entry(entry)
        self.to_index.append(obj)
        self.update_count += 1

    def _update_index(self):
        """Clean up the item dictionaries to contain only items that
        are valid and send them over to Solr for indexing.

        NOTE: Solr may error out on index if it receives a field it is
              not aware of. We should change this code to look up the
              Solr schema, and remove attributes that it doesn't know,
              like __name__ and __parent__ below.
        """
        logger.debug('Updating index for %s objects' % len(self.to_index))
        cleaned = []
        ignored_attrs = [
            '__name__',
            '__parent__',
            'deletion_type',
        ]
        for item in self.to_index:
            item_dict = copy.deepcopy(item.__dict__)
            if 'Modified' in item_dict:
                if hasattr(item_dict['Modified'], 'isoformat'):
                    mod_date = item_dict['Modified'].isoformat()
                else:
                    mod_date = item_dict['Modified']
                # Make sure the date is acceptable to Solr, strip off
                # the +00:00 and replace it with a Z
                item_dict['Modified'] = "%sZ" % mod_date[:-6]
            item_dict['uid'] = item_dict['__name__']
            # XXX: Need to look up the schema, then modify the dict
            #      based on that.
            for attr in ignored_attrs:
                item_dict.pop(attr, '')
            cleaned.append(item_dict)
        # XXX: Need to handle Solr errors here
        response = self.solr.update(cleaned)
        return response
Example #31
0
import sys

database = 'fashion_ip'
collection = 'docs'

# Make a connection to Mongo.
try:
    db_conn = Connection("localhost")
    # db_conn = Connection("emo2.trinity.duke.edu", 27017)
except ConnectionFailure:
    print "couldn't connect: be sure that Mongo is running on localhost:27017"
    sys.exit(1)

db = db_conn[database]

solr = Solr('http://emo2.trinity.duke.edu:8080/solr/')

# query = {'q':'*:*','fl':'_id','tv.tf':'true','qt':'tvrh','rows':10,'start':0}
# response = solr.search(**query)
# tv = response.raw_response['termVectors']
# tv[0] == 'warnings'
# tv[1] == [...]

# tv[2] == 'doc-0'
# tv[3] == [...]
# tv[3][0] == 'uniqueKey'
# tv[3][1] == '4f406d8347b2301618000000'
# tv[3][2] == 'content'
# tv[3][3] == ['1', ['tf', 2], '151', ['tf', 1], '157', ['tf', 1], '182', ['tf', 1], '186', ['tf', 2], ...
# tv[4] == 'uniqueKeyFieldName'
# tv[5] == '_id'
Example #32
0
 def __init__(self, urls, config, version=4):
     self.cursor = Solr(urls, version=version)
Example #33
0
from pymongo.errors import ConnectionFailure
# import solr
from mysolr import Solr

# Make a connection to Mongo.
try:
    db_conn = Connection()
    # db_conn = Connection("emo2.trinity.duke.edu", 27017)
except ConnectionFailure:
    print "couldn't connect: be sure that Mongo is running on localhost:27017"
    sys.exit(1)

db = db_conn['fashion_ip']

# create a connection to a solr server
solr = Solr('http://localhost:8080/solr')

# DELETE ALL DOCS FIRST!!
solr.delete_by_query(query='*:*', commit=True)

total_docs = db.docs.find().count()
count = 0
documents = []

for doc in db.docs.find({}, {
        '_id': True,
        'year': True,
        'court': True,
        'court_level': True,
        'url': True,
        'name': True,
Example #34
0
from mysolr import Solr
import requests

import localConfig

# set connection through requests
session = requests.Session()
solr_handle = Solr(localConfig.solr_URL, make_request=session)

Example #35
0
from mysolr import Solr

# Default connection to localhost:8080
solr = Solr("http://localhost:8983/solr/barcore")

# All solr params are supported!
query = {'q': '*:*', 'facet': 'true', 'facet.field': 'zip'}
response = solr.search(**query)

# do stuff with documents
for document in response.documents:
    # modify field 'foo'
    document['rating'] = 2.0

# update index with modified documents
solr.update(response.documents, commit=True)
#!/usr/bin/env python

import sys
import os
import json
from mysolr import Solr

PDBE_SOLR_URL = "http://www.ebi.ac.uk/pdbe/search/pdb"
solr = Solr(PDBE_SOLR_URL)

PY3 = sys.version > '3'

if PY3:
    import urllib.request as urllib2
else:
    import urllib2

SERVER_URL = "https://www.ebi.ac.uk/pdbe/api"


def join_with_AND(query_params):
    '''convenience function to create query string with AND'''
    return " AND ".join(["%s:%s" % (k, v) for k, v in query_params.items()])


def execute_solr_query(query, query_fields):
    '''convenience function'''
    query["q"] = join_with_AND(query_fields)  # add q
    response = solr.search(**query)
    documents = response.documents
    print("Found %d matching entities in %d entries." %
Example #37
0
from flask import Flask, request, session, g, redirect, url_for, abort, render_template, flash
import sqlite3
import pdb
from mysolr import Solr
import requests
from contextlib import closing
from flask.ext.sqlalchemy import SQLAlchemy

#configuration must have the full path
DATABASE = 'c:/Users/Alicia/PycharmProjects/WorldValues/worldvalues.db'
DEBUG = True
SECRET_KEY = 'development key'
USERNAME = '******'
PASSWORD = '******'
solr = Solr('http://localhost:8983/solr/#/collection1')

app = Flask(__name__)
app.debug = True
app.config.from_object(__name__)


def connect_db():
    return sqlite3.connect(app.config['DATABASE'])


@app.before_request
def before_request():
    g.db = connect_db()


@app.teardown_request
Example #38
0
#User Guide
#Connecting to Solr

#Use mysolr.Solr object to connect to a Solr instance.

from mysolr import Solr

# Default connection. Connecting to http://localhost:8080/solr/
solr = Solr()

# Custom connection
solr = Solr('http://foo.bar:9090/solr/')

# If the server is secured with HTTP basic authentication you can connect by using auth parameter.
from mysolr import Solr

solr = Solr(auth=('admin', 'admin'))

#Further information about auth parameter in requests docs
#Queriying to Solr

#Making a query to Solr is very easy, just call search method with your query.
from mysolr import Solr

solr = Solr()
# Search for all documents
response = solr.search(q='*:*')
# Get documents
documents = response.documents

#Besides, all available Solr query params are supported. So making a query using pagination would be as simple as
Example #39
0
def query_solr(query):
    solr = Solr()
    response = solr.search(q=query)
    documents = response.documents
    return documents
Example #40
0
from mysolr import Solr
import requests

# set connection through requests
session = requests.Session()
solr_handle = Solr('http://localhost:8080/solr/search', make_request=session)

Example #41
0
 def setUp(self):
     self.solr = Solr(os.getenv('SOLR_URL'))
Example #42
0
class eBsolr:
    cursor = None

    def __init__(self, urls, config, version=4):
        self.cursor = Solr(urls, version=version)

    def update(self, documents, input_type='json', commit=False):
        self.cursor.update(documents, input_type, commit)

    def deleteById(self, tid, commit=False):
        return self.cursor.delete_by_key(tid, commit=commit)

    def deleteByQuery(self, query, commit=False):
        return self.cursor.delete_by_query(query=query, commit=commit)

    def deleteAll(self, commit=False):
        return self.cursor.delete_by_query("*:*", commit=commit)

    def getResponse(self, search, fields=None, start=0, rows=None, sort=None, fq=None):
        query = {'q': search}
        if fields:
            if isinstance(fields, basestring):
                query['fl'] = fields
            else:
                query['fl'] = ",".join(fields)
        if sort:
            query['sort'] = sort

        if fq:
            query['fq'] = fq

        # Default to 10000 rows
        limit = rows
        if rows is None:
            limit = _MAXROWS
        query['start'] = start
        query['rows'] = limit

        response = self.cursor.search(**query)
        if int(response.status) >= 400:
            raise Exception('Error Solr {}: {}'.format(response.status, response.extract_errmessage()))
        if rows is None and response.total_results > limit:
            # query['start'] = response.total_results
            query['rows'] = response.total_results
            response = self.cursor.search(**query)

        return response

    def get_language_query(self, language):
        q_temp = None
        if language is not None and language != "":
            langArray = language.split(';')
            if len(langArray) > 0:
                lang = langArray[0]
                q_temp = "language:%s" % lang
                for lang in langArray[1:]:
                    q_temp = "%s OR language:%s" % (q_temp, lang)
        return q_temp

    def getDocs(self, search, fields=None, start=0, rows=None, sort=None, fq=None):
        """search: query sintaks ex: "field:keys,field2:keys2"
           fields: field yg di ambil (list) ex: ['field', 'field2']
           start: start row
           rows: max / limit row
           sort: order rows ex: field asc, field2 desc"""
        # Get documents
        response = self.getResponse(search, fields, start, rows, sort, fq)

        return {"docs": response.documents, "count": response.total_results}

    def getFacetList(self, facets, facetField):
        ff = {}
        if not isinstance(facetField, list):
            facetField = facetField.split(",")
        for facet in facetField:
            if facet:
                ff[facet] = facets['facet_fields'][facet]

        return ff

    def getFacetPivotGeneral(self, query, facetField, pivotField, limit=None, fq=None):
        try:
            url = "{0}select?q={1}&rows=1&wt=json&indent=true&facet=true&facet.pivot={2},{3}".format(
                self.cursor.base_url, query.replace("+", "%2B"), facetField, pivotField)

            url = '{}select'.format(self.cursor.base_url)
            params = {'q': query,
                      'rows': 0,
                      'wt': 'json',
                      'indent': 'true',
                      'facet': 'true',
                      'facet.pivot': '{},{}'.format(facetField, pivotField)}

            if limit:
                params['facet.limit'] = limit
            if fq:
                params['fq'] = fq
                #                 url = "%s&facet.limit=%d" % (url, limit)
            http_response = requests.get(url, params=params)
            # print url
            #             http_response = requests.get(url)

            return http_response.json()['facet_counts']['facet_pivot']['{0},{1}'.format(facetField, pivotField)]
        except Exception, e:
            print("Error parsing facet pivot...")
            print e
        return None
Example #43
0
import time
import socket
import xml.parsers.expat

#import sunburnt
from mysolr import Solr

from Resource.ResourceHelper import ResourceHelper
from Resource.Resource import Resource
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

solrBase = "http://localhost:8983/solr/"
updateUrl = solrBase + 'update/'

solr = Solr(solrBase)

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:   
    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = Resource(feed, 'feed').get_id()
            print(feedDict['id'])
            print("Indexing", feedDict)
            
            solr.update([feedDict], 'json', commit=True)
            print('Indexed.')
        return self._stemmer.stem(word).lower()


# Make a connection to Mongo.
try:
    # db_conn = Connection("localhost", 27017)
    db_conn = Connection("emo2.trinity.duke.edu", 27017)
except ConnectionFailure:
    print "couldn't connect: be sure that Mongo is running on localhost:27017"
    # sys.stdout.flush()
    sys.exit(1)

db = db_conn['fashion_ip']

# Connection to Solr for faster full text searching
solr = Solr('http://localhost:8080/solr')

qstring = sys.argv[1]

pir_re = re.compile(r'.* ' + qstring + '.*', re.IGNORECASE)
porter = nltk.PorterStemmer()

for year in range(1900, 2013):
    print '\nYEAR: ', year

    response = solr.search(q=qstring + ' year:' + str(year),
                           fl='_id,score',
                           rows=10000,
                           start=0)
    documents = response.documents
Example #45
0
class QueryResultTestCase(unittest.TestCase):

    def setUp(self):
        self.solr = Solr('http://localhost:8983/solr')

    def test_search(self):
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{'id' : 1}]
        response = self.solr.update(documents, input_type='json')
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{'id' : 2}]
        response = self.solr.update(documents, input_type='xml')
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = 'id:1'
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Example #46
0
	def __init__(self, url):
		self.url = url
		self.conn = Solr(url)
Example #47
0
def get_solr_count(query):
    server = Solr(SOLR_SERVER)
    query_response = server.search(**query)
    count = query_response.total_results
    return count
Example #48
0
 def __init__(self,solrAddr):
     '''
     Constructor
     '''
     self.solr = Solr(solrAddr)
Example #49
0
class UpdateItems(object):
    """Create a new SharedItem or update it if it already exists.
    This will find all the entries, then create / update them. Then
    do a batch index to Solr.
    """

    def __init__(self, context, request):
        self.context = context
        self.request = request
        self.create_count = 0
        self.update_count = 0
        self.messages = []
        self.to_index = []
        solr_uri = request.registry.settings.get('push.solr_uri', None)
        if solr_uri is None:
            raise AttributeError(u'A push.solr_uri is required')
        # XXX: We are importing solr here to be able to mock it in the tests
        from mysolr import Solr
        self.solr = Solr(solr_uri)
        self.shared = context.shared

    def __call__(self):
        #  If the request isn't an RSS feed, bail out
        if self.request.content_type not in ALLOWED_CONTENT:
            body_msg = (
                "The content-type of the request must be one of the "
                "following: %s"
            ) % ", ".join(ALLOWED_CONTENT)
            return HTTPBadRequest(body=body_msg)
        # Create / update
        self._process_items()
        # Index in Solr
        self._update_index()
        # Return a 200 with details on what happened in the body
        self.messages.append("%s items created." % self.create_count)
        self.messages.append("%s items updated." % self.update_count)
        return HTTPOk(body=" ".join(self.messages))

    def _process_items(self):
        """Get a list of new items to create and existing items that
        need to be updated.
        """
        shared_content = feedparser.parse(self.request.body)
        for item in shared_content.entries:
            uid = item['id']
            # Get the uid, minus the urn:syndication bit
            item['uid'] = uid = normalize_uid(uid)
            logger.info('Processing item %s' % uid)
            item['link'] = item.link
            item['feed_link'] = shared_content.feed.link
            if uid in self.shared:
                self._update_item(item)
            else:
                self._create_item(item)

    def _create_item(self, entry):
        """Create new items in the feed
        """
        new_item = SharedItem()
        uid = entry['uid']
        logger.info('Creating item %s' % uid)
        new_item.update_from_entry(entry)
        # XXX: Should name and parent be necessary here? Shouldn't
        #      the `add` method do that for us?
        new_item.__name__ = uid
        new_item.__parent__ = self.shared
        self.shared.add(uid, new_item)
        self.to_index.append(self.shared[uid])
        self.create_count += 1

    def _update_item(self, entry):
        """Update existing items in the db using their UID
        """
        uid = entry['uid']
        logger.info('Updating item %s' % uid)
        obj = self.shared[uid]
        # XXX: these aren't coming from the object. Why is that? Is
        #      the `add` method on the folder not setting them?
        obj.__name__ = uid
        obj.__parent__ = self.shared
        selected_or_shared = (
            'selected' in entry['feed_link'] or
            'shared' in entry['feed_link']
        )
        if selected_or_shared and hasattr(obj, 'deletion_type'):
            remove_deleted_status(uid, self.shared, self.solr)
        obj.update_from_entry(entry)
        self.to_index.append(obj)
        self.update_count += 1

    def _update_index(self):
        """Clean up the item dictionaries to contain only items that
        are valid and send them over to Solr for indexing.

        NOTE: Solr may error out on index if it receives a field it is
              not aware of. We should change this code to look up the
              Solr schema, and remove attributes that it doesn't know,
              like __name__ and __parent__ below.
        """
        logger.debug('Updating index for %s objects' % len(self.to_index))
        cleaned = []
        ignored_attrs = [
            '__name__',
            '__parent__',
            'deletion_type',
        ]
        for item in self.to_index:
            item_dict = copy.deepcopy(item.__dict__)
            if 'Modified' in item_dict:
                if hasattr(item_dict['Modified'], 'isoformat'):
                    mod_date = item_dict['Modified'].isoformat()
                else:
                    mod_date = item_dict['Modified']
                # Make sure the date is acceptable to Solr, strip off
                # the +00:00 and replace it with a Z
                item_dict['Modified'] = "%sZ" % mod_date[:-6]
            if 'content' in item_dict:
                items = [item['value'] for item in item_dict['content']]
                if items:
                    # XXX: use first content item, discard the rest
                    item_dict['content'] = items[0]
            item_dict['uid'] = item_dict['__name__']
            # XXX: Need to look up the schema, then modify the dict
            #      based on that.
            for attr in ignored_attrs:
                item_dict.pop(attr, '')
            cleaned.append(item_dict)
        # XXX: Need to handle Solr errors here
        response = self.solr.update(cleaned)
        return response
Example #50
0
 def setUp(self):
     self.solr = Solr('http://localhost:8983/solr')
Example #51
0
from pymongo.errors import ConnectionFailure
# import solr
from mysolr import Solr

# Make a connection to Mongo.
try:
    db_conn = Connection()
    # db_conn = Connection("emo2.trinity.duke.edu", 27017)
except ConnectionFailure:
    print "couldn't connect: be sure that Mongo is running on localhost:27017"
    sys.exit(1)

db = db_conn['fashion_ip']

# create a connection to a solr server
solr = Solr('http://localhost:8080/solr')

# DELETE ALL DOCS FIRST!!
solr.delete_by_query(query='*:*', commit=True)

total_docs = db.docs.find().count()
count = 0
documents = []

for doc in db.docs.find({},{'_id':True,'year':True,'court':True,'court_level':True,'url':True,'name':True,'content':True,'tags':True,'subjects':True}):
	if count%100 == 0:
		print count
		
	# don't know how else to get solr to take IDs...
	doc['_id'] = str(doc['_id'])
	# include subject tag in list of strings if weigth greater than 0.01
Example #52
0
 def setUp(self):
     self.solr = Solr(os.getenv('SOLR_URL'))
Example #53
0
 def __init__(self, url):
     self.url = url
     self.conn = Solr(url)