Esempio n. 1
0
 def __init__(self, core_name=None, wt='json'):
     self.core_name = core_name
     if core_name:
         self.__solr = pysolr.Solr(self.__solr_url + core_name)
     if wt:
         self.wt = wt
Esempio n. 2
0
import re
import pysolr
from sqlalchemy.sql import func
import config
import models

if hasattr(config, "solr") and config.solr == "lib_prod":
    blake_object_solr = pysolr.Solr(
        'http://webapp.lib.unc.edu:8200/solr/blake/blake_object')
    blake_copy_solr = pysolr.Solr(
        'http://webapp.lib.unc.edu:8200/solr/blake/blake_copy')
    blake_work_solr = pysolr.Solr(
        'http://webapp.lib.unc.edu:8200/solr/blake/blake_work')
elif hasattr(config, "solr") and config.solr == "lib_dev":
    blake_object_solr = pysolr.Solr(
        'http://london.libint.unc.edu:8983/solr/blake_object')
    blake_copy_solr = pysolr.Solr(
        'http://london.libint.unc.edu:8983/solr/blake_copy')
    blake_work_solr = pysolr.Solr(
        'http://london.libint.unc.edu:8983/solr/blake_work')
elif hasattr(config, "solr") and config.solr == "local":
    blake_object_solr = pysolr.Solr('http://localhost:8983/solr/blake_object')
    blake_copy_solr = pysolr.Solr('http://localhost:8983/solr/blake_copy')
    blake_work_solr = pysolr.Solr('http://localhost:8983/solr/blake_work')
else:
    blake_object_solr = pysolr.Solr(
        'http://ctools-dev.its.unc.edu/solr/blake-object')
    blake_copy_solr = pysolr.Solr(
        'http://ctools-dev.its.unc.edu/solr/blake-copy')
    blake_work_solr = pysolr.Solr(
        'http://ctools-dev.its.unc.edu/solr/blake-work')
Esempio n. 3
0
    def query_network_fast(self, uuid, search_string, max_edges):
        niceCx = NiceCXNetwork()
        #uuid = '7246d8cf-c644-11e6-b48c-0660b7976219'
        search_terms_dict = {k:1 for k in search_string.split(',')}
        edge_keepers = set([])
        node_keepers = set([])

        solr = pysolr.Solr(solr_url + uuid + '/', timeout=10)

        try:
            results = solr.search(search_string, rows=10000)
            #search_terms_array = [int(n['id']) for n in results.docs]
            search_terms_array = {int(n['id']):1 for n in results.docs}
            search_terms_set = set([int(n['id']) for n in results.docs])
            if(not search_terms_array):
                return {'message': 'No nodes found'}

            print('starting nodes 1')
            #===================
            # METADATA
            #===================
            available_aspects = []
            for ae in (o for o in self.stream_aspect(uuid, 'metaData')):
                available_aspects.append(ae.get(CX_CONSTANTS.METADATA_NAME))
                mde = MetaDataElement(json_obj=ae)
                niceCx.add_metadata(mde)

            opaque_aspects = set(available_aspects).difference(known_aspects_min)

            print(opaque_aspects)

            #===================
            # EDGES
            #===================
            edge_count = 0
            added_edges = 0
            edges = []
            edge_found = False
            edge_id = -1
            edge_s = None
            edge_t = None
            edge_i = None
            e_count = 0

            start_time = time.time()
            parser = self.stream_aspect_raw(uuid, 'edges')

            for prefix, event, value in parser:
                # process event, prefix and value

                if(event == 'end_map'):
                    e_count += 1
                    if e_count % 5000 == 0:
                        print(e_count)
                    if edge_found:
                        edge_keepers.add(edge_id)
                        node_keepers.update([edge_s, edge_t])
                        edges.append(EdgeElement(id=edge_id, edge_source=edge_s, edge_target=edge_t, edge_interaction=edge_i))

                    edge_id = -1
                    edge_s = None
                    edge_t = None
                    edge_i = None
                    edge_found = False

                if (prefix) == ('item.s'):
                    if value in search_terms_set:
                        edge_found = True
                    edge_s = value

                elif (prefix) == ('item.t'):
                    if value in search_terms_set:
                        edge_found = True
                    edge_t = value

                elif (prefix) == ('item.i'):
                    if edge_found:
                        edge_i = value

                elif (prefix) == ('item.@id'):
                    edge_id = value

            print('Response time (Edge search): ' + str(time.time() - start_time))
            print(node_keepers)
            print(edge_keepers)

            '''
            if 'edges' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'edges')):
                    if ae.get(CX_CONSTANTS.EDGE_SOURCE_NODE_ID) in search_terms_set or ae.get(CX_CONSTANTS.EDGE_TARGET_NODE_ID) in search_terms_set:
                        edge_keepers.add(ae.get(CX_CONSTANTS.ID))
                        node_keepers.update([ae.get(CX_CONSTANTS.EDGE_SOURCE_NODE_ID), ae.get(CX_CONSTANTS.EDGE_TARGET_NODE_ID)])

                    if edge_count % 5000 == 0:
                        print(edge_count)

                    if added_edges > max_edges:
                        raise StopIteration('Max edges reached')

                    edge_count += 1
            else:
                raise Exception('Network does not contain any nodes.  Cannot query')
            '''

        except SolrError as se:
            if('404' in se.message):
                ndex2.get_logger('SOLR').warning('Network not found ' + self.uuid + ' on ' + solr_url + ' server.')
                raise Exception("Network not found (SOLR)")
            else:
                ndex2.get_logger('SOLR').warning('Network error ' + self.uuid + ' on ' + solr_url + ' server. ' + se.message)
                raise Exception(se.message)
        except StopIteration as si:
                ndex2.get_logger('QUERY').warning("Found more than max edges.  Raising exception")
                raise StopIteration(si.message)
Esempio n. 4
0
        path.replace(target_base_dir, '')
        for path in glob(target_base_dir + '*')
    ]

    for account in accounts:
        logs = glob(target_base_dir + account + '/*')
        for log in logs:
            try:
                # MOD - 2018/03/20 - Fujinet - SF6.7.0_開発依頼内容_No.1 - START
                #fp = open(log)
                #fcntl.flock(fp.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB)
                if os.path.getsize(log) <= 0:
                    continue
                else:
                    fp = open(log)
                    fcntl.flock(fp.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB)
                # MOD - 2018/03/20 - Fujinet - SF6.7.0_開発依頼内容_No.1 - END
            except IOError as e:
                print e
                continue
            with fp:
                json_list = json.loads(fp.read())
                fcntl.flock(fp.fileno(), fcntl.LOCK_UN)
            try:
                s = solr.Solr(solr_base_URL + account, timeout=60)
                result = s.add([json_list])
            except solr.SolrError as e:
                print e
                continue
            os.remove(log)
Esempio n. 5
0
def delete_user(user_id):
    """Celery task to remove a user from search."""
    solr = pysolr.Solr(settings.SOLR_URL, always_commit=True)

    solr.delete(q="type:users AND atlas_id:%s" % user_id)
Esempio n. 6
0
 def searcher(self):
     self.solr = pysolr.Solr(self.options.url)
Esempio n. 7
0
# -*- coding: utf-8 -*-
"""
Created on Fri May 14 10:23:00 2021

@author: cbadenes
"""
import worker.annotator as workers
import pysolr
import multiprocessing as mp
import time

if __name__ == '__main__':
    print("annotating documents..")

    solr = pysolr.Solr('http://librairy.linkeddata.es/data/mesinesp',
                       always_commit=True,
                       timeout=50)

    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())

    print("reading from solr..")
    counter = 0
    completed = False
    window_size = 50
    cursor = "*"

    t = time.time()
    while (not completed):
        old_counter = counter
        try:
Esempio n. 8
0
    def update_by_query(self,
                        query,
                        field=None,
                        value=None,
                        data={},
                        queryparameters=None):

        import pysolr

        count = 0

        solr = pysolr.Solr(self.solr + self.core)

        #
        # extend query: do not return documents, that are tagged
        #

        query_marked_before = ''

        if field:
            query_marked_before = field + ':"' + solr_mask(value) + '"'

        # else extract field and value from data to build query of yet tagged docs to exclude

        for fieldname in data:

            if isinstance(data[fieldname], list):

                for value in data[fieldname]:

                    if query_marked_before:
                        query_marked_before += " AND "

                    query_marked_before += fieldname + ':"' + solr_mask(
                        value) + '"'
            else:

                value = data[fieldname]
                if query_marked_before:
                    query_marked_before += " AND "

                query_marked_before += fieldname + ':"' + solr_mask(
                    value) + '"'

        solrparameters = {
            'fl': 'id',
            'defType': 'edismax',
            'rows': 10000000,
        }

        # add custom Solr parameters (if the same parameter, overwriting the obove defaults)
        if queryparameters:
            solrparameters.update(queryparameters)

        if query_marked_before:
            # don't extend query but use filterquery for more performance (cache) on aliases
            solrparameters["fq"] = 'NOT (' + query_marked_before + ')'

        if self.verbose:
            print("Solr query:")
            print(query)
            print("Solr parameters:")
            print(solrparameters)

        results = solr.search(query, **solrparameters)

        for result in results:
            docid = result['id']

            if self.verbose:
                print("Tagging {}".format(docid))

            self.tag(docid=docid, field=field, value=value, data=data)

            count += 1

        return count
    def search(self, query, project_id, version_id, top_n=50, return_json=False, \
        query_string=None, query_field=None):
        """
        This function takes a lucene query which can be created from
        the lucene query parser class and performs a search on the index
        It then returns the top n results according to the ranking score

        Inputs
        ------
        query : Lucene Query
            A query create by the QueryGenerator class present in 
            query_generator.py
        top_n : Int
            The number of top results we want our search to return
        return_json : Bool
            If true, the search results are jsonified and returned
            else the search results are return in the Lucene Document format
        query_string : String
            The string entered by the user
        rerank_fiels : String
            The name of the field against which the reranker must be run
        """
        # Field names do not contain spaces
        query_field = query_field.replace(" ","_")

        proj_exists = self.ensure_collection_exists(project_id,version_id)
        if proj_exists:
            index_url = self.solr_server_link + "/solr/" + proj_exists

        if not proj_exists:
            return 400

        if self.use_rm3 and index_url:
            new_url = index_url + '/anserini'
            response = self.session.get(new_url,data={"q":query})
            data = response.json()
            docs = data['docs']['docs']
            
            for idx, x in enumerate(docs):
                for key in x:
                    x[key] = [x[key]]
                x['score']=x['score'][0]
                x['id']=str(idx)
            search_results_list = [x for x in docs]
            """
            the resonse contains these keys
            dict_keys(
                [
                    'question', 
                    'answer',
                    'answer_formatted', 
                    'question_variation_0', 
                    'question_variation_1', 
                    'question_variation_2', 
                    'score', 
                    'id'
                ]
            )
            """          
        else:
            client = pysolr.Solr(index_url, always_commit=True)
            search_results = client.search(query,fl='*,score',rows=top_n)
            search_results_list = [x for x in search_results]
            
            if search_results.raw_response['response']['numFound'] > 0:
                max_score = search_results.raw_response['response']['docs'][0]['score']
                
                if max_score < 3:
                    return "Not present"
                """
                the resonse contains these keys
                dict_keys(
                    [
                        'answer', 
                        'answer_formatted', 
                        'disease_1', 
                        'disease_2', 
                        'question_variation_0', 
                        'question_variation_1', 
                        'question_variation_2', 
                        'question', 
                        'subject_1_immunization', 
                        'vaccine_1', 
                        'who_is_writing_this', 
                        'id', 
                        'score',
                        '_version_'
                    ]
                )
                """
            else:
                search_results_list = []
        
        # print("reranking")
        # TODO : Add support for reranking multiple fields
        if self.rerank_endpoint is not None and query_string and query_field:
            ids = {}
            text = []

            # pdb.set_trace()
            for document in search_results_list:
                ids[document['question'][0]]=document['id']
                text.append([document['id'],document['question'][0]])

            scoreDocs = self.reranker.rerank(query_string, text)

            # case where gpu is rate limited
            if not scoreDocs:
                return_docs = [[x,x['score']] for x in search_results_list]
            else:
                return_docs = []
                for x in scoreDocs:
                    for y in search_results_list:
                        if x[1]==y['question'][0]:
                            return_docs.append([y,x[0]])
                            break
        else:
            #TODO:setup so that score is correct
            # return document as well as score
            return_docs = [[x,x['score']] for x in search_results_list]

        if self.debug:
            scoreDocs=[]
            if query_field.endswith("*"):
                # mapper from text to doc
                fields = [
                        "question_variation_1",
                        "question_variation_0",
                        "answer",
                        "answer_formatted"
                    ]
            else:
                # only show qa
                fields = [
                        "answer",
                        "answer_formatted"
                    ]
            for doc in return_docs:
                text = doc[0][query_field.replace('*',"")][0]
                for field in fields:
                    text += " ||| " + doc[0][field][0]

                scoreDocs.append([doc[1],text])

        return scoreDocs
urls = (
    '/',
    'SimpleIndexSearchPage',
    '/searchSimpleIndex',
    'SearchSimpleIndex',
)

CATEGORY = {
    'b': 'Business',
    'e': 'Entertainment',
    't': 'Science and Technology',
    'm': 'Health'
}
render = web.template.render('templates/', base='layout')
SOLR_SIMPLEINDEX = pysolr.Solr('http://localhost:8983/solr/simpleindex')


def get_web_input(web_input):
    draw = web_input['draw']
    query = web_input['search[value]']
    offset = web_input['start']
    count = web_input['length']
    return draw, query, offset, count


def search(query, offset, count, draw, solr_endpoint):
    """
    This function is responsible for hitting the solr endpoint
    and returning the results back.
    """
Esempio n. 11
0
def connect_to_solr(core, port=8983):
    solr_instance = pysolr.Solr('http://localhost:{}/solr/{}'.format(
        port, core),
                                always_commit=True)
    return solr_instance
Esempio n. 12
0
def intent_request(session, request):

    intent = request['intent']['name']
    print "intent_request: {}".format(intent)

    if intent == "CreateItem":

        title = request['intent']['slots']['mytitle']['value']
        data = {'title': title, 'note': "This was created via Amazon Echo"}
        output_speech = "You want to add: {}.  What is the context?".format(
            title)
        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': False
            },
            'sessionAttributes': {
                'title': title
            }
        }
        return response

    elif intent == "SetContextOrFolder":

        attributes = session['attributes']
        if attributes.get('context'):
            folder = request['intent']['slots']['mycontextorfolder'].get(
                'value', "No Folder")
            d = {
                'new_task': True,
                'title': attributes.get('title', 'No title'),
                'context': attributes['context'],
                'folder': folder
            }
            output_speech = "The item will be placed in folder {}. Do you want it to be starred?".format(
                folder)
        else:
            context = request['intent']['slots']['mycontextorfolder'].get(
                'value', "No Context")
            d = {
                'title': attributes.get('title', 'No title'),
                'context': context
            }
            output_speech = "The item will be placed in context {}. What folder do you want to place it in?".format(
                context)

        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': False
            },
            'sessionAttributes': d
        }
        return response

    elif intent == "AMAZON.YesIntent":
        # ? should use dict.copy()
        attributes = session['attributes']
        if attributes.get('new_task'):
            title = attributes['title']
            context_title = attributes['context']
            folder_title = attributes['folder']
            #data={'title':title, 'context':context, 'folder':folder, 'star':True}
            #r = requests.post(c.ec_uri+":5000/incoming_from_echo", json=data)
            task = Task(priority=3, title=title, star=True)
            folder = remote_session.query(Folder).filter_by(
                title=folder_title.lower()).first()
            if folder:
                task.folder = folder
            else:
                folder_title = 'No Folder'
            context = remote_session.query(Context).filter_by(
                title=context_title.lower()).first()
            if context:
                task.context = context
            else:
                context_title = 'No Context'
            task.startdate = datetime.today().date()
            task.note = "Created from Echo on {}".format(task.startdate)
            remote_session.add(task)
            remote_session.commit()

            r = requests.get(c.ec_uri + ':5000/sync')
            print r.text

            output_speech = "The new item {} will be created for context {} and folder {} and it will be starred".format(
                title, context_title, folder_title)
            response = {
                'response': {
                    'outputSpeech': {
                        'type': 'PlainText',
                        'text': output_speech
                    },
                    'shouldEndSession': True
                }
            }
            return response
        else:
            which_task = attributes.get('which_task')
            if which_task:
                task = remote_session.query(Task).get(which_task)
                output_speech = "I will read the note: " + task.note
                response = {
                    'response': {
                        'outputSpeech': {
                            'type': 'PlainText',
                            'text': output_speech
                        },
                        'shouldEndSession': True
                    }
                }

    elif intent == "AMAZON.NoIntent":
        attributes = session['attributes']
        if attributes.get('new_task'):
            title = attributes['title']
            context_title = attributes['context']
            folder_title = attributes['folder']
            #data={'title':title, 'context':context, 'folder':folder, 'star':True}
            #r = requests.post(c.ec_uri+":5000/incoming_from_echo", json=data)
            task = Task(priority=3, title=title, star=False)
            folder = remote_session.query(Folder).filter_by(
                title=folder_title.lower()).first()
            if folder:
                task.folder = folder
            else:
                folder_title = 'No Folder'
            context = remote_session.query(Context).filter_by(
                title=context_title.lower()).first()
            if context:
                task.context = context
            else:
                context_title = 'No Context'
            task.startdate = datetime.today().date()
            task.note = "Created from Echo on {}".format(task.startdate)
            remote_session.add(task)
            remote_session.commit()

            r = requests.get(c.ec_uri + ':5000/sync')
            print r.text

            output_speech = "The new item {} will be created for context {} and folder {} and it will not be starred".format(
                title, context_title, folder_title)
            response = {
                'response': {
                    'outputSpeech': {
                        'type': 'PlainText',
                        'text': output_speech
                    },
                    'shouldEndSession': True
                }
            }
            return response

    elif intent == "RetrieveFolderItems":
        folder_title = request['intent']['slots']['myfolder'].get('value', '')
        folder_title = folder_title.lower()
        q = remote_session.query(Task).join(Folder).filter(
            and_(Folder.title == folder_title, Task.completed == None,
                 Task.deleted == False,
                 datetime.now() - Task.modified < timedelta(days=30)))
        count = q.count()
        tasks = q.limit(10).all()

        if count:
            output_speech = "The total number of tasks is {}. ".format(count)
            now = datetime.now()
            for n, task in enumerate(tasks, start=1):
                output_speech += "{}, {}. Created {} days ago.{}".format(
                    n, task.title, (now - task.created).days,
                    ' It is starred. ' if task.star else ' ')
        else:
            output_speech = "I did not find anything."

        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': True
            }
        }
        return response

    elif intent == "RetrieveContextItems":
        context_title = request['intent']['slots']['mycontext'].get(
            'value', '')
        context_title = context_title.lower()
        count = remote_session.query(Task).join(Context).filter(
            and_(Context.title == context_title, Task.completed == None,
                 Task.deleted == False,
                 datetime.now() - Task.modified < timedelta(days=30))).count()
        tasks = remote_session.query(Task).join(Context).filter(
            and_(Context.title == context_title, Task.completed == None,
                 Task.deleted == False,
                 datetime.now() - Task.modified <
                 timedelta(days=30))).limit(10).all()

        if count:
            output_speech = "The total number of tasks is {}. ".format(count)
            now = datetime.now()
            for n, task in enumerate(tasks, start=1):
                output_speech += "{}, {}. Created {} days ago. It is {} starred. ".format(
                    n, task.title, (now - task.created).days,
                    '' if task.star else 'not')

        else:
            output_speech = 'I did not find anything.'

        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': True
            }
        }
        return response

    elif intent == 'RetrieveSpokenItems':
        ####this should become retrieve text2speech items #########
        tasks = remote_session.query(Task).join(TaskKeyword, Keyword).filter(
            and_(Task.completed == None, Task.deleted == False,
                 Task.id == TaskKeyword.task_id,
                 TaskKeyword.keyword_id == Keyword.id,
                 Keyword.name == 'text2speech')).all()
        #tasks = remote_session.query(Task).filter(Task.star==True).all()
        #tasks = remote_session.query(Task).join(Context).filter(and_(Context.title=='work', Task.star==True, Task.completed==None, datetime.now()-Task.modified<timedelta(days=30))).all()

        if tasks:
            output_speech = ''
            for n, task in enumerate(tasks, start=1):
                output_speech += '{}, the title: {}, the note: {} '.format(
                    n, task.title, task.note)
        else:
            output_speech = 'I did not find anything.'

        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': True
            }
        }
        return response

    elif intent == "RetrieveSpecificItems":
        solr = pysolr.Solr(c.ec_uri + ':8983/solr/listmanager/', timeout=10)
        queryterm = request['intent']['slots']['queryterm']['value']
        queryterm = queryterm.replace(' ', '')
        #Initially thought the queryterm could be queryterms and hence the below
        #s = 'title:' + ' AND title:'.join(queryterm.split())
        #s = s + ' note:' + ' AND note:'.join(queryterm.split())
        s = 'title:{} note:{} tag:{}'.format(*(3 * (queryterm, )))
        print s
        #fq = ['star:true', 'completed:false']
        fq = ['completed:false']
        result = solr.search(s, fq=fq)  # rows=1 or **{'rows':1}
        if len(result):
            output_speech = ''
            task_ids = {}
            for n, task in enumerate(result.docs, start=1):
                output_speech += '{}, {}, {}. '.format(n, task['id'],
                                                       task['title'])
                task_ids[n] = task['id']
            output_speech += 'Which one do you want read?'
            response = {
                'response': {
                    'outputSpeech': {
                        'type': 'PlainText',
                        'text': output_speech
                    },
                    'shouldEndSession': False
                },
                'sessionAttributes': {
                    'task_ids': task_ids
                }
            }
        else:
            output_speech = 'I did not find anything, sorry.'
            response = {
                'response': {
                    'outputSpeech': {
                        'type': 'PlainText',
                        'text': output_speech
                    },
                    'shouldEndSession': True
                }
            }

        return response

    elif intent == 'GetTaskNumber':
        tasknumber = request['intent']['slots']['tasknumber']['value']
        print "tasknumber =", tasknumber
        print "type(tasknumber)=", type(tasknumber)  #"1"
        attributes = session['attributes']
        task_ids = attributes.get('task_ids')  #{"1": 2938}
        print "task_ids=", task_ids
        id_ = task_ids.get(tasknumber)
        if id_:
            print "id_=", id_
            task = remote_session.query(Task).get(id_)
            output_speech = "I will read the note: " + task.note
            response = {
                'response': {
                    'outputSpeech': {
                        'type': 'PlainText',
                        'text': output_speech
                    },
                    'shouldEndSession': True
                }
            }
            return response
        else:
            output_speech = 'I did not find anything, sorry.'
            response = {
                'response': {
                    'outputSpeech': {
                        'type': 'PlainText',
                        'text': output_speech
                    },
                    'shouldEndSession': True
                }
            }

    elif intent == 'RetrieveStarredItems':
        context = request['intent']['slots']['mycontext']['value']
        tasks = remote_session.query(Task).join(Context).filter(
            and_(Context.title == context, Task.star == True,
                 Task.completed == None,
                 datetime.now() - Task.modified < timedelta(days=30))).all()
        output_speech = ''
        for n, task in enumerate(tasks, start=1):
            output_speech += '{}, {}. '.format(n, task.title)

        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': True
            }
        }
        return response

    else:
        output_speech = "I couldn't tell which type of intent request that was.  Try again."
        response = {
            'response': {
                'outputSpeech': {
                    'type': 'PlainText',
                    'text': output_speech
                },
                'shouldEndSession': False
            }
        }
        return response
Esempio n. 13
0
def str_rep(in_str):
        in_str = in_str.replace("\n", '')
        out_str = re.sub('\s+', ' ', in_str)
        out_str = re.sub("[.]+", '', out_str)
        return out_str


def extract_content(solr_data):

    soup = BeautifulSoup(solr_data, "html5lib")
    out = [str_rep(x) for x in soup.stripped_strings]
    return " ".join(out)

solr = pysolr.Solr(
    'http://localhost:8983/solr/gettingstarted',
    timeout=10
)

solr.delete(q='*:*')
for key, val in build_dict(DST_PATH).items():
    with open(val, 'rb') as fh:
        try:
            data = solr.extract(fh, extractOnly=True)
        except pysolr.SolrError as e:
            print("Damaged file %s" % val)
            continue
        metadata = data['metadata']
        content = extract_content(data['contents'])
        index = [{
            'id': key,
            '_text_': content
Esempio n. 14
0
def main():
    from sqlalchemy.orm import sessionmaker

    if hasattr(config, "solr") and config.solr == "lib_prod":
        blake_object_solr = pysolr.Solr(
            'http://webapp.lib.unc.edu:8200/solr/blake/blake_object')
        blake_copy_solr = pysolr.Solr(
            'http://webapp.lib.unc.edu:8200/solr/blake/blake_copy')
        blake_work_solr = pysolr.Solr(
            'http://webapp.lib.unc.edu:8200/solr/blake/blake_work')
    elif hasattr(config, "solr") and config.solr == "lib_dev":
        blake_object_solr = pysolr.Solr(
            'http://london.libint.unc.edu:8983/solr/blake_object')
        blake_copy_solr = pysolr.Solr(
            'http://london.libint.unc.edu:8983/solr/blake_copy')
        blake_work_solr = pysolr.Solr(
            'http://london.libint.unc.edu:8983/solr/blake_work')
    else:
        blake_object_solr = pysolr.Solr(
            'http://localhost:8983/solr/blake_object')
        blake_copy_solr = pysolr.Solr('http://localhost:8983/solr/blake_copy')
        blake_work_solr = pysolr.Solr('http://localhost:8983/solr/blake_work')

    engine = models.db.create_engine(config.db_connection_string)
    session = sessionmaker(bind=engine)()
    objects = session.query(models.BlakeObject).all()
    blake_object_solr.delete(q='*:*')
    for blake_object in objects:
        try:
            if blake_object.supplemental is None:
                obj = {
                    "id":
                    blake_object.object_id,
                    "title":
                    blake_object.title,
                    "bentley_id":
                    blake_object.bentley_id,
                    "dbi":
                    blake_object.dbi,
                    "desc_id":
                    blake_object.desc_id,
                    "copy_id":
                    blake_object.copy_bad_id,
                    "characteristics":
                    blake_object.characteristics,
                    "components":
                    json.dumps(blake_object.components),
                    "illustration_description":
                    json.dumps(blake_object.illustration_description),
                    "text":
                    json.dumps(blake_object.text),
                    "copy_title":
                    blake_object.copy.title,
                    "copy_institution":
                    blake_object.copy.institution,
                    # FIXME: properly convert unicode rather than stripping characters
                    "notes":
                    json.dumps([
                        unicodedata.normalize('NFKD', note["note"]).encode(
                            'ascii', 'ignore') for note in blake_object.notes
                    ])
                }
                print obj["id"]
                if blake_object.copy.work:
                    obj["work_title"] = blake_object.copy.work.title
                    obj["work_id"] = blake_object.copy.work.bad_id
                    obj["composition_date"] = blake_object.copy.composition_date
                    obj["print_date"] = blake_object.copy.print_date
                    obj["medium"] = blake_object.copy.work.medium
                blake_object_solr.add([obj])
        except pysolr.SolrError as err:
            print err
    blake_object_solr.optimize()

    copies = session.query(models.BlakeCopy).all()
    blake_copy_solr.delete(q='*:*')
    for blake_copy in copies:
        copy_ = {
            "id": blake_copy.copy_id,
            "bad_id": blake_copy.bad_id,
            "source": blake_copy.source,
            "title": blake_copy.title,
            "institution": blake_copy.institution,
            "header": blake_copy.header,
            "composition_date": blake_copy.composition_date,
            "print_date": blake_copy.print_date,
            "effective_copy_id": blake_copy.effective_copy_id
        }
        if blake_copy.work:
            copy_["medium"] = blake_copy.work.medium
            copy_["work_id"] = blake_copy.work.bad_id
        blake_copy_solr.add([copy_])
    blake_copy_solr.optimize()

    works = session.query(models.BlakeWork).all()
    blake_work_solr.delete(q='*:*')
    for blake_work in works:
        blake_work_solr.add([{
            "id":
            blake_work.work_id,
            "bad_id":
            blake_work.bad_id,
            "title":
            blake_work.title,
            "medium":
            blake_work.medium,
            "info":
            blake_work.info,
            "image":
            blake_work.image,
            "composition_date":
            blake_work.composition_date,
            "composition_date_string":
            blake_work.composition_date_string
        }])
    blake_work_solr.optimize()
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.searcher = pysolr.Solr(settings.SOLR_URL)
Esempio n. 16
0
import pysolr
import json
import time
#sudo su - solr -c "/opt/solr/bin/solr create -c TKTDNhom10 -n data_driven_schema_configs"
#sudo su - solr -c "/opt/solr/bin/solr delete -c TKTDNhom10 -deleteConfig data_driven_schema_configs"
solr = pysolr.Solr("http://localhost:8983/solr/TestConfig/", timeout=1000)
# import spacy
# nlp = spacy.load("vi_spacy_model")

#/home/luuthanh/Desktop/test_config

#danh chi muc
# with open("/home/luuthanh/Desktop/TKTD/backend/data.json","r") as file:
#     solr.add(json.load(file))

# a = nlp('"Quốc tế"')
results = solr.search(
    'title_decription_content: ("WikiLeaks") && date: [ * TO NOW ]',
    **{
        'start': 0,
        'rows': 5,
        'fl': '* score',
        'fq': ['topic: ( * )', 'author:  ( * )'],
        'hl': 'true',
        'hl.method': 'original',
        'hl.simple.pre': '<mark>',
        'hl.simple.post': '</mark>',
        'hl.highlightMultiTerm': 'true',
        # 'hl.usePhraseHighlighter':'false',
        'hl.fragsize': 100,
        'defType': 'edismax',
Esempio n. 17
0
 def indexer(self):
     self.solr_doclist = []
     self.conn = pysolr.Solr(self.options.url)
     self.conn.delete("*:*")
     self.conn.commit()
Esempio n. 18
0
def run():
    ping('metadata_database', verbose=True)
    ping('temporal_index_database', verbose=True)
    ping('spatial_index_database', verbose=True)
    ping('thematic_index_solr', verbose=True)

    engine1 = create_engine(conf_metadata_processor.db_connection)

    with engine1.connect() as conn:
        test = conn.execute("""SELECT table_name
                               FROM information_schema.tables
                               WHERE table_schema='public'
                                 AND table_type='BASE TABLE';""").fetchall()
        if len(test) < 2:
            raise Exception("As tabelas não foram criadas ainda.")
        test = conn.execute(
            "SELECT trigger_name FROM information_schema.triggers;").fetchall(
            )
        if len(test) < 2:
            raise Exception("Os gatilhos não foram criadas ainda.")

    del engine1

    engine2 = create_engine(conf_temporal_indexing.db_connection)

    with engine2.connect() as conn:
        test = conn.execute("""SELECT table_name
                                   FROM information_schema.tables
                                   WHERE table_schema='public'
                                     AND table_type='BASE TABLE';""").fetchall(
        )
        if len(test) < 1:
            raise Exception("As tabelas não foram criadas ainda.")
        test = conn.execute("""SELECT routine_name 
                               FROM information_schema.routines 
                               WHERE routine_type='FUNCTION' 
                                  AND specific_schema='public';""").fetchall()
        if len(test) < 4:
            raise Exception(
                "Os procedimentos armazenados não foram criados ainda.")

    del engine2

    driver = GraphDatabase.driver(
        conf_spatial_indexing.db_connection["bolt_uri"],
        auth=basic_auth(conf_spatial_indexing.db_connection["user"],
                        conf_spatial_indexing.db_connection["password"]),
        encrypted=False)

    with driver.session() as session:
        test = session.run("match(p:Place) return count(p);").value()
        if test[0] < 5596:
            raise Exception("O grafo de lugares não foi totalmente criado.")

    del driver

    solr_resource = pysolr.Solr(conf_thematic_indexing.resource_solr_core_uri,
                                always_commit=True)
    solr_resource.ping()
    solr_dataset = pysolr.Solr(conf_thematic_indexing.dataset_solr_core_uri,
                               always_commit=True)
    solr_dataset.ping()

    del solr_dataset
    del solr_resource
Esempio n. 19
0
def add_to_search_index(data_dict_id, in_bulk=False):

    log = logging.getLogger('ckan')
    od_search_solr_url = config.get(SEARCH_INTEGRATION_URL_OPTION, "")
    od_search_enabled = config.get(SEARCH_INTEGRATION_ENABLED_OPTION, False)
    od_search_od_url_en = config.get(
        SEARCH_INTEGRATION_OD_URL_EN_OPTION,
        "https://open.canada.ca/data/en/dataset/")
    od_search_od_url_fr = config.get(
        SEARCH_INTEGRATION_OD_URL_FR_OPTION,
        "https://ouvert.canada.ca/data/fr/dataset/")

    # Retrieve the full record - it has additional information including organization title and metadata modified date
    # that are not available in the regular data dict

    portal = LocalCKAN()
    data_dict = portal.action.package_show(id=data_dict_id)

    if not od_search_enabled:
        return
    try:
        subject_codes = scheming_choices_label_by_value(
            scheming_get_preset('canada_subject')['choices'])
        type_codes = scheming_choices_label_by_value(
            scheming_get_preset('canada_resource_related_type')['choices'])
        collection_codes = scheming_choices_label_by_value(
            scheming_get_preset('canada_collection')['choices'])
        juristiction_codes = scheming_choices_label_by_value(
            scheming_get_preset('canada_jurisdiction')['choices'])
        resource_type_codes = scheming_choices_label_by_value(
            scheming_get_preset('canada_resource_type')['choices'])
        frequency_codes = scheming_choices_label_by_value(
            scheming_get_preset('canada_frequency')['choices'])

        org_title = data_dict['organization']['title'].split('|')
        owner_org_title_en = org_title[0].strip()
        owner_org_title_fr = org_title[1].strip()

        subjects_en = []
        subjects_fr = []
        subjects = json.loads(data_dict['subject']) if \
            isinstance(data_dict['subject'], str) else data_dict['subject']
        for s in subjects:
            subjects_en.append(subject_codes['en'][s].replace(",", ""))
            subjects_fr.append(subject_codes['fr'][s].replace(",", ""))

        resource_type_en = []
        resource_type_fr = []
        resource_fmt = []
        resource_title_en = []
        resource_title_fr = []
        for r in data_dict['resources']:
            resource_type_en.append(
                resource_type_codes['en'][r['resource_type']]
                if r['resource_type'] in resource_type_codes['en'] else '')
            resource_type_fr.append(
                resource_type_codes['fr'][r['resource_type']]
                if r['resource_type'] in resource_type_codes['fr'] else '')
            resource_fmt.append(r['format'])

            resource_name = json.loads(r['name_translated']) if \
                isinstance(r['name_translated'], str) else r['name_translated']
            if 'en' in resource_name:
                resource_title_en.append(resource_name['en'])
            elif 'fr-t-en' in resource_name:
                resource_title_en.append(resource_name['fr-t-en'])
            if 'fr' in resource_name:
                resource_title_fr.append(resource_name['fr'].strip())
            elif 'en-t-fr' in resource_name:
                resource_title_fr.append(resource_name['en-t-fr'].strip())

        notes_translated = json.loads(data_dict['notes_translated']) if \
            isinstance(data_dict['notes_translated'], str) else data_dict['notes_translated']
        title_translated = json.loads(data_dict['title_translated']) if \
            isinstance(data_dict['title_translated'], str) else data_dict['title_translated']
        od_obj = {
            'portal_type_en_s':
            type_codes['en'][data_dict['type']],
            'portal_type_fr_s':
            type_codes['fr'][data_dict['type']],
            'collection_type_en_s':
            collection_codes['en'][data_dict['collection']],
            'collection_type_fr_s':
            collection_codes['fr'][data_dict['collection']],
            'jurisdiction_en_s':
            juristiction_codes['en'][data_dict['jurisdiction']],
            'jurisdiction_fr_s':
            juristiction_codes['fr'][data_dict['jurisdiction']],
            'owner_org_title_en_s':
            owner_org_title_en,
            'owner_org_title_fr_s':
            owner_org_title_fr,
            'subject_en_s':
            subjects_en,
            'subject_fr_s':
            subjects_fr,
            'resource_type_en_s':
            list(set(resource_type_en)),
            'resource_type_fr_s':
            list(set(resource_type_fr)),
            'update_cycle_en_s':
            frequency_codes['en'][data_dict['frequency']],
            'update_cycle_fr_s':
            frequency_codes['fr'][data_dict['frequency']],
            'id_name_s':
            data_dict['name'],
            'id':
            data_dict['name'],
            'owner_org_s':
            data_dict['owner_org'],
            'author_txt':
            data_dict['author'] if 'author' in data_dict else '',
            'description_txt_en':
            notes_translated['en']
            if 'en' in data_dict['notes_translated'] else '',
            'description_txt_fr':
            notes_translated['fr']
            if 'fr' in data_dict['notes_translated'] else '',
            'description_xlt_txt_fr':
            notes_translated['fr-t-en']
            if 'fr-t-en' in notes_translated else '',
            'description_xlt_txt_en':
            notes_translated['en-t-fr']
            if 'en-t-f-r' in notes_translated else '',
            'title_en_s':
            title_translated['en'] if 'en' in title_translated else '',
            'title_fr_s':
            title_translated['fr'] if 'fr' in title_translated else '',
            'title_xlt_fr_s':
            title_translated['fr-t-en']
            if 'fr-t-en' in title_translated else '',
            'title_xlt_en_s':
            title_translated['en-t-fr']
            if 'en-t-fr' in title_translated else '',
            'resource_format_s':
            list(set(resource_fmt)),
            'resource_title_en_s':
            resource_title_en,
            'resource_title_fr_s':
            resource_title_fr,
            'last_modified_tdt':
            parser.parse(data_dict['metadata_modified']).replace(
                microsecond=0).isoformat() + 'Z',
            'ogp_link_en_s':
            '{0}{1}'.format(od_search_od_url_en, data_dict['name']),
            'ogp_link_fr_s':
            '{0}{1}'.format(od_search_od_url_fr, data_dict['name']),
        }

        keywords = json.loads(data_dict['keywords']) if \
            isinstance(data_dict['keywords'], str) else data_dict['keywords']
        if 'en' in keywords:
            od_obj['keywords_en_s'] = keywords['en']
        elif 'fr-t-en' in keywords:
            od_obj['keywords_en_s'] = keywords['fr-t-en']
        if 'fr' in keywords:
            od_obj['keywords_fr_s'] = keywords['fr']
        elif 'en-t-fr' in keywords:
            od_obj['keywords_fr_s'] = keywords['en-t-fr']

        solr = pysolr.Solr(od_search_solr_url)
        if in_bulk:
            solr.add([od_obj])
        else:
            solr.delete(id=od_obj['id'])
            solr.add([od_obj])
            solr.commit()
    except Exception as x:
        log.error("Exception: {} {}".format(x.message, x.args))
import pysolr
import nltk
import numpy as np

solr = pysolr.Solr('http://localhost:8983/solr/glove', timeout=10)


def getVectorFromSolr(word):
    result = solr.search("id:" + word)
    return result.raw_response['response']['docs'][0]['value'][0].split(" ")


def getVector(word):
    word = word.lower().strip()
    wList = nltk.word_tokenize(word)
    count = 0
    res = np.zeros(100, dtype=float)
    for w in wList:
        temp = np.array(getVectorFromSolr(w), dtype=float)
        res = np.add(res, temp)
        count += 1
    res = np.divide(res, count)
    return res


def getCosineSimBetVectors(vec1, vec2):
    sim = max(
        0,
        np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
    return sim
# -*- coding: utf-8 -*-
"""
Created on Sun Dec  3 16:42:38 2017

@author: apurva
"""
import pysolr
import os
import sys
solr = pysolr.Solr('http://localhost:8983/solr/task4/', timeout=10000)
import nltk
path = sys.argv[1]
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from rake_nltk import Rake
lemmatizer = WordNetLemmatizer()
st = LancasterStemmer()
r= Rake()
documents = []
docId=1
for filename in os.listdir(path):

    f = open(path+'/'+filename)
    raw = f.read()
    sentence = sent_tokenize(raw)
    sentId=1 
    for sen in sentence:
        words = word_tokenize(sen)
Esempio n. 22
0
import pysolr
import json
import sys

# Argument 1 the json line file to index
file_to_index = sys.argv[1]

# Argument 2 core to connect
core_name = sys.argv[2]

solr = pysolr.Solr('http://localhost:8983/solr/%s/' % core_name)

size_buffer = 1000
count = 0
with open(file_to_index) as fil:
    for line in fil:
        json_obj = json.loads(line)
        json_obj['id'] = json_obj['url']
        solr.add([json_obj], commit=(count % size_buffer == 0))
        print("\r%d" % count, end='')
        count += 1
    solr.commit()
    print("\r%d" % count)
Esempio n. 23
0
                'overview':
                tmdbMovie['overview'],
                'tagline':
                tmdbMovie['tagline'],
                'cast_nomv':
                " ".join(
                    [castMember['name'] for castMember in tmdbMovie['cast']]),
                'directors':
                [director['name'] for director in tmdbMovie['directors']],
                'cast':
                [castMember['name'] for castMember in tmdbMovie['cast']],
                'genres': [genre['name'] for genre in tmdbMovie['genres']],
                'release_date':
                releaseDate,
                'vote_average':
                tmdbMovie['vote_average']
                if 'vote_average' in tmdbMovie else None,
                'vote_count':
                int(tmdbMovie['vote_count'])
                if 'vote_count' in tmdbMovie else None,
            }
        except KeyError as k:  # Ignore any movies missing these attributes
            print(k)
            continue


if __name__ == "__main__":
    solr = pysolr.Solr('http://localhost:8983/solr/tmdb', timeout=100)
    solr.add(indexableMovies())
    solr.commit()
Esempio n. 24
0
        'NAME':
        'django.contrib.auth.password_validation.MinimumLengthValidator',
    },
    {
        'NAME':
        'django.contrib.auth.password_validation.CommonPasswordValidator',
    },
    {
        'NAME':
        'django.contrib.auth.password_validation.NumericPasswordValidator',
    },
]
#Solar url connection and access
SOLAR_CONFIGURATION = {"URL": "https://52.152.191.13:8983/solr"}
solr_product = pysolr.Solr(
    'https://52.152.191.13:8983/solr/product_information/',
    timeout=10,
    verify=False)
solr_notification_status = pysolr.Solr(
    'https://52.152.191.13:8983/solr/sap_notification_status/',
    timeout=10,
    verify=False)
solr_unstructure_data = pysolr.Solr(
    'https://52.152.191.13:8983/solr/unstructure_processed_data/',
    timeout=10,
    verify=False)
solr_document_variant = pysolr.Solr(
    'https://52.152.191.13:8983/solr/sap_document_variant/',
    timeout=10,
    verify=False)
# Internationalization
# https://docs.djangoproject.com/en/2.1/topics/i18n/
Esempio n. 25
0
def solr_load_batch(batch):
    """Process batch."""
    solr = pysolr.Solr(settings.SOLR_URL, always_commit=True)

    solr.add(list(map(build_doc, batch)))
Esempio n. 26
0
 def make(self, conn_name, *args, **kwargs):
     url = settings.HAYSTACK_CONNECTIONS[conn_name]['URL']
     return pysolr.Solr(url, **kwargs)
Esempio n. 27
0
    def query_network(self, uuid, search_string, max_edges):
        myConst = CX_CONSTANTS

        niceCx = NiceCXNetwork()
        #uuid = '7246d8cf-c644-11e6-b48c-0660b7976219'
        search_terms_dict = {k:1 for k in search_string.split(',')}

        solr = pysolr.Solr(solr_url + uuid + '/', timeout=10)

        try:
            results = solr.search(search_string, rows=10000)
            #search_terms_array = [int(n['id']) for n in results.docs]
            search_terms_array = {int(n['id']):1 for n in results.docs}
            if(not search_terms_array):
                return {'message': 'No nodes found'}

            print('starting nodes 1')
            #===================
            # METADATA
            #===================
            available_aspects = []
            for ae in (o for o in self.stream_aspect(uuid, 'metaData')):
                available_aspects.append(ae.get(CX_CONSTANTS.METADATA_NAME))
                mde = MetaDataElement(json_obj=ae)
                niceCx.add_metadata(mde)

            #available_aspects = ['edges', 'nodes'] # TODO - remove this
            opaque_aspects = set(available_aspects).difference(known_aspects_min)

            print(opaque_aspects)

            #===================
            # NODES
            #===================
            if 'nodes' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'nodes')):
                    if search_terms_array.get(ae.get(CX_CONSTANTS.ID)):
                        add_this_node = NodeElement(cx_fragment=ae)
                        niceCx.create_node(add_this_node)
            else:
                raise Exception('Network does not contain any nodes.  Cannot query')

            print('starting edges 1')
            #===================
            # EDGES
            #===================
            edge_count = 0
            added_edges = 0
            start_time = time.time()
            if 'edges' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'edges')):
                    if niceCx.nodes.get(ae.get(CX_CONSTANTS.EDGE_SOURCE_NODE_ID_OR_SUBNETWORK)) is not None or niceCx.nodes.get(ae.get(CX_CONSTANTS.EDGE_TARGET_NODE_ID)) is not None:
                        add_this_edge = EdgeElement(cx_fragment=ae)
                        niceCx.create_edge(add_this_edge)
                        added_edges += 1
                    if edge_count % 5000 == 0:
                        print(edge_count)

                    #if edge_count > 30000:
                    #    break

                    if added_edges > max_edges:
                        raise StopIteration('Max edges reached')
                    edge_count += 1
            else:
                raise Exception('Network does not contain any nodes.  Cannot query')

            print('Response time (Edge search): ' + str(time.time() - start_time))
            print('starting nodes 2')
            #===================
            # NODES
            #===================
            for ae in (o for o in self.stream_aspect(uuid, 'nodes')):
                if niceCx.get_missing_nodes().get(ae.get(CX_CONSTANTS.ID)):
                    add_this_node = NodeElement(cx_fragment=ae)
                    niceCx.create_node(add_this_node)

            #====================
            # NETWORK ATTRIBUTES
            #====================
            if 'networkAttributes' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'networkAttributes')):
                    add_this_network_attribute = NetworkAttributesElement(cx_fragment=ae)
                    niceCx.add_network_attribute(add_this_network_attribute)

            #===================
            # NODE ATTRIBUTES
            #===================
            if 'nodeAttributes' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'nodeAttributes')):
                    if niceCx.nodes.get(ae.get(CX_CONSTANTS.PROPERTY_OF)):
                        add_this_node_att = NodeAttributesElement(json_obj=ae)
                        niceCx.add_node_attribute(add_this_node_att)

            #===================
            # EDGE ATTRIBUTES
            #===================
            if 'edgeAttributes' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'edgeAttributes')):
                    if niceCx.edges.get(ae.get(CX_CONSTANTS.PROPERTY_OF)):
                        add_this_edge_att = EdgeAttributesElement(json_obj=ae)
                        niceCx.set_edge_attribute()

            #===================
            # NODE CITATIONS
            #===================
            if 'nodeCitations' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'nodeCitations')):
                    for e_po in ae.get(CX_CONSTANTS.PROPERTY_OF):
                        if niceCx.get_nodes().get(e_po) is not None:
                            niceCx.add_node_citations_from_cx(ae)

            #===================
            # EDGE CITATIONS
            #===================
            ec_count = 0
            if 'edgeCitations' in available_aspects:
                for ae in (o for o in self.stream_aspect(uuid, 'edgeCitations')):
                    for e_po in ae.get(CX_CONSTANTS.PROPERTY_OF):
                        if niceCx.get_edges().get(e_po) is not None:
                            niceCx.add_edge_citations_from_cx(ae)
                    ec_count += 1
                    if ec_count % 500 == 0:
                        print(ec_count)

            #===================
            # CITATIONS
            #===================
            if 'citations' in available_aspects:
                #======================================================
                # FILTER CITATIONS IF THERE ARE EDGE OR NODE CITATIONS
                # OTHERWISE ADD THEM ALL (NO-FILTER) -- TODO
                #======================================================
                for ae in (o for o in self.stream_aspect(uuid, 'citations')):
                    add_this_citation = CitationElement(cx_fragment=ae)
                    niceCx.add_citation(add_this_citation)

            #===================
            # OPAQUE ASPECTS
            #===================
            for oa in opaque_aspects:
                objects = self.stream_aspect(uuid, oa)
                obj_items = (o for o in objects)
                for oa_item in obj_items:
                    aspect_element = AspectElement(oa_item, oa)
                    niceCx.add_opaque_aspect(aspect_element)

        except SolrError as se:
            if('404' in se.message):
                ndex2.get_logger('SOLR').warning('Network not found ' + self.uuid + ' on ' + solr_url + ' server.')
                raise Exception("Network not found (SOLR)")
            else:
                ndex2.get_logger('SOLR').warning('Network error ' + self.uuid + ' on ' + solr_url + ' server. ' + se.message)
                raise Exception(se.message)
        except StopIteration as si:
                ndex2.get_logger('QUERY').warning("Found more than max edges.  Raising exception")
                raise StopIteration(si.message)


        #nice_cx_json = niceCx.to_cx()

        return niceCx
import pysolr
import requests
import os
import sqlite3
import psycopg2
import psycopg2.extras
from gensim.parsing import preprocessing

# GLOBALS
conn = psycopg2.connect(
    "dbname=MAG19 user=mag password=1maG$ host=shetland.informatik.uni-freiburg.de"
)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

solr = pysolr.Solr('http://localhost:8983/solr/arxiv_cs_metadata',
                   always_commit=True)


def search_solr_parse_json(query, collection, search_field):
    """ Searches the arxiv_cs_metadata collection on arxiv_identifier (search_field)
    using the resp. arxiv id as the query, 
    parses the json result and returns it as a list of dictionaries where
    each dictionary corresponds to a record. 
    ARGUMENTS: query, string: each arxiv id
               collection: the Solr collection name (=arxiv_cs_metadata)
               search_field: the Solr field which is queried (=arxiv_identifier)
    RETURNS: docs, list of dicts: the documents (records) returned by Solr 
             AFTER getting the JSON response and parsing it."""
    solr_url = 'http://localhost:8983/solr/' + collection + '/select'
    url_params = {'q': query, 'rows': 1, 'df': search_field}
    solr_response = requests.get(solr_url, params=url_params)
Esempio n. 29
0
    #print bibdir
    print '===================================='
    solrinstance.add([bibdir], commit=False)
    
    
    
    #Issue with loading into sh obsids.sh and all wont we duplicate them if we do stuff separateky for overlaps and stuff. Should we do it just once or check whats been loaded to protect against this BUG
if __name__=="__main__":
    if len(sys.argv)==2:
        execfile("./default.conf")
    elif len(sys.argv)==3:
        execfile(sys.argv[2])
    else:
        print "Usage: python rdf2solr3.py biblistfile [conffile]"
        sys.exit(-1)
    c=adsrdf.ADSConnection(SESAME, REPOSITORY)
    print "cccccccccccccccccccccccccccccccccccc",c
    #researchpapers=[unquote(e.split('#')[1]) for e in c.getDataByType('cito:ResearchPaper')]
    #h= HTMLParser.HTMLParser()
    researchpapers=[ele.strip() for ele in open(sys.argv[1]).readlines()]
    print researchpapers
    #researchpapers=['2000A&A...359..489C', '2000ApJ...534L..47G', '2000ApJ...536L..27W', '2000ApJ...540L..69S', '2000ApJ...541...49H']
    solr=pysolr.Solr(SOLR)
    #solr=None
    #researchpapers=['2000ApJ...534L..47G', '2009ApJ...692.1143K']
    for ele in researchpapers:
        print "Indexing: ",ele
        putIntoSolr(solr, ele)
        print "-------------"
    solr.commit()
Esempio n. 30
0
# If on Python 2.X
from __future__ import print_function

import pysolr
from django.conf import settings

from iati.models import Budget
from solr.budget.indexing import BudgetIndexing
from solr.tasks import BaseTaskIndexing

solr = pysolr.Solr('{url}/{core}'.format(
    url=settings.SOLR.get('url'),
    core=settings.SOLR.get('cores').get('budget')),
                   always_commit=True,
                   timeout=180)


class BudgetTaskIndexing(BaseTaskIndexing):
    indexing = BudgetIndexing
    model = Budget
    solr = solr

    def run_from_activity(self, activity):
        for budget in activity.budget_set.all():
            self.instance = budget
            self.run()