Ejemplo n.º 1
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Ejemplo n.º 2
0
    def update(self, metaInfo, metaData):
        '''
        @see: ISearchProvider.update()
        '''

        si = SolrInterface('http://%s%s' %
                           (self.solr_server_url, metaData.Type))

        document = dict()

        document["MetaInfoId"] = metaInfo.Id
        document["MetaDataId"] = metaData.Id
        document["languageId"] = metaInfo.Language

        # custom processing on some fields
        field = 'CreationDate'
        if hasattr(metaInfo, field) and getattr(metaInfo, field):
            document['CreationData_Year'] = getattr(metaInfo, field).year

        for field in si.schema.fields:
            if hasattr(metaInfo, field) and getattr(metaInfo, field):
                document[field] = getattr(metaInfo, field)
            elif hasattr(metaData, field) and getattr(metaData, field):
                document[field] = getattr(metaData, field)

        si.add(document)
        si.commit()
Ejemplo n.º 3
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Ejemplo n.º 4
0
def make_connection():
    solr_url = config.get('adhocracy.solr.url', 'http://localhost:8983/solr/')
    solr_url = solr_url.strip()
    if not solr_url.endswith('/'):
        solr_url = solr_url + '/'
    http_connection = Http()
    return SolrInterface(solr_url, http_connection=http_connection)
Ejemplo n.º 5
0
 def run(self, solr_id):
     """ Run the synchronization, delete the record on Solr
     :param solr_id: identifier of the record to delete
     """
     si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth
     si.delete(solr_id)
     si.commit()
     return _('Record %s deleted on Solr') % solr_id
Ejemplo n.º 6
0
def get_sunburnt_connection():
    from pylons import config
    solr_url = config.get('adhocracy.solr.url', 'http://localhost:8983/solr/')
    solr_url = solr_url.strip()
    if not solr_url.endswith('/'):
        solr_url = solr_url + '/'
    http_connection = Http()

    return SolrInterface(solr_url, http_connection=http_connection, mode='r')
Ejemplo n.º 7
0
Archivo: index.py Proyecto: alkadis/vcv
def make_connection():
    solr_url = config.get_string('adhocracy.solr.url',
                                 'http://localhost:8983/solr/')
    solr_url = solr_url.strip()
    if not solr_url.endswith('/'):
        solr_url = solr_url + '/'
    kwargs = {}
    if config.get_bool('adhocracy.force_no_http_proxy'):
        kwargs['proxy_info'] = None
    http_connection = Http(**kwargs)
    return SolrInterface(solr_url, http_connection=http_connection)
Ejemplo n.º 8
0
 def add(self, url, doc):
     si_item = self.__solr_pool.get(url)
     if not si_item:
         si_item = [SolrInterface(url), 0, []]
         self.__solr_pool[url] = si_item
     si_item[2].append(doc)
     si_item[1] += 1
     if si_item[
             1] % NUMBER_OF_DOCS_PER_ADD == 0:  # NOTE: Solr itself will also auto-commit after some time
         si_item[0].add(si_item[2])
         si_item[2] = []
     if si_item[1] > NUMBER_OF_DOCS_PER_COMMIT:
         si_item[0].commit()
         si_item[1] = 0
     return _('Record exported with ID %s on SolR.') % doc['id']
Ejemplo n.º 9
0
def main():
	solr_url = "http://politicalframing.com:8983/solr/collection1"
	h = httplib2.Http(cache="/var/tmp/solr_cache")
	si = SolrInterface(url = solr_url, http_connection = h)

	# chamber = 'Senate'
	# print commit_solr()

	numFound = si.query(chamber='senate').paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches about Topic X in senate " + str(numFound)
	for i in range(0, int(math.ceil(numFound/10000))):
		current_speeches = si.query(chamber='senate').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], chamber='Senate')

			if partial_document:
				print speech['id'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='house').paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches about Topic X in house " + str(numFound)
	for i in range(0, int(math.ceil(numFound/10000))):
		current_speeches = si.query(chamber='house').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], chamber='House')

			if partial_document:
				print speech['id'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()
Ejemplo n.º 10
0
    def processQuery(self, session, scheme, qa=None, qi=None, qd=None):
        '''
        Creates the solr query based on received REST queries
        '''

        si = SolrInterface('http://%sother' % self.solr_server_url)
        types = [
            self.queryIndexer.typesByMetaData[key]
            for key in self.queryIndexer.typesByMetaData.keys()
        ]

        solrQuery = None
        orClauses = []

        if qa is not None:
            assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa
            solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses)
            if QMetaDataInfo.type in qa: types = qa.type.values

        if qi is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses)

        if qd is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses)

        if orClauses:
            extend = None
            for clause in orClauses:
                if extend: extend = extend | clause
                else: extend = clause

            if solrQuery is None: solrQuery = si.query(extend)
            else: solrQuery = solrQuery.query(extend)

        if solrQuery is None: solrQuery = si.query()
        solrQuery = buildShards(solrQuery, self.solr_server_url, types)

        return solrQuery
Ejemplo n.º 11
0
# getCollectionsFromSolr20140919 revised to cover several cases and to report progress
#  VT CS4984, Computational Linguistics, by Xuan Zhang, Tarek Kanan, Edward Fox
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name

eventQuery = "Brazil_NightClub_Fire"
# Commented out lines support the special handling when there are spaces in the event name.
# eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
#root = 'D:\Test\EventCollections\SmallCollections'
# Or, for a Mac, use something like
root = '/Users/mzamani/Documents/CS4984/Unit2/Brazil_NightClub_Fire'

# Create and execute a Solr query
words = eventQuery.split()
query = si.query(event=words[0])
for w in words[1:]:
    query = query.query(event=w)
response = query.execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
tot = response.result.numFound

#print response.result.numFound
print tot, "documents found in collection [", eventQuery, "]\n"
Ejemplo n.º 12
0
from sunburnt import SolrInterface
import sys

si = SolrInterface("http://nick.dlib.vt.edu:8080/solr")

eventQuery = sys.argv[1]

response = si.query( event=eventQuery).execute()
tot = response.result.numFound
response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()
docs = {}
print response.result.numFound
i = 1
for res in response:
    f = open(str(i) + ".txt","w")
    f.write(res['content'].encode("utf-8"))
    f.close()
    i+=1
si.commit()
Ejemplo n.º 13
0
def get_solr(url=None):
    """ return a `SolrInterface` instance using the `solr_url` setting """
    if url is None:
        url = get_settings()['kotti_solr.solr_url']
    return SolrInterface(url)
Ejemplo n.º 14
0
This repo has a YAML file we might be able to use
https://github.com/unitedstates/congress-legislators

It seems the original parser has been improved (or maybe just migrated)
https://github.com/unitedstates/congressional-record/blob/master/congressionalrecord/fdsys/cr_parser.py

"""

import httplib2
from sunburnt import SolrInterface
from dateutil import parser
from datetime import datetime

solr_url = "http://politicalframing.com:8983/solr"  # "http://localhost:8983/solr/"
h = httplib2.Http(cache="/var/tmp/solr_cache")
si = SolrInterface(url=solr_url, http_connection=h)


def get_speeches(rows, start, dabool, **kwargs):
    query = {}
    neg_query = {}

    if kwargs.get('speech_id'): query['id'] = kwargs['speech_id']
    if kwargs.get('phrase'): query['speaking'] = kwargs['phrase']
    if kwargs.get('congress'): query['congress'] = kwargs['congress']

    kwargs['start_date'] = parser.parse(
        kwargs['start_date']) if kwargs.get('start_date') else datetime(
            1994, 1, 1)
    kwargs['end_date'] = parser.parse(
        kwargs['end_date']) if kwargs.get('end_date') else datetime.now()
def main():
    solr_url = "http://politicalframing.com:8983/solr"
    h = httplib2.Http(cache="/var/tmp/solr_cache")
    si = SolrInterface(url=solr_url, http_connection=h)

    totalNumFound = si.query(**{
        "*": "*"
    }).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(
        speaker_raw="the presiding officer").exclude(
            speaker_raw="the vice president").exclude(
                speaker_raw="the speaker pro tempore").exclude(
                    speaker_raw="the acting president pro tempore").sort_by(
                        "speaker_raw").paginate(
                            rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(
        totalNumFound)

    senateNumFound = si.query(chamber='Senate').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore  " + str(
        senateNumFound)

    houseNumFound = si.query(chamber='House').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        houseNumFound)

    extensionsNumFound = si.query(chamber='Extensions').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        extensionsNumFound)

    print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound)

    print "-----------------------"
    print "-----------------------"

    numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(
        speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='Senate').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='Senate')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()

    numFound = si.query(chamber='House').exclude(speaker_party="*").exclude(
        speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='House').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='House')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()

    numFound = si.query(chamber='Extensions').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='Extensions').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='Extensions')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()