Beispiel #1
0
    def processQuery(self, session, scheme, qa=None, qi=None, qd=None):
        '''
        Creates the solr query based on received REST queries
        '''

        si = SolrInterface('http://%sother' % self.solr_server_url)
        types = [self.queryIndexer.typesByMetaData[key] for key in self.queryIndexer.typesByMetaData.keys()]

        solrQuery = None
        orClauses = []

        if qa is not None:
            assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa
            solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses)
            if QMetaDataInfo.type in qa: types = qa.type.values

        if qi is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses)

        if qd is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses)

        if orClauses:
            extend = None
            for clause in orClauses:
                if extend: extend = extend | clause
                else: extend = clause

            if solrQuery is None: solrQuery = si.query(extend)
            else: solrQuery = solrQuery.query(extend)

        if solrQuery is None: solrQuery = si.query()
        solrQuery = buildShards(solrQuery, self.solr_server_url, types)

        return solrQuery
Beispiel #2
0
    def _do_search(self, sort_by=None):
        si = SolrInterface(self.solr_endpoint)

        searchquery = si.Q(*[si.Q(s) for s in self.searchable])
        query = si.query(searchquery).field_limit(score=True)

        realm_query = self._build_realm_filter(si)
        if realm_query:
            query = query.filter(realm_query)

        author_query = self._build_author_filter(si)
        if author_query:
            query = query.filter(author_query)

        trac_query = self._build_trac_filter(si)
        if trac_query:
            query = query.filter(trac_query)

        for field in sort_by or []:
            query = query.sort_by(field)

        query = query.paginate(start=self.page_start, rows=self.page_size)\
                            .highlight('oneline',
                                    **{'simple.pre':'<span class="highlight">',
                                       'snippets': 3,
                                       'fragsize': 600,
                                       'simple.post':'</span>'})

        # boosting - super hacky but sunburnt is not support bq
        options = query.options()
        options['bq'] = ['realm:ticket^999','status:new^100', 'status:assigned^100',
                         'status:reopened^999', 'status:reviewing^100',
                         'status:accepted^100','(*:* -xxx)^999']
        result = query.interface.search(**options)
        return query.transform_result(result, dict)
Beispiel #3
0
def main():
	solr_url = "http://politicalframing.com:8983/solr/collection1"
	h = httplib2.Http(cache="/var/tmp/solr_cache")
	si = SolrInterface(url = solr_url, http_connection = h)

	# chamber = 'Senate'
	# print commit_solr()

	numFound = si.query(chamber='senate').paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches about Topic X in senate " + str(numFound)
	for i in range(0, int(math.ceil(numFound/10000))):
		current_speeches = si.query(chamber='senate').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], chamber='Senate')

			if partial_document:
				print speech['id'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='house').paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches about Topic X in house " + str(numFound)
	for i in range(0, int(math.ceil(numFound/10000))):
		current_speeches = si.query(chamber='house').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], chamber='House')

			if partial_document:
				print speech['id'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()
def _retrieve_address_from_history(original_source):
    server = endpoints.solr + '/dealschrome/geodata'
    solr = SolrInterface(server)
    res = solr.query(id=original_source).execute()
    if len(res):
        ll = str(res[0]['latlng'][0])+','+str(res[0]['latlng'][1])
        determined_source = res[0]['determined_source']
    else:
        ll = None
        determined_source = None
    return (ll,determined_source)
Beispiel #5
0
    def processQuery(self, session, scheme, qa=None, qi=None, qd=None):
        '''
        Creates the solr query based on received REST queries
        '''

        si = SolrInterface('http://%sother' % self.solr_server_url)
        types = [
            self.queryIndexer.typesByMetaData[key]
            for key in self.queryIndexer.typesByMetaData.keys()
        ]

        solrQuery = None
        orClauses = []

        if qa is not None:
            assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa
            solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses)
            if QMetaDataInfo.type in qa: types = qa.type.values

        if qi is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses)

        if qd is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses)

        if orClauses:
            extend = None
            for clause in orClauses:
                if extend: extend = extend | clause
                else: extend = clause

            if solrQuery is None: solrQuery = si.query(extend)
            else: solrQuery = solrQuery.query(extend)

        if solrQuery is None: solrQuery = si.query()
        solrQuery = buildShards(solrQuery, self.solr_server_url, types)

        return solrQuery
Beispiel #6
0
si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name

eventQuery = "Brazil_NightClub_Fire"
# Commented out lines support the special handling when there are spaces in the event name.
# eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
#root = 'D:\Test\EventCollections\SmallCollections'
# Or, for a Mac, use something like
root = '/Users/mzamani/Documents/CS4984/Unit2/Brazil_NightClub_Fire'

# Create and execute a Solr query
words = eventQuery.split()
query = si.query(event=words[0])
for w in words[1:]:
    query = query.query(event=w)
response = query.execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
tot = response.result.numFound

#print response.result.numFound
print tot, "documents found in collection [", eventQuery, "]\n"
print "Retrieving documents...\n"

response = si.query(event=eventQuery).paginate(0, tot).execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").paginate(0,tot).execute()
def search_tiles(request, collection_id=None, version=None, variant=None):
    results = None
    solr_metric = None
    subset_query = None
    query = None
    ed_class = None
    ed_level = None

    if request.GET:
        #if 'level' in request.GET:
        ed_level = request.GET.get('level')
        #if 'class' in request.GET:
        ed_class = request.GET.get('class')
        if 'q' in request.GET:
            query = remove_quotation_marks(request.GET.get('q'))
        if query:
            subset = ''
            if collection_id:
                subset = str(collection_id) + '/' + str(version) + '/' + str(variant).split('-')[0]
            page = request.GET.get('p')
            start_time = time.clock()

            setting = solr_switcher()

            if subset and not (ed_level and ed_class):
                subset_query = "%s*" % subset

            query_metric = QueryMetric(request)
            register_user_query(query_metric)
            request_metric = RequestMetric(request, query_hash=query_metric.get_md5())
            register_client_metric(request_metric)
            solr_metric = SolrMetric(request_hash=request_metric.get_md5(), query_hash=query_metric.get_md5())

            try:
                solr_interface = SolrInterface(setting.SOLR_MAIN_URL)
                solr_interface.conn.request_handler_name('search')
                results = solr_interface.query(solr_interface.Q('"' + query + '"'))
                if subset_query:
                    results = results.filter(collectionid=subset_query)
                if ed_level:
                    results = results.filter(collection_school_type_code=ed_level)
                if ed_class:
                    results = results.filter(collection_ep_class=ed_class)

                results = results.filter(published=True)
            except SolrError:
                solr_metric.solr_error = SolrError

        if results is not None:
            pages = Paginator(results, PAGINATION_ROWS)

            try:
                results = pages.page(page)
            except PageNotAnInteger:
                results = pages.page(1)
            except EmptyPage:
                results = pages.page(pages.num_pages)
            results.num_pages = pages.num_pages
            results.total_count = pages._count
            results.processing_time = time.clock() - start_time
            if solr_metric:
                if results.number < pages.num_pages:
                    solr_metric.next_page = results.number + 1
                solr_metric.num_pages = pages.num_pages
                if results.number > 1:
                    solr_metric.prev_page = results.number - 1
                solr_metric.page = results.number
                solr_metric.total_count = pages._count
                solr_metric.processing_time = results.processing_time
            # get post and pre pages
            max_count = pages.num_pages if pages.num_pages < results.number + PAGES_OFFSET else results.number + PAGES_OFFSET
            results.post_pages = range(results.number + 1, max_count + 1)
            results.pre_pages = [x for x in range(results.number - PAGES_OFFSET, results.number) if x > 0]

            if solr_metric:
                solr_metric.request_time = time.clock() - start_time
                register_solr_metric(solr_metric)
    else:
        results = None

    return render(request, 'search_tiles.html', {'results': results,
                                                 'solr_metric': solr_metric,
                                                 'query': query,
                                                 'collection_id': collection_id,
                                                 'variant': variant,
                                                 'version': version,
                                                 'level': ed_level,
                                                 'class': ed_class,
                                                 'chosen_education_level': ed_level,
                                                 'chosen_level': ed_class})
Beispiel #8
0
from sunburnt import SolrInterface
import sys

si = SolrInterface("http://nick.dlib.vt.edu:8080/solr")

eventQuery = sys.argv[1]

response = si.query( event=eventQuery).execute()
tot = response.result.numFound
response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()
docs = {}
print response.result.numFound
i = 1
for res in response:
    f = open(str(i) + ".txt","w")
    f.write(res['content'].encode("utf-8"))
    f.close()
    i+=1
si.commit()
Beispiel #9
0
def dataset_import_data(dataset_id):
    """
    Import a dataset into Solr.
    """
    from redd.models import Dataset

    log = logging.getLogger('redd.tasks.dataset_import_data')
    log.info('Beginning import, dataset_id: %i' % dataset_id)

    dataset = Dataset.objects.get(id=dataset_id)

    solr = SolrInterface(settings.SOLR_ENDPOINT)
    #solr_fields = []

    #for h, t in dataset.schema:
    #    if t == 'NoneType':
    #        solr_fields.append(None)
    #    else:
    #        solr_fields.append('%s_%s' % (h, t.__name__))
        
    reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r'))
    reader.next()

    add_buffer = []
    normal_type_exceptions = []

    for i, row in enumerate(reader, start=1):
        data = {}

        typing="""for t, header, field, value in izip(normal_types, headers, solr_fields, row):
         try:
                value = normalize_column_type([value], normal_type=t)[1][0]
            except InvalidValueForTypeException:
                # Convert exception to row-specific error
                normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t))
                continue

            # No reason to send null fields to Solr (also sunburnt doesn't like them) 
            if value == None:
                continue

            if t in [unicode, bool, int, float]:
                if value == None:
                    continue

                data[field] = value
            elif t == datetime:
                data[field] = value.isoformat()
            elif t == date:
                pass
            elif t == time:
                pass
            else:
                # Note: if NoneType should never fall through to here 
                raise TypeError('Unexpected normal type: %s' % t.__name__)"""

        # If we've had a normal type exception, don't bother do the rest of this
        if not normal_type_exceptions:
            data = {
                'id': uuid4(),
                'dataset_id': dataset.id,
                'row': i,
                'full_text': '\n'.join(row),
                'csv_data': json.dumps(row)
            }

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(add_buffer)
                add_buffer = []

    if add_buffer:
        solr.add(add_buffer)
        add_buffer = []
    
    if not normal_type_exceptions:
        solr.commit()
    else:
        # Rollback pending changes
        solr.delete(queries=solr.query(dataset_id=dataset.id))
        
        for e in normal_type_exceptions:
            print e 

    log.info('Finished import, dataset_id: %i' % dataset_id)
class SolrBackend(Component):
    implements(ISearchBackend)

    UNIQUE_ID = "unique_id"

    HIGHLIGHTABLE_FIELDS = {
        "unique_id" : True,
        "id" : True,
        "type" : True,
        "product" : True,
        "milestone" : True,
        "author" : True,
        "component" : True,
        "status" : True,
        "resolution" : True,
        "keywords" : True,
        "summary" : True,
        "content" : True,
        "changes" : True,
        "owner" : True,
        "repository" : True,
        "revision" : True,
        "message" : True,
        "name" : True
        }

    server_url = Option(
            BHSEARCH_CONFIG_SECTION,
            'solr_server_url',
            doc="""Url of the server running Solr instance.""",
            doc_domain='bhsearch')

    def __init__(self):
        self.solr_interface = SolrInterface(str(self.server_url))

    def add_doc(self, doc, operation_context=None):
        self._reformat_doc(doc)
        doc[self.UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
                                                doc["type"], doc["id"])
        self.solr_interface.add(doc)
        self.solr_interface.commit()

    def delete_doc(product, doc_type, doc_id, operation_context=None):
        unique_id = self._create_unique_id(product, doc_type, doc_id)
        self.solr_interface.delete(unique_id)

    def optimize(self):
        self.solr_interface.optimize()

    def query(
            self, query, query_string, sort = None, fields = None,
            filter = None, facets = None, pagenum = 1, pagelen = 20,
            highlight = False, highlight_fields = None, context = None):

        if not query_string:
            query_string = "*.*"

        final_query_chain = self._create_query_chain(query, query_string)
        solr_query = self.solr_interface.query(final_query_chain)
        faceted_solr_query = solr_query.facet_by(facets)
        highlighted_solr_query = faceted_solr_query.highlight(
                                    self.HIGHLIGHTABLE_FIELDS)

        start = 0 if pagenum == 1 else pagelen * pagenum
        paginated_solr_query = highlighted_solr_query.paginate(
                            start=start, rows=pagelen)
        results = paginated_solr_query.execute()

        mlt, hexdigests = self.query_more_like_this(paginated_solr_query,
                                                    fields="type", mindf=1,
                                                    mintf=1)

        query_result = self._create_query_result(highlighted_solr_query,
                                                 results, fields, pagenum,
                                                 pagelen)
        return query_result, mlt, hexdigests

    def query_more_like_this(self, query_chain, **kwargs):
        mlt_results = query_chain.mlt(**kwargs).execute().more_like_these
        mlt_dict = {}
        hexdigests = {}

        for doc, results in mlt_results.iteritems():
            hexdigest = hashlib.md5(doc).hexdigest()
            hexdigests[doc] = hexdigest

            for mlt_doc in results.docs:
                if doc not in mlt_dict:
                    mlt_dict[doc] = [self._process_doc(mlt_doc)]
                else:
                    mlt_dict[doc].append(self._process_doc(mlt_doc))

        return mlt_dict, hexdigests

    def _process_doc(self, doc):
        ui_doc = dict(doc)

        if doc.get('product'):
            env = ProductEnvironment(self.env, doc['product'])
            product_href = ProductEnvironment.resolve_href(env, self.env)
            ui_doc["href"] = product_href(doc['type'], doc['id'])
        else:
            ui_doc["href"] = self.env.href(doc['type'], doc['id'])

        ui_doc['title'] = str(doc['type'] + ": " + doc['_stored_name']).title()

        return ui_doc

    def _create_query_result(
                        self, query, results, fields, pagenum, pagelen):
        total_num, total_page_count, page_num, offset = \
                    self._prepare_query_result_attributes(query, results,
                                                          pagenum, pagelen)

        query_results = QueryResult()
        query_results.hits = total_num
        query_results.total_page_count = total_page_count
        query_results.page_number = page_num
        query_results.offset = offset

        docs = []
        highlighting = []

        for retrieved_record in results:
            result_doc = self._process_record(fields, retrieved_record)
            docs.append(result_doc)

            result_highlights = dict(retrieved_record['solr_highlights'])

            highlighting.append(result_highlights)
            query_results.docs = docs
            query_results.highlighting = highlighting

        return query_results

    def _create_query_chain(self, query, query_string):
        matches = re.findall(re.compile(r'([\w\*]+)'), query_string)
        tokens = set([match for match in matches])

        final_query_chain = None
        for token in tokens:
            token_query_chain = self._search_fields_for_token(token)
            if final_query_chain is None:
                final_query_chain = token_query_chain
            else:
                final_query_chain |= token_query_chain

        return final_query_chain

    def _process_record(self, fields, retrieved_record):
        result_doc = dict()
        if fields:
            for field in fields:
                if field in retrieved_record:
                    result_doc[field] = retrieved_record[field]
        else:
            for key, value in retrieved_record.items():
                result_doc[key] = value

        for key, value in result_doc.iteritems():
            result_doc[key] = self._from_whoosh_format(value)

        return result_doc

    def _from_whoosh_format(self, value):
        if isinstance(value, datetime):
            value = utc.localize(value)
        return value

    def _prepare_query_result_attributes(
                                    self, query, results, pagenum, pagelen):
        results_total_num = query.execute().result.numFound
        total_page_count = int(ceil(results_total_num / pagelen))
        pagenum = min(total_page_count, pagenum)

        offset = (pagenum-1) * pagelen
        if (offset+pagelen) > results_total_num:
            pagelen = results_total_num - offset

        return results_total_num, total_page_count, pagenum, offset

    def is_index_outdated(self):
        return False

    def recreate_index(self):
        return True

    @contextmanager
    def start_operation(self):
        yield

    def _search_fields_for_token(self, token):
        q_chain = None
        field_boosts = DefaultQueryParser(self.env).field_boosts

        for field, boost in field_boosts.iteritems():
            if field != 'query_suggestion_basket' and field != 'relations':
                field_token_dict = {field: token}
                if q_chain is None:
                    q_chain = self.solr_interface.Q(**field_token_dict)**boost
                else:
                    q_chain |= self.solr_interface.Q(**field_token_dict)**boost

        return q_chain

    def _reformat_doc(self, doc):
        for key, value in doc.items():
            if key is None:
                del doc[None]
            elif value is None:
                del doc[key]
            elif isinstance(value, basestring) and value == "":
                del doc[key]
            else:
                doc[key] = self._to_whoosh_format(value)

    def _to_whoosh_format(self, value):
        if isinstance(value, basestring):
            value = unicode(value)
        elif isinstance(value, datetime):
            value = self._convert_date_to_tz_naive_utc(value)
        return value

    def _convert_date_to_tz_naive_utc(self, value):
        if value.tzinfo:
            utc_time = value.astimezone(utc)
            value = utc_time.replace(tzinfo=None)
        return value

    def _create_unique_id(self, product, doc_type, doc_id):
        if product:
            return u"%s:%s:%s" % (product, doc_type, doc_id)
        else:
            return u"%s:%s" % (doc_type, doc_id)
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name 

# eventQuery = "Typhoon Haiyan"
eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
root = 'D:\Test\EventCollections\SmallCollections'

#response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
response = si.query(text="west africa").execute()

#response = si.query(event=eventQuery).execute()

tot = response.result.numFound
print tot
#response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()

response = si.query(event=eventQuery).paginate(0,tot).execute()

docs = {}

print response.result.numFound

i = 1
Beispiel #12
0
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name

# eventQuery = "Typhoon Haiyan"
eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
root = 'D:\Test\EventCollections\SmallCollections'

#response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
response = si.query(text="west africa").execute()

#response = si.query(event=eventQuery).execute()

tot = response.result.numFound
print tot
#response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()

response = si.query(event=eventQuery).paginate(0, tot).execute()

docs = {}

print response.result.numFound

i = 1
def main():
    solr_url = "http://politicalframing.com:8983/solr"
    h = httplib2.Http(cache="/var/tmp/solr_cache")
    si = SolrInterface(url=solr_url, http_connection=h)

    totalNumFound = si.query(**{
        "*": "*"
    }).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(
        speaker_raw="the presiding officer").exclude(
            speaker_raw="the vice president").exclude(
                speaker_raw="the speaker pro tempore").exclude(
                    speaker_raw="the acting president pro tempore").sort_by(
                        "speaker_raw").paginate(
                            rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(
        totalNumFound)

    senateNumFound = si.query(chamber='Senate').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore  " + str(
        senateNumFound)

    houseNumFound = si.query(chamber='House').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        houseNumFound)

    extensionsNumFound = si.query(chamber='Extensions').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        extensionsNumFound)

    print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound)

    print "-----------------------"
    print "-----------------------"

    numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(
        speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='Senate').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='Senate')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()

    numFound = si.query(chamber='House').exclude(speaker_party="*").exclude(
        speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='House').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='House')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()

    numFound = si.query(chamber='Extensions').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='Extensions').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='Extensions')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()
def main():
	solr_url = "http://politicalframing.com:8983/solr"
	h = httplib2.Http(cache="/var/tmp/solr_cache")
	si = SolrInterface(url = solr_url, http_connection = h)

	totalNumFound = si.query(**{"*":"*"}).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(totalNumFound)

	senateNumFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore  " + str(senateNumFound)

	houseNumFound = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(houseNumFound)

	extensionsNumFound = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(extensionsNumFound)

	print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound)

	print "-----------------------"
	print "-----------------------"


	numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(numFound)
	for i in range(0, int(math.ceil(numFound/100000))):
		current_speeches = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Senate')

			print speech['id']
			if partial_document:
				print speech['speaker_raw'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(numFound)
	for i in range(0, int(math.ceil(numFound/100000))):
		current_speeches = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='House')

			print speech['id']
			if partial_document:
				print speech['speaker_raw'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(numFound)
	for i in range(0, int(math.ceil(numFound/100000))):
		current_speeches = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Extensions')

			print speech['id']
			if partial_document:
				print speech['speaker_raw'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name 

# eventQuery = "Typhoon Haiyan"
eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
root = 'CollectionSmall'

response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()

#response = si.query(event=eventQuery).execute()

tot = response.result.numFound
print tot
#response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()

response = si.query(event=eventQuery).paginate(0,tot).execute()

docs = {}

print response.result.numFound

i = 1

directory = root + "/"
eventQuery = "Texas_Fertilizer_Plant_Explosion"

#these are the query lists for Team A
eventQueryList = ["Texas_Fertilizer_Plant_Explosion", "Rain_at_Islip"]
# Commented out lines support the special handling when there are spaces in the event name.
# eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
#root = 'D:\Test\EventCollections\SmallCollections'
# Or, for a Mac, use something like
#someone needs to change this part
root = '../Unit3/output'

# Create and execute a Solr query
words = eventQuery.split();
query = si.query(event=words[0])
for w in words[1:]:
    query = query.query(event = w)
response = query.execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
tot = response.result.numFound

#print response.result.numFound
print tot, "documents found in collection [", eventQuery, "]\n"
print "Retrieving documents...\n"

response = si.query(event=eventQuery).paginate(0,tot).execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").paginate(0,tot).execute()
class ScoreSpider(CrawlSpider):
    name = "score"
    allowed_domains = ["matchendirect.fr"]
    start_urls = ["http://www.matchendirect.fr/hier/"]
    rules = [
        Rule(
            SgmlLinkExtractor(allow=(r"/live-score/[a-z0-9\-]+\.html$", r"/foot-score/[a-z0-9\-]+\.html$")),
            "parse_score",
        )
    ]

    # init solr instance
    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.si = SolrInterface("http://*****:*****@class="tableau"][1]')
            rows = table.xpath("tr")
            for row in rows:
                # if match has has started & is finished
                scoring = row.xpath('td[@class="lm4"]/a[not(span)]/text()').extract()
                isPlaying = row.xpath('td[@class="lm2_1"]').extract()
                if scoring and not isPlaying:
                    score = ScoreItem()
                    score["id"] = "http://www.matchendirect.fr" + row.xpath('td[@class="lm4"]/a/@href').extract().pop()
                    score["host"] = row.xpath('td[@class="lm3"]/a/text()').extract().pop()
                    score["visitor"] = row.xpath('td[@class="lm5"]/a/text()').extract().pop()

                    scoringArr = scoring.pop().split(" - ")
                    score["scorehost"] = int(scoringArr[0])
                    score["scorevisitor"] = int(scoringArr[1])
                    if score["scorehost"] > score["scorevisitor"]:
                        score["winner"] = score["host"]
                    elif score["scorehost"] < score["scorevisitor"]:
                        score["winner"] = score["visitor"]

                    leagueArr = league.xpath("a[1]/text()").extract().pop().split(" : ")
                    score["country"] = leagueArr[0]
                    score["league"] = leagueArr[1]

                    docs.append(dict(score))

        # index crawled games
        self.si.add(docs)
        self.si.commit()

    # called on followed urls
    # get game details (goal scorer & time)
    def parse_score(self, response):
        sel = Selector(response)
        # if match has started & is finished
        scorehost = sel.xpath('//div[@id="match_score"]/div[@class="col2"]/text()').extract().pop().strip()
        scorevisitor = sel.xpath('//div[@id="match_score"]/div[@class="col3"]/text()').extract().pop().strip()
        isPlaying = sel.xpath('//div[@id="match_entete_2"]/img').extract()

        if scorehost and scorevisitor and not isPlaying:
            score = ScoreItem()

            # get already indexed data
            solr_doc = self.si.query(id=response.url).execute()
            if list(solr_doc):
                doc = solr_doc[0]
            else:
                doc = {}
                score["id"] = response.url

            # get goals
            table = sel.xpath('//table[@class="tableau match_evenement"]')
            rows = table.xpath("tr")
            score["goalscorershost"], score["goalscorersvisitor"], score["goaltimeshost"], score["goaltimesvisitor"] = (
                [],
                [],
                [],
                [],
            )
            score["penaltytimeshost"], score["penaltytimesvisitor"], score["ogtimeshost"], score["ogtimesvisitor"] = (
                [],
                [],
                [],
                [],
            )
            for row in rows:
                tdgoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement1"]]')
                tdpenaltyhost = row.xpath('td[@class="c1" and span[@class="ico_evenement2"]]')
                tdowngoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement7"]]')
                tdgoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement1"]]')
                tdpenaltyvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement2"]]')
                tdowngoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement7"]]')
                tdgoalhost = tdgoalhost or tdpenaltyhost or tdowngoalhost
                tdgoalvisitor = tdgoalvisitor or tdpenaltyvisitor or tdowngoalvisitor
                if tdgoalhost:
                    time = tdgoalhost.xpath('following-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'")
                    if tdpenaltyhost:
                        score["penaltytimeshost"].append(time)
                    elif tdowngoalhost:
                        score["ogtimeshost"].append(time)
                    score["goaltimeshost"].append(time)
                    score["goalscorershost"].append(tdgoalhost.xpath("a/text()").extract().pop())
                elif tdgoalvisitor:
                    time = (
                        tdgoalvisitor.xpath('preceding-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'")
                    )
                    if tdpenaltyvisitor:
                        score["penaltytimesvisitor"].append(time)
                    elif tdowngoalvisitor:
                        score["ogtimesvisitor"].append(time)
                    score["goaltimesvisitor"].append(time)
                    score["goalscorersvisitor"].append(tdgoalvisitor.xpath("a/text()").extract().pop())

            # get time, refree & stadium
            matchinfos = sel.xpath('//table[@id="match_entete_1"]/tr/td[@class="info"]/text()').extract()
            matchinfos.pop()
            matchinfos = [x.lstrip("\n\t\r") for x in matchinfos]
            if u"Arbitre : - " in matchinfos:
                matchinfos.remove(u"Arbitre : - ")
            date = format_date(matchinfos[0])
            time = matchinfos[1].split(" ")[-1].replace("h", ":") + ":00"
            score["date"] = "%sT%sZ" % (date, time)
            if len(matchinfos) >= 3:
                score["stadium"] = matchinfos[2]
                if len(matchinfos) == 4:
                    score["referee"] = matchinfos[3].split(" : ")[1]

            # index all datas
            doc = dict(doc.items() + dict(score).items())
            self.si.add(doc)
            self.si.commit()