Example #1
0
 def _get_count(self):
     "Returns the total number of objects, across all pages."
     if self._count is None:
         solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
         solr_response = solr.query(self._q, fields=['id'])
         self._count = int(solr_response.results.numFound)
     return self._count
Example #2
0
def word_matches_for_page(page_id, words):
    """
    Gets a list of pre-analyzed words for a list of words on a particular
    page. So if you pass in 'manufacturer' you can get back a list like
    ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ...
    """
    solr = SolrConnection(settings.SOLR)

    # Make sure page_id is of type str, else the following string
    # operation may result in a UnicodeDecodeError. For example, see
    # ticket #493
    if not isinstance(page_id, str):
        page_id = str(page_id)

    ocr_list = ["ocr"]
    ocr_list.extend(["ocr_%s" % l for l in settings.SOLR_LANGUAGES])
    ocrs = " OR ".join([query_join(words, o) for o in ocr_list])
    q = "id:%s AND (%s)" % (page_id, ocrs)
    params = {
        "hl.snippets": 100,
        "hl.requireFieldMatch": "true",
        "hl.maxAnalyzedChars": "102400"
    }
    response = solr.query(q, fields=["id"], highlight=ocr_list, **params)

    if page_id not in response.highlighting:
        return []

    words = set()
    for ocr in ocr_list:
        if ocr in response.highlighting[page_id]:
            for context in response.highlighting[page_id][ocr]:
                words.update(find_words(context))
    return list(words)
Example #3
0
def word_matches_for_page(page_id, words):
    """
    Gets a list of pre-analyzed words for a list of words on a particular
    page. So if you pass in 'manufacturer' you can get back a list like
    ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ...
    """
    solr = SolrConnection(settings.SOLR)

    # Make sure page_id is of type str, else the following string
    # operation may result in a UnicodeDecodeError. For example, see
    # ticket #493
    if not isinstance(page_id, str):
        page_id = str(page_id)

    ocr_list = ['ocr', ]
    ocr_list.extend(['ocr_%s' % l for l in settings.SOLR_LANGUAGES])
    ocrs = ' OR '.join([query_join(words, o) for o in ocr_list])
    q = 'id:%s AND (%s)' % (page_id, ocrs)
    params = {"hl.snippets": 100, "hl.requireFieldMatch": 'true', "hl.maxAnalyzedChars": '102400'}
    response = solr.query(q, fields=['id'], highlight=ocr_list, **params)

    if page_id not in response.highlighting:
        return []

    words = set()
    for ocr in ocr_list:
        if ocr in response.highlighting[page_id]:
            for context in response.highlighting[page_id][ocr]:
                words.update(find_words(context))
    return list(words)
Example #4
0
 def _get_count(self):
     "Returns the total number of objects, across all pages."
     if self._count is None:
         solr = SolrConnection(settings.SOLR)  # TODO: maybe keep connection around?
         solr_response = solr.query(self._q, fields=['id'])
         self._count = int(solr_response.results.numFound)
     return self._count
Example #5
0
def oralookup(pid=None, uuid=None, fields_to_return="f_name, f_subject, f_keyphrase, faculty, f_institution, thesis_type, content_type, collection", endpoint="http://ora.ouls.ox.ac.uk:8080/solr/select"):
  s = SolrConnection(endpoint)
  results = {}
  query = ""
  if pid:
    pid = "\:".join(pid.split(":"))
    query = "id:%s" % pid
  elif uuid:
    query = "id:uuid\:%s" % uuid
  else:
    return results
  # Running actual query (3 tries, failover)
  tries = 0
  while(tries != 3):
    try:
      r = s.query(q = query, fields = fields_to_return)
      logger.debug("Solr response: %s" % r.header)
      tries = 3
    except BadStatusLine:
      sleep(0.5)
      tries = tries + 1
  try:
    assert len(r.results) == 1
    return r.results[0]
  except ValueError:
    logger.warn("Couldn't parse json response from Solr endpoint: %s" % r)
    return {}
  except AssertionError:
    logger.warn("Couldn't assert that only a single result was fetched: %s" % results)
    return {}
Example #6
0
def get_page_text(page):
    no_text = ["Text not available"]
    solr = SolrConnection(settings.SOLR)
    query = 'id:"%s"' % page.url
    solr_results = solr.query(query)
    results_attribute = getattr(solr_results, 'results', None)
    if isinstance(results_attribute, list) and len(results_attribute) > 0:
        return results_attribute[0].get('ocr', no_text)
    else:
        return no_text
Example #7
0
def get_page_text(page):
    no_text = ["Text not available"]
    solr = SolrConnection(settings.SOLR)
    query = 'id:"%s"' % page.url
    solr_results = solr.query(query)
    results_attribute = getattr(solr_results, 'results', None)
    if isinstance(results_attribute, list) and len(results_attribute) > 0:
        return results_attribute[0].get('ocr', no_text)
    else:
        return no_text
Example #8
0
def similar_pages(page):
    solr = SolrConnection(settings.SOLR)
    d = page.issue.date_issued
    year, month, day = '{0:02d}'.format(d.year), '{0:02d}'.format(d.month), '{0:02d}'.format(d.day) 
    date = ''.join(map(str, (year, month, day)))

    query = '+type:page AND date:%s AND %s AND NOT(lccn:%s)' % (date, query_join(map(lambda p: p.city, 
                                           page.issue.title.places.all()), 'city'), page.issue.title.lccn)
    response = solr.query(query, rows=25)
    results = response.results
    return map(lambda kwargs: utils.get_page(**kwargs), 
               map(lambda r: urlresolvers.resolve(r['id']).kwargs, results))
Example #9
0
def similar_pages(page):
    solr = SolrConnection(settings.SOLR)
    d = page.issue.date_issued
    year, month, day = '{0:02d}'.format(d.year), '{0:02d}'.format(d.month), '{0:02d}'.format(d.day) 
    date = ''.join(map(str, (year, month, day)))

    query = '+type:page AND date:%s AND %s AND NOT(lccn:%s)' % (date, query_join(map(lambda p: p.city, 
                                           page.issue.title.places.all()), 'city'), page.issue.title.lccn)
    response = solr.query(query, rows=25)
    results = response.results
    return map(lambda kwargs: utils.get_page(**kwargs), 
               map(lambda r: urlresolvers.resolve(r['id']).kwargs, results))
Example #10
0
def execute_solr_query(query, fields, sort, sort_order, rows, start):
    solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
    solr_response = solr.query(query,
                               fields=['lccn', 'title',
                                       'edition',
                                       'place_of_publication',
                                       'start_year', 'end_year',
                                       'language'],
                               rows=rows,
                               sort=sort,
                               sort_order=sort_order,
                               start=start)
    return solr_response
Example #11
0
def execute_solr_query(query, fields, sort, sort_order, rows, start):
    # default arg_separator - underscore wont work if fields to facet on
    # themselves have underscore in them
    solr = SolrConnection(settings.SOLR)  # TODO: maybe keep connection around?
    solr_response = solr.query(query,
                               fields=[
                                   'lccn', 'title', 'edition',
                                   'place_of_publication', 'start_year',
                                   'end_year', 'language'
                               ],
                               rows=rows,
                               sort=sort,
                               sort_order=sort_order,
                               start=start)
    return solr_response
Example #12
0
def execute_solr_query(query, fields, sort, sort_order, rows, start):
    # default arg_separator - underscore wont work if fields to facet on
    # themselves have underscore in them
    solr = SolrConnection(settings.SOLR)  # TODO: maybe keep connection around?
    solr_response = solr.query(query,
                               fields=['lccn', 'title',
                                       'edition',
                                       'place_of_publication',
                                       'start_year', 'end_year',
                                       'language'],
                               rows=rows,
                               sort=sort,
                               sort_order=sort_order,
                               start=start)
    return solr_response
Example #13
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(
            settings.SOLR)  # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {
            "hl.snippets": 100,  # TODO: make this unlimited
            "hl.requireFieldMatch": 'true',  # limits highlighting slop
            "hl.maxAnalyzedChars": '102400',  # increased from default 51200
        }
        sort_field, sort_order = _get_sort(self.query.get('sort'),
                                           in_pages=True)
        solr_response = solr.query(self._q,
                                   fields=[
                                       'id', 'title', 'date', 'sequence',
                                       'edition_label', 'section_label'
                                   ],
                                   highlight=self._ocr_list,
                                   rows=self.per_page,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start,
                                   **params)

        pages = []
        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            for ocr in self._ocr_list:
                for s in coords.get(ocr) or []:
                    words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url, page.words,
                                                    number, len(pages))
            pages.append(page)

        return Page(pages, number, self)
Example #14
0
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """

    solr = SolrConnection(settings.SOLR)

    titles = models.Title.objects.all()
    if since:
        titles = titles.filter(created__gte=since)

    titles = titles.prefetch_related("languages", "alt_titles", "subjects",
                                     "notes", "places", "urls", "essays",
                                     "country", "holdings")

    count = 0

    for chunk in sliced(titles, 500):
        docs = []

        for title in chunk:
            try:
                docs.append(title.solr_doc)
            except Exception:
                LOGGER.exception("Unable to index title %s", title)

        solr.add_many(docs)

        reset_queries()
        solr.commit()

        count += len(chunk)
        LOGGER.info("indexed %d titles", count)

    lccns = set(models.Title.objects.values_list("lccn", flat=True))

    for result in solr.query("+type:title", fields=["id", "lccn"]):
        stale_id = result["id"]
        lccn = result["lccn"]
        if lccn not in lccns:
            LOGGER.warning("Removing stale title %s from the search index",
                           stale_id)
            delete_title(stale_id, solr=solr)

    solr.commit()
Example #15
0
class QuoteResource:

	def __init__(self):
		self.solr = SolrConnection('http://localhost:8983/solr')

	@cherrypy.expose
	def index(self, callback=None, person=None, topic=None):

		filters = []
		filters.append('type:quote')

		if person:
			filters.append('person_t:%s' % person)
		if topic:
			filters.append('quote_t:%s' % topic)

		results = self.solr.query(q = ' AND '.join(filters), rows=100)
		docs = []

		timeline = { 
 	   		  'timeline':
    				{
				        'headline':'OnTheRecord',
				        'type':'default',
					'startDate':'2012,1,1',
					'text':'We help you track quotations from politicians over time',
				}
			}
		for result in results:
			doc = { 
                			"startDate":result['date'].strftime('%Y,%m,%d'),
                			"headline":result['person'],
                			"text":'<a href="' + result['url'] + '">'+result['title'] +'</a>',
                			"asset":
                			{
                    				"media":"<blockquote>\""+result['quote'] + "\"</blockquote>",
                    				"credit":"",
                    				"caption":""
                			}
            			}
			docs.append(doc)
		timeline['timeline']['date'] = docs;
		
		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps(timeline, ensure_ascii=False, indent=4).encode('utf-8')
Example #16
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {"hl.snippets": 100, # TODO: make this unlimited
            "hl.requireFieldMatch": 'true', # limits highlighting slop
            "hl.maxAnalyzedChars": '102400', # increased from default 51200
            }
        sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True)
        solr_response = solr.query(self._q,
                                   fields=['id', 'title', 'date', 'sequence',
                                           'edition_label', 'section_label'],
                                   highlight=self._ocr_list,
                                   rows=self.per_page,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start,
                                   **params)

        pages = []
        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            for ocr in self._ocr_list:
                for s in coords.get(ocr) or []:
                    words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url,
                                                    page.words,
                                                    number, len(pages))
            pages.append(page)

        return Page(pages, number, self)
Example #17
0
    def __init__(self, query):
        self.query = query.copy()

        # figure out the solr query
        q = title_search(self.query)

        try:
            page = int(self.query.get('page'))
        except:
            page = 1

        try:
            rows = int(self.query.get('rows'))
        except:
            rows = 50
        start = rows * (page - 1)

        # determine sort order
        sort_field, sort_order = _get_sort(self.query.get('sort'))

        # execute query
        solr = SolrConnection(
            settings.SOLR)  # TODO: maybe keep connection around?
        solr_response = solr.query(q,
                                   fields=[
                                       'lccn', 'title', 'edition',
                                       'place_of_publication', 'start_year',
                                       'end_year', 'language'
                                   ],
                                   rows=rows,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start)

        # convert the solr documents to Title models
        # could use solr doc instead of going to db, if performance requires it
        lccns = [d['lccn'] for d in solr_response.results]
        results = []
        for lccn in lccns:
            try:
                title = models.Title.objects.get(lccn=lccn)
                results.append(title)
            except models.Title.DoesNotExist, e:
                pass  # TODO: log exception
Example #18
0
    def __init__(self, query):
        self.query = query.copy()

        # figure out the solr query
        q = title_search(self.query)

        try:
            page = int(self.query.get('page'))
        except:
            page = 1

        try:
            rows = int(self.query.get('rows'))
        except:
            rows = 50
        start = rows * (page - 1)

        # determine sort order
        sort_field, sort_order = _get_sort(self.query.get('sort'))

        # execute query
        solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
        solr_response = solr.query(q,
                                   fields=['lccn', 'title',
                                           'edition',
                                           'place_of_publication',
                                           'start_year', 'end_year',
                                           'language'],
                                   rows=rows,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start)

        # convert the solr documents to Title models
        # could use solr doc instead of going to db, if performance requires it
        lccns = [d['lccn'] for d in solr_response.results]
        results = []
        for lccn in lccns:
            try:
                title = models.Title.objects.get(lccn=lccn)
                results.append(title)
            except models.Title.DoesNotExist, e:
                pass # TODO: log exception
Example #19
0
class PersonResource:

	def __init__(self):
		self.solr = SolrConnection('http://localhost:8983/solr')

	@cherrypy.expose
	def index(self):

		results = self.solr.query('*:*', facet='true', facet_field='person')

		for person in results.facet_counts[u'facet_fields'][u'person']:

			print person

		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps({
			'test': 'test',
			'data': 'data'
		}, ensure_ascii=False, indent=4).encode('utf-8')
Example #20
0
    def setUpClass(cls):
        # First, add a folio to Solr so that the image_uri can be retrieved during the MEI conversion
        # Using curl here because it turned out to be easier than solrconn.add and gives better error messages
        os.system("curl {0}/update/?commit=true -H 'Content-Type: text/xml' -d '<add><doc>\
        <field name=\"id\">testid</field>\
        <field name=\"type\">cantusdata_folio</field>\
        <field name=\"manuscript_id\">{1}</field>\
        <field name=\"number\">{2}</field>\
        <field name=\"image_uri\">{3}</field>\
        </doc></add>'".format(settings.SOLR_SERVER, MEI_FIXTURE_ID, MEI_FIXTURE_FOLIO, MEI_FIXTURE_URI))

        docs = list(MEIConverter.process_file(MEI_FIXTURE, MEI_FIXTURE_SIGLUM, MEI_FIXTURE_ID))

        # Sanity check
        solrconn = SolrConnection(settings.SOLR_SERVER)
        prequery = solrconn.query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM)
        assert prequery.numFound == 0, 'MEI was already in the database when loading the test fixture'

        solrconn.add_many(docs)
        solrconn.commit()
Example #21
0
if not options.wiki:
    raise Exception('A wiki is required, passed as host name')

conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr')

query = ["host:'%s'" % (options.wiki)]

query += ['ns:%d ' % (int(options.namespace))]

if options.start_date or options.end_date:
    start = options.start_date + 'T00:00:00.000Z' if options.start_date else '*'
    end = options.end_date + 'T00:00:00.000Z' if options.end_date else '*'
    query += ['created:[%s TO %s]' % (start, end)]

response = conn.query(' AND '.join(query), fields=['html_en','nolang_txt','html'])
paginator = SolrPaginator(response)

print paginator.count, 'results to chomp through...'

polarities, subjectivities = [], []

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        sent = sentiment.sentiment(doc.get('html_en', doc.get('nolang_txt', doc.get('html'))))
        if ( sent == (0,0)):
            continue
        polarities.append(sent[0])
        subjectivities.append(sent[1])
    if page % int(paginator.num_pages/10) == 0:
        print "========","On page", page, "of", paginator.num_pages, "======="
Example #22
0
    "-w", "--wiki", dest="wiki", action="store", default=None, help="Specifies the wiki to perform calculations against"
)
parser.add_option(
    "-n", "--sents", dest="num_sents", action="store", default=5, help="Specifies the number of sentences to write"
)

(options, args) = parser.parse_args()

if options.id:
    query = "id:%s" % (options.id)
elif options.wiki:
    query = "host:'%s' AND ns:0" % (options.wiki)
else:
    raise Exception("A wiki  or ID is required, passed as host name")

conn = SolrConnection("http://search-s10.prod.wikia.net:8983/solr")

response = conn.query(query, fields=["html_en", "nolang_txt", "html", "title", "title_en", "id"])
paginator = SolrPaginator(response)

summarizer = SimpleSummarizer()

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        text = doc.get("html_en", doc.get("nolang_txt", doc.get("html")))
        title = doc.get("title_en", doc.get("title", doc["id"]))
        summed = summarizer.get_summarized(text, options.num_sents)
        print "\t\t=======", title, "======="
        print "\t" + "\n\t".join([sent for sent in summed if not sent.startswith("Contents")])
        print "\t\t====================================="
Example #23
0
def search(field, data, path, hlength, mode):
	from termcolor import colored
	from solr import SolrConnection
	#hlength = int(hlength)
	#search solr, get filePath, do a grep and show the line
	#print 'search'
	s = SolrConnection(SOLR_URL)
	if field == 'name':
		query = 'name:"' + data + '"'
		response = s.query(query)
	elif field == 'txt':
		query = 'txt:"' + data + '"'		
		#response = s.query(query, hl=True, hl.q='txt:bandits', hl.fl='txt', hl.fragsize=50, hl.preserveMulti=True, hl.snippets=100)
		if hlength:
			response = s.query(query, fl='id,name', highlight=True, fields = 'txt', hl_q=query, hl_fragsize=hlength, hl_snippets=1000, hl_bs_type = 'SENTENCE')																																	 
		else:
			response = s.query(query, fl='id,name')
	else:
		query = 'name:"' + data + '" OR txt:"' + data + '"'
		#response = s.query(query, hl=True, hl.q='txt:bandits', hl.fl='txt', hl.fragsize=50, hl.preserveMulti=True, hl.snippets=100)
		if hlength:
			response = s.query(query, fl='id,name', highlight=True, fields = 'txt', hl_q=query, hl_fragsize=hlength, hl_snippets=1000, hl_bs_type = 'SENTENCE')
		else:
			response = s.query(query, fl='id,name')

	#print query
	#print response.__dict__
	#print response.highlighting
	if hlength and field != 'name':
		hlength = int(hlength)			
		for id in response.highlighting:
			if os.path.isfile(id):			
				if response.highlighting[id]:
					for txt in response.highlighting[id]['txt']:
						txt = txt.strip()
						startpos = txt.index('<em>')
						endpos = txt.rindex('</em>')
						print (txt[:startpos] + colored(txt[startpos+4:endpos], 'red') + txt[endpos+5:]).replace('<em>', '').replace('</em>', '')
				else:
					fdata = open(id, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ')
					fdata = filter(lambda x: x in string.printable, fdata)
					for m in re.finditer( data, fdata ):
						start = m.start()-hlength
						if start < 0 :
							start = 0					
						end = m.end() + hlength
						if end > len(fdata):
							end = len(fdata)

						print (fdata[start:m.start()] + colored(fdata[m.start():m.end()], 'red') + fdata[m.end():end]).replace('<em>', '').replace('</em>', '')
				if id.endswith(('.mp3')):
					if mode == 'slow':
						x = raw_input('press `y` to play, `n` to move forward \n')
						if x == 'y':
							subprocess.call(["afplay", id])
				else:					
					print '\t To open the file press cmd + double click '
					print colored("file://"+id, 'blue')			
					print '\n \n'
					if mode == 'slow':
						raw_input('press any key to continue \n')

			else:
				s.delete_query('id:'+id)
	else:
		for hit in response.results:			
			if hit['id']:
				if hit['id'].endswith(('.mp3')):
					if mode == 'slow':
						x = raw_input('press `y` to play, `n` to move forward \n')
						if x == 'y':
							subprocess.call(["afplay", hit['id']])
				else:					
					print '\t To open the file press cmd + double click '
					print colored("file://"+hit['id'], 'blue')			
					print '\n \n'
					if mode == 'slow':
						raw_input('press any key to continue \n')
			else:
				s.delete_query('id:'+hit['id'])
Example #24
0
def title_count():
    solr = SolrConnection(settings.SOLR)
    return solr.query('type:title', fields=['id']).numFound
Example #25
0
                  dest="limit",
                  action="store",
                  default=None,
                  help="Specifies the document size of the calculation set")

(options, args) = parser.parse_args()

if not options.wiki:
    raise Exception('A wiki is required, passed as host name')

conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr')

ne = []

query = ["host:'%s'" % (options.wiki), 'ns:0']

response = conn.query(' AND '.join(query),
                      fields=['html_en', 'nolang_txt', 'html'],
                      sort='backlinks desc',
                      limit=100)
paginator = SolrPaginator(response)

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        try:
            text = doc.get('html_en', doc.get('nolang_txt', doc.get('html')))
            corpus = NECorpus(text)
            print corpus.nes()
        except UnicodeEncodeError:
            pass
Example #26
0
    "-l",
    "--limit",
    dest="limit",
    action="store",
    default=None,
    help="Specifies the document size of the calculation set",
)

(options, args) = parser.parse_args()

if not options.wiki:
    raise Exception("A wiki is required, passed as host name")

conn = SolrConnection("http://search-s10.prod.wikia.net:8983/solr")

ne = []

query = ["host:'%s'" % (options.wiki), "ns:0"]

response = conn.query(" AND ".join(query), fields=["html_en", "nolang_txt", "html"], sort="backlinks desc", limit=100)
paginator = SolrPaginator(response)

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        try:
            text = doc.get("html_en", doc.get("nolang_txt", doc.get("html")))
            corpus = NECorpus(text)
            print corpus.nes()
        except UnicodeEncodeError:
            pass
Example #27
0
if not options.wiki:
    raise Exception('A wiki is required, passed as host name')

conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr')

query = ["host:'%s'" % (options.wiki)]

query += ['ns:%d ' % (int(options.namespace))]

if options.start_date or options.end_date:
    start = options.start_date + 'T00:00:00.000Z' if options.start_date else '*'
    end = options.end_date + 'T00:00:00.000Z' if options.end_date else '*'
    query += ['created:[%s TO %s]' % (start, end)]

response = conn.query(' AND '.join(query),
                      fields=['html_en', 'nolang_txt', 'html'])
paginator = SolrPaginator(response)

print paginator.count, 'results to chomp through...'

polarities, subjectivities = [], []

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        sent = sentiment.sentiment(
            doc.get('html_en', doc.get('nolang_txt', doc.get('html'))))
        if (sent == (0, 0)):
            continue
        polarities.append(sent[0])
        subjectivities.append(sent[1])
    if page % int(paginator.num_pages / 10) == 0:
Example #28
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(
            settings.SOLR)  # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {
            "hl.snippets": 100,  # TODO: make this unlimited
            "hl.requireFieldMatch": 'true',  # limits highlighting slop
            "hl.maxAnalyzedChars": '102400',  # increased from default 51200
        }
        params.update(self.facet_params)
        sort_field, sort_order = _get_sort(self.query.get('sort'),
                                           in_pages=True)
        solr_response = solr.query(self._q,
                                   fields=[
                                       'id', 'title', 'date', 'month', 'day',
                                       'sequence', 'edition_label',
                                       'section_label'
                                   ],
                                   highlight=self._ocr_list,
                                   rows=self.per_page,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start,
                                   **params)
        solr_facets = solr_response.facet_counts
        # sort states by number of hits per state (desc)
        facets = {
            'city': _sort_facets_asc(solr_facets, 'city'),
            'county': _sort_facets_asc(solr_facets, 'county'),
            'frequency': _sort_facets_asc(solr_facets, 'frequency'),
            'language': _sort_facets_asc(solr_facets, 'language'),
            'state': _sort_facets_asc(solr_facets, 'state'),
        }
        # sort by year (desc)
        facets['year'] = sorted(list(
            solr_facets['facet_ranges']['year']['counts'].items()),
                                key=lambda k: k[0],
                                reverse=True)
        facet_gap = self.facet_params['f_year_facet_range_gap']
        if facet_gap > 1:
            facets['year'] = [('%s-%d' % (y[0], int(y[0]) + facet_gap - 1),
                               y[1]) for y in facets['year']]
        pages = []
        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            for ocr in self._ocr_list:
                for s in coords.get(ocr) or []:
                    words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url, page.words)
            pages.append(page)

        solr_page = Page(pages, number, self)
        solr_page.facets = facets
        return solr_page
Example #29
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {
            "hl.snippets": 100, # TODO: make this unlimited
            "hl.requireFieldMatch": 'true', # limits highlighting slop
            "hl.maxAnalyzedChars": '102400', # increased from default 51200
            # "hl.method":'unified'
            }
        params.update(self.facet_params)
        sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True)
        solr_response = solr.query(
           self._q,
           fields="*",
           # highlight=self._ocr_list,
           highlight='ocr_vector',
           rows=self.per_page,
           sort=sort_field,
           sort_order=sort_order,
           start=start,
           **params)

        solr_facets = solr_response.facet_counts

        facets = dict()
        facets['year'] = sorted(solr_facets.get('facet_fields')['year'].items())
        facets['lccn'] = sorted(solr_facets.get('facet_fields')['lccn'].items(), lambda x, y: x - y, lambda k: k[1], True)
        facets['county'] = sorted(solr_facets.get('facet_fields')['county'].items(), lambda x, y: x - y, lambda k: k[1], True)
        facets['region'] = sorted(solr_facets.get('facet_fields')['region'].items(), lambda x, y: x - y, lambda k: k[1], True)
        facets['city'] = sorted(solr_facets.get('facet_fields')['city'].items(), lambda x, y: x - y, lambda k: k[1], True)
        facets['newspaper_type'] = sorted(solr_facets.get('facet_fields')['newspaper_type'].items(), lambda x, y: x - y, lambda k: k[1], True)

        pages = []

        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            # for ocr in self._ocr_list:
            for s in coords.get('ocr_vector') or []:
                words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url,
                                                    page.words,
                                                    number, len(pages))
            pages.append(page)

        solr_page = Page(pages, number, self)
        solr_page.facets = facets
        return solr_page
Example #30
0
def title_count():
    solr = SolrConnection(settings.SOLR)
    return solr.query("type:title", fields=["id"]).numFound
Example #31
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {"hl.snippets": 100, # TODO: make this unlimited
            "hl.requireFieldMatch": 'true', # limits highlighting slop
            "hl.maxAnalyzedChars": '102400', # increased from default 51200
            }
        params.update(self.facet_params)
        sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True)
        solr_response = solr.query(self._q,
                                   fields=['id', 'title', 'date', 'month', 'day',
                                           'sequence', 'edition_label', 
                                           'section_label'],
                                   highlight=self._ocr_list,
                                   rows=self.per_page,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start,
                                   **params)
        solr_facets = solr_response.facet_counts
        # sort states by number of hits per state (desc)
        facets = {'state': sorted(solr_facets.get('facet_fields')['state'].items(),
                                  lambda x, y: x - y, lambda k: k[1], True),
                  'year': solr_facets['facet_ranges']['year']['counts'],
                  'county': sorted(solr_facets.get('facet_fields')['county'].items(),
                                  lambda x, y: x - y, lambda k: k[1], True)}
        # sort by year (desc)
        facets['year'] = sorted(solr_facets['facet_ranges']['year']['counts'].items(),
                                lambda x, y: int(x) - int(y), lambda k: k[0], True)
        facet_gap = self.facet_params['f_year_facet_range_gap']
        if facet_gap > 1:
            facets['year'] = [('%s-%d' % (y[0], int(y[0])+facet_gap-1), y[1]) 
                              for y in facets['year']]
        pages = []
        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            for ocr in self._ocr_list:
                for s in coords.get(ocr) or []:
                    words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url,
                                                    page.words,
                                                    number, len(pages))
            pages.append(page)

        solr_page = Page(pages, number, self)
        solr_page.facets = facets
        return solr_page
Example #32
0
if options.wiki:
    query = 'host:%s' % (options.wiki)
elif not options.query:
    raise Exception('A wiki is required, passed as host name')

if options.query:
    query += ' '+options.query

specifier = options.wiki if options.wiki else str(os.getpid())

conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr')

print query

response = conn.query(query, fields=['html_en','nolang_txt','html', 'title', 'title_en', 'id'], rows=100)
paginator = SolrPaginator(response)

def initialize_dir(page):
    paths = [options.dest, specifier, str(page)]
    fullpath = ''
    for path in paths:
        fullpath += path + '/'
        if not os.path.exists(fullpath):
            os.mkdir(fullpath)
    return fullpath

for page in paginator.page_range:
    pagedir = initialize_dir(page)
    lockfilepath = pagedir+'/LOCK'
    with open(lockfilepath, 'w') as lockfile:
Example #33
0
def title_count():
    solr = SolrConnection(settings.SOLR)
    return solr.query('type:title', fields=['id']).numFound
Example #34
0
                  "--wiki",
                  dest="wiki",
                  action="store",
                  default=None,
                  help="Specifies the wiki to perform calculations against")

(options, args) = parser.parse_args()

if options.id:
    query = 'id:%s' % (options.id)
elif options.wiki:
    query = "host:'%s' AND ns:0" % (options.wiki)
else:
    raise Exception('A wiki  or ID is required, passed as host name')

conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr')

response = conn.query(
    query, fields=['html_en', 'nolang_txt', 'html', 'title', 'title_en', 'id'])
paginator = SolrPaginator(response)

tool = ReadabilityTool(lang='eng')

for page in paginator.page_range:
    for doc in paginator.page(page).object_list:
        text = doc.get('html_en', doc.get('nolang_txt', doc.get('html')))
        title = doc.get('title_en', doc.get('title', doc['id']))
        print "=======", title, "======="
        tool.getReportAll(text.encode('utf8'))
        print "====================================="