def solr_reindex (self): """ Reindex all entries. Used when switching to/from "private" status. """ solr_conn = SolrConnection(settings.SOLR_URL) # Start by deleting 'em all solr_conn.delete_query('user:%s' % self.user.id) entries = Entry.objects.filter(user=self.user) docs = [] # Arbitrary assignment of a constant, here. SLICE_SIZE = 50 slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: docs.append(entry.solr_doc) if len(docs) == SLICE_SIZE: try: solr_conn.add_many(docs) except: # should log appropriately, huh pass del(docs) docs = [] # Don't miss the leftovers solr_conn.add_many(docs) solr_conn.commit() solr_conn.optimize()
class Command(BaseCommand): user_option = optparse.make_option( '--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option, ) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s + SLICE_SIZE) entry_slice = entries[s:s + SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del (docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
class Command(BaseCommand): user_option = optparse.make_option('--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option,) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s+SLICE_SIZE) entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del(docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
def search(self, **kwargs): query = kwargs['q'] api_key = "aac5b38a36513510000ef3286494fc6d" url = urllib2.urlopen("http://tinysong.com/s/%s/?format=json&key=%s" % (urllib2.quote(query), api_key)) response = json.loads(url.read()) # TODO: Remove redundancy between results and tracks? results = [] tracks = [] for song in response: source_id = 'grooveshark' result = { 'artist': song['ArtistName'], 'album': song['AlbumName'], 'title': song['SongName'], 'sources': [ { 'sourceid': source_id, 'trackid': '%s' % song['SongID'] } ] } results.append(result) track = { 'id': 'track_%s_%s' % (source_id, song['SongID']), 'type': 'track', 'track_title': song['SongName'], 'track_artist': song['ArtistName'], 'track_album': song['AlbumName'], 'request_source_id': source_id, 'request_track_id': song['SongID'], } tracks.append(track) # Register the songs in the search engine solr = SolrConnection(settings.SOLR_URL) solr.add_many(tracks) solr.commit() solr.close() cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps(results, ensure_ascii=False, indent=4).encode('utf-8')
def create(self, **kwargs): # Collect the channel details name = kwargs["name"] pos = kwargs["pos"] # Create the channel in the search engine doc = {"id": "channel_%s" % (name,), "type": "channel", "channel_id": name, "channel_location": pos} solr = SolrConnection(settings.SOLR_URL) solr.add_many([doc]) solr.commit() solr.close() # Create the channel in the URL hierarchy self.__dict__[name] = ChannelResource.Channel(name)
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ solr = SolrConnection(settings.SOLR) titles = models.Title.objects.all() if since: titles = titles.filter(created__gte=since) titles = titles.prefetch_related("languages", "alt_titles", "subjects", "notes", "places", "urls", "essays", "country", "holdings") count = 0 for chunk in sliced(titles, 500): docs = [] for title in chunk: try: docs.append(title.solr_doc) except Exception: LOGGER.exception("Unable to index title %s", title) solr.add_many(docs) reset_queries() solr.commit() count += len(chunk) LOGGER.info("indexed %d titles", count) lccns = set(models.Title.objects.values_list("lccn", flat=True)) for result in solr.query("+type:title", fields=["id", "lccn"]): stale_id = result["id"] lccn = result["lccn"] if lccn not in lccns: LOGGER.warning("Removing stale title %s from the search index", stale_id) delete_title(stale_id, solr=solr) solr.commit()
def setUpClass(cls): # First, add a folio to Solr so that the image_uri can be retrieved during the MEI conversion # Using curl here because it turned out to be easier than solrconn.add and gives better error messages os.system("curl {0}/update/?commit=true -H 'Content-Type: text/xml' -d '<add><doc>\ <field name=\"id\">testid</field>\ <field name=\"type\">cantusdata_folio</field>\ <field name=\"manuscript_id\">{1}</field>\ <field name=\"number\">{2}</field>\ <field name=\"image_uri\">{3}</field>\ </doc></add>'".format(settings.SOLR_SERVER, MEI_FIXTURE_ID, MEI_FIXTURE_FOLIO, MEI_FIXTURE_URI)) docs = list(MEIConverter.process_file(MEI_FIXTURE, MEI_FIXTURE_SIGLUM, MEI_FIXTURE_ID)) # Sanity check solrconn = SolrConnection(settings.SOLR_SERVER) prequery = solrconn.query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM) assert prequery.numFound == 0, 'MEI was already in the database when loading the test fixture' solrconn.add_many(docs) solrconn.commit()
author = [ ' '.join( itertools.chain(*[(p if p.isupper() else [p]) for p in a if p])) for a in author ] author = ', '.join(author) row = row[:13] + (author, ) + row[14:] # print row[13] articles.append( Article(*( cell.decode('utf-8') if isinstance(cell, str) else cell for cell in row))) i += 1 solr.add_many(a._asdict() for a in articles) j += 10000 if j % 1e5 == 0: print "Loaded: ", i if c.rowcount == 0: break # writer = csv.writer(open('articles.csv', 'wb')) # writer.writerows(articles) else: reader = csv.reader(open('articles.csv', 'rb')) for i, row in enumerate(reader): articles.append(Article(*(cell.decode('utf-8') for cell in row))) if i % 1e5 == 0: print "Loaded: ", i del seen
def submit(self, **kwargs): # Grab the user and request details json_data = json.loads(kwargs['json']) user_id = json_data['userid'] channel_id = self.channel_id requests = json_data['requests'] # Create a local representation of the requests tracks = [] for request in requests: source_id = request['sourceid'] track_id = request['trackid'] # Build up a Solr query filters = [] filters.append('type:track') filters.append('request_source_id:%s' % source_id) filters.append('request_track_id:%s' % track_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) response = solr.select(q = ' AND '.join(filters), fields = 'track_artist, track_album, track_title') if len(response.results) == 1: track = { 'id': 'request_%s_%s_%s' % (source_id, track_id, user_id), 'type': 'request', 'channel_id': channel_id, 'track_artist': response.results[0]['track_artist'], 'track_album': response.results[0]['track_album'], 'track_title': response.results[0]['track_title'], 'request_user_id': user_id, 'request_source_id': source_id, 'request_track_id': track_id } tracks.append(track) # Create the request in the search engine solr = SolrConnection(settings.SOLR_URL) solr.add_many(tracks) solr.commit() solr.close() # Log the request to the database db = psycopg2.connect(database='airjukebox') cr = db.cursor() for track in tracks: cr.execute('insert into tbrequests (userid, locationid, sourceid, trackid) values (%(request_user_id)s, %(channel_id)s, %(request_source_id)s, %(request_track_id)s)', track) db.commit() cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps(tracks, ensure_ascii=False, indent=4).encode('utf-8')
def index_pages(only_missing=False): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) page_qs = models.Page.objects.order_by("pk") if only_missing: page_qs = page_qs.filter(indexed=False) else: # FIXME: we should not churn the index when documents have not been deleted: solr.delete_query("type:page") # To avoid MySQL limitations, we'll run two queries: the first will only # lookup the primary keys to allow MySQL to satisfy the ORDER BY / LIMIT # using only the index and then we'll use the primary keys to lookup the # full Page objects for each chunk which will actually be indexed. full_page_qs = page_qs.prefetch_related( Prefetch( "issue", queryset=models.Issue.objects.prefetch_related( "batch", "title", "title__languages", "title__alt_titles", "title__subjects", "title__notes", "title__places", "title__urls", "title__essays", "title__country", "title__holdings", ), )) count = 0 for pk_chunk in sliced(page_qs.values_list("pk", flat=True), 100): # We have to force the PKs into a list to work around limitations in # MySQL preventing the use of a subquery which uses LIMIT: chunk = full_page_qs.filter(pk__in=list(pk_chunk)) docs = [] pks = [] for page in chunk: try: docs.append(page.solr_doc) pks.append(page.pk) except Exception: LOGGER.warning("Unable to index page %s", page.url, exc_info=True) continue if docs: solr.add_many(docs) solr.commit() models.Page.objects.filter(pk__in=pks).update(indexed=True) count += len(pk_chunk) reset_queries() LOGGER.info("indexed %d pages", count) solr.commit()
articles = [] c.execute("SELECT * FROM article LIMIT 10000 OFFSET %d" % j) for row in c.fetchall(): if row[0] in seen: continue seen.add(row[0]) if row[13]: author = [a.split(' ') for a in row[13].decode('utf-8').split(', ')] author = [' '.join(itertools.chain(*[(p if p.isupper() else [p]) for p in a if p])) for a in author] author = ', '.join(author) row = row[:13] + (author,) + row[14:] # print row[13] articles.append(Article(*(cell.decode('utf-8') if isinstance(cell, str) else cell for cell in row))) i += 1 solr.add_many(a._asdict() for a in articles) j += 10000 if j % 1e5 == 0: print "Loaded: ", i if c.rowcount == 0: break # writer = csv.writer(open('articles.csv', 'wb')) # writer.writerows(articles) else: reader = csv.reader(open('articles.csv', 'rb')) for i, row in enumerate(reader): articles.append(Article(*(cell.decode('utf-8') for cell in row))) if i % 1e5 == 0: print "Loaded: ", i del seen
# Compose document data to store in Solr. documents = [] for path, fname in txts: log.msg(fname, "->", path) url = site + path with codecs.open(fname, 'rb', encoding) as fp: title, content = parse_document(fp) doc = { 'title': title, 'content': content, #'last_modified': datetime.fromtimestamp(os.path.getmtime(fname)), 'last_modified': datetime.now().replace(tzinfo=utc), 'site': site, 'url': url, 'id': hashlib.sha1(url).hexdigest() } documents.append(doc) u = options['username'] p = options['password'] if u and p: s = SolrConnection(server, http_user=u, http_pass=p) else: s = SolrConnection(server) s.add_many(documents) s.commit() if __name__ == '__main__': main()