Beispiel #1
0
 def solr_reindex (self):
     """
     Reindex all entries.  Used when switching to/from "private" status.
     """
     solr_conn = SolrConnection(settings.SOLR_URL)
     # Start by deleting 'em all
     solr_conn.delete_query('user:%s' % self.user.id)
     entries = Entry.objects.filter(user=self.user)
     docs = []
     # Arbitrary assignment of a constant, here.
     SLICE_SIZE = 50
     slices = [x for x in range(entries.count()) \
         if x % SLICE_SIZE == 0]
     for s in slices:
         entry_slice = entries[s:s+SLICE_SIZE]
         for entry in entry_slice:
             docs.append(entry.solr_doc)
             if len(docs) == SLICE_SIZE:
                 try:
                     solr_conn.add_many(docs)
                 except:
                     # should log appropriately, huh
                     pass
                 del(docs)
                 docs = []
     # Don't miss the leftovers
     solr_conn.add_many(docs)
     solr_conn.commit()
     solr_conn.optimize()
Beispiel #2
0
class Command(BaseCommand):
    user_option = optparse.make_option(
        '--user',
        action='store',
        dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option, )
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s + SLICE_SIZE)
            entry_slice = entries[s:s + SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del (docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
Beispiel #3
0
class Command(BaseCommand):
    user_option = optparse.make_option('--user',
        action='store', dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option,)
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY 
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s+SLICE_SIZE)
            entry_slice = entries[s:s+SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del(docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
	def search(self, **kwargs):

		query = kwargs['q']
		api_key = "aac5b38a36513510000ef3286494fc6d"

		url = urllib2.urlopen("http://tinysong.com/s/%s/?format=json&key=%s" % (urllib2.quote(query), api_key))
		response = json.loads(url.read())

		# TODO: Remove redundancy between results and tracks?
		results = []
		tracks = []
		for song in response:

			source_id = 'grooveshark'

			result = {
				'artist': song['ArtistName'],
				'album': song['AlbumName'],
				'title': song['SongName'],
				'sources': [
					{
						'sourceid': source_id,
						'trackid': '%s' % song['SongID']
					}
				]
			}
			results.append(result)

			track = {
				'id': 'track_%s_%s' % (source_id, song['SongID']),
				'type': 'track',

				'track_title': song['SongName'],
				'track_artist': song['ArtistName'],
				'track_album': song['AlbumName'],

				'request_source_id': source_id,
				'request_track_id': song['SongID'],
			}
			tracks.append(track)

		# Register the songs in the search engine
		solr = SolrConnection(settings.SOLR_URL)
		solr.add_many(tracks)
		solr.commit()
		solr.close()

		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps(results, ensure_ascii=False, indent=4).encode('utf-8')
    def create(self, **kwargs):

        # Collect the channel details
        name = kwargs["name"]
        pos = kwargs["pos"]

        # Create the channel in the search engine
        doc = {"id": "channel_%s" % (name,), "type": "channel", "channel_id": name, "channel_location": pos}

        solr = SolrConnection(settings.SOLR_URL)
        solr.add_many([doc])
        solr.commit()
        solr.close()

        # Create the channel in the URL hierarchy
        self.__dict__[name] = ChannelResource.Channel(name)
Beispiel #6
0
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """

    solr = SolrConnection(settings.SOLR)

    titles = models.Title.objects.all()
    if since:
        titles = titles.filter(created__gte=since)

    titles = titles.prefetch_related("languages", "alt_titles", "subjects",
                                     "notes", "places", "urls", "essays",
                                     "country", "holdings")

    count = 0

    for chunk in sliced(titles, 500):
        docs = []

        for title in chunk:
            try:
                docs.append(title.solr_doc)
            except Exception:
                LOGGER.exception("Unable to index title %s", title)

        solr.add_many(docs)

        reset_queries()
        solr.commit()

        count += len(chunk)
        LOGGER.info("indexed %d titles", count)

    lccns = set(models.Title.objects.values_list("lccn", flat=True))

    for result in solr.query("+type:title", fields=["id", "lccn"]):
        stale_id = result["id"]
        lccn = result["lccn"]
        if lccn not in lccns:
            LOGGER.warning("Removing stale title %s from the search index",
                           stale_id)
            delete_title(stale_id, solr=solr)

    solr.commit()
Beispiel #7
0
    def setUpClass(cls):
        # First, add a folio to Solr so that the image_uri can be retrieved during the MEI conversion
        # Using curl here because it turned out to be easier than solrconn.add and gives better error messages
        os.system("curl {0}/update/?commit=true -H 'Content-Type: text/xml' -d '<add><doc>\
        <field name=\"id\">testid</field>\
        <field name=\"type\">cantusdata_folio</field>\
        <field name=\"manuscript_id\">{1}</field>\
        <field name=\"number\">{2}</field>\
        <field name=\"image_uri\">{3}</field>\
        </doc></add>'".format(settings.SOLR_SERVER, MEI_FIXTURE_ID, MEI_FIXTURE_FOLIO, MEI_FIXTURE_URI))

        docs = list(MEIConverter.process_file(MEI_FIXTURE, MEI_FIXTURE_SIGLUM, MEI_FIXTURE_ID))

        # Sanity check
        solrconn = SolrConnection(settings.SOLR_SERVER)
        prequery = solrconn.query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM)
        assert prequery.numFound == 0, 'MEI was already in the database when loading the test fixture'

        solrconn.add_many(docs)
        solrconn.commit()
                    author = [
                        ' '.join(
                            itertools.chain(*[(p if p.isupper() else [p])
                                              for p in a if p]))
                        for a in author
                    ]
                    author = ', '.join(author)
                    row = row[:13] + (author, ) + row[14:]
#                print row[13]

                articles.append(
                    Article(*(
                        cell.decode('utf-8') if isinstance(cell, str) else cell
                        for cell in row)))
                i += 1
            solr.add_many(a._asdict() for a in articles)
            j += 10000
            if j % 1e5 == 0:
                print "Loaded: ", i
            if c.rowcount == 0:
                break
    #    writer = csv.writer(open('articles.csv', 'wb'))
    #    writer.writerows(articles)
    else:
        reader = csv.reader(open('articles.csv', 'rb'))
        for i, row in enumerate(reader):
            articles.append(Article(*(cell.decode('utf-8') for cell in row)))
            if i % 1e5 == 0:
                print "Loaded: ", i

    del seen
	def submit(self, **kwargs):

		# Grab the user and request details
		json_data = json.loads(kwargs['json'])

		user_id = json_data['userid']
		channel_id = self.channel_id
		requests = json_data['requests']

		# Create a local representation of the requests
		tracks = []
		for request in requests:

			source_id = request['sourceid']
			track_id = request['trackid']

			# Build up a Solr query
			filters = []
			filters.append('type:track')
			filters.append('request_source_id:%s' % source_id)
			filters.append('request_track_id:%s' % track_id)

			# Make the request to Solr
			solr = SolrConnection(settings.SOLR_URL)
			response = solr.select(q = ' AND '.join(filters), fields = 'track_artist, track_album, track_title')

			if len(response.results) == 1:

				track = {
					'id': 'request_%s_%s_%s' % (source_id, track_id, user_id),
					'type': 'request',

					'channel_id': channel_id,

					'track_artist': response.results[0]['track_artist'],
					'track_album': response.results[0]['track_album'],
					'track_title': response.results[0]['track_title'],

					'request_user_id': user_id,
					'request_source_id': source_id,
					'request_track_id': track_id
				}
				tracks.append(track)

		# Create the request in the search engine
		solr = SolrConnection(settings.SOLR_URL)
		solr.add_many(tracks)
		solr.commit()
		solr.close()

		# Log the request to the database
		db = psycopg2.connect(database='airjukebox')
		cr = db.cursor()

		for track in tracks:

			cr.execute('insert into tbrequests (userid, locationid, sourceid, trackid) values (%(request_user_id)s, %(channel_id)s, %(request_source_id)s, %(request_track_id)s)', track)

		db.commit()

		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps(tracks, ensure_ascii=False, indent=4).encode('utf-8')
Beispiel #10
0
def index_pages(only_missing=False):
    """index all the pages that are modeled in the database
    """
    solr = SolrConnection(settings.SOLR)

    page_qs = models.Page.objects.order_by("pk")

    if only_missing:
        page_qs = page_qs.filter(indexed=False)
    else:
        # FIXME: we should not churn the index when documents have not been deleted:
        solr.delete_query("type:page")

    # To avoid MySQL limitations, we'll run two queries: the first will only
    # lookup the primary keys to allow MySQL to satisfy the ORDER BY / LIMIT
    # using only the index and then we'll use the primary keys to lookup the
    # full Page objects for each chunk which will actually be indexed.

    full_page_qs = page_qs.prefetch_related(
        Prefetch(
            "issue",
            queryset=models.Issue.objects.prefetch_related(
                "batch",
                "title",
                "title__languages",
                "title__alt_titles",
                "title__subjects",
                "title__notes",
                "title__places",
                "title__urls",
                "title__essays",
                "title__country",
                "title__holdings",
            ),
        ))

    count = 0
    for pk_chunk in sliced(page_qs.values_list("pk", flat=True), 100):
        # We have to force the PKs into a list to work around limitations in
        # MySQL preventing the use of a subquery which uses LIMIT:
        chunk = full_page_qs.filter(pk__in=list(pk_chunk))

        docs = []
        pks = []

        for page in chunk:
            try:
                docs.append(page.solr_doc)
                pks.append(page.pk)
            except Exception:
                LOGGER.warning("Unable to index page %s",
                               page.url,
                               exc_info=True)
                continue

        if docs:
            solr.add_many(docs)
            solr.commit()
            models.Page.objects.filter(pk__in=pks).update(indexed=True)

        count += len(pk_chunk)
        reset_queries()
        LOGGER.info("indexed %d pages", count)

    solr.commit()
            articles = []
            c.execute("SELECT * FROM article LIMIT 10000 OFFSET %d" % j)
            for row in c.fetchall():
                if row[0] in seen:
                    continue
                seen.add(row[0])
                if row[13]:
                    author = [a.split(' ') for a in row[13].decode('utf-8').split(', ')]
                    author = [' '.join(itertools.chain(*[(p if p.isupper() else [p]) for p in a if p])) for a in author]
                    author = ', '.join(author)
                    row = row[:13] + (author,) + row[14:]
#                print row[13]

                articles.append(Article(*(cell.decode('utf-8') if isinstance(cell, str) else cell for cell in row)))
                i += 1
            solr.add_many(a._asdict() for a in articles)
            j += 10000
            if j % 1e5 == 0:
                print "Loaded: ", i
            if c.rowcount == 0:
                break
    #    writer = csv.writer(open('articles.csv', 'wb'))
    #    writer.writerows(articles)
    else:
        reader = csv.reader(open('articles.csv', 'rb'))
        for i, row in enumerate(reader):
            articles.append(Article(*(cell.decode('utf-8') for cell in row)))
            if i % 1e5 == 0:
                print "Loaded: ", i

    del seen
    # Compose document data to store in Solr.
    documents = []
    for path, fname in txts:
        log.msg(fname, "->", path)
        url = site + path
        with codecs.open(fname, 'rb', encoding) as fp:
            title, content = parse_document(fp)
        doc = {
            'title': title,
            'content': content,
            #'last_modified': datetime.fromtimestamp(os.path.getmtime(fname)),
            'last_modified': datetime.now().replace(tzinfo=utc),
            'site': site,
            'url': url,
            'id': hashlib.sha1(url).hexdigest()
        }
        documents.append(doc)
    u = options['username']
    p = options['password']
    if u and p:
        s = SolrConnection(server, http_user=u, http_pass=p)
    else:
        s = SolrConnection(server)
    s.add_many(documents)
    s.commit()

if __name__ == '__main__':
    main()