コード例 #1
0
 def solr_delete(self):
     """
     Remove from solr index
     """
     solr_conn = SolrConnection(settings.SOLR_URL, persistent=False)
     solr_conn.delete_query('id:%s' % self.id)
     solr_conn.commit()
コード例 #2
0
 def handle(self, **options):
     solr = SolrConnection(SOLR_URL)
     if options['user']:
         solr.delete_query('user:%s' % options['user'])
     else:
         solr.delete_query('id:[* TO *]')
     solr.commit()
コード例 #3
0
ファイル: zap_index.py プロジェクト: CDRH/nebnews
 def handle(self, **options):
     solr = SolrConnection(settings.SOLR)
     if options['batch']:
         solr.delete_query('batch: %s' % options['batch'])
     else:
         solr.delete_query('id:[* TO *]')
     solr.commit()
コード例 #4
0
 def handle(self, **options):
     solr = SolrConnection(settings.SOLR)
     if options['batch']:
         solr.delete_query('batch: %s' % options['batch'])
     else:
         solr.delete_query('id:[* TO *]')
     solr.commit()
コード例 #5
0
 def test_index_pages(self):
     solr = SolrConnection(settings.SOLR)
     solr.delete_query('type:page')
     solr.commit()
     self.assertEqual(si.page_count(), 0)
     si.index_pages()
     self.assertEqual(si.page_count(), 2)
コード例 #6
0
ファイル: zap_index.py プロジェクト: dchud/unalog2
 def handle(self, **options):
     solr = SolrConnection(SOLR_URL)
     if options["user"]:
         solr.delete_query("user:%s" % options["user"])
     else:
         solr.delete_query("id:[* TO *]")
     solr.commit()
コード例 #7
0
 def solr_reindex (self):
     """
     Reindex all entries.  Used when switching to/from "private" status.
     """
     solr_conn = SolrConnection(settings.SOLR_URL)
     # Start by deleting 'em all
     solr_conn.delete_query('user:%s' % self.user.id)
     entries = Entry.objects.filter(user=self.user)
     docs = []
     # Arbitrary assignment of a constant, here.
     SLICE_SIZE = 50
     slices = [x for x in range(entries.count()) \
         if x % SLICE_SIZE == 0]
     for s in slices:
         entry_slice = entries[s:s+SLICE_SIZE]
         for entry in entry_slice:
             docs.append(entry.solr_doc)
             if len(docs) == SLICE_SIZE:
                 try:
                     solr_conn.add_many(docs)
                 except:
                     # should log appropriately, huh
                     pass
                 del(docs)
                 docs = []
     # Don't miss the leftovers
     solr_conn.add_many(docs)
     solr_conn.commit()
     solr_conn.optimize()
コード例 #8
0
 def solr_index(self):
     """
     Write out to solr
     """
     solr_conn = SolrConnection(settings.SOLR_URL, persistent=False)
     solr_conn.add(**self.solr_doc)
     solr_conn.commit()
コード例 #9
0
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """
    cursor = connection.cursor()
    solr = SolrConnection(settings.SOLR)
    if since:
        cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" %
                       since)
    else:
        solr.delete_query('type:title')
        cursor.execute("SELECT lccn FROM core_title")

    count = 0
    while True:
        row = cursor.fetchone()
        if row == None:
            break
        title = models.Title.objects.get(lccn=row[0])
        index_title(title, solr)
        count += 1
        if count % 100 == 0:
            _log.info("indexed %s titles" % count)
            reset_queries()
            solr.commit()
    solr.commit()
コード例 #10
0
ファイル: index.py プロジェクト: LibraryOfCongress/chronam
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """
    cursor = connection.cursor()
    solr = SolrConnection(settings.SOLR)
    if since:
        cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since)
    else:
        solr.delete_query('type:title')
        cursor.execute("SELECT lccn FROM core_title")

    count = 0
    while True:
        row = cursor.fetchone()
        if row is None:
            break
        title = models.Title.objects.get(lccn=row[0])
        index_title(title, solr)
        count += 1
        if count % 100 == 0:
            LOGGER.info("indexed %s titles", count)
            reset_queries()
            solr.commit()
    solr.commit()
コード例 #11
0
ファイル: index.py プロジェクト: dchud/unalog2
class Command(BaseCommand):
    user_option = optparse.make_option(
        '--user',
        action='store',
        dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option, )
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s + SLICE_SIZE)
            entry_slice = entries[s:s + SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del (docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
コード例 #12
0
ファイル: index.py プロジェクト: dchud/unalog2
class Command(BaseCommand):
    user_option = optparse.make_option('--user',
        action='store', dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option,)
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY 
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s+SLICE_SIZE)
            entry_slice = entries[s:s+SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del(docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
コード例 #13
0
ファイル: findmyfile.py プロジェクト: ggaurav/find_my_file
def _refresh(field=None, data=None, path = None, isCron = None):
	from solr import SolrConnection
	from ID3 import *
	s = SolrConnection(SOLR_URL)
	if path and path != '*':
		#called by user		
		pathsArr = path.split(',')		
	else:
		#called from cron		
		pathsArr = folderpaths
	matches = []
	#handles modify, add
	#deletion will be handled in search when file in solr but not in path
	time.time()
	for path in pathsArr:
		for root, dirnames, filenames in os.walk(path):
			for extension in ['txt', 'log', 'py', 'pl', 'sql', 'mp3']:
				for filename in fnmatch.filter(filenames, '*.' + extension):				
					fullName = os.path.join(root, filename)
					if os.path.getsize(fullName) > 8800000:
						continue
					#print fullName
					if not isCron or (time.time() - os.path.getmtime(fullName) < 24*60*60):				
						try:
							#data = open(fullName, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ')
							if filename.endswith(('.txt', '.log', '.py', '.pl', '.sql')):								
								data = open(fullName, 'r').read()
								data = filterTxt(data)
							else:								
								audiofile = ID3(fullName)
								audiofilekeys = audiofile.keys()
								if 'TITLE' in audiofilekeys:
									data = audiofile['TITLE'] + " "
								if 'ARTIST' in audiofilekeys:
									data += audiofile['ARTIST'] + " "
								if 'ALBUM' in audiofilekeys:
									data += audiofile['ALBUM'] + " "
								if not data:
									data = ''
								data = data.strip()
							fullName = filterTxt(fullName)
							filename = filterTxt(filename)						
							s.add(id = fullName, name = filename, txt = data)
							s.commit()
						except:																	
							pass
							#print data
							#print traceback.format_exc()
							#print fullName	
							#sys.exit()					
						gc.collect()
コード例 #14
0
	def search(self, **kwargs):

		query = kwargs['q']
		api_key = "aac5b38a36513510000ef3286494fc6d"

		url = urllib2.urlopen("http://tinysong.com/s/%s/?format=json&key=%s" % (urllib2.quote(query), api_key))
		response = json.loads(url.read())

		# TODO: Remove redundancy between results and tracks?
		results = []
		tracks = []
		for song in response:

			source_id = 'grooveshark'

			result = {
				'artist': song['ArtistName'],
				'album': song['AlbumName'],
				'title': song['SongName'],
				'sources': [
					{
						'sourceid': source_id,
						'trackid': '%s' % song['SongID']
					}
				]
			}
			results.append(result)

			track = {
				'id': 'track_%s_%s' % (source_id, song['SongID']),
				'type': 'track',

				'track_title': song['SongName'],
				'track_artist': song['ArtistName'],
				'track_album': song['AlbumName'],

				'request_source_id': source_id,
				'request_track_id': song['SongID'],
			}
			tracks.append(track)

		# Register the songs in the search engine
		solr = SolrConnection(settings.SOLR_URL)
		solr.add_many(tracks)
		solr.commit()
		solr.close()

		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps(results, ensure_ascii=False, indent=4).encode('utf-8')
コード例 #15
0
ファイル: index.py プロジェクト: LibraryOfCongress/chronam
def index_missing_pages():
    """
    index all pages that are missing from solr in the database
    """
    solr = SolrConnection(settings.SOLR)
    count = 0
    pages = models.Page.objects.filter(indexed=False).all()
    number_of_pages = len(pages)
    for page in pages:
        LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages, page.url)
        solr.add(**page.solr_doc)
        count += 1
        page.indexed = True
        page.save()
    solr.commit()
コード例 #16
0
	def finished(self, **kwargs):

		source_id = kwargs['source_id']
		track_id = kwargs['track_id']

		# Build up a Solr query
		filters = []
		filters.append('type:request')
		filters.append('channel_id:%s' % self.channel_id)
		filters.append('request_source_id:%s' % source_id)
		filters.append('request_track_id:%s' % track_id)

		# Make the request to Solr
		solr = SolrConnection(settings.SOLR_URL)
		solr.delete_query(' AND '.join(filters))
		solr.commit()
コード例 #17
0
ファイル: index.py プロジェクト: bireme/medlist
def index_evidence(evidence):
    evidence_medicine_list = []

    evidence_medicine = MedicineEvidenceSummary.objects.filter(evidence=evidence.id)
    for evimed in evidence_medicine: 
        if evimed.medicine.name not in evidence_medicine_list:
                evidence_medicine_list.append(evimed.medicine.name)

    # try to create a connection to a solr server and send medicine
    try:
        solr = SolrConnection(settings.SOLR_URL)
        solr.add(
            id = "evidence-%s-%s" % (evidence.language, evidence.id), 
            type = "evidence",
            title = evidence.title,            
            description = evidence.description,
            context = evidence.context,
            question = evidence.question,
            link = evidence.link,
            file = evidence.file,
            language = evidence.language,
            evidence_medicine = evidence_medicine_list,
        )
        response = solr.commit()
    except Exception as ex: 
        return False

    return True
コード例 #18
0
def index_missing_pages():
    """
    index all pages that are missing from solr in the database
    """
    solr = SolrConnection(settings.SOLR)
    count = 0
    pages = models.Page.objects.filter(indexed=False).all()
    number_of_pages = len(pages)
    for page in pages:
        LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages,
                    page.url)
        solr.add(**page.solr_doc)
        count += 1
        page.indexed = True
        page.save()
    solr.commit()
コード例 #19
0
    def create(self, **kwargs):

        # Collect the channel details
        name = kwargs["name"]
        pos = kwargs["pos"]

        # Create the channel in the search engine
        doc = {"id": "channel_%s" % (name,), "type": "channel", "channel_id": name, "channel_location": pos}

        solr = SolrConnection(settings.SOLR_URL)
        solr.add_many([doc])
        solr.commit()
        solr.close()

        # Create the channel in the URL hierarchy
        self.__dict__[name] = ChannelResource.Channel(name)
コード例 #20
0
ファイル: index.py プロジェクト: sshyran/chronam
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """

    solr = SolrConnection(settings.SOLR)

    titles = models.Title.objects.all()
    if since:
        titles = titles.filter(created__gte=since)

    titles = titles.prefetch_related("languages", "alt_titles", "subjects",
                                     "notes", "places", "urls", "essays",
                                     "country", "holdings")

    count = 0

    for chunk in sliced(titles, 500):
        docs = []

        for title in chunk:
            try:
                docs.append(title.solr_doc)
            except Exception:
                LOGGER.exception("Unable to index title %s", title)

        solr.add_many(docs)

        reset_queries()
        solr.commit()

        count += len(chunk)
        LOGGER.info("indexed %d titles", count)

    lccns = set(models.Title.objects.values_list("lccn", flat=True))

    for result in solr.query("+type:title", fields=["id", "lccn"]):
        stale_id = result["id"]
        lccn = result["lccn"]
        if lccn not in lccns:
            LOGGER.warning("Removing stale title %s from the search index",
                           stale_id)
            delete_title(stale_id, solr=solr)

    solr.commit()
コード例 #21
0
def index_pages():
    """index all the pages that are modeled in the database
    """
    solr = SolrConnection(settings.SOLR)
    solr.delete_query('type:page')
    cursor = connection.cursor()
    cursor.execute("SELECT id FROM core_page")
    count = 0
    while True:
        row = cursor.fetchone()
        if row is None:
            break
        page = models.Page.objects.get(id=row[0])
        LOGGER.info("[%s] indexing page: %s", count, page.url)
        solr.add(**page.solr_doc)
        count += 1
        if count % 100 == 0:
            reset_queries()
    solr.commit()
コード例 #22
0
ファイル: index.py プロジェクト: LibraryOfCongress/chronam
def index_pages():
    """index all the pages that are modeled in the database
    """
    solr = SolrConnection(settings.SOLR)
    solr.delete_query('type:page')
    cursor = connection.cursor()
    cursor.execute("SELECT id FROM core_page")
    count = 0
    while True:
        row = cursor.fetchone()
        if row is None:
            break
        page = models.Page.objects.get(id=row[0])
        LOGGER.info("[%s] indexing page: %s", count, page.url)
        solr.add(**page.solr_doc)
        count += 1
        if count % 100 == 0:
            reset_queries()
    solr.commit()
コード例 #23
0
ファイル: test_search_notation.py プロジェクト: DDMAL/cantus
    def setUpClass(cls):
        # First, add a folio to Solr so that the image_uri can be retrieved during the MEI conversion
        # Using curl here because it turned out to be easier than solrconn.add and gives better error messages
        os.system("curl {0}/update/?commit=true -H 'Content-Type: text/xml' -d '<add><doc>\
        <field name=\"id\">testid</field>\
        <field name=\"type\">cantusdata_folio</field>\
        <field name=\"manuscript_id\">{1}</field>\
        <field name=\"number\">{2}</field>\
        <field name=\"image_uri\">{3}</field>\
        </doc></add>'".format(settings.SOLR_SERVER, MEI_FIXTURE_ID, MEI_FIXTURE_FOLIO, MEI_FIXTURE_URI))

        docs = list(MEIConverter.process_file(MEI_FIXTURE, MEI_FIXTURE_SIGLUM, MEI_FIXTURE_ID))

        # Sanity check
        solrconn = SolrConnection(settings.SOLR_SERVER)
        prequery = solrconn.query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM)
        assert prequery.numFound == 0, 'MEI was already in the database when loading the test fixture'

        solrconn.add_many(docs)
        solrconn.commit()
コード例 #24
0
def index_pages():
    """index all the pages that are modeled in the database
    """
    _log = logging.getLogger(__name__)
    solr = SolrConnection(settings.SOLR)
    cursor = connection.cursor()
    cursor.execute(
        "SELECT id FROM core_page WHERE ocr_filename IS NOT NULL AND ocr_filename <> ''"
    )
    count = 0
    while True:
        row = cursor.fetchone()
        if row == None:
            break
        page = models.Page.objects.get(id=row[0])
        _log.info("[%s] indexing page: %s" % (count, page.url))
        solr.add(**page.solr_doc)
        count += 1
        if count % 100 == 0:
            reset_queries()
    solr.commit()
コード例 #25
0
ファイル: updater.py プロジェクト: mcg/couchdb-solr2
    def _send_update(self, *args, **kwargs):
        """Send an update request to Solr.

        Solr commits are made only on deletion.

        Takes a single argument: the AMQP message that was received.
        """
        try:
            log.info("Processing update request")
            msg = args[0]
            updates = json.loads(msg.body)
            solr = SolrConnection(self.solr_uri)
            if updates["type"] == "updated":
                add = ET.Element("add")
                for update in updates["data"]:
                    doc = ET.SubElement(add, "doc")
                    for fields in update:
                        # There should only be one pair
                        # FIXME: move to a dictionary structure
                        for k, v in fields.items():
                            SolrUpdater.xml_field(doc, solr.escapeKey(k), solr.escapeVal(v))
                log.debug("Sending update to Solr: " + ET.tostring(add))
                solr.doUpdateXML(ET.tostring(add))
            elif updates["type"] == "deleted":
                for id in updates["data"]:
                    log.debug("Deleting document with id '%s'" % id)
                    solr.delete(id)
                solr.commit()
            elif updates["type"] == "deleted_db":
                db_name = updates["data"]
                log.info("Deleting indexes for database '%s'" % db_name)
                solr.deleteByQuery("_db:%s" % db_name)
                solr.commit()
            else:
                log.warning("Unrecognized update type: '%s'" % updates["type"])
        except Exception:
            log.exception("Unexpected exception")
コード例 #26
0
ファイル: test_search_notation.py プロジェクト: DDMAL/cantus
 def tearDownClass(cls):
     solrconn = SolrConnection(settings.SOLR_SERVER)
     solrconn.delete_query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM)
     solrconn.delete_query('type:cantusdata_folio AND manuscript_id:{0}'.format(MEI_FIXTURE_ID))
     solrconn.commit()
コード例 #27
0
    # Compose document data to store in Solr.
    documents = []
    for path, fname in txts:
        log.msg(fname, "->", path)
        url = site + path
        with codecs.open(fname, 'rb', encoding) as fp:
            title, content = parse_document(fp)
        doc = {
            'title': title,
            'content': content,
            #'last_modified': datetime.fromtimestamp(os.path.getmtime(fname)),
            'last_modified': datetime.now().replace(tzinfo=utc),
            'site': site,
            'url': url,
            'id': hashlib.sha1(url).hexdigest()
        }
        documents.append(doc)
    u = options['username']
    p = options['password']
    if u and p:
        s = SolrConnection(server, http_user=u, http_pass=p)
    else:
        s = SolrConnection(server)
    s.add_many(documents)
    s.commit()

if __name__ == '__main__':
    main()

コード例 #28
0
ファイル: batch_loader.py プロジェクト: sshyran/chronam
class BatchLoader(object):
    """This class allows you to load a batch into the database. A loader
    object serves as a context for a particular batch loading job.
    """
    def __init__(self, process_ocr=True, process_coordinates=True):
        """Create a BatchLoader.

        The process_ocr parameter is used (mainly in testing) when we don't
        want to spend time actually extracting ocr text and indexing.
        """
        self.PROCESS_OCR = process_ocr
        self.solr = SolrConnection(settings.SOLR)
        self.PROCESS_COORDINATES = process_coordinates

    def _find_batch_file(self, batch):
        """
        TODO: Who can we toss the requirement at to make this
        available in a canonical location?
        """
        # look for batch_1.xml, BATCH_1.xml, etc
        for alias in [
                "batch_1.xml",
                "BATCH_1.xml",
                "batchfile_1.xml",
                "batch_2.xml",
                "BATCH_2.xml",
                "batch.xml",
        ]:
            # TODO: might we want 'batch.xml' first? Leaving last for now to
            # minimize impact.
            url = urlparse.urljoin(batch.storage_url, alias)
            try:
                urllib2.urlopen(url)
                validated_batch_file = alias
                break
            except (urllib2.HTTPError, urllib2.URLError):
                continue
        else:
            raise BatchLoaderException(
                "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?"
                % batch.path)
        return validated_batch_file

    def _sanity_check_batch(self, batch):
        # if not os.path.exists(batch.path):
        #    raise BatchLoaderException("batch does not exist at %s" % batch.path)
        # b = urllib2.urlopen(batch.url)
        batch.validated_batch_file = self._find_batch_file(batch)

    def load_batch(self, batch_path, strict=True):
        """Load a batch, and return a Batch instance for the batch
        that was loaded.

          loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01')

        """

        LOGGER.info("loading batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if batch_path != link_name and not os.path.islink(link_name):
                LOGGER.info("creating symlink %s -> %s", batch_path, link_name)
                os.symlink(batch_path, link_name)
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        if not strict:
            try:
                batch = Batch.objects.get(name=batch_name)
                LOGGER.info("Batch already loaded: %s", batch_name)
                return batch
            except Batch.DoesNotExist as e:
                pass

        LOGGER.info("loading batch: %s", batch_name)

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None

        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath("ndnp:reel", namespaces=ns):

                reel_number = e.attrib["reelNumber"].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist as e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath("ndnp:issue", namespaces=ns):
                mets_url = urlparse.urljoin(batch.storage_url, e.text)

                try:
                    issue, pages = self._load_issue(mets_url)
                except ValueError as e:
                    LOGGER.exception("Unable to load issue from %s", mets_url)
                    continue

                # commit new changes to the solr index, if we are indexing
                if self.PROCESS_OCR:
                    LOGGER.info("Adding pages to solr index from issue %s",
                                issue.title)
                    for page in pages:
                        LOGGER.debug("indexing ocr for: %s", page.url)
                        self.solr.add(**page.solr_doc)
                        page.indexed = True
                        page.save()

            if self.PROCESS_OCR:
                LOGGER.info("Committing solr index")
                self.solr.commit()

            batch.save()
            msg = "processed %s pages" % batch.page_count
            LOGGER.info(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
        except Exception as e:
            msg = "unable to load batch: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            try:
                self.purge_batch(batch_name)
            except Exception:
                LOGGER.exception(
                    "Unable to purge batch %s after loading failed",
                    batch_name)
            raise BatchLoaderException(msg)

        if settings.IS_PRODUCTION:
            batch.released = datetime.now()
            batch.save()

        cache.delete("newspaper_info")

        return batch

    def _get_batch(self, batch_name, batch_source=None, create=False):
        if create:
            batch = self._create_batch(batch_name, batch_source)
        else:
            batch = Batch.objects.get(name=batch_name)
        return batch

    def _create_batch(self, batch_name, batch_source):
        if Batch.objects.filter(name=batch_name).count() != 0:
            raise BatchLoaderException("batch %s already loaded" % batch_name)
        batch = Batch()
        batch.name = batch_name
        batch.source = batch_source
        try:
            parts = batch_name.split("_", 3)
            if len(parts) == 4:
                parts = parts[1:]
            awardee_org_code, name_part, version = parts
            batch.awardee = Awardee.objects.get(org_code=awardee_org_code)
        except Awardee.DoesNotExist:
            msg = "no awardee for org code: %s" % awardee_org_code
            LOGGER.error(msg)
            raise BatchLoaderException(msg)
        batch.save()
        return batch

    def _load_issue(self, mets_file):
        LOGGER.debug("parsing issue mets file: %s", mets_file)
        doc = etree.parse(mets_file)

        # get the mods for the issue
        div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0]
        dmdid = div.attrib["DMDID"]
        mods = dmd_mods(doc, dmdid)

        # set up a new Issue
        issue = Issue()
        issue.volume = mods.xpath(
            'string(.//mods:detail[@type="volume"]/mods:number[1])',
            namespaces=ns).strip()
        issue.number = mods.xpath(
            'string(.//mods:detail[@type="issue"]/mods:number[1])',
            namespaces=ns).strip()
        issue.edition = int(
            mods.xpath(
                'string(.//mods:detail[@type="edition"]/mods:number[1])',
                namespaces=ns))
        issue.edition_label = mods.xpath(
            'string(.//mods:detail[@type="edition"]/mods:caption[1])',
            namespaces=ns).strip()

        # parse issue date
        date_issued = mods.xpath("string(.//mods:dateIssued)", namespaces=ns)
        issue.date_issued = datetime.strptime(date_issued, "%Y-%m-%d")

        # attach the Issue to the appropriate Title
        lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])',
                          namespaces=ns).strip()
        try:
            title = Title.objects.get(lccn=lccn)
        except title.DoesNotExist:
            url = "https://chroniclingamerica.loc.gov/lccn/%s/marc.xml" % lccn
            LOGGER.info("attempting to load MARC record from %s", url)
            management.call_command("load_titles", url)
            title = Title.objects.get(lccn=lccn)
        issue.title = title

        issue.batch = self.current_batch
        issue.save()
        LOGGER.debug("saved issue: %s", issue.url)

        notes = []
        for mods_note in mods.xpath(".//mods:note", namespaces=ns):
            type = mods_note.xpath("string(./@type)")
            label = mods_note.xpath("string(./@displayLabel)")
            text = mods_note.xpath("string(.)")
            note = models.IssueNote(type=type, label=label, text=text)
            notes.append(note)
        issue.notes = notes
        issue.save()

        # attach pages: lots of logging because it's expensive
        pages = []
        for page_div in div.xpath('.//mets:div[@TYPE="np:page"]',
                                  namespaces=ns):
            try:
                pages.append(self._load_page(doc, page_div, issue))
            except BatchLoaderException:
                LOGGER.exception(
                    "Failed to load page. doc: %s, page div: %s, issue: %s",
                    doc, page_div, issue)

        return issue, pages

    def _load_page(self, doc, div, issue):
        dmdid = div.attrib["DMDID"]
        mods = dmd_mods(doc, dmdid)
        page = Page()

        seq_string = mods.xpath("string(.//mods:extent/mods:start)",
                                namespaces=ns)
        try:
            page.sequence = int(seq_string)
        except ValueError:
            raise BatchLoaderException(
                "could not determine sequence number for page from '%s'" %
                seq_string)
        page.number = mods.xpath('string(.//mods:detail[@type="page number"])',
                                 namespaces=ns).strip()

        reel_number = mods.xpath(
            'string(.//mods:identifier[@type="reel number"])',
            namespaces=ns).strip()
        try:
            reel = models.Reel.objects.get(number=reel_number,
                                           batch=self.current_batch)
            page.reel = reel
        except models.Reel.DoesNotExist:
            if reel_number:
                reel = models.Reel(number=reel_number,
                                   batch=self.current_batch,
                                   implicit=True)
                reel.save()
                page.reel = reel
            else:
                LOGGER.warning("unable to find reel number in page metadata")

        LOGGER.info("Assigned page sequence: %s", page.sequence)

        _section_dmdid = div.xpath(
            'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)',
            namespaces=ns)
        if _section_dmdid:
            section_mods = dmd_mods(doc, _section_dmdid)
            section_label = section_mods.xpath(
                'string(.//mods:detail[@type="section label"]/mods:number[1])',
                namespaces=ns).strip()
            if section_label:
                page.section_label = section_label

        page.issue = issue

        LOGGER.info("Saving page. issue date: %s, page sequence: %s",
                    issue.date_issued, page.sequence)

        # TODO - consider the possibility of executing the file name
        #        assignments (below) before this page.save().
        page.save()

        notes = []
        for mods_note in mods.xpath(".//mods:note", namespaces=ns):
            type = mods_note.xpath("string(./@type)")
            label = mods_note.xpath("string(./@displayLabel)")
            text = mods_note.xpath("string(.)").strip()
            note = models.PageNote(type=type, label=label, text=text)
            notes.append(note)
        page.notes = notes

        # there's a level indirection between the METS structmap and the
        # details about specific files in this package ...
        # so we have to first get the FILEID from the issue div in the
        # structmap and then use it to look up the file details in the
        # larger document.

        for fptr in div.xpath("./mets:fptr", namespaces=ns):
            file_id = fptr.attrib["FILEID"]
            file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id,
                                namespaces=ns)[0]
            file_type = file_el.attrib["USE"]

            # get the filename relative to the storage location
            file_name = file_el.xpath("string(./mets:FLocat/@xlink:href)",
                                      namespaces=ns)
            file_name = urlparse.urljoin(doc.docinfo.URL, file_name)
            file_name = self.storage_relative_path(file_name)

            if file_type == "master":
                page.tiff_filename = file_name
            elif file_type == "service":
                page.jp2_filename = file_name
                try:
                    # extract image dimensions from technical metadata for jp2
                    for admid in file_el.attrib["ADMID"].split(" "):
                        length, width = get_dimensions(doc, admid)
                        if length and width:
                            page.jp2_width = width
                            page.jp2_length = length
                            break
                except KeyError:
                    LOGGER.info(
                        "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...",
                        page.issue,
                        page,
                    )

                if not page.jp2_width:
                    raise BatchLoaderException(
                        "No jp2 width for issue: %s page: %s" %
                        (page.issue, page))
                if not page.jp2_length:
                    raise BatchLoaderException(
                        "No jp2 length for issue: %s page: %s" %
                        (page.issue, page))
            elif file_type == "derivative":
                page.pdf_filename = file_name
            elif file_type == "ocr":
                page.ocr_filename = file_name

        if page.ocr_filename:
            # don't incurr overhead of extracting ocr text, word coordinates
            # and indexing unless the batch loader has been set up to do it
            if self.PROCESS_OCR:
                page = self.process_ocr(page)
        else:
            LOGGER.info("No ocr filename for issue: %s page: %s", page.issue,
                        page)

        LOGGER.debug("saving page: %s", page.url)
        page.save()
        return page

    def process_ocr(self, page):
        LOGGER.debug("extracting ocr text and word coords for %s", page.url)

        url = urlparse.urljoin(self.current_batch.storage_url,
                               page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        lang_text_solr = {}
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(
                    Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                LOGGER.warning(
                    "Language %s does not exist in the database. Defaulting to English.",
                    lang)
                # default to english as per requirement
                language = models.Language.objects.get(code="eng")
            ocr.language_texts.create(language=language)
            lang_text_solr[language.code] = text

        page.ocr = ocr
        page.lang_text = lang_text_solr
        page.save()
        return page

    def _process_coordinates(self, page, coords):
        LOGGER.debug("writing out word coords for %s", page.url)

        # We'll use a temporary file in case the coordinates dir is configured
        # to a network filesystem which has poor update performance
        # characteristics
        fd, path = tempfile.mkstemp(text="w",
                                    suffix=".coordinates",
                                    dir=settings.TEMP_STORAGE)
        f = open(path, "w")
        f.write(gzip_compress(json.dumps(coords)))
        f.close()
        os.close(fd)
        final_path = models.coordinates_path(page._url_parts())
        try:
            shutil.move(path, final_path)
        except Exception:
            LOGGER.warning(
                'Could not move coordinates to "%s". Waiting 5 seconds before trying again…',
                final_path)
            time.sleep(5)
            shutil.move(path, final_path)

    def process_coordinates(self, batch_path):
        LOGGER.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    if not page.ocr_filename:
                        LOGGER.warning(
                            "Batch [%s] page [%s] has no OCR; skipping coordinates processing",
                            batch_name,
                            page,
                        )
                    else:
                        url = urlparse.urljoin(self.current_batch.storage_url,
                                               page.ocr_filename)
                        LOGGER.debug("Extracting OCR from url %s", url)
                        lang_text, coords = ocr_extractor(url)
                        self._process_coordinates(page, coords)
        except Exception as e:
            msg = "unable to process coordinates for batch: %s" % e
            LOGGER.exception(msg)
            raise BatchLoaderException(msg)

    def storage_relative_path(self, path):
        """returns a relative path for a given file path within a batch, so
        that storage can be re-homed without having to rewrite paths in the db
        """
        rel_path = path.replace(self.current_batch.storage_url, "")
        return rel_path

    @transaction.atomic
    def purge_batch(self, batch_name):
        batch_name = _normalize_batch_name(batch_name)

        try:
            batch = self._get_batch(batch_name)
        except Batch.DoesNotExist:
            LOGGER.info("Batch %s does not exist", batch_name)
            return

        event = LoadBatchEvent(batch_name=batch_name, message="starting purge")
        event.save()

        try:
            self._purge_batch(batch)
            event = LoadBatchEvent(batch_name=batch_name, message="purged")
            event.save()
            # clean up symlinks if exists
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if os.path.islink(link_name):
                LOGGER.info("Removing symlink %s", link_name)
                os.remove(link_name)
        except Exception as e:
            msg = "purge failed: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            raise BatchLoaderException(msg)

    def _purge_batch(self, batch):
        batch_name = batch.name
        # just delete batch causes memory to bloat out
        # so we do it piece-meal
        for issue in batch.issues.prefetch_related("pages__issue",
                                                   "pages__issue__title"):
            for page in issue.pages.all():
                page.delete()
                # remove coordinates
                if os.path.exists(models.coordinates_path(page._url_parts())):
                    os.remove(models.coordinates_path(page._url_parts()))
            issue.delete()
        batch.delete()
        if self.PROCESS_OCR:
            self.solr.delete_query('batch:"%s"' % batch_name)
            self.solr.commit()
コード例 #29
0
ファイル: sample_redis.py プロジェクト: bodleian/datafinder
        "uuid": [u"78755d851f9a453b84a51b1c00c68553"],
        "depositor": "zool0982"
        #               'identifier': ['fri_day1'],
        #                'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/fri_day1/df_manifest.rdf'],
        #                'mediator': ['admin'],
        #               'text': ['', 'http://vocab.ox.ac.uk/projectfunding#', '', 'seeking_approval', '', '', 'yes', '', ''],
        #               'depositor': ['zool0982'],
        #               'embargoedUntilDate': ['2083-06-21T14:08:45Z'],
        #               'alternative': ['fri_day1'],
        #               'subject': [''],
        #               'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'],
        #               'publisher': ['Bodleian Libraries, University of Oxford'],
        #               'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'],
        #               'language': [''],
        #               'title': ['fri_day1'],
        #               'embargoStatus': ['True'],
        #               'description': [''],
        #               'format': [''],
        #               'modified': ['2013-06-21 14:08:45.525602'],
        #               'currentVersion': ['2'],
        #               'created': ['2013-06-21 14:08:45.253033'],
        #               'issued': [''],
        #               'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet']
    }

    #   solr_doc = {'identifier': ['fri_day1'], 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/fri_day1/df_manifest.rdf'], 'mediator': ['admin'], 'text': ['', '', '', 'http://vocab.ox.ac.uk/projectfunding#', '', 'yes', '', 'seeking_approval', ''], 'depositor': ['zool0982'], 'embargoedUntilDate': ['2083-06-21T14:08:45Z'], 'alternative': ['fri_day1'], 'subject': [''], 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], 'publisher': ['', 'Bodleian Libraries, University of Oxford'], 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], 'uuid': [u'4fb84512bfaf4927945ea3c241bf21c0'], 'language': [''], 'title': ['fri_day1'], 'embargoStatus': ['True'], 'description': [''], 'format': [''], 'modified': ['2013-06-21 14:08:45.525602'], 'id': ['fri_day1'], 'currentVersion': ['2'], 'created': ['2013-06-21 14:08:45.253033'], 'issued': [''], 'silo': ['DataFinder'], 'type': ['http://vocab.ox.ac.uk/dataset/schema#DataSet', '']}
    # solr_doc = {'identifier': ['mond_ay2'], 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/mond_ay2/df_manifest.rdf'], 'mediator': ['admin'], 'text': ['', 'http://vocab.ox.ac.uk/projectfunding#', '', 'seeking_approval', '', 'yes', '', '', ''], 'depositor': 'zool0982', 'alternative': ['mond_ay2'], 'embargoedUntilDate': ['2083-06-24T03:41:53Z'], 'subject': [''], 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], 'publisher': ['Bodleian Libraries, University of Oxford'], 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], 'uuid': [u'78755d851f9a453b84a51b1c00c68553'], 'language': [''], 'title': ['mond_ay2'], 'embargoStatus': ['True'], 'description': ['mond_ay2'], 'format': [''], 'modified': ['2013-06-24 03:41:53.988847'], 'id': ['mond_ay2'], 'currentVersion': ['2'], 'created': ['2013-06-24 03:41:53.618090'], 'issued': [''], 'silo': ['DataFinder'], 'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet']}
    # print repr(solr_doc)
    solr.add(_commit=True, **solr_doc)
    solr.commit()
コード例 #30
0
def commit():
    solr = SolrConnection(settings.SOLR)
    solr.commit()
コード例 #31
0
 def handle(self, **options):
     solr = SolrConnection(settings.SOLR)
     solr.commit()
コード例 #32
0
ファイル: index.py プロジェクト: LibraryOfCongress/chronam
def commit():
    solr = SolrConnection(settings.SOLR)
    solr.commit()
コード例 #33
0
class BatchLoader(object):
    """This class allows you to load a batch into the database. A loader
    object serves as a context for a particular batch loading job.
    """
    def __init__(self, process_ocr=True, process_coordinates=True):
        """Create a BatchLoader.

        The process_ocr parameter is used (mainly in testing) when we don't
        want to spend time actually extracting ocr text and indexing.
        """
        self.PROCESS_OCR = process_ocr
        if self.PROCESS_OCR:
            self.solr = SolrConnection(settings.SOLR)
        self.PROCESS_COORDINATES = process_coordinates

    def _find_batch_file(self, batch):
        """
        TODO: Who can we toss the requirement at to make this
        available in a canonical location?
        """
        # look for batch_1.xml, BATCH_1.xml, etc
        for alias in [
                "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml",
                "BATCH_2.xml", "batch.xml"
        ]:
            # TODO: might we want 'batch.xml' first? Leaving last for now to
            # minimize impact.
            url = urllib.parse.urljoin(batch.storage_url, alias)
            try:
                u = urllib.request.urlopen(url)
                validated_batch_file = alias
                break
            except urllib.error.HTTPError as e:
                continue
            except urllib.error.URLError as e:
                continue
        else:
            raise BatchLoaderException(
                "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?"
                % batch.path)
        return validated_batch_file

    def _sanity_check_batch(self, batch):
        #if not os.path.exists(batch.path):
        #    raise BatchLoaderException("batch does not exist at %s" % batch.path)
        #b = urllib2.urlopen(batch.url)
        batch.validated_batch_file = self._find_batch_file(batch)

    def load_batch(self, batch_path):
        """Load a batch, and return a Batch instance for the batch
        that was loaded.

          loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01')

        """
        self.pages_processed = 0

        # Trailing slash breaks comparison to link_name below, so strip off
        batch_path = batch_path.rstrip("/")

        _logger.info("loading batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path)
        if dirname:
            batch_source = None
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)

            # Create symlink if paths don't match, symlink not already there,
            # and batch_path wasn't input with a BATCH_STORAGE symlink path
            if (batch_path != link_name and not os.path.islink(link_name)
                    and not (os.path.islink(settings.BATCH_STORAGE)
                             and batch_path.startswith(
                                 os.path.realpath(settings.BATCH_STORAGE)))):
                _logger.info("creating symlink %s -> %s", batch_path,
                             link_name)
                os.symlink(batch_path, link_name)
        else:
            batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE,
                                                batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = Batch.objects.get(name=batch_name)
            _logger.info("Batch already loaded: %s" % batch_name)
            return batch
        except Batch.DoesNotExist as e:
            pass

        _logger.info("loading batch: %s" % batch_name)
        t0 = time()
        times = []

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):

                reel_number = e.attrib['reelNumber'].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist as e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath('ndnp:issue', namespaces=ns):
                mets_url = urllib.parse.urljoin(batch.storage_url, e.text)
                try:
                    issue = self._load_issue(mets_url)
                except ValueError as e:
                    _logger.exception(e)
                    continue
                reset_queries()
                times.append((time() - t0, self.pages_processed))

            # commit new changes to the solr index, if we are indexing
            if self.PROCESS_OCR:
                self.solr.commit()

            batch.save()
            msg = "processed %s pages" % batch.page_count
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            _logger.info(msg)
            event.save()
        except Exception as e:
            msg = "unable to load batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            try:
                self.purge_batch(batch_name)
            except Exception as pbe:
                _logger.error("purge batch failed for failed load batch: %s" %
                              pbe)
                _logger.exception(pbe)
            raise BatchLoaderException(msg)

        # updates the min and max years of all titles
        set_fulltext_range()
        return batch

    def _get_batch(self, batch_name, batch_source=None, create=False):
        if create:
            batch = self._create_batch(batch_name, batch_source)
        else:
            batch = Batch.objects.get(name=batch_name)
        return batch

    def _create_batch(self, batch_name, batch_source):
        if Batch.objects.filter(name=batch_name).count() != 0:
            raise BatchLoaderException("batch %s already loaded" % batch_name)
        batch = Batch()
        batch.name = batch_name
        batch.source = batch_source
        try:
            _, org_code, name_part, version = batch_name.split("_", 3)
            awardee_org_code = org_code
            batch.awardee = Awardee.objects.get(org_code=awardee_org_code)
        except Awardee.DoesNotExist as e:
            msg = "no awardee for org code: %s" % awardee_org_code
            _logger.error(msg)
            raise BatchLoaderException(msg)
        batch.save()
        return batch

    def _load_issue(self, mets_file):
        _logger.debug("parsing issue mets file: %s" % mets_file)
        doc = etree.parse(mets_file)

        # get the mods for the issue
        div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0]
        dmdid = div.attrib['DMDID']
        mods = dmd_mods(doc, dmdid)

        # set up a new Issue
        issue = Issue()
        issue.volume = mods.xpath(
            'string(.//mods:detail[@type="volume"]/mods:number[1])',
            namespaces=ns).strip()
        issue.number = mods.xpath(
            'string(.//mods:detail[@type="issue"]/mods:number[1])',
            namespaces=ns).strip()
        issue.edition = int(
            mods.xpath(
                'string(.//mods:detail[@type="edition"]/mods:number[1])',
                namespaces=ns))
        issue.edition_label = mods.xpath(
            'string(.//mods:detail[@type="edition"]/mods:caption[1])',
            namespaces=ns).strip()

        # parse issue date
        date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns)
        issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d')

        # attach the Issue to the appropriate Title
        lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])',
                          namespaces=ns).strip()
        try:
            title = Title.objects.get(lccn=lccn)
        except Exception as e:
            url = settings.MARC_RETRIEVAL_URLFORMAT % lccn
            _logger.info("attempting to load marc record from %s", url)
            management.call_command('load_titles', url)
            title = Title.objects.get(lccn=lccn)

        issue.title = title

        issue.batch = self.current_batch
        issue.save()
        _logger.debug("saved issue: %s" % issue.url)

        notes = []
        for mods_note in mods.xpath('.//mods:note', namespaces=ns):
            type = mods_note.xpath('string(./@type)')
            label = mods_note.xpath('string(./@displayLabel)')
            text = mods_note.xpath('string(.)')
            note = models.IssueNote(type=type, label=label, text=text)
            notes.append(note)
        issue.notes.set(notes, bulk=False)
        issue.save()

        # attach pages: lots of logging because it's expensive
        for page_div in div.xpath('.//mets:div[@TYPE="np:page"]',
                                  namespaces=ns):
            try:
                page = self._load_page(doc, page_div, issue)
                self.pages_processed += 1
            except BatchLoaderException as e:
                _logger.exception(e)

        return issue

    def _load_page(self, doc, div, issue):
        dmdid = div.attrib['DMDID']
        mods = dmd_mods(doc, dmdid)
        page = Page()

        seq_string = mods.xpath('string(.//mods:extent/mods:start)',
                                namespaces=ns)
        try:
            page.sequence = int(seq_string)
        except ValueError as e:
            raise BatchLoaderException(
                "could not determine sequence number for page from '%s'" %
                seq_string)
        page.number = mods.xpath('string(.//mods:detail[@type="page number"])',
                                 namespaces=ns).strip()

        reel_number = mods.xpath(
            'string(.//mods:identifier[@type="reel number"])',
            namespaces=ns).strip()
        try:
            reel = models.Reel.objects.get(number=reel_number,
                                           batch=self.current_batch)
            page.reel = reel
        except models.Reel.DoesNotExist as e:
            if reel_number:
                reel = models.Reel(number=reel_number,
                                   batch=self.current_batch,
                                   implicit=True)
                reel.save()
                page.reel = reel
            else:
                _logger.warn("unable to find reel number in page metadata")

        _logger.info("Assigned page sequence: %s" % page.sequence)

        _section_dmdid = div.xpath(
            'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)',
            namespaces=ns)
        if _section_dmdid:
            section_mods = dmd_mods(doc, _section_dmdid)
            section_label = section_mods.xpath(
                'string(.//mods:detail[@type="section label"]/mods:number[1])',
                namespaces=ns).strip()
            if section_label:
                page.section_label = section_label

        page.issue = issue

        _logger.info("Saving page. issue date: %s, page sequence: %s" %
                     (issue.date_issued, page.sequence))

        # TODO - consider the possibility of executing the file name
        #        assignments (below) before this page.save().
        page.save()

        notes = []
        for mods_note in mods.xpath('.//mods:note', namespaces=ns):
            type = mods_note.xpath('string(./@type)')
            label = mods_note.xpath('string(./@displayLabel)')
            text = mods_note.xpath('string(.)').strip()
            note = models.PageNote(type=type, label=label, text=text)
            notes.append(note)
        page.notes.set(notes, bulk=False)

        # there's a level indirection between the METS structmap and the
        # details about specific files in this package ...
        # so we have to first get the FILEID from the issue div in the
        # structmap and then use it to look up the file details in the
        # larger document.

        for fptr in div.xpath('./mets:fptr', namespaces=ns):
            file_id = fptr.attrib['FILEID']
            file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id,
                                namespaces=ns)[0]
            file_type = file_el.attrib['USE']

            # get the filename relative to the storage location
            file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)',
                                      namespaces=ns)
            file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name)
            file_name = self.storage_relative_path(file_name)

            if file_type == 'master':
                page.tiff_filename = file_name
            elif file_type == 'service':
                page.jp2_filename = file_name
                try:
                    # extract image dimensions from technical metadata for jp2
                    for admid in file_el.attrib['ADMID'].split(' '):
                        length, width = get_dimensions(doc, admid)
                        if length and width:
                            page.jp2_width = width
                            page.jp2_length = length
                            break
                except KeyError as e:
                    _logger.info(
                        "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..."
                        % (page.issue, page))
                    im = Image.open(page.jp2_abs_filename)
                    page.jp2_width, page.jp2_length = im.size

                if not page.jp2_width:
                    raise BatchLoaderException(
                        "No jp2 width for issue: %s page: %s" %
                        (page.issue, page))
                if not page.jp2_length:
                    raise BatchLoaderException(
                        "No jp2 length for issue: %s page: %s" %
                        (page.issue, page))
            elif file_type == 'derivative':
                page.pdf_filename = file_name
            elif file_type == 'ocr':
                page.ocr_filename = file_name

        if page.ocr_filename:
            # don't incurr overhead of extracting ocr text, word coordinates
            # and indexing unless the batch loader has been set up to do it
            if self.PROCESS_OCR:
                self.process_ocr(page)
        else:
            _logger.info("No ocr filename for issue: %s page: %s" %
                         (page.issue, page))

        _logger.debug("saving page: %s" % page.url)
        page.save()
        return page

    def process_ocr(self, page, index=True):
        _logger.debug("extracting ocr text and word coords for %s" % page.url)

        url = urllib.parse.urljoin(self.current_batch.storage_url,
                                   page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        for lang, text in lang_text.items():
            try:
                language = models.Language.objects.get(
                    Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language, text=text)
        page.ocr = ocr
        if index:
            _logger.debug("indexing ocr for: %s" % page.url)
            self.solr.add(**page.solr_doc)
            page.indexed = True
        page.save()

    def _process_coordinates(self, page, coords):
        _logger.debug("writing out word coords for %s" % page.url)

        f = open(models.coordinates_path(page._url_parts()), "wb")
        f.write(gzip_compress(json.dumps(coords).encode('utf-8')))
        f.close()

    def process_coordinates(self, batch_path):
        _logger.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE,
                                                batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    url = urllib.parse.urljoin(self.current_batch.storage_url,
                                               page.ocr_filename)

                    lang_text, coords = ocr_extractor(url)
                    self._process_coordinates(page, coords)
        except Exception as e:
            msg = "unable to process coordinates for batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            raise BatchLoaderException(msg)

    def storage_relative_path(self, path):
        """returns a relative path for a given file path within a batch, so
        that storage can be re-homed without having to rewrite paths in the db
        """
        rel_path = path.replace(self.current_batch.storage_url, '')
        return rel_path

    def purge_batch(self, batch_name):
        event = LoadBatchEvent(batch_name=batch_name, message="starting purge")
        event.save()

        try:
            batch = self._get_batch(batch_name)
            self._purge_batch(batch)
            event = LoadBatchEvent(batch_name=batch_name, message="purged")
            event.save()
            # clean up symlinks if exists
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if os.path.islink(link_name):
                _logger.info("Removing symlink %s", link_name)
                os.remove(link_name)
            # updates the min and max years of all titles
            set_fulltext_range()
        except Exception as e:
            msg = "purge failed: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            raise BatchLoaderException(msg)

    def _purge_batch(self, batch):
        batch_name = batch.name
        # just delete batch causes memory to bloat out
        # so we do it piece-meal
        for issue in batch.issues.all():
            for page in issue.pages.all():
                page.delete()
                # remove coordinates
                if os.path.exists(models.coordinates_path(page._url_parts())):
                    os.remove(models.coordinates_path(page._url_parts()))
                reset_queries()
            issue.delete()
        batch.delete()
        if self.PROCESS_OCR:
            self.solr.delete_query('batch:"%s"' % batch_name)
            self.solr.commit()
コード例 #34
0
	def submit(self, **kwargs):

		# Grab the user and request details
		json_data = json.loads(kwargs['json'])

		user_id = json_data['userid']
		channel_id = self.channel_id
		requests = json_data['requests']

		# Create a local representation of the requests
		tracks = []
		for request in requests:

			source_id = request['sourceid']
			track_id = request['trackid']

			# Build up a Solr query
			filters = []
			filters.append('type:track')
			filters.append('request_source_id:%s' % source_id)
			filters.append('request_track_id:%s' % track_id)

			# Make the request to Solr
			solr = SolrConnection(settings.SOLR_URL)
			response = solr.select(q = ' AND '.join(filters), fields = 'track_artist, track_album, track_title')

			if len(response.results) == 1:

				track = {
					'id': 'request_%s_%s_%s' % (source_id, track_id, user_id),
					'type': 'request',

					'channel_id': channel_id,

					'track_artist': response.results[0]['track_artist'],
					'track_album': response.results[0]['track_album'],
					'track_title': response.results[0]['track_title'],

					'request_user_id': user_id,
					'request_source_id': source_id,
					'request_track_id': track_id
				}
				tracks.append(track)

		# Create the request in the search engine
		solr = SolrConnection(settings.SOLR_URL)
		solr.add_many(tracks)
		solr.commit()
		solr.close()

		# Log the request to the database
		db = psycopg2.connect(database='airjukebox')
		cr = db.cursor()

		for track in tracks:

			cr.execute('insert into tbrequests (userid, locationid, sourceid, trackid) values (%(request_user_id)s, %(channel_id)s, %(request_source_id)s, %(request_track_id)s)', track)

		db.commit()

		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps(tracks, ensure_ascii=False, indent=4).encode('utf-8')
コード例 #35
0
ファイル: findmyfile.py プロジェクト: ggaurav/find_my_file
def uninstall(field, data, path, length, mode):	
	from solr import SolrConnection
	s = SolrConnection(SOLR_URL)
	s.delete_query('id:*')
	s.commit()
コード例 #36
0
class BatchLoader(object):
    """This class allows you to load a batch into the database. A loader
    object serves as a context for a particular batch loading job.
    """

    def __init__(self, process_ocr=True, process_coordinates=True):
        """Create a BatchLoader.

        The process_ocr parameter is used (mainly in testing) when we don't
        want to spend time actually extracting ocr text and indexing.
        """
        self.PROCESS_OCR = process_ocr
        self.solr = SolrConnection(settings.SOLR)
        self.PROCESS_COORDINATES = process_coordinates

    def _find_batch_file(self, batch):
        """
        TODO: Who can we toss the requirement at to make this
        available in a canonical location?
        """
        # look for batch_1.xml, BATCH_1.xml, etc
        for alias in ["batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml"]:
            # TODO: might we want 'batch.xml' first? Leaving last for now to
            # minimize impact.
            url = urlparse.urljoin(batch.storage_url, alias)
            try:
                urllib2.urlopen(url)
                validated_batch_file = alias
                break
            except (urllib2.HTTPError, urllib2.URLError):
                continue
        else:
            raise BatchLoaderException(
                "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path)
        return validated_batch_file

    def _sanity_check_batch(self, batch):
        # if not os.path.exists(batch.path):
        #    raise BatchLoaderException("batch does not exist at %s" % batch.path)
        #b = urllib2.urlopen(batch.url)
        batch.validated_batch_file = self._find_batch_file(batch)

    def load_batch(self, batch_path, strict=True):
        """Load a batch, and return a Batch instance for the batch
        that was loaded.

          loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01')

        """

        LOGGER.info("loading batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if batch_path != link_name and not os.path.islink(link_name):
                LOGGER.info("creating symlink %s -> %s", batch_path, link_name)
                os.symlink(batch_path, link_name)
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        if not strict:
            try:
                batch = Batch.objects.get(name=batch_name)
                LOGGER.info("Batch already loaded: %s", batch_name)
                return batch
            except Batch.DoesNotExist as e:
                pass

        LOGGER.info("loading batch: %s", batch_name)

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):

                reel_number = e.attrib['reelNumber'].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist as e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath('ndnp:issue', namespaces=ns):
                mets_url = urlparse.urljoin(batch.storage_url, e.text)

                try:
                    issue, pages = self._load_issue(mets_url)
                except ValueError as e:
                    LOGGER.exception(e)
                    continue

                # commit new changes to the solr index, if we are indexing
                if self.PROCESS_OCR:
                    LOGGER.info("Adding pages to solr index from issue %s", issue.title)
                    for page in pages:
                        LOGGER.debug("indexing ocr for: %s", page.url)
                        self.solr.add(**page.solr_doc)
                        page.indexed = True
                        page.save()

            if self.PROCESS_OCR:
                LOGGER.info("Committing solr index")
                self.solr.commit()

            batch.save()
            msg = "processed %s pages" % batch.page_count
            LOGGER.info(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
        except Exception as e:
            msg = "unable to load batch: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            try:
                self.purge_batch(batch_name)
            except Exception as pbe:
                LOGGER.error("purge batch failed for failed load batch: %s", pbe)
                LOGGER.exception(pbe)
            raise BatchLoaderException(msg)

        if settings.IS_PRODUCTION:
            batch.released = datetime.now()
            batch.save()

        return batch

    def _get_batch(self, batch_name, batch_source=None, create=False):
        if create:
            batch = self._create_batch(batch_name, batch_source)
        else:
            batch = Batch.objects.get(name=batch_name)
        return batch

    def _create_batch(self, batch_name, batch_source):
        if Batch.objects.filter(name=batch_name).count() != 0:
            raise BatchLoaderException("batch %s already loaded" % batch_name)
        batch = Batch()
        batch.name = batch_name
        batch.source = batch_source
        try:
            parts = batch_name.split("_", 3)
            if len(parts) is 4:
                parts = parts[1:]
            awardee_org_code, name_part, version = parts
            batch.awardee = Awardee.objects.get(org_code=awardee_org_code)
        except Awardee.DoesNotExist:
            msg = "no awardee for org code: %s" % awardee_org_code
            LOGGER.error(msg)
            raise BatchLoaderException(msg)
        batch.save()
        return batch

    def _load_issue(self, mets_file):
        LOGGER.debug("parsing issue mets file: %s", mets_file)
        doc = etree.parse(mets_file)

        # get the mods for the issue
        div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0]
        dmdid = div.attrib['DMDID']
        mods = dmd_mods(doc, dmdid)

        # set up a new Issue
        issue = Issue()
        issue.volume = mods.xpath(
            'string(.//mods:detail[@type="volume"]/mods:number[1])',
            namespaces=ns).strip()
        issue.number = mods.xpath(
            'string(.//mods:detail[@type="issue"]/mods:number[1])',
            namespaces=ns).strip()
        issue.edition = int(mods.xpath(
            'string(.//mods:detail[@type="edition"]/mods:number[1])',
            namespaces=ns))
        issue.edition_label = mods.xpath(
            'string(.//mods:detail[@type="edition"]/mods:caption[1])',
            namespaces=ns).strip()

        # parse issue date
        date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns)
        issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d')

        # attach the Issue to the appropriate Title
        lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])',
                          namespaces=ns).strip()
        try:
            title = Title.objects.get(lccn=lccn)
        except Exception as e:
            url = 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn
            LOGGER.info("attempting to load marc record from %s", url)
            management.call_command('load_titles', url)
            title = Title.objects.get(lccn=lccn)
        issue.title = title

        issue.batch = self.current_batch
        issue.save()
        LOGGER.debug("saved issue: %s", issue.url)

        notes = []
        for mods_note in mods.xpath('.//mods:note', namespaces=ns):
            type = mods_note.xpath('string(./@type)')
            label = mods_note.xpath('string(./@displayLabel)')
            text = mods_note.xpath('string(.)')
            note = models.IssueNote(type=type, label=label, text=text)
            notes.append(note)
        issue.notes = notes
        issue.save()

        # attach pages: lots of logging because it's expensive
        pages = []
        for page_div in div.xpath('.//mets:div[@TYPE="np:page"]',
                                  namespaces=ns):
            try:
                pages.append(self._load_page(doc, page_div, issue))
            except BatchLoaderException as e:
                LOGGER.error("Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue)
                LOGGER.exception(e)

        return issue, pages

    def _load_page(self, doc, div, issue):
        dmdid = div.attrib['DMDID']
        mods = dmd_mods(doc, dmdid)
        page = Page()

        seq_string = mods.xpath(
            'string(.//mods:extent/mods:start)', namespaces=ns)
        try:
            page.sequence = int(seq_string)
        except ValueError:
            raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string)
        page.number = mods.xpath(
            'string(.//mods:detail[@type="page number"])',
            namespaces=ns
        ).strip()

        reel_number = mods.xpath(
            'string(.//mods:identifier[@type="reel number"])',
            namespaces=ns
        ).strip()
        try:
            reel = models.Reel.objects.get(number=reel_number,
                                           batch=self.current_batch)
            page.reel = reel
        except models.Reel.DoesNotExist:
            if reel_number:
                reel = models.Reel(number=reel_number,
                                   batch=self.current_batch,
                                   implicit=True)
                reel.save()
                page.reel = reel
            else:
                LOGGER.warn("unable to find reel number in page metadata")

        LOGGER.info("Assigned page sequence: %s", page.sequence)

        _section_dmdid = div.xpath(
            'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)',
            namespaces=ns)
        if _section_dmdid:
            section_mods = dmd_mods(doc, _section_dmdid)
            section_label = section_mods.xpath(
                'string(.//mods:detail[@type="section label"]/mods:number[1])',
                namespaces=ns).strip()
            if section_label:
                page.section_label = section_label

        page.issue = issue

        LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence)

        # TODO - consider the possibility of executing the file name
        #        assignments (below) before this page.save().
        page.save()

        notes = []
        for mods_note in mods.xpath('.//mods:note', namespaces=ns):
            type = mods_note.xpath('string(./@type)')
            label = mods_note.xpath('string(./@displayLabel)')
            text = mods_note.xpath('string(.)').strip()
            note = models.PageNote(type=type, label=label, text=text)
            notes.append(note)
        page.notes = notes

        # there's a level indirection between the METS structmap and the
        # details about specific files in this package ...
        # so we have to first get the FILEID from the issue div in the
        # structmap and then use it to look up the file details in the
        # larger document.

        for fptr in div.xpath('./mets:fptr', namespaces=ns):
            file_id = fptr.attrib['FILEID']
            file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id,
                                namespaces=ns)[0]
            file_type = file_el.attrib['USE']

            # get the filename relative to the storage location
            file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)',
                                      namespaces=ns)
            file_name = urlparse.urljoin(doc.docinfo.URL, file_name)
            file_name = self.storage_relative_path(file_name)

            if file_type == 'master':
                page.tiff_filename = file_name
            elif file_type == 'service':
                page.jp2_filename = file_name
                try:
                    # extract image dimensions from technical metadata for jp2
                    for admid in file_el.attrib['ADMID'].split(' '):
                        length, width = get_dimensions(doc, admid)
                        if length and width:
                            page.jp2_width = width
                            page.jp2_length = length
                            break
                except KeyError:
                    LOGGER.info("Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page)

                if not page.jp2_width:
                    raise BatchLoaderException("No jp2 width for issue: %s page: %s" % (page.issue, page))
                if not page.jp2_length:
                    raise BatchLoaderException("No jp2 length for issue: %s page: %s" % (page.issue, page))
            elif file_type == 'derivative':
                page.pdf_filename = file_name
            elif file_type == 'ocr':
                page.ocr_filename = file_name

        if page.ocr_filename:
            # don't incurr overhead of extracting ocr text, word coordinates
            # and indexing unless the batch loader has been set up to do it
            if self.PROCESS_OCR:
                page = self.process_ocr(page)
        else:
            LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page)

        LOGGER.debug("saving page: %s", page.url)
        page.save()
        return page

    def process_ocr(self, page):
        LOGGER.debug("extracting ocr text and word coords for %s", page.url)

        url = urlparse.urljoin(self.current_batch.storage_url,
                               page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        lang_text_solr = {}
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                LOGGER.warn("Language %s does not exist in the database. Defaulting to English.", lang)
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language)
            lang_text_solr[language.code] = text

        page.ocr = ocr
        page.lang_text = lang_text_solr
        page.save()
        return page

    def _process_coordinates(self, page, coords):
        LOGGER.debug("writing out word coords for %s", page.url)

        fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE)  # get a temp file in case the coordinates dir is a NFS or S3 mount which have poor multiple write performance
        f = open(path, "w")
        f.write(gzip_compress(json.dumps(coords)))
        f.close()
        os.close(fd)
        final_path = models.coordinates_path(page._url_parts())
        try:
            shutil.move(path, final_path)
        except Exception:
            LOGGER.warn("Could not move coordinates to [%s]. Waiting 5 seconds and trying again in case of network mount", final_path)
            time.sleep(5)
            shutil.move(path, final_path)

    def process_coordinates(self, batch_path):
        LOGGER.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    if not page.ocr_filename:
                        LOGGER.warn("Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page))
                    else:
                        url = urlparse.urljoin(self.current_batch.storage_url,
                                               page.ocr_filename)
                        LOGGER.debug("Extracting OCR from url %s", url)
                        lang_text, coords = ocr_extractor(url)
                        self._process_coordinates(page, coords)
        except Exception as e:
            msg = "unable to process coordinates for batch: %s" % e
            LOGGER.exception(msg)
            raise BatchLoaderException(msg)

    def storage_relative_path(self, path):
        """returns a relative path for a given file path within a batch, so
        that storage can be re-homed without having to rewrite paths in the db
        """
        rel_path = path.replace(self.current_batch.storage_url, '')
        return rel_path

    @transaction.atomic
    def purge_batch(self, batch_name):
        event = LoadBatchEvent(batch_name=batch_name, message="starting purge")
        event.save()

        try:
            batch = self._get_batch(batch_name)
            self._purge_batch(batch)
            event = LoadBatchEvent(batch_name=batch_name, message="purged")
            event.save()
            # clean up symlinks if exists
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if os.path.islink(link_name):
                LOGGER.info("Removing symlink %s", link_name)
                os.remove(link_name)
        except Exception as e:
            msg = "purge failed: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            raise BatchLoaderException(msg)

    def _purge_batch(self, batch):
        batch_name = batch.name
        # just delete batch causes memory to bloat out
        # so we do it piece-meal
        for issue in batch.issues.all():
            for page in issue.pages.all():
                page.delete()
                # remove coordinates
                if os.path.exists(models.coordinates_path(page._url_parts())):
                    os.remove(models.coordinates_path(page._url_parts()))
            issue.delete()
        batch.delete()
        if self.PROCESS_OCR:
            self.solr.delete_query('batch:"%s"' % batch_name)
            self.solr.commit()
コード例 #37
0
ファイル: app_actions.py プロジェクト: bireme/medlist
def solr_index(med):
    lists = []
    countries = []
    sections = []
    subsections = []
    pharma_form_list = []
    pharma_form_type_list = []
    category_list = []
    observation_list = []

    # if medicine status is not active delete from solr index
    if not med.active:
        try:
            solr = SolrConnection(settings.SOLR_URL)
            solr.delete(id=str(med.id))
            response = solr.commit()
        except Exception as ex: 
            return False

        return True

    # index medicine on solr index
    medicine_translations = MedicineLocal.objects.filter(medicine=med.id)
    medicine_list = ['en^%s' % med.name.strip()]
    for translation in medicine_translations:
        medicine_list.append('%s^%s' % (translation.language, translation.name.strip()))
    
    medicine_list = "|".join(medicine_list) # ex.: en^codeine|pt-br^codeína|es^codeína

    # retrieve actives pharmaceutical forms of currente medicine
    pharm_forms = med.pharmaceuticalform_set.filter(active=True)
    for form in pharm_forms:

        # ex. ^enTablet|es^Tableta|pt-br^Comprimido
        pharma_form_type_translations = "|".join( form.pharmaceutical_form_type.get_translations() )
        pharma_form_type_list.append(pharma_form_type_translations)

        # ex. ^enTablet|es^Tableta|pt-br^Comprimido|comp^15 mg/ml
        pharma_form_list.append('%s|comp^%s' % (pharma_form_type_translations, form.composition))

        # create category_list (section and subsection where current pharmaceutical form is used on lists)
        section_pharm_form_list = SectionPharmForm.objects.filter(pharmaceutical_form=form)

        for section_pharm_form in section_pharm_form_list:
            #add observations of current section_pharm_form
            if section_pharm_form.only_for_children:
                observation_list.append('only_for_children')
            if section_pharm_form.specialist_care_for_children:
                observation_list.append('specialist_care_for_children')
            if section_pharm_form.restriction_age:
                observation_list.append('restriction_age')
            if section_pharm_form.best_evidence:
                observation_list.append('best_evidence')
            if section_pharm_form.observation:
                observation_list.append('observation')

            section = Section.objects.get(pk=section_pharm_form.section.id)
            section_translations = "|".join(section.get_translations())

            section_tree = section.get_ancestors()
            
            if section_tree:
                for sec in section_tree:                    
                    category_translations = "|".join(sec.get_translations())
                    if category_translations not in category_list:
                        category_list.append(category_translations)
            
            if section_translations not in category_list:
                category_list.append(section_translations)
   
            list_associated = "|".join( section.list.get_translations() )
            if section.list.type == 'c':
                if list_associated not in countries:                                
                    countries.append(list_associated)
            else:                
                if list_associated not in lists:
                    lists.append(list_associated)

    #check if current medicine have Evidence summaries
    has_evidence = None
    evidence_total = MedicineEvidenceSummary.objects.filter(medicine=med.id).count()
    if evidence_total > 0:
        has_evidence = "true"

    # try to create a connection to a solr server and send medicine
    try:
        solr = SolrConnection(settings.SOLR_URL)
        solr.add(
            id = str(med.id), 
            type = "medicine",
            name = medicine_list,
            pharmaceutical_form = pharma_form_list,        
            pharmaceutical_form_type = pharma_form_type_list,
            list=lists,
            country=countries,
            category=category_list,
            observation=observation_list,
            has_evidence=has_evidence,
        )
        response = solr.commit()
    except Exception as ex: 
        return False

    return True
コード例 #38
0
ファイル: commit_index.py プロジェクト: CDRH/nebnews
 def handle(self, **options):
     solr = SolrConnection(settings.SOLR)
     solr.commit()
コード例 #39
0
def solr_index(med):
    lists = []
    countries = []
    sections = []
    subsections = []
    pharma_form_list = []
    pharma_form_type_list = []
    category_list = []
    observation_list = []

    # if medicine status is not active delete from solr index
    if not med.active:
        try:
            solr = SolrConnection(settings.SOLR_URL)
            solr.delete(id=str(med.id))
            response = solr.commit()
        except Exception as ex:
            return False

        return True

    # index medicine on solr index
    medicine_translations = MedicineLocal.objects.filter(medicine=med.id)
    medicine_list = ['en^%s' % med.name.strip()]
    for translation in medicine_translations:
        medicine_list.append('%s^%s' %
                             (translation.language, translation.name.strip()))

    medicine_list = "|".join(
        medicine_list)  # ex.: en^codeine|pt-br^codeína|es^codeína

    # retrieve actives pharmaceutical forms of currente medicine
    pharm_forms = med.pharmaceuticalform_set.filter(active=True)
    for form in pharm_forms:

        # ex. ^enTablet|es^Tableta|pt-br^Comprimido
        pharma_form_type_translations = "|".join(
            form.pharmaceutical_form_type.get_translations())
        pharma_form_type_list.append(pharma_form_type_translations)

        # ex. ^enTablet|es^Tableta|pt-br^Comprimido|comp^15 mg/ml
        pharma_form_list.append(
            '%s|comp^%s' % (pharma_form_type_translations, form.composition))

        # create category_list (section and subsection where current pharmaceutical form is used on lists)
        section_pharm_form_list = SectionPharmForm.objects.filter(
            pharmaceutical_form=form)

        for section_pharm_form in section_pharm_form_list:
            #add observations of current section_pharm_form
            if section_pharm_form.only_for_children:
                observation_list.append('only_for_children')
            if section_pharm_form.specialist_care_for_children:
                observation_list.append('specialist_care_for_children')
            if section_pharm_form.restriction_age:
                observation_list.append('restriction_age')
            if section_pharm_form.best_evidence:
                observation_list.append('best_evidence')
            if section_pharm_form.observation:
                observation_list.append('observation')

            section = Section.objects.get(pk=section_pharm_form.section.id)
            section_translations = "|".join(section.get_translations())

            section_tree = section.get_ancestors()

            if section_tree:
                for sec in section_tree:
                    category_translations = "|".join(sec.get_translations())
                    if category_translations not in category_list:
                        category_list.append(category_translations)

            if section_translations not in category_list:
                category_list.append(section_translations)

            list_associated = "|".join(section.list.get_translations())
            if section.list.type == 'c':
                if list_associated not in countries:
                    countries.append(list_associated)
            else:
                if list_associated not in lists:
                    lists.append(list_associated)

    #check if current medicine have Evidence summaries
    has_evidence = None
    evidence_total = MedicineEvidenceSummary.objects.filter(
        medicine=med.id).count()
    if evidence_total > 0:
        has_evidence = "true"

    # try to create a connection to a solr server and send medicine
    try:
        solr = SolrConnection(settings.SOLR_URL)
        solr.add(
            id=str(med.id),
            type="medicine",
            name=medicine_list,
            pharmaceutical_form=pharma_form_list,
            pharmaceutical_form_type=pharma_form_type_list,
            list=lists,
            country=countries,
            category=category_list,
            observation=observation_list,
            has_evidence=has_evidence,
        )
        response = solr.commit()
    except Exception as ex:
        return False

    return True
コード例 #40
0
ファイル: index.py プロジェクト: sshyran/chronam
def index_pages(only_missing=False):
    """index all the pages that are modeled in the database
    """
    solr = SolrConnection(settings.SOLR)

    page_qs = models.Page.objects.order_by("pk")

    if only_missing:
        page_qs = page_qs.filter(indexed=False)
    else:
        # FIXME: we should not churn the index when documents have not been deleted:
        solr.delete_query("type:page")

    # To avoid MySQL limitations, we'll run two queries: the first will only
    # lookup the primary keys to allow MySQL to satisfy the ORDER BY / LIMIT
    # using only the index and then we'll use the primary keys to lookup the
    # full Page objects for each chunk which will actually be indexed.

    full_page_qs = page_qs.prefetch_related(
        Prefetch(
            "issue",
            queryset=models.Issue.objects.prefetch_related(
                "batch",
                "title",
                "title__languages",
                "title__alt_titles",
                "title__subjects",
                "title__notes",
                "title__places",
                "title__urls",
                "title__essays",
                "title__country",
                "title__holdings",
            ),
        ))

    count = 0
    for pk_chunk in sliced(page_qs.values_list("pk", flat=True), 100):
        # We have to force the PKs into a list to work around limitations in
        # MySQL preventing the use of a subquery which uses LIMIT:
        chunk = full_page_qs.filter(pk__in=list(pk_chunk))

        docs = []
        pks = []

        for page in chunk:
            try:
                docs.append(page.solr_doc)
                pks.append(page.pk)
            except Exception:
                LOGGER.warning("Unable to index page %s",
                               page.url,
                               exc_info=True)
                continue

        if docs:
            solr.add_many(docs)
            solr.commit()
            models.Page.objects.filter(pk__in=pks).update(indexed=True)

        count += len(pk_chunk)
        reset_queries()
        LOGGER.info("indexed %d pages", count)

    solr.commit()