def update_zcatalog(self, records_reponse): """ Fetch data from an OAI Server and populate the repository's catalog Highly inneficient in terms of storage. It is not recommended to use ZCatalog storage. Arguments: records_reponse -- <record> Generator """ visited_records = [] catalog = self.aq_parent.getCatalog() for (header, meta, about), token in records_reponse: if not header or not meta: #Invalid record, passing through continue id = processId(header.identifier()) catalog_record = catalog.searchResults( meta_type=OAIRecord.meta_type, id=id) visited_records.append(id) if len(catalog_record): record = catalog_record[0].getObject() else: manage_addOAIRecord(self, id=id, deleted=header.isDeleted(), about=unicode(about)) record = self._getOb(id) record.harvester = self.id #Storing some fields for ZCatalog for key, val in meta.getMap().iteritems(): if key == 'identifier' and val: for ident in val: if url_pattern.match(ident): #Storing only the url record.dc_identifier = ident break else: record.dc_identifier = val[0] elif key == 'description' and val: record.dc_description = "\n".join(val) elif key == 'type' and val: record.dc_type = "\n".join(val) elif key == 'author' and val: record.dc_author = "\n".join(val) elif key == 'language' and val: record.dc_language = [] for lang in val: try: record.dc_language.append( pycountry.languages.get( alpha2=str(lang).lower() ).alpha2 ) except KeyError: continue else: setattr(record, 'dc_' + key, val) record.header = header record.metadata = meta record.reindex_object() if self.resume_token != token: self.resume_token = token try: transaction.commit() except: pass #Deleting unaffected OAIRecords for oai_record in self.getCatalog().searchResults( meta_type=OAIRecord.meta_type, harvester=self.id): if oai_record.id not in visited_records: self.manage_delObjects(oai_record.id)
def update_sqlalchemy(self, records_reponse): """ The recommended way to store OAI Records from remote servers Arguments: records_reponse -- <record> Generator """ session = self.aq_parent.get_session() record_insert_list = [] record_map_insert_list = [] record_full_insert_list = [] record_map_delete_list = [] record_full_delete_list = [] try: for (header, meta, about), token in records_reponse: if not header or not meta: #Invalid record, passing through continue record_id = processId(header.identifier()) record = session.query(OAIRecordMapper).filter( OAIRecordMapper.id==record_id).all() record_map = [] if record: record = record[0] if header.isDeleted(): session.delete(record) session.commit() continue #Get existing records record_maps = session.query(OAIRecordMapMapper).\ filter(OAIRecordMapMapper.record_id==record_id).all() record_fullmaps = session.query(OAIRecordMapFullMapper).\ filter(OAIRecordMapFullMapper.record_id==record_id).all() for map_item in record_maps: record_map.append({ 'id': map_item.id, 'lang': map_item.lang, 'record_id': map_item.record_id, 'key': map_item.key, 'value': map_item.value }) for map_item in record_fullmaps: record_map.append({ 'id': map_item.id, 'lang': map_item.lang, 'record_id': map_item.record_id, 'key': map_item.key, 'value': map_item.value }) else: record_insert_list.append({'id': record_id, 'harvester': self.id}) new_record_map = [] #Get the request metadata data dict and make it flat for key, values in meta.getMap().iteritems(): if key.find(':') != -1: key, lang = key.split(':') else: lang = None map_dict = { 'record_id': record_id, 'lang': lang, 'key': unicode('dc_' + key), } for value in values: if value: if key == 'identifier': #Storing only the url if url_pattern.match(value): map_dict.update({'value': value}) new_record_map.append(dict(map_dict)) break elif key == 'language': try: country_code = pycountry.languages.get( alpha2=str(value).lower() ).alpha2 map_dict.update({'value': country_code}) new_record_map.append(dict(map_dict)) except: continue else: map_dict.update({'value': value}) new_record_map.append(dict(map_dict)) diff = ListDictDiffer(new_record_map, record_map, ('id', )) #Insert list for map_dict in diff.added(): if map_dict['key'] in ('dc_title', 'dc_description'): record_full_insert_list.append(map_dict) else: record_map_insert_list.append(map_dict) #Delete list - write some tests for this for list_index in diff.removed_index(): map_dict = record_map[list_index] if map_dict['key'] in ('dc_title', 'dc_description'): record_full_delete_list.append(map_dict['id']) else: record_map_delete_list.append(map_dict['id']) #Resumption token changed -> Make a bulk insert/delete if self.resume_token != token: self.resume_token = token try: transaction.commit() except: pass if record_insert_list: session.bind.execute( sqlalchemy_setup.tables['records_table'].\ insert(), record_insert_list) record_insert_list = [] #Insert if record_map_insert_list: session.bind.execute( sqlalchemy_setup.tables['records_map_table'].\ insert(), record_map_insert_list) record_map_insert_list = [] if record_full_insert_list: session.bind.execute( sqlalchemy_setup.tables['records_map_full_table'].\ insert(), record_full_insert_list) record_full_insert_list = [] if record_map_delete_list: table = sqlalchemy_setup.tables['records_map_table'] session.bind.execute( table.delete().where( table.columns.id.in_(record_map_delete_list)) ) record_map_delete_list = [] if record_full_delete_list: table = sqlalchemy_setup.tables['records_map_full_table'] session.bind.execute( table.delete().where( table.columns.id.in_(record_full_delete_list)) ) record_full_delete_list = [] except XMLSyntaxError, e: logging.error('Update failed with: %s' % str(e))