def ensure_dataset_is_in_couple_table(cls, dataset_record): from ckan import model from ckanext.harvest.model import HarvestCoupledResource q = model.Session.query(HarvestCoupledResource) \ .filter_by(dataset_record_package_id=dataset_record.id) if q.count() == 0: harvest_objects = [ho for ho in dataset_record.harvest_objects \ if ho.current] if len(harvest_objects) != 1: log.warning('Wrong num of current harvest_objects (%i)', len(harvest_objects)) additional_couple_stats.add('Wrong num of harvest_objects (%i)' % len(harvest_objects), dataset_record.name) return harvest_object = harvest_objects[0] harvest_source_reference = harvest_object.harvest_source_reference obj = HarvestCoupledResource( dataset_record_package_id=dataset_record.id, harvest_source_reference=harvest_source_reference) model.Session.add(obj) model.Session.commit() additional_couple_stats.add('Added to couple table', dataset_record.name) log.info('Added to couple table: %s', dataset_record.name) else: additional_couple_stats.add('Already in couple table', dataset_record.name) log.info('Already in couple table: %s', dataset_record.name)
def _create_coupled_resource(self, service_name, ref, dataset_name): service = model.Package.by_name(unicode(service_name or '')) dataset = model.Package.by_name(unicode(dataset_name or '')) if service_name: assert service if dataset_name: assert dataset model.Session.add( HarvestCoupledResource(service_record=service, harvest_source_reference=ref_prefix + ref, dataset_record=dataset))
def add_coupling(cls, service_record, dataset_record, dataset_harvest_object, harvest_source_reference): from ckan import model from ckanext.harvest.model import HarvestCoupledResource if dataset_harvest_object.harvest_source_reference != harvest_source_reference: dataset_harvest_object.harvest_source_reference = harvest_source_reference model.Session.commit() q = model.Session.query(HarvestCoupledResource) \ .filter_by(service_record_package_id=service_record.id) \ .filter_by(dataset_record_package_id=dataset_record.id) \ .filter_by(harvest_source_reference=harvest_source_reference) if q.count() == 0: obj = HarvestCoupledResource( service_record_package_id=service_record.id, dataset_record_package_id=dataset_record.id, harvest_source_reference=harvest_source_reference) model.Session.add(obj) model.Session.commit()
def update_coupled_resources(package, harvest_source_reference): '''Update the harvest_coupled_resource_table with the details of this harvested package\'s couplings. :param package: the Package object containing extra fields with couples to update in the table. :param harvest_source_reference: the ref of this package being harvested. This is not relevant if it is a service record, but essential if it is a dataset. ''' resource_type = package.extras['resource-type'] if resource_type == 'service': # When a service record is harvested, ensure the couples listed # in it match the couples in the HarvestCoupledResource objects, # ignoring their dataset values (they might be filled in or not). pkg_couples_str = package.extras['coupled-resource'] pkg_couples = json.loads(pkg_couples_str) log.info('Service Record %s has %i coupled resources to update', package.name, len(pkg_couples)) table_couples_matching_service = HarvestCoupledResource.get_by_service_record(package) table_couples_not_matching_pkg = table_couples_matching_service.all() # cross them off as we go for pkg_couple in pkg_couples: try: ref = extract_harvest_source_reference_from_coupled_resource(pkg_couple) except CoupledResourceParseError, e: log.warn('Error parsing couple: %s Ignoring couple=%s', e, pkg_couple) continue # Match both service and ref matching_table_couples = table_couples_matching_service.filter_by(harvest_source_reference=ref) if matching_table_couples.count() > 0: # Test: test_02_reharvest_existing_service # Note down the matches so we don't delete them later for matching_table_couple in matching_table_couples: log.info('Service couple is already there (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) table_couples_not_matching_pkg.remove(matching_table_couple) continue # Match just ref with blank service matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\ .filter_by(service_record=None) if matching_table_couples.count() == 0: # Test: test_06_harvest_service_not_matching_a_dataset # create the row obj = HarvestCoupledResource(service_record=package, harvest_source_reference=ref) model.Session.add(obj) log.info('Ref is new for this service - adding (%s, %s, None)', package.name, ref) model.Session.commit() else: # Test: test_04_harvest_service_to_match_existing_dataset for matching_table_couple in matching_table_couples: # fill in the service value matching_table_couple.service_record = package log.info('Service filled into couple matching ref (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) model.Session.commit() # Delete service value for any table_couples not matching the package # Test: test_08_reharvest_existing_service_to_delete_and_add_couples for table_couple in table_couples_not_matching_pkg: log.info('Service couple not matched - deleted service (%s->None, %s, %s)', _package_name(table_couple.service_record), ref, _package_name(table_couple.dataset_record)) table_couple.service_record = None model.Session.commit() return
ref = harvest_source_reference assert ref for couple in model.Session.query(HarvestCoupledResource) \ .filter_by(dataset_record=package) \ .filter(HarvestCoupledResource.harvest_source_reference!=ref): log.info('Ref %s has been replaced for this dataset record with ' '%s. Removing link to the dataset record (%s, %s, %s->None)', couple.harvest_source_reference, ref, _package_name(couple.service_record), couple.harvest_source_reference, _package_name(couple.dataset_record)) couple.dataset_record = None model.Session.commit() # Couples with this ref for couple in HarvestCoupledResource.get_by_harvest_source_reference(ref): if couple.dataset_record != package: # Test: test_03_harvest_dataset_to_match_existing_service log.info('Linking ref to this dataset record (%s, %s, %s->%s)', _package_name(couple.service_record), ref, _package_name(couple.dataset_record), package.name) couple.dataset_record = package model.Session.commit() else: # Test: test_01_reharvest_existing_dataset log.info('Couple for this dataset and ref already exists (%s, %s, %s)', _package_name(couple.service_record), ref, _package_name(couple.dataset_record))
def update_coupled_resources(package, harvest_source_reference): '''Update the harvest_coupled_resource_table with the details of this harvested package\'s couplings. :param package: the Package object containing extra fields with couples to update in the table. :param harvest_source_reference: the ref of this package being harvested. This is not relevant if it is a service record, but essential if it is a dataset. ''' resource_type = package.extras['resource-type'] if resource_type == 'service': # When a service record is harvested, ensure the couples listed # in it match the couples in the HarvestCoupledResource objects, # ignoring their dataset values (they might be filled in or not). pkg_couples_str = package.extras['coupled-resource'] pkg_couples = json.loads(pkg_couples_str) log.info('Service Record %s has %i coupled resources to update', package.name, len(pkg_couples)) table_couples_matching_service = HarvestCoupledResource.get_by_service_record( package) table_couples_not_matching_pkg = table_couples_matching_service.all( ) # cross them off as we go for pkg_couple in pkg_couples: try: ref = extract_harvest_source_reference_from_coupled_resource( pkg_couple) except CoupledResourceParseError, e: log.warn('Error parsing couple: %s Ignoring couple=%s', e, pkg_couple) continue # Match both service and ref matching_table_couples = table_couples_matching_service.filter_by( harvest_source_reference=ref) if matching_table_couples.count() > 0: # Test: test_02_reharvest_existing_service # Note down the matches so we don't delete them later for matching_table_couple in matching_table_couples: log.info( 'Service couple is already there (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) table_couples_not_matching_pkg.remove( matching_table_couple) continue # Match just ref with blank service matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\ .filter_by(service_record=None) if matching_table_couples.count() == 0: # Test: test_06_harvest_service_not_matching_a_dataset # create the row obj = HarvestCoupledResource(service_record=package, harvest_source_reference=ref) model.Session.add(obj) log.info('Ref is new for this service - adding (%s, %s, None)', package.name, ref) model.Session.commit() else: # Test: test_04_harvest_service_to_match_existing_dataset for matching_table_couple in matching_table_couples: # fill in the service value matching_table_couple.service_record = package log.info( 'Service filled into couple matching ref (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) model.Session.commit() # Delete service value for any table_couples not matching the package # Test: test_08_reharvest_existing_service_to_delete_and_add_couples for table_couple in table_couples_not_matching_pkg: log.info( 'Service couple not matched - deleted service (%s->None, %s, %s)', _package_name(table_couple.service_record), ref, _package_name(table_couple.dataset_record)) table_couple.service_record = None model.Session.commit() return
assert ref for couple in model.Session.query(HarvestCoupledResource) \ .filter_by(dataset_record=package) \ .filter(HarvestCoupledResource.harvest_source_reference!=ref): log.info( 'Ref %s has been replaced for this dataset record with ' '%s. Removing link to the dataset record (%s, %s, %s->None)', couple.harvest_source_reference, ref, _package_name(couple.service_record), couple.harvest_source_reference, _package_name(couple.dataset_record)) couple.dataset_record = None model.Session.commit() # Couples with this ref for couple in HarvestCoupledResource.get_by_harvest_source_reference( ref): if couple.dataset_record != package: # Test: test_03_harvest_dataset_to_match_existing_service log.info('Linking ref to this dataset record (%s, %s, %s->%s)', _package_name(couple.service_record), ref, _package_name(couple.dataset_record), package.name) couple.dataset_record = package model.Session.commit() else: # Test: test_01_reharvest_existing_dataset log.info( 'Couple for this dataset and ref already exists (%s, %s, %s)', _package_name(couple.service_record), ref, _package_name(couple.dataset_record)) # No couples for this ref