def clean_list_record(self, list_record): """ Given a dict, prepare it for saving as a newsitem. Result will be a dictionary of anything from list_record that looks like a known field of the NewsItem model. Anything that looks like a known SchemaField of the item's Schema will be set as an 'attributes' sub-dictionary. Unrecognized keys will be ignored (and logged). Locations are found heuristically: - If there's a 'location' key, try to split the value into (lat, lon) points - If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those - If there's a 'location_name', geocode if needed - If there's no 'location_name', reverse-geocode if needed """ from ebpub.db.models import NewsItem fieldnames = [f.name for f in NewsItem._meta.fields] core_fields = {} from ebdata.retrieval.utils import get_point if 'location' in list_record: # If there's a comma- or space-separated location in the # orginal, this gives us a way to use it by mapping it to # "location" try: lat, lon = re.split(r'[\s,]+', list_record.pop('location')) list_record.setdefault('lat', lat) list_record.setdefault('lon', lon) except ValueError: pass # Now try all the field names recognized by get_point(), eg # lat, latitude, lon, long, lng, georss_point, etc. point = get_point(list_record) for fieldname in fieldnames: if fieldname in list_record: # TODO: coerce types? Or maybe Django's implicit conversion is OK. core_fields[fieldname] = list_record.pop(fieldname) # Try to ensure we have both point and location_name; # fall back to address extraction from *all* fields. address_text = core_fields.get('location_name') or '\n'.join( [unicode(s) for s in list_record.values()]) point, location_name = self.geocode_if_needed(point, address_text) core_fields['location'] = point core_fields['location_name'] = location_name # Attributes. attributes = {} schemafields = self.schema.schemafield_set.all() for sf in schemafields: if sf.name in list_record: # TODO: coerce types? Or maybe Django's implicit conversion is OK. attributes[sf.name] = list_record.pop(sf.name) core_fields['attributes'] = attributes if len(list_record): self.logger.debug("Unused stuff from list_record: %s" % list_record) return core_fields
def clean_list_record(self, list_record): """ Given a dict, prepare it for saving as a newsitem. Result will be a dictionary of anything from list_record that looks like a known field of the NewsItem model. Anything that looks like a known SchemaField of the item's Schema will be set as an 'attributes' sub-dictionary. Unrecognized keys will be ignored (and logged). Locations are found heuristically: - If there's a 'location' key, try to split the value into (lat, lon) points - If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those - If there's a 'location_name', geocode if needed - If there's no 'location_name', reverse-geocode if needed """ from ebpub.db.models import NewsItem fieldnames = [f.name for f in NewsItem._meta.fields] core_fields = {} from ebdata.retrieval.utils import get_point if 'location' in list_record: # If there's a comma- or space-separated location in the # orginal, this gives us a way to use it by mapping it to # "location" try: lat, lon = re.split(r'[\s,]+', list_record.pop('location')) list_record.setdefault('lat', lat) list_record.setdefault('lon', lon) except ValueError: pass # Now try all the field names recognized by get_point(), eg # lat, latitude, lon, long, lng, georss_point, etc. point = get_point(list_record) for fieldname in fieldnames: if fieldname in list_record: # TODO: coerce types? Or maybe Django's implicit conversion is OK. core_fields[fieldname] = list_record.pop(fieldname) # Try to ensure we have both point and location_name; # fall back to address extraction from *all* fields. address_text = core_fields.get('location_name') or '\n'.join([unicode(s) for s in list_record.values()]) point, location_name = self.geocode_if_needed(point, address_text) core_fields['location'] = point core_fields['location_name'] = location_name # Attributes. attributes = {} schemafields = self.schema.schemafield_set.all() for sf in schemafields: if sf.name in list_record: # TODO: coerce types? Or maybe Django's implicit conversion is OK. attributes[sf.name] = list_record.pop(sf.name) core_fields['attributes'] = attributes if len(list_record): self.logger.debug("Unused stuff from list_record: %s" % list_record) return core_fields
def get_location(self, record): """Try to get a point from the record, trying both georss, geo, and some non-standard conventions. Returns a Point or None. This is not called automatically; if you want to use it, your scraper should do ``newsitem.location = self.get_location(record)`` sometime prior to ``self.save()``. """ from ebdata.retrieval.utils import get_point return get_point(record)
def clean_list_record(self, list_record): """ Given a dict, prepare it for saving as a newsitem. Result will be a dictionary of anything from list_record that looks like a known field of the NewsItem model. Anything that looks like a known SchemaField of the item's Schema will be set as an item in an 'attributes' sub-dictionary. Unrecognized keys will be ignored (and logged). Locations are found heuristically: - If there's a 'location' key, try to split the value into (lat, lon) points - If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those - If there's a 'location_name', geocode if needed - If there's no 'location_name', reverse-geocode if possible """ from ebpub.db.models import NewsItem fieldnames = [f.name for f in NewsItem._meta.fields] core_fields = {} from ebdata.retrieval.utils import get_point if 'location' in list_record: # If there's a comma- or space-separated location in the # orginal, this gives us a way to use it by mapping it to # "location" try: lat, lon = re.split(r'[\s,]+', str(list_record.pop('location'))) list_record.setdefault('lat', lat) list_record.setdefault('lon', lon) except ValueError: pass # Now try all the field names recognized by get_point(), eg # lat, latitude, lon, long, lng, georss_point, etc. point = get_point(list_record) for fieldname in fieldnames: if fieldname in list_record: # TODO: coerce types? Or maybe Django's implicit conversion is OK. core_fields[fieldname] = list_record.pop(fieldname) # Try to ensure we have both point and location_name; # fall back to address extraction from *all* fields. address_text = core_fields.get('location_name') if self.get_location_name_from_all_fields and not address_text: address_text = '\n'.join([unicode(s) for s in list_record.values()]) point, location_name = self.geocode_if_needed(point, address_text) core_fields['location'] = point core_fields['location_name'] = location_name # Attributes. attributes = list_record.get('attributes', {}) schemafields = self.schema.schemafield_set.all() for sf in schemafields: if sf.name in list_record: # TODO: coerce types? Or maybe Django's implicit conversion is OK. value = list_record.pop(sf.name) if sf.is_many_to_many_lookup(): # Passed value needs to be a list of strings. if isinstance(value, basestring): value = [value] lookups = [ Lookup.objects.get_or_create_lookup( sf, name=v, code=v, make_text_slug=False ) for v in value] value = ','.join([str(lookup.id) for lookup in lookups]) elif sf.is_lookup: # Need an int id. value = unicode(value) value = Lookup.objects.get_or_create_lookup( sf, name=value, code=value, make_text_slug=False) value = value.id else: # TODO: handle other types? value = unicode(value) attributes[sf.name] = value core_fields['attributes'] = attributes if len(list_record): self.logger.debug("Unused stuff from list_record: %s" % list_record) return core_fields