def augment_luckygoogle(source, propertyinfo, augmented, failed): ''' ''' #logger.debug("Not found: " + place) #It is possible for us to get passed in a data profile which includes a property of type luckygoogle which is not meant to be augmented. #In that case there will be no composite param if not u"composite" in propertyinfo: return composite = propertyinfo[u"composite"] pname = propertyinfo.get(u"property", u'luckygoogle') for obj in source: try: objid = obj[u'id'] #Excel will sometimes give us dates as integers, which reflects in the data set coming back. #Hence the extra unicode conv. #FIXME: should fix in freemix.json endpoint and remove from here item = u', '.join([ unicode(obj[k]) for k in composite if unicode(obj.get(k, u'')).strip() ]) link = luckygoogle(item) if link: val = items_dict.setdefault(objid, {u'id': objid, u'label': obj[u'label']}) val[pname] = link except (KeyboardInterrupt, SystemExit): raise except Exception, e: if logger: logger.info('Exception in augment_date: ' + repr(e)) failureinfo = failure_dict.setdefault(objid, {u'id': objid, u'label': obj[u'label']}) failureinfo[pname] = repr(e)
def webfeed(body): import feedparser #Abstracted from Akara demo/modules/atomtools.py feed = feedparser.parse(body) from akara import logger; logger.info('%i entries: '%len(feed.entries)) def process_entry(e): #from pprint import pformat; from akara import logger; logger.info('webfeed entry: ' + repr(pformat(dict(e)))); logger.info('webfeed entry: ' + repr(pformat(e.__dict__))) data = {} if hasattr(e, 'link'): data[u'id'] = e.link data[u'link'] = e.link if hasattr(e, 'summary'): data[u'description'] = e.summary if hasattr(e, 'title'): data[u'title'] = e.title data[u'label'] = e.title if hasattr(e, 'author_detail'): data[u'author_name'] = e.author_detail.name if hasattr(e, 'updated_parsed'): data[u'updated'] = datetime.datetime(*e.updated_parsed[:7]).isoformat().split('.')[0] if hasattr(e, 'tags'): data[u'tags'] = [ t['term'] for t in e.tags ] return data return [ process_entry(e) for e in feed.entries ] if feed.entries else None
def geocode_spatial(self, spatial): if (not self.api_key): logger.warn( "No API key set for Bing (use bing_api_key configuration key") return None address = Address(spatial) for candidate in address.get_candidates(): # See if this address candidate exists in our cache if (candidate not in DplaBingGeocoder.resultCache): # logger.debug("geocode: No result for [%s] in cache, retrieving from Bing" % candidate) results = self._fetch_results(candidate) DplaBingGeocoder.resultCache[candidate] = list(results) # logger.info("geocode: Result:") # logger.info("geocode: spatial: %s" % spatial) # logger.info("geocode: address: %s" % candidate) # logger.info("geocode: count: %s" % len(DplaBingGeocoder.resultCache[candidate])) # logger.info("geocode: result: %s" % DplaBingGeocoder.resultCache[candidate]) # Require that a single match, or closely grouped matches be returned to avoid bad geocoding results if (1 == len(DplaBingGeocoder.resultCache[candidate]) \ or self._are_closely_grouped_results(DplaBingGeocoder.resultCache[candidate])): result = DplaBingGeocoder.resultCache[candidate][0] coordinate = (result["geocodePoints"][0]["coordinates"][0], result["geocodePoints"][0]["coordinates"][1]) valid_result = True # If we have a specified country, perform a sanity check that the returned coordinate is within # the country's bounding box if (address.country and \ "countryRegion" in result["address"]): bbox_result = self._is_in_country(coordinate, address.country) # If we can't get a country's bbox, assume that we have a good result if (bbox_result is not None): valid_result = bbox_result if (not valid_result): # logger.debug("geocode: Result [%s] not in the correct country [%s], ignoring" % (result["name"], address.country,)) pass if (valid_result): if ("name" in spatial): logger.info("geocode: Result: %s => %s (%s)" % ( spatial["name"], result["name"], result["point"]["coordinates"], )) else: logger.info("geocode: Result: %s => %s (%s)" % ( spatial, result["name"], result["point"]["coordinates"], )) return coordinate return None
def augment_wrapper(source, pname, failed, func, opname): for obj in source: try: id = obj[u'id'] func(obj, id) except (KeyboardInterrupt, SystemExit): raise except Exception, e: if logger: logger.info('Exception in %s: '%opname + repr(e)) failed.setdefault(pname, []).append({u'id': id, u'label': obj[u'label'], 'input': '(masked by exception)', 'reason': repr(e)})
def add_handler(self, method, handler): if method in self.method_table: logger.warn("Replacing %r method handler for %r" % (method, self.path)) else: logger.info("Created %r method handler for %r" % (method, self.path)) # If an outer WSGI wrapper was specified, wrap it around the handler method if self.wsgi_wrapper: handler = self.wsgi_wrapper(handler) self.method_table[method] = handler
def mdlenrichlocation(body,ctype,action="mdl-enrich-location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by combining all spatial fields into one. Will also split out country and state on a best-efforts basis. For primary use with MDL documents. Possible avenues of improvement: - For fields with semi-colons, permute and create multiple spatial elements - Create an ordered list of "names" for the geocoder to attempt to lookup as opposed to our single concatenated list: - Everything concatenated together - Everything concatenated together up to "United States" - Remove left-most elements one by one """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): sp = {} v = getprop(data,prop) fields = len(v) if not fields: logger.error("Spatial is empty.") return json.dumps(data) else: # Concatenate all values together to form the name field sp["name"] = ", ".join(v) logger.info("mdl-enrich-location: %s => %s" % (fields, sp["name"],)) if (1 == fields): # If there is only one element present, it is a country sp["country"] = clean(v[0]) elif "United States" in v: country_index = v.index("United States") sp["country"] = clean(v[country_index]) # The prior item is almost always a state if (country_index > 1): state = clean(v[country_index - 1]) if (is_state(state)): sp["state"] = state if sp: sp = [sp] setprop(data, prop, sp) return json.dumps(data)
def harvard_enrich_location(body, ctype, action="harvard_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a Harvard JSON document and enriches the "spatial" field by translating any MARC country codes contained within the originalDocument place element into their names, for better geocoding accuracy. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if (exists(data, "originalRecord/metadata/mods/originInfo/place")): places = getprop(data, "originalRecord/metadata/mods/originInfo/place") country = "" countryCode = "" name = "" # Add non-country terms for place in iterify(places): logger.info("place: %s" % place) placeTerm = getprop(place, "placeTerm", True) if (isinstance(placeTerm, basestring)): name += " " + placeTerm elif (not exists(placeTerm, "authority")): name += " " + getprop(placeTerm, "#text", True) # Add country for place in iterify(places): placeTerm = getprop(place, "placeTerm", True) if (exists(placeTerm, "authority") \ and "marccountry" == getprop(placeTerm, "authority", True)): countryCode = getprop(placeTerm, "#text", True) country = get_country_from_marccode(countryCode) if (country): name += ", " + country # logger.info("geocode: harvard: Converting name to %s" % name) spatial = {"name": re.sub("[\[\]]", "", name.strip(", "))} if (country \ and (2 == len(countryCode) \ or countryCode.startswith("xx"))): spatial["country"] = country setprop(data, prop, [spatial]) return json.dumps(data)
def atom_moin(body, ctype, maxcount=None, folder=None, feed=None): #Sample query: #curl --request POST "http://localhost:8880/atom.moin?feed=http://bitworking.org/news/feed/&maxcount=10&folder=foo091023" #You can set ...&maxcount=100 or whatever number, if you like maxcount = int(maxcount if maxcount else DEFAULT_MAX) H = httplib2.Http('.cache') if USER: H.add_credentials(USER, PASSWD) #Prepare the envelope for the output (POST response) w = structencoder() output = w.cofeed(ROOT(E_CURSOR(u'updates', {u'feed': feed}))) logger.debug('Feed: ' + feed) entries = atomtools.ejsonize(feed) for entry in islice(entries, 0, maxcount): try: logger.debug('ENTRY: ' + repr(entry)) aid = entry[u'label'] slug = atomtools.slug_from_title(aid) #logger.debug('GRIPPO' + repr((id,))) dest = folder + '/' + slug chunks = [ ' title:: ' + entry[u'title'] ] chunks.append(' last changed:: ' + entry[u'updated']) chunks.append(' link:: ' + (first_item(entry[u'link']) or '')) if u'summary' in entry: chunks.append('= Summary =\n' + entry[u'summary']) if u'content_src' in entry: chunks.append('= Content =\n' + entry[u'content_src']) if u'content_text' in entry: chunks.append('= Content =\n' + entry[u'content_text']) #logger.debug("Result IDs: " + ids) if u'categories' in entry: chunks.append(u'= Categories =') for categories in entry[u'categories']: chunks.append(' * ' + categories) chunks.append(' id:: ' + entry[u'id']) chunks.append('= akara:metadata =\n akara:type:: http://purl.org/com/zepheira/zen/resource/webfeed\n') url = absolutize(dest, MOINBASE) headers = {'Content-Type' : 'text/plain'} resp, content = H.request(url, "PUT", body='\n'.join(chunks).encode('utf-8'), headers=headers) logger.debug("Result: " + repr((resp, content))) output.send(E(u'update', {u'entry-id': entry[u'id'], u'page': url})) except (KeyboardInterrupt, SystemExit): raise except Exception, e: logger.info('Exception handling Entry page: ' + repr(e)) output.send(E(u'failure', {u'entry-id': entry[u'id']}))
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "save_process/status") != "complete": print "Error, save process did not complete" return -1 # Update ingestion document kwargs = { "delete_process/status": "running", "delete_process/start_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.process_deleted_docs(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None msg = "Total documents deleted: %s" % total_deleted print msg logger.info(msg) # Update ingestion document kwargs = { "delete_process/status": status, "delete_process/error": error_msg, "delete_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "save_process/status") != "complete": print "Error, save process did not complete" return -1 # Update ingestion document kwargs = { "delete_process/status": "running", "delete_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.process_deleted_docs(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None msg = "Total documents deleted: %s" % total_deleted print msg logger.info(msg) # Update ingestion document kwargs = { "delete_process/status": status, "delete_process/error": error_msg, "delete_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def geocode_spatial(self, spatial): if (not self.api_key): logger.warn("No API key set for Bing (use bing_api_key configuration key") return None address = Address(spatial) for candidate in address.get_candidates(): # See if this address candidate exists in our cache if (candidate not in DplaBingGeocoder.resultCache): results = self._fetch_results(candidate) DplaBingGeocoder.resultCache[candidate] = list(results) # Require that a single match, or closely grouped matches be returned to avoid bad geocoding results if (1 == len(DplaBingGeocoder.resultCache[candidate]) \ or self._are_closely_grouped_results(DplaBingGeocoder.resultCache[candidate])): result = DplaBingGeocoder.resultCache[candidate][0] coordinate = (result["geocodePoints"][0]["coordinates"][0], result["geocodePoints"][0]["coordinates"][1]) valid_result = True # If we have a specified country, perform a sanity check that the returned coordinate is within # the country's bounding box if (address.country and \ "countryRegion" in result["address"]): bbox_result = self._is_in_country(coordinate, address.country) # If we can't get a country's bbox, assume that we have a good result if (bbox_result is not None): valid_result = bbox_result if (not valid_result): logger.debug("Geocode result [%s] not in the correct country [%s], ignoring" % (result["name"], address.country,)) pass if (valid_result): if ("name" in spatial): logger.info("Geocode result: %s => %s (%s)" % (spatial["name"], result["name"], result["point"]["coordinates"],)) else: logger.info("Geocode result: %s => %s (%s)" % (spatial, result["name"], result["point"]["coordinates"],)) return coordinate return None
def download_image(url, id, download): """ Downloads the thumbnail from the given url and stores it on disk. Current implementation stores the file on disk Arguments: url String - the url of the file for downloading id String - document id, used for the file name generation download Bool - True if download image False if only check the mime type Returns: (Name, mime, status) - if everything was OK: - Name of the file where the image was stored - MIME type for the image - Status ("download"|"error") """ name = None mime = None status = "error" def res(name, mime, status): return (name, mime, status) # Open connection to the image using provided URL. try: conn = urllib.urlopen(url) except IOError as e: logger.error("Cannot open url [%s] for downloading thumbnail." % url) return res(name, mime, status) if not conn.getcode() / 100 == 2: logger.error("Got %s from url: [%s] for document: [%s]" % (conn.getcode(), url, id)) return res(name, mime, status) # Get the thumbnail extension from the URL, needed for storing the # file on disk with proper extension. file_extension = "" mime = None try: # The content type from HTTP headers. mime = conn.headers['content-type'] file_extension = find_file_extension(mime) except FileExtensionException as e: logger.error("Couldn't find file extension.") return res(name, mime, status) # so we should just check mime type if not download: return res(None, mime, None) # Get the directory path and file path for storing the image. (path, fname, relative_fname) = generate_file_path(id, file_extension) # Let's create the directory for storing the file name. if not os.path.exists(path): logger.info("Creating directory: " + path) os.makedirs(path) else: logger.debug("Path [%s] exists." % path) # Download the image. try: logger.info("Downloading file to: " + fname) local_file = open(fname, 'wb') local_file.write(conn.read()) except Exception as e: logger.error(e.message) return res(name, mime, status) else: conn.close() local_file.close() logger.debug("Downloaded file from [%s] to [%s]." % (url, fname, )) status = "downloaded" name = relative_fname return res(name, mime, status)
def download_image(url, id, download): """ Downloads the thumbnail from the given url and stores it on disk. Current implementation stores the file on disk Arguments: url String - the url of the file for downloading id String - document id, used for the file name generation download Bool - True if download image False if only check the mime type Returns: (Name, mime, status) - if everything was OK: - Name of the file where the image was stored - MIME type for the image - Status ("download"|"error") """ name = None mime = None status = "error" def res(name, mime, status): return (name, mime, status) # Open connection to the image using provided URL. try: conn = urllib.urlopen(url) except IOError as e: logger.error("Cannot open url [%s] for downloading thumbnail." % url) return res(name, mime, status) if not conn.getcode() / 100 == 2: msg = "Got %s from url: [%s] for document: [%s]" % \ (conn.getcode(), url, id) logger.error(msg) return res(name, mime, status) # Get the thumbnail extension from the URL, needed for storing the # file on disk with proper extension. file_extension = "" mime = None try: # The content type from HTTP headers. mime = conn.headers['content-type'] file_extension = find_file_extension(mime) except FileExtensionException as e: logger.error("Couldn't find file extension.") return res(name, mime, status) # so we should just check mime type if not download: return res(None, mime, None) # Get the directory path and file path for storing the image. (path, fname, relative_fname) = generate_file_path(id, file_extension) # Let's create the directory for storing the file name. if not os.path.exists(path): logger.info("Creating directory: " + path) os.makedirs(path) else: logger.debug("Path [%s] exists." % path) # Download the image. try: logger.info("Downloading file to: " + fname) local_file = open(fname, 'wb') local_file.write(conn.read()) except Exception as e: msg = e.message logger.error(msg) return res(name, mime, status) else: conn.close() local_file.close() logger.debug("Downloaded file from [%s] to [%s]." % ( url, fname, )) status = "downloaded" name = relative_fname return res(name, mime, status)
except Exception, error: raise Exception("Unable to write to PID file %r: %s" % (pid_file, error)) finally: f.close() def remove_pid(pid_file): "Remove the given filename (which should be the PID file)" try: os.remove(pid_file) except Exception, error: if not os.path.exists(pid_file): logger.error("Unable to remove PID file %r: %s", pid_file, error) else: logger.info("Removed PID file %r", pid_file) # There are two ways to run the Akara server, either in debug mode # (running in the foreground, with the -X option) or in daemon mode # (running in the background) which is the default. The latter is # trickier to support. # In that case the command-line program spawns off a new process, # which is the master HTTP node ("the flup server"). It manages the # subprocesses which actually handle the HTTP requests. The flup # server starts up and either manages to set things up or fails # because of some problem. The command-line program needs to exit with # an error code if there was a problem, so there must be some sort of # communications between the two.
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "fetch_process/status") != "complete": print "Cannot enrich, fetch process did not complete" return -1 # Update ingestion document enrich_dir = create_enrich_dir(ingestion_doc["provider"]) kwargs = { "enrich_process/status": "running", "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Set the headers sent with the enrich request with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) headers = { "Source": ingestion_doc["provider"], "Collection": "", "Content-Type": "application/json", "Pipeline-Rec": ",".join(profile["enrichments_rec"]), "Pipeline-Coll": ",".join(profile["enrichments_coll"]) } error_msg = None fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") total_enriched_records = 0 for filename in os.listdir(fetch_dir): filepath = os.path.join(fetch_dir, filename) with open(filepath, "r") as f: try: data = json.loads(f.read()) except: error_msg = "Error loading " + filepath break # Enrich print "Enriching file " + filepath enrich_path = ingestion_doc["uri_base"] + "/enrich" resp, content = H.request(enrich_path, "POST", body=json.dumps(data), headers=headers) if not resp["status"].startswith("2"): error_msg = "Error (status %s) enriching data from %s" % \ (resp["status"], filepath) print "Stopped enrichment process: " + error_msg break data = json.loads(content) enriched_records = data["enriched_records"] total_enriched_records += data["enriched_records_count"] # Write enriched data to file with open(os.path.join(enrich_dir, filename), "w") as f: f.write(json.dumps(enriched_records)) logger.info("Total records enriched: %s" % total_enriched_records) # Update ingestion document if error_msg is not None: status = "error" else: status = "complete" kwargs = { "enrich_process/status": status, "enrich_process/error": error_msg, "enrich_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress fetch directory, then delete make_tarfile(fetch_dir) shutil.rmtree(fetch_dir) return 0 if status == "complete" else -1
def map_date_publisher_and_spatial(self): """ Examine the many possible originInfo elements and pick out date, spatial, and publisher information. Dates may come in multiple originInfo elements, in which case we take the last one. """ ret_dict = {"date": [], "spatial": [], "publisher": []} date_fields = ("dateIssued", "dateCreated", "dateCaptured", "dateValid", "dateModified", "copyrightDate", "dateOther") date_origin_info = [] def datestring(date_data): """ Given a "date field" element from inside an originInfo, return a string representation of the date or dates represented. """ if type(date_data) == dict: # E.g. single dateCaptured without any attributes; just take # it return date_data.get("#text") elif type(date_data) == unicode: return date_data keyDate, startDate, endDate = None, None, None for _dict in date_data: if _dict.get("keyDate") == "yes": keyDate = _dict.get("#text") if _dict.get("point") == "start": startDate = _dict.get("#text") if _dict.get("point") == "end": endDate = _dict.get("#text") if startDate and endDate: return "%s - %s" % (startDate, endDate) elif keyDate: return keyDate else: return None origin_infos = filter(None, iterify(getprop(self.provider_data, "originInfo", True))) for origin_info in origin_infos: # Put aside date-related originInfo elements for later ... for field in date_fields: if field in origin_info: date_origin_info.append(origin_info) break # Map publisher if ("publisher" in origin_info and origin_info["publisher"] not in ret_dict["publisher"]): ret_dict["publisher"].append(self.txt(origin_info["publisher"])) # Map spatial if exists(origin_info, "place/placeTerm"): for place_term in iterify(getprop(origin_info, "place/placeTerm")): if isinstance(place_term, basestring): pass elif isinstance(place_term, dict): place_term = place_term.get("#text") if (place_term and place_term not in ret_dict["spatial"]): ret_dict["spatial"].append(place_term) # Map dates. Only use the last date-related originInfo element try: last_date_origin_info = date_origin_info[-1] for field in date_fields: if field in last_date_origin_info: s = datestring(last_date_origin_info[field]) if s and s not in ret_dict["date"]: ret_dict["date"].append(s) except Exception as e: logger.info("Can not get date from %s" % self.provider_data["_id"]) for k in ret_dict.keys(): if not ret_dict[k]: del ret_dict[k] self.update_source_resource(ret_dict)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"]) print "Fetching records for " + fetcher.provider total_fetched_records = 0 for response in fetcher.fetch_all_data(): if response["error"]: error_msg.extend(iterify(response["error"])) print response["error"] else: # Write records to file filename = os.path.join(fetch_dir, str(uuid.uuid4())) with open(filename, "w") as f: f.write(json.dumps(response["data"])) print "Records written to " + filename total_fetched_records += len(getprop(response, "data/records")) logger.info("Total records fetched: %s" % total_fetched_records) # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def map_date_publisher(self): """ Examine the many possible originInfo elements and pick out date, spatial, and publisher information. Dates may come in multiple originInfo elements, in which case we take the last one. """ ret_dict = {"date": [], "spatial": [], "publisher": []} date_fields = ("dateIssued", "dateCreated", "dateCaptured", "dateValid", "dateModified", "copyrightDate", "dateOther") date_origin_info = [] def datestring(date_data): """ Given a "date field" element from inside an originInfo, return a string representation of the date or dates represented. """ if type(date_data) == dict: # E.g. single dateCaptured without any attributes; just take # it return self.txt(date_data) elif type(date_data) == unicode: return date_data keyDate, startDate, endDate = None, None, None for _dict in date_data: if _dict.get("keyDate") == "yes": keyDate = self.txt(_dict) if _dict.get("point") == "start": startDate = self.txt(_dict) if _dict.get("point") == "end": endDate = self.txt(_dict) if startDate and endDate: return "%s - %s" % (startDate, endDate) elif keyDate: return keyDate else: return None origin_infos = filter(None, iterify(getprop(self.provider_data, "originInfo", True))) for origin_info in origin_infos: # Put aside date-related originInfo elements for later ... for field in date_fields: if field in origin_info: date_origin_info.append(origin_info) break # Map publisher if ("publisher" in origin_info and origin_info["publisher"] not in ret_dict["publisher"]): ret_dict["publisher"].append( self.txt(origin_info["publisher"])) # Map dates. Only use the last date-related originInfo element try: last_date_origin_info = date_origin_info[-1] for field in date_fields: if field in last_date_origin_info: s = datestring(last_date_origin_info[field]) if s and s not in ret_dict["date"]: ret_dict["date"].append(s) except Exception as e: logger.info("Can not get date from %s" % self.provider_data["_id"]) for k in ret_dict.keys(): if not ret_dict[k]: del ret_dict[k] self.update_source_resource(ret_dict)