Example #1
0
def augment_luckygoogle(source, propertyinfo, augmented, failed):
    '''
    '''
    #logger.debug("Not found: " + place)
    #It is possible for us to get passed in a data profile which includes a property of type luckygoogle which is not meant to be augmented.
    #In that case there will be no composite param
    if not u"composite" in propertyinfo:
        return
    composite = propertyinfo[u"composite"]
    pname = propertyinfo.get(u"property", u'luckygoogle')
    for obj in source:
        try:
            objid = obj[u'id']
            #Excel will sometimes give us dates as integers, which reflects in the data set coming back.
            #Hence the extra unicode conv.
            #FIXME: should fix in freemix.json endpoint and remove from here
            item = u', '.join([ unicode(obj[k]) for k in composite if unicode(obj.get(k, u'')).strip() ])
            link = luckygoogle(item)
            if link:
                val = items_dict.setdefault(objid, {u'id': objid, u'label': obj[u'label']})
                val[pname] = link
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception, e:
            if logger: logger.info('Exception in augment_date: ' + repr(e))
            failureinfo = failure_dict.setdefault(objid, {u'id': objid, u'label': obj[u'label']})
            failureinfo[pname] = repr(e)
Example #2
0
File: feeds.py Project: dpla/zen
def webfeed(body):
    import feedparser
    #Abstracted from Akara demo/modules/atomtools.py
    feed = feedparser.parse(body)
    from akara import logger; logger.info('%i entries: '%len(feed.entries))
    
    def process_entry(e):
        #from pprint import pformat; from akara import logger; logger.info('webfeed entry: ' + repr(pformat(dict(e)))); logger.info('webfeed entry: ' + repr(pformat(e.__dict__)))
        data = {}
        if hasattr(e, 'link'):
            data[u'id'] = e.link
            data[u'link'] = e.link
        if hasattr(e, 'summary'):
            data[u'description'] = e.summary
        if hasattr(e, 'title'):
            data[u'title'] = e.title
            data[u'label'] = e.title
        if hasattr(e, 'author_detail'):
            data[u'author_name'] = e.author_detail.name
        if hasattr(e, 'updated_parsed'):
            data[u'updated'] = datetime.datetime(*e.updated_parsed[:7]).isoformat().split('.')[0]
        if hasattr(e, 'tags'):
            data[u'tags'] = [ t['term'] for t in e.tags ]
        return data

    return [ process_entry(e) for e in feed.entries ] if feed.entries else None
Example #3
0
    def geocode_spatial(self, spatial):
        if (not self.api_key):
            logger.warn(
                "No API key set for Bing (use bing_api_key configuration key")
            return None

        address = Address(spatial)
        for candidate in address.get_candidates():
            # See if this address candidate exists in our cache
            if (candidate not in DplaBingGeocoder.resultCache):
                # logger.debug("geocode: No result for [%s] in cache, retrieving from Bing" % candidate)
                results = self._fetch_results(candidate)
                DplaBingGeocoder.resultCache[candidate] = list(results)
                # logger.info("geocode: Result:")
                # logger.info("geocode: spatial: %s" % spatial)
                # logger.info("geocode: address: %s" % candidate)
                # logger.info("geocode: count: %s" % len(DplaBingGeocoder.resultCache[candidate]))
                # logger.info("geocode: result: %s" % DplaBingGeocoder.resultCache[candidate])

            # Require that a single match, or closely grouped matches be returned to avoid bad geocoding results
            if (1 == len(DplaBingGeocoder.resultCache[candidate]) \
                or self._are_closely_grouped_results(DplaBingGeocoder.resultCache[candidate])):
                result = DplaBingGeocoder.resultCache[candidate][0]
                coordinate = (result["geocodePoints"][0]["coordinates"][0],
                              result["geocodePoints"][0]["coordinates"][1])
                valid_result = True

                # If we have a specified country, perform a sanity check that the returned coordinate is within
                # the country's bounding box
                if (address.country and \
                    "countryRegion" in result["address"]):
                    bbox_result = self._is_in_country(coordinate,
                                                      address.country)

                    # If we can't get a country's bbox, assume that we have a good result
                    if (bbox_result is not None):
                        valid_result = bbox_result
                        if (not valid_result):
                            # logger.debug("geocode: Result [%s] not in the correct country [%s], ignoring" % (result["name"], address.country,))
                            pass

                if (valid_result):
                    if ("name" in spatial):
                        logger.info("geocode: Result: %s => %s (%s)" % (
                            spatial["name"],
                            result["name"],
                            result["point"]["coordinates"],
                        ))
                    else:
                        logger.info("geocode: Result: %s => %s (%s)" % (
                            spatial,
                            result["name"],
                            result["point"]["coordinates"],
                        ))
                    return coordinate

        return None
Example #4
0
def augment_wrapper(source, pname, failed, func, opname):
    for obj in source:
        try:
            id = obj[u'id']
            func(obj, id)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception, e:
            if logger: logger.info('Exception in %s: '%opname + repr(e))
            failed.setdefault(pname, []).append({u'id': id, u'label': obj[u'label'], 'input': '(masked by exception)', 'reason': repr(e)})
Example #5
0
    def add_handler(self, method, handler):
        if method in self.method_table:
            logger.warn("Replacing %r method handler for %r" %
                        (method, self.path))
        else:
            logger.info("Created %r method handler for %r" %
                        (method, self.path))
        # If an outer WSGI wrapper was specified, wrap it around the handler method
        if self.wsgi_wrapper:
            handler = self.wsgi_wrapper(handler)

        self.method_table[method] = handler
Example #6
0
    def add_handler(self, method, handler):
        if method in self.method_table:
            logger.warn("Replacing %r method handler for %r"  %
                        (method, self.path))
        else:
            logger.info("Created %r method handler for %r" %
                        (method, self.path))
        # If an outer WSGI wrapper was specified, wrap it around the handler method
        if self.wsgi_wrapper:
            handler = self.wsgi_wrapper(handler)

        self.method_table[method] = handler
Example #7
0
def mdlenrichlocation(body,ctype,action="mdl-enrich-location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document by
    combining all spatial fields into one. Will also split out country and state on a 
    best-efforts basis.

    For primary use with MDL documents.

    Possible avenues of improvement:
      - For fields with semi-colons, permute and create multiple spatial elements 
      - Create an ordered list of "names" for the geocoder to attempt to lookup 
        as opposed to our single concatenated list:
          - Everything concatenated together 
          - Everything concatenated together up to "United States" 
          - Remove left-most elements one by one
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        sp = {}
        v = getprop(data,prop)
        fields = len(v)
        if not fields:
            logger.error("Spatial is empty.")
            return json.dumps(data)
        else:
            # Concatenate all values together to form the name field 
            sp["name"] = ", ".join(v)
            logger.info("mdl-enrich-location: %s => %s" % (fields, sp["name"],))

            if (1 == fields): 
                # If there is only one element present, it is a country 
                sp["country"] = clean(v[0])
            elif "United States" in v: 
                country_index = v.index("United States")
                sp["country"] = clean(v[country_index])

                # The prior item is almost always a state 
                if (country_index > 1):
                    state = clean(v[country_index - 1])
                    if (is_state(state)): 
                        sp["state"] = state

        if sp:
            sp = [sp]
            setprop(data, prop, sp)

    return json.dumps(data)
def harvard_enrich_location(body,
                            ctype,
                            action="harvard_enrich_location",
                            prop="sourceResource/spatial"):
    """
    Service that accepts a Harvard JSON document and enriches the "spatial" field by translating 
    any MARC country codes contained within the originalDocument place element into their names, 
    for better geocoding accuracy.     
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if (exists(data, "originalRecord/metadata/mods/originInfo/place")):
        places = getprop(data, "originalRecord/metadata/mods/originInfo/place")
        country = ""
        countryCode = ""
        name = ""

        # Add non-country terms
        for place in iterify(places):
            logger.info("place: %s" % place)
            placeTerm = getprop(place, "placeTerm", True)
            if (isinstance(placeTerm, basestring)):
                name += " " + placeTerm
            elif (not exists(placeTerm, "authority")):
                name += " " + getprop(placeTerm, "#text", True)

        # Add country
        for place in iterify(places):
            placeTerm = getprop(place, "placeTerm", True)
            if (exists(placeTerm, "authority") \
                and "marccountry" == getprop(placeTerm, "authority", True)):
                countryCode = getprop(placeTerm, "#text", True)
                country = get_country_from_marccode(countryCode)
                if (country):
                    name += ", " + country

        # logger.info("geocode: harvard: Converting name to %s" % name)
        spatial = {"name": re.sub("[\[\]]", "", name.strip(", "))}
        if (country \
            and (2 == len(countryCode) \
                 or countryCode.startswith("xx"))):
            spatial["country"] = country

        setprop(data, prop, [spatial])

    return json.dumps(data)
Example #9
0
File: atom_zen.py Project: dpla/zen
def atom_moin(body, ctype, maxcount=None, folder=None, feed=None):
    #Sample query:
    #curl --request POST "http://localhost:8880/atom.moin?feed=http://bitworking.org/news/feed/&maxcount=10&folder=foo091023"
    #You can set ...&maxcount=100 or whatever number, if you like
    maxcount = int(maxcount if maxcount else DEFAULT_MAX)

    H = httplib2.Http('.cache')
    if USER:
        H.add_credentials(USER, PASSWD)

    #Prepare the envelope for the output (POST response)
    w = structencoder()
    output = w.cofeed(ROOT(E_CURSOR(u'updates', {u'feed': feed})))
    logger.debug('Feed: ' + feed)
    
    entries = atomtools.ejsonize(feed)
    for entry in islice(entries, 0, maxcount):
        try:
            logger.debug('ENTRY: ' + repr(entry))
            aid = entry[u'label']
            slug = atomtools.slug_from_title(aid)
            #logger.debug('GRIPPO' + repr((id,)))
            dest = folder + '/' + slug
            chunks = [ ' title:: ' + entry[u'title'] ]
            chunks.append(' last changed:: ' + entry[u'updated'])
            chunks.append(' link:: ' + (first_item(entry[u'link']) or ''))

            if u'summary' in entry: chunks.append('= Summary =\n' + entry[u'summary'])
            if u'content_src' in entry: chunks.append('= Content =\n' + entry[u'content_src'])
            if u'content_text' in entry: chunks.append('= Content =\n' + entry[u'content_text'])
            #logger.debug("Result IDs: " + ids)
            if u'categories' in entry:
                chunks.append(u'= Categories =')
                for categories in entry[u'categories']:
                    chunks.append(' * ' + categories)

            chunks.append(' id:: ' + entry[u'id'])
            chunks.append('= akara:metadata =\n akara:type:: http://purl.org/com/zepheira/zen/resource/webfeed\n')

            url = absolutize(dest, MOINBASE)
            headers = {'Content-Type' : 'text/plain'}
            resp, content = H.request(url, "PUT", body='\n'.join(chunks).encode('utf-8'), headers=headers)
            logger.debug("Result: " + repr((resp, content)))
            output.send(E(u'update', {u'entry-id': entry[u'id'], u'page': url}))
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception, e:
            logger.info('Exception handling Entry page: ' + repr(e))
            output.send(E(u'failure', {u'entry-id': entry[u'id']}))
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "save_process/status") != "complete":
        print "Error, save process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "delete_process/status": "running",
        "delete_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.process_deleted_docs(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None

    msg = "Total documents deleted: %s" % total_deleted
    print msg
    logger.info(msg)

    # Update ingestion document
    kwargs = {
        "delete_process/status": status,
        "delete_process/error": error_msg,
        "delete_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "save_process/status") != "complete":
        print "Error, save process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "delete_process/status": "running",
        "delete_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.process_deleted_docs(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None

    msg = "Total documents deleted: %s" % total_deleted
    print msg
    logger.info(msg)

    # Update ingestion document
    kwargs = {
        "delete_process/status": status,
        "delete_process/error": error_msg,
        "delete_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
Example #12
0
    def geocode_spatial(self, spatial):
        if (not self.api_key):
            logger.warn("No API key set for Bing (use bing_api_key configuration key")
            return None

        address = Address(spatial)
        for candidate in address.get_candidates():
            # See if this address candidate exists in our cache 
            if (candidate not in DplaBingGeocoder.resultCache): 
                results = self._fetch_results(candidate)
                DplaBingGeocoder.resultCache[candidate] = list(results)

            # Require that a single match, or closely grouped matches be returned to avoid bad geocoding results
            if (1 == len(DplaBingGeocoder.resultCache[candidate]) \
                or self._are_closely_grouped_results(DplaBingGeocoder.resultCache[candidate])):
                result = DplaBingGeocoder.resultCache[candidate][0]
                coordinate = (result["geocodePoints"][0]["coordinates"][0], result["geocodePoints"][0]["coordinates"][1])
                valid_result = True
                
                # If we have a specified country, perform a sanity check that the returned coordinate is within
                # the country's bounding box
                if (address.country and \
                    "countryRegion" in result["address"]):
                    bbox_result = self._is_in_country(coordinate, address.country)

                    # If we can't get a country's bbox, assume that we have a good result
                    if (bbox_result is not None):
                        valid_result = bbox_result
                        if (not valid_result):
                            logger.debug("Geocode result [%s] not in the correct country [%s], ignoring" % (result["name"], address.country,))
                            pass

                if (valid_result): 
                    if ("name" in spatial): 
                        logger.info("Geocode result: %s => %s (%s)" % (spatial["name"], result["name"], result["point"]["coordinates"],))
                    else: 
                        logger.info("Geocode result: %s => %s (%s)" % (spatial, result["name"], result["point"]["coordinates"],))
                    return coordinate

        return None
def download_image(url, id, download):
    """
    Downloads the thumbnail from the given url and stores it on disk.

    Current implementation stores the file on disk

    Arguments:
        url      String - the url of the file for downloading
        id       String - document id, used for the file name generation
        download Bool   - True if download image
                          False if only check the mime type

    Returns:
        (Name, mime, status) - if everything was OK:

                - Name of the file where the image was stored
                - MIME type for the image
                - Status ("download"|"error")

    """
    name = None
    mime = None
    status = "error"

    def res(name, mime, status):
        return (name, mime, status)

    # Open connection to the image using provided URL.
    try:
        conn = urllib.urlopen(url)
    except IOError as e:
        logger.error("Cannot open url [%s] for downloading thumbnail." % url)
        return res(name, mime, status)

    if not conn.getcode() / 100 == 2:
        logger.error("Got %s from url: [%s] for document: [%s]" %
                     (conn.getcode(), url, id))
        return res(name, mime, status)

    # Get the thumbnail extension from the URL, needed for storing the
    # file on disk with proper extension.
    file_extension = ""
    mime = None
    try:
        # The content type from HTTP headers.
        mime = conn.headers['content-type']
        file_extension = find_file_extension(mime)
    except FileExtensionException as e:
        logger.error("Couldn't find file extension.")
        return res(name, mime, status)

    # so we should just check mime type
    if not download:
        return res(None, mime, None)

    # Get the directory path and file path for storing the image.
    (path, fname, relative_fname) = generate_file_path(id, file_extension)

    # Let's create the directory for storing the file name.
    if not os.path.exists(path):
        logger.info("Creating directory: " + path)
        os.makedirs(path)
    else:
        logger.debug("Path [%s] exists." % path)

    # Download the image.
    try:
        logger.info("Downloading file to: " + fname)
        local_file = open(fname, 'wb')
        local_file.write(conn.read())
    except Exception as e:
        logger.error(e.message)
        return res(name, mime, status)
    else:
        conn.close()
        local_file.close()

    logger.debug("Downloaded file from [%s] to [%s]." % (url, fname, ))
    status = "downloaded"
    name = relative_fname
    return res(name, mime, status)
Example #14
0
def download_image(url, id, download):
    """
    Downloads the thumbnail from the given url and stores it on disk.

    Current implementation stores the file on disk

    Arguments:
        url      String - the url of the file for downloading
        id       String - document id, used for the file name generation
        download Bool   - True if download image
                          False if only check the mime type

    Returns:
        (Name, mime, status) - if everything was OK:

                - Name of the file where the image was stored
                - MIME type for the image
                - Status ("download"|"error")

    """
    name = None
    mime = None
    status = "error"

    def res(name, mime, status):
        return (name, mime, status)

    # Open connection to the image using provided URL.
    try:
        conn = urllib.urlopen(url)
    except IOError as e:
        logger.error("Cannot open url [%s] for downloading thumbnail." % url)
        return res(name, mime, status)

    if not conn.getcode() / 100 == 2:
        msg = "Got %s from url: [%s] for document: [%s]" % \
            (conn.getcode(), url, id)
        logger.error(msg)
        return res(name, mime, status)

    # Get the thumbnail extension from the URL, needed for storing the
    # file on disk with proper extension.
    file_extension = ""
    mime = None
    try:
        # The content type from HTTP headers.
        mime = conn.headers['content-type']
        file_extension = find_file_extension(mime)
    except FileExtensionException as e:
        logger.error("Couldn't find file extension.")
        return res(name, mime, status)

    # so we should just check mime type
    if not download:
        return res(None, mime, None)

    # Get the directory path and file path for storing the image.
    (path, fname, relative_fname) = generate_file_path(id, file_extension)

    # Let's create the directory for storing the file name.
    if not os.path.exists(path):
        logger.info("Creating directory: " + path)
        os.makedirs(path)
    else:
        logger.debug("Path [%s] exists." % path)

    # Download the image.
    try:
        logger.info("Downloading file to: " + fname)
        local_file = open(fname, 'wb')
        local_file.write(conn.read())
    except Exception as e:
        msg = e.message
        logger.error(msg)
        return res(name, mime, status)
    else:
        conn.close()
        local_file.close()

    logger.debug("Downloaded file from [%s] to [%s]." % (
        url,
        fname,
    ))
    status = "downloaded"
    name = relative_fname
    return res(name, mime, status)
Example #15
0
        except Exception, error:
            raise Exception("Unable to write to PID file %r: %s" %
                            (pid_file, error))
    finally:
        f.close()

def remove_pid(pid_file):
    "Remove the given filename (which should be the PID file)"
    try:
        os.remove(pid_file)
    except Exception, error:
        if not os.path.exists(pid_file):
            logger.error("Unable to remove PID file %r: %s",
                      pid_file, error)
    else:
        logger.info("Removed PID file %r", pid_file)


# There are two ways to run the Akara server, either in debug mode
# (running in the foreground, with the -X option) or in daemon mode
# (running in the background) which is the default. The latter is
# trickier to support.

# In that case the command-line program spawns off a new process,
# which is the master HTTP node ("the flup server"). It manages the
# subprocesses which actually handle the HTTP requests. The flup
# server starts up and either manages to set things up or fails
# because of some problem. The command-line program needs to exit with
# an error code if there was a problem, so there must be some sort of
# communications between the two.
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print "Cannot enrich, fetch process did not complete"
        return -1

    # Update ingestion document
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    kwargs = {
        "enrich_process/status": "running",
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Set the headers sent with the enrich request
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())
    headers = {
        "Source": ingestion_doc["provider"],
        "Collection": "",
        "Content-Type": "application/json",
        "Pipeline-Rec": ",".join(profile["enrichments_rec"]),
        "Pipeline-Coll": ",".join(profile["enrichments_coll"])
    }

    error_msg = None
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")
    
    total_enriched_records = 0
    for filename in os.listdir(fetch_dir):
        filepath = os.path.join(fetch_dir, filename)
        with open(filepath, "r") as f:
            try:
                data = json.loads(f.read())
            except:
                error_msg = "Error loading " + filepath
                break

        # Enrich
        print "Enriching file " + filepath
        enrich_path = ingestion_doc["uri_base"] + "/enrich"
        resp, content = H.request(enrich_path, "POST", body=json.dumps(data),
                                  headers=headers)
        if not resp["status"].startswith("2"):
            error_msg = "Error (status %s) enriching data from %s" % \
                        (resp["status"], filepath)
            print "Stopped enrichment process: " + error_msg
            break

        data = json.loads(content)
        enriched_records = data["enriched_records"]
        total_enriched_records += data["enriched_records_count"]

        # Write enriched data to file
        with open(os.path.join(enrich_dir, filename), "w") as f:
            f.write(json.dumps(enriched_records))

    logger.info("Total records enriched: %s" % total_enriched_records)

    # Update ingestion document
    if error_msg is not None:
        status = "error"
    else:
        status = "complete"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/error": error_msg,
        "enrich_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress fetch directory, then delete
    make_tarfile(fetch_dir)
    shutil.rmtree(fetch_dir)

    return 0 if status == "complete" else -1
Example #17
0
    def map_date_publisher_and_spatial(self):
        """
        Examine the many possible originInfo elements and pick out date,
        spatial, and publisher information.

        Dates may come in multiple originInfo elements, in which case we take
        the last one.
        """
        ret_dict = {"date": [], "spatial": [], "publisher": []}
        date_fields = ("dateIssued", "dateCreated", "dateCaptured",
                       "dateValid", "dateModified", "copyrightDate",
                       "dateOther")
        date_origin_info = []

        def datestring(date_data):
            """
            Given a "date field" element from inside an originInfo, return a
            string representation of the date or dates represented.
            """
            if type(date_data) == dict:
                # E.g. single dateCaptured without any attributes; just take
                # it
                return date_data.get("#text")
            elif type(date_data) == unicode:
                return date_data
            keyDate, startDate, endDate = None, None, None
            for _dict in date_data:
                if _dict.get("keyDate") == "yes":
                    keyDate = _dict.get("#text")
                if _dict.get("point") == "start":
                    startDate = _dict.get("#text")
                if _dict.get("point") == "end":
                    endDate = _dict.get("#text")
            if startDate and endDate:
                return "%s - %s" % (startDate, endDate)
            elif keyDate:
                return keyDate
            else:
                return None

        origin_infos = filter(None, iterify(getprop(self.provider_data,
                                                    "originInfo", True)))
        for origin_info in origin_infos:
            # Put aside date-related originInfo elements for later ...
            for field in date_fields:
                if field in origin_info:
                    date_origin_info.append(origin_info)
                    break
            # Map publisher
            if ("publisher" in origin_info and origin_info["publisher"] not in
                ret_dict["publisher"]):
                ret_dict["publisher"].append(self.txt(origin_info["publisher"]))
            # Map spatial
            if exists(origin_info, "place/placeTerm"):
                for place_term in iterify(getprop(origin_info,
                                                   "place/placeTerm")):
                    if isinstance(place_term, basestring):
                        pass
                    elif isinstance(place_term, dict):
                        place_term = place_term.get("#text")

                    if (place_term and place_term not in ret_dict["spatial"]):
                        ret_dict["spatial"].append(place_term)

        # Map dates. Only use the last date-related originInfo element
        try:
            last_date_origin_info = date_origin_info[-1]
            for field in date_fields:
                if field in last_date_origin_info:
                    s = datestring(last_date_origin_info[field])
                    if s and s not in ret_dict["date"]:
                        ret_dict["date"].append(s)
        except Exception as e:
            logger.info("Can not get date from %s" %
                        self.provider_data["_id"])

        for k in ret_dict.keys():
            if not ret_dict[k]:
                del ret_dict[k]

        self.update_source_resource(ret_dict)
Example #18
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"])

    print "Fetching records for " + fetcher.provider
    total_fetched_records = 0
    for response in fetcher.fetch_all_data():
        if response["error"]:
            error_msg.extend(iterify(response["error"]))
            print response["error"]
        else:
            # Write records to file
            filename = os.path.join(fetch_dir, str(uuid.uuid4()))
            with open(filename, "w") as f:
                f.write(json.dumps(response["data"]))
            print "Records written to " + filename
            total_fetched_records += len(getprop(response, "data/records"))

    logger.info("Total records fetched: %s" % total_fetched_records)

    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
Example #19
0
    def map_date_publisher(self):
        """
        Examine the many possible originInfo elements and pick out date,
        spatial, and publisher information.

        Dates may come in multiple originInfo elements, in which case we take
        the last one.
        """
        ret_dict = {"date": [], "spatial": [], "publisher": []}
        date_fields = ("dateIssued", "dateCreated", "dateCaptured",
                       "dateValid", "dateModified", "copyrightDate",
                       "dateOther")
        date_origin_info = []

        def datestring(date_data):
            """
            Given a "date field" element from inside an originInfo, return a
            string representation of the date or dates represented.
            """
            if type(date_data) == dict:
                # E.g. single dateCaptured without any attributes; just take
                # it
                return self.txt(date_data)
            elif type(date_data) == unicode:
                return date_data
            keyDate, startDate, endDate = None, None, None
            for _dict in date_data:
                if _dict.get("keyDate") == "yes":
                    keyDate = self.txt(_dict)
                if _dict.get("point") == "start":
                    startDate = self.txt(_dict)
                if _dict.get("point") == "end":
                    endDate = self.txt(_dict)
            if startDate and endDate:
                return "%s - %s" % (startDate, endDate)
            elif keyDate:
                return keyDate
            else:
                return None

        origin_infos = filter(None, iterify(getprop(self.provider_data,
                                                    "originInfo", True)))
        for origin_info in origin_infos:
            # Put aside date-related originInfo elements for later ...
            for field in date_fields:
                if field in origin_info:
                    date_origin_info.append(origin_info)
                    break
            # Map publisher
            if ("publisher" in origin_info and origin_info["publisher"] not in
                    ret_dict["publisher"]):
                ret_dict["publisher"].append(
                    self.txt(origin_info["publisher"]))
        # Map dates. Only use the last date-related originInfo element
        try:
            last_date_origin_info = date_origin_info[-1]
            for field in date_fields:
                if field in last_date_origin_info:
                    s = datestring(last_date_origin_info[field])
                    if s and s not in ret_dict["date"]:
                        ret_dict["date"].append(s)
        except Exception as e:
            logger.info("Can not get date from %s" %
                        self.provider_data["_id"])

        for k in ret_dict.keys():
            if not ret_dict[k]:
                del ret_dict[k]

        self.update_source_resource(ret_dict)