Beispiel #1
0
def couch_rev_check_recs(docs):
    """
    Insert revisions for all records into structure using CouchDB bulk interface.
    Uses key ranges to narrow bulk query to the source being ingested.

    Performance improved version of couch_rev_check_recs_old, but it uses another input format:
    Input:
     {doc["_id"]: doc, ...}
    """
    if not docs:
        return
    uri = join(COUCH_DATABASE, '_all_docs')
    docs_ids = sorted(docs)
    start = docs_ids[0]
    end = docs_ids[-1:][0]
#    uri += "?" + urlencode({"startkey": start, "endkey": end})
    uri += '?startkey="%s"&endkey="%s"' % (quote_plus(start), quote_plus(end))
    response, content = H.request(uri, 'GET', headers=COUCH_AUTH_HEADER)
    if str(response.status).startswith('2'):
        rows = json.loads(content)["rows"]
        for r in rows:
            if r["id"] in docs:
                docs[r["id"]]["_rev"] = r["value"]["rev"]
    else:
        logger.warn('Unable to retrieve document revisions via bulk interface: ' + repr(response))
        logger.warn('Request: ' + uri)
Beispiel #2
0
def couch_rev_check_recs_old(docs, src):
    """
    Insert revisions for all records into structure using CouchDB bulk interface.
    Uses key ranges to narrow bulk query to the source being ingested.

    Deprecated: has performance issue
    """

    uri = join(COUCH_DATABASE,'_all_docs')
    start = quote(COUCH_ID_BUILDER(src,''))
    end = quote(COUCH_ID_BUILDER(src,'Z'*100)) # FIXME. Is this correct?
    uri += '?startkey=%s&endkey=%s'%(start,end)

    # REVU: it fetches all docs from db again and again for each doc bulk
    # by killing performance and can cause memory issues with big collections
    # so, if you need to set revisions for each 100 doc among 10000, you
    # will be getting by 10000 docs for each hundred (100 times)
    #
    # new version is implemented in couch_rev_check_recs2, see details
    resp, cont = H.request(join(COUCH_DATABASE,'_all_docs'), 'GET', headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        rows = json.loads(cont)["rows"]
        #revs = { r["id"]:r["value"]["rev"] for r in rows } # 2.7 specific
        revs = {}
        for r in rows:
            revs[r["id"]] = r["value"]["rev"]
        for doc in docs:
            id = doc['_id']
            if id in revs:
                doc['_rev'] = revs[id]
    else:
        logger.warn('Unable to retrieve document revisions via bulk interface: ' + repr(resp))
        logger.warn('Request old: ' + uri)
Beispiel #3
0
    def geocode_spatial(self, spatial):
        '''
        Accepts a dictionary and attempts to return a set
        of coordinates in format [latitude, longitude] that
        match the place.
        '''
        if (not self.api_key):
            logger.warn("No API key set for Bing " +
                        "(use bing_api_key configuration key)")
            return None

        address = Address(spatial)
        for candidate in address.get_candidates():
            # See if this address candidate exists in our cache
            if (candidate not in DplaBingGeocoder.resultCache):
                results = self._fetch_results(candidate)
                DplaBingGeocoder.resultCache[candidate] = list(results)

            # Require that a single match, or closely grouped matches be
            # returned to avoid bad geocoding results
            candidates = len(DplaBingGeocoder.resultCache[candidate])
            closely_grouped_results = self._are_closely_grouped_results(
                                        DplaBingGeocoder.resultCache[candidate]
                                        )
            if (candidates == 1 or closely_grouped_results):
                result = DplaBingGeocoder.resultCache[candidate][0]
                coordinates = (result["geocodePoints"][0]["coordinates"][0],
                               result["geocodePoints"][0]["coordinates"][1])
                valid_result = True

                # If we have a specified country, perform a sanity check that
                # the returned coordinates is within the country's bounding box
                if (address.country and "countryRegion" in result["address"]):
                    bbox_result = self._is_in_country(coordinates,
                                                      address.country)

                    # If we can't get a country's bbox, assume that we have a
                    # good result
                    if (bbox_result is not None):
                        valid_result = bbox_result
                        if (not valid_result):
                            msg = "Geocode result [%s] " % result["name"] + \
                                  "not in the correct country " + \
                                  "[%s], ignoring" % address.country
                            logger.debug(msg)

                if (valid_result):
                    if ("name" in spatial):
                        logger.debug("Geocode result: %s => %s (%s)" %
                                     (spatial["name"], result["name"],
                                      result["point"]["coordinates"],))
                    else:
                        logger.debug("Geocode result: %s => %s (%s)" %
                                     (spatial, result["name"],
                                      result["point"]["coordinates"],))
                    return coordinates

        return None
Beispiel #4
0
    def geocode_spatial(self, spatial):
        if (not self.api_key):
            logger.warn(
                "No API key set for Bing (use bing_api_key configuration key")
            return None

        address = Address(spatial)
        for candidate in address.get_candidates():
            # See if this address candidate exists in our cache
            if (candidate not in DplaBingGeocoder.resultCache):
                # logger.debug("geocode: No result for [%s] in cache, retrieving from Bing" % candidate)
                results = self._fetch_results(candidate)
                DplaBingGeocoder.resultCache[candidate] = list(results)
                # logger.info("geocode: Result:")
                # logger.info("geocode: spatial: %s" % spatial)
                # logger.info("geocode: address: %s" % candidate)
                # logger.info("geocode: count: %s" % len(DplaBingGeocoder.resultCache[candidate]))
                # logger.info("geocode: result: %s" % DplaBingGeocoder.resultCache[candidate])

            # Require that a single match, or closely grouped matches be returned to avoid bad geocoding results
            if (1 == len(DplaBingGeocoder.resultCache[candidate]) \
                or self._are_closely_grouped_results(DplaBingGeocoder.resultCache[candidate])):
                result = DplaBingGeocoder.resultCache[candidate][0]
                coordinate = (result["geocodePoints"][0]["coordinates"][0],
                              result["geocodePoints"][0]["coordinates"][1])
                valid_result = True

                # If we have a specified country, perform a sanity check that the returned coordinate is within
                # the country's bounding box
                if (address.country and \
                    "countryRegion" in result["address"]):
                    bbox_result = self._is_in_country(coordinate,
                                                      address.country)

                    # If we can't get a country's bbox, assume that we have a good result
                    if (bbox_result is not None):
                        valid_result = bbox_result
                        if (not valid_result):
                            # logger.debug("geocode: Result [%s] not in the correct country [%s], ignoring" % (result["name"], address.country,))
                            pass

                if (valid_result):
                    if ("name" in spatial):
                        logger.info("geocode: Result: %s => %s (%s)" % (
                            spatial["name"],
                            result["name"],
                            result["point"]["coordinates"],
                        ))
                    else:
                        logger.info("geocode: Result: %s => %s (%s)" % (
                            spatial,
                            result["name"],
                            result["point"]["coordinates"],
                        ))
                    return coordinate

        return None
Beispiel #5
0
 def register_service(self, ident, path, handler, doc=None, query_template=None):
     if "/" in path:
         raise ValueError("Registered path %r may not contain a '/'" % (path,))
     if doc is None:
         doc = inspect.getdoc(handler) or ""
     if ident in self._registered_services:
         logger.warn("Replacing mount point %r (%r)" % (path, ident))
     else:
         logger.debug("Created new mount point %r (%r)" % (path, ident))
     serv = Service(handler, path, ident, doc, query_template)
     self._registered_services[path] = serv
Beispiel #6
0
def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None):
    """
    Service that accepts a JSON document and "shreds" or "unshreds" the value
    of the field(s) named by the "prop" parameter

    "prop" can include multiple property names, delimited by a comma (the delim
    property is used only for the fields to be shredded/unshredded). This
    requires that the fields share a common delimiter however.
    """

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    def mismatch_parens(s):
        return s.count("(") != s.count(")")

    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            if action == "shred":
                if isinstance(v, list):
                    try:
                        v = delim.join(v)
                    except Exception as e:
                        logger.warn("Can't join list %s on delim for %s, %s" %
                                    (v, data["_id"], e))
                if delim in v:
                    setprop(data, p, v)
                else:
                    continue

                shredded = [""]
                for s in re.split(re.escape(delim), v):
                    if mismatch_parens(shredded[-1]):
                        shredded[-1] += "%s%s" % (delim, s)
                    else:
                        shredded.append(s)
                shredded = [i.strip() for i in shredded if i.strip()]
                if not keepdup:
                    result = []
                    for s in shredded:
                        if s not in result:
                            result.append(s)
                    shredded = result
                setprop(data, p, shredded)
            elif action == "unshred":
                if isinstance(v, list):
                    setprop(data, p, delim.join(v))

    return json.dumps(data)
Beispiel #7
0
    def add_handler(self, method, handler):
        if method in self.method_table:
            logger.warn("Replacing %r method handler for %r" %
                        (method, self.path))
        else:
            logger.info("Created %r method handler for %r" %
                        (method, self.path))
        # If an outer WSGI wrapper was specified, wrap it around the handler method
        if self.wsgi_wrapper:
            handler = self.wsgi_wrapper(handler)

        self.method_table[method] = handler
Beispiel #8
0
    def add_handler(self, method, handler):
        if method in self.method_table:
            logger.warn("Replacing %r method handler for %r"  %
                        (method, self.path))
        else:
            logger.info("Created %r method handler for %r" %
                        (method, self.path))
        # If an outer WSGI wrapper was specified, wrap it around the handler method
        if self.wsgi_wrapper:
            handler = self.wsgi_wrapper(handler)

        self.method_table[method] = handler
Beispiel #9
0
    def map_rights(self):
        prop = "rights"
        if exists(self.provider_data, prop):
            rights_uri = ""
            rights = self.provider_data.get(prop)
            try:
                if rights.startswith("http"):
                    rights_uri = urlparse(rights).geturl()
            except Exception as e:
                logger.warn("Unable to parse rights URI: %s\n%s" % (rights, e))

            if rights_uri:
                self.mapped_data.update({"rights": rights_uri})
            else:
                self.update_source_resource({"rights": rights})
Beispiel #10
0
    def map_rights(self):
        prop = "rights"
        if exists(self.provider_data, prop):
            rights_uri = ""
            rights = self.provider_data.get(prop)
            try:
                if rights.startswith("http"):
                    rights_uri = urlparse(rights).geturl()
            except Exception as e:
                logger.warn("Unable to parse rights URI: %s\n%s" % (rights, e))

            if rights_uri:
                self.mapped_data.update({"rights": rights_uri})
            else:
                self.update_source_resource({"rights": rights})
def cdl_identify_object(body, ctype):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    url = None
    if exists(data, "object"):
        handle = getprop(data, "object")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
    if exists(data, "originalRecord/doc/isShownBy"):
        handle = getprop(data, "originalRecord/doc/isShownBy")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break

    if url:
        if 'content.cdlib.org' in url:
            base_url, obj_id, object_type = url.rsplit("/", 2)
            is_shown_at = getprop(data, "isShownAt")
            is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
            if obj_id != is_shown_at_id:
                logger.warn(
                    "Object url for %s has ARK value (%s) that does not match isShownAt (%s)"
                    % (data["_id"], obj_id, is_shown_at_id))
                obj_id = is_shown_at_id
            url = "/".join([base_url, obj_id, object_type])
            if object_type == "hi-res":
                setprop(data, "hasView", {"@id": url})
                url = url.replace('hi-res', 'thumbnail')

        setprop(data, "object", url)
    else:
        logger.warn("No url found for object in id %s" % data["_id"])
        delprop(data, "object", True)
    return json.dumps(data)
Beispiel #12
0
 def register_service(self,
                      ident,
                      path,
                      handler,
                      doc=None,
                      query_template=None):
     if "/" in path:
         raise ValueError("Registered path %r may not contain a '/'" %
                          (path, ))
     if doc is None:
         doc = inspect.getdoc(handler) or ""
     if ident in self._registered_services:
         logger.warn("Replacing mount point %r (%r)" % (path, ident))
     else:
         logger.debug("Created new mount point %r (%r)" % (path, ident))
     serv = Service(handler, path, ident, doc, query_template)
     self._registered_services[path] = serv
Beispiel #13
0
def cdl_identify_object(body, ctype):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    url = None
    if exists(data, "object"):
        handle = getprop(data, "object")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
    if exists(data, "originalRecord/doc/isShownBy"):
        handle = getprop(data, "originalRecord/doc/isShownBy")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break

    if url:
        if 'content.cdlib.org' in url:
            base_url, obj_id, object_type = url.rsplit("/", 2)
            is_shown_at = getprop(data, "isShownAt")
            is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
            if obj_id != is_shown_at_id:
                logger.warn("Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id))
                obj_id = is_shown_at_id
            url = "/".join([base_url, obj_id, object_type])
            if object_type == "hi-res":
                setprop(data, "hasView", {"@id": url})
                url = url.replace('hi-res', 'thumbnail')

        setprop(data, "object", url)
    else:
        logger.warn("No url found for object in id %s" % data["_id"])
        delprop(data, "object", True)
    return json.dumps(data)
Beispiel #14
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp)))
            continue

        body = cont
    return body
Beispiel #15
0
    def geocode_spatial(self, spatial):
        if (not self.api_key):
            logger.warn("No API key set for Bing (use bing_api_key configuration key")
            return None

        address = Address(spatial)
        for candidate in address.get_candidates():
            # See if this address candidate exists in our cache 
            if (candidate not in DplaBingGeocoder.resultCache): 
                results = self._fetch_results(candidate)
                DplaBingGeocoder.resultCache[candidate] = list(results)

            # Require that a single match, or closely grouped matches be returned to avoid bad geocoding results
            if (1 == len(DplaBingGeocoder.resultCache[candidate]) \
                or self._are_closely_grouped_results(DplaBingGeocoder.resultCache[candidate])):
                result = DplaBingGeocoder.resultCache[candidate][0]
                coordinate = (result["geocodePoints"][0]["coordinates"][0], result["geocodePoints"][0]["coordinates"][1])
                valid_result = True
                
                # If we have a specified country, perform a sanity check that the returned coordinate is within
                # the country's bounding box
                if (address.country and \
                    "countryRegion" in result["address"]):
                    bbox_result = self._is_in_country(coordinate, address.country)

                    # If we can't get a country's bbox, assume that we have a good result
                    if (bbox_result is not None):
                        valid_result = bbox_result
                        if (not valid_result):
                            logger.debug("Geocode result [%s] not in the correct country [%s], ignoring" % (result["name"], address.country,))
                            pass

                if (valid_result): 
                    if ("name" in spatial): 
                        logger.info("Geocode result: %s => %s (%s)" % (spatial["name"], result["name"], result["point"]["coordinates"],))
                    else: 
                        logger.info("Geocode result: %s => %s (%s)" % (spatial, result["name"], result["point"]["coordinates"],))
                    return coordinate

        return None
Beispiel #16
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            if request.environ.get('HTTP_HOST'):
                prefix += request.environ['HTTP_HOST']
            else:
                prefix += request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s" % 
                        (uri, repr(resp)))
            continue
        body = cont

    return body
Beispiel #17
0
def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None):
    """
    Service that accepts a JSON document and "shreds" or "unshreds" the value
    of the field(s) named by the "prop" parameter

    "prop" can include multiple property names, delimited by a comma (the delim
    property is used only for the fields to be shredded/unshredded). This
    requires that the fields share a common delimiter however.

    The 'shred' action splits values by delimeter. It handles some complex edge
    cases beyond what split() expects. For example:
      ["a,b,c", "d,e,f"] -> ["a","b","c","d","e","f"]
      'a,b(,c)' -> ['a', 'b(,c)']
    Duplicate values are removed unless keepdup evaluates true.

    The 'unshred' action joins a list of values with delim.

    See: https://issues.dp.la/issues/2940
         https://issues.dp.la/issues/4251
         https://issues.dp.la/issues/4266
         https://issues.dp.la/issues/4578
         https://issues.dp.la/issues/4600
    """
    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    def index_for_first_open_paren(values):
        """
        Accepts a list of values. Returns the index of the index of the first 
        value containing an opening paren.
        """
        for v in values:
            if v.count("(") > v.count(")"):
                return values.index(v)
        return None

    def index_for_matching_close_paren(values):
        """
        Accepts a list of values. Returns the index of the index of the first 
        value containing a closing paren.
        """
        index = None
        for v in values:
            if index is not None and v.count("(") > v.count(")"):
                return index
            elif v.count(")") > v.count("("):
                index = values.index(v)
        return index

    def rejoin_partials(values, delim):
        """
        Accepts a list of values which have been split by delim. Searches for 
        values that have been separated 

        For example, this value:
          'my (somewhat contrived; value) with a delimeter enclosed in parens'
        would be split into: 
          ['my (somewhat contrived', 'value) with a delimeter enclosed in parens']
       
        This method rejoins it.
        """
        index1 = index_for_first_open_paren(values)
        index2 = index_for_matching_close_paren(values)
        if index1 is not None and index2 is not None:
            if index1 == 0 and index2 == len(values) - 1:
                return [delim.join(values)]
            elif index1 == 0:
                values = [delim.join(values[:index2 + 1])
                          ] + values[index2 + 1:]
            elif index2 == len(values) - 1:
                values = values[:index1] + [delim.join(values[index1:])]
            else:
                values = values[:index1] + [
                    delim.join(values[index1:index2 + 1])
                ] + values[index2 + 1:]
            return rejoin_partials(values, delim)
        else:
            return values

    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            if action == "shred":
                if isinstance(v, list):
                    v = filter(None, v)
                    try:
                        v = delim.join(v)
                        v = v.replace("%s%s" % (delim, delim), delim)
                    except Exception as e:
                        logger.warn("Can't join list %s on delim for %s, %s" %
                                    (v, data["_id"], e))
                if delim in v:
                    setprop(data, p, v)
                else:
                    continue

                shredded = [""]
                for s in re.split(re.escape(delim), v):
                    shredded.append(s)
                shredded = rejoin_partials(shredded, delim)
                shredded = [i.strip() for i in shredded if i.strip()]

                if not keepdup:
                    result = []
                    for s in shredded:
                        if s not in result:
                            result.append(s)
                    shredded = result
                setprop(data, p, shredded)
            elif action == "unshred":
                if isinstance(v, list):
                    setprop(data, p, delim.join(v))

    return json.dumps(data)
Beispiel #18
0
def copyprop(body,
             ctype,
             prop=None,
             to_prop=None,
             create=False,
             key=None,
             remove=None,
             no_replace=None,
             no_overwrite=None):
    """Copies value in one prop to another prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    create -- creates to_prop if True (default False)
    key -- the key to use if to_prop is a dict (default None)
    remove  -- removes prop if True (default False)
    no_replace -- creates list of to_prop string and appends prop if True
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, to_prop) and no_overwrite:
        pass
    else:
        if exists(data, prop) and create and not exists(data, to_prop):
            val = {} if key else ""
            setprop(data, to_prop, val)

        if exists(data, prop) and exists(data, to_prop):
            val = getprop(data, prop)
            to_element = getprop(data, to_prop)

            if isinstance(to_element, basestring):
                if no_replace:
                    el = [to_element] if to_element else []
                    el.append(val)
                    # Flatten
                    val = [
                        e for s in el
                        for e in (s if not isinstance(s, basestring) else [s])
                    ]
                setprop(data, to_prop, val)
            else:
                # If key is set, assume to_element is dict or list of dicts
                if key:
                    if not isinstance(to_element, list):
                        to_element = [to_element]
                    for dict in to_element:
                        if exists(dict, key) or create:
                            setprop(dict, key, val)
                        else:
                            msg = "Key %s does not exist in %s" % (key,
                                                                   to_prop)
                            logger.debug(msg)
                else:
                    # Handle case where to_element is a list
                    if isinstance(to_element, list):
                        if isinstance(val, list):
                            to_element = to_element + val
                        else:
                            to_element.append(val)
                        setprop(data, to_prop, to_element)
                    else:
                        # to_prop is dictionary but no key was passed.
                        msg = "%s is a dictionary but no key was passed" % to_prop
                        logger.warn(msg)
                        setprop(data, to_prop, val)

            if remove:
                delprop(data, prop)

    return json.dumps(data)
Beispiel #19
0
    def geocode_spatial(self, spatial):
        '''
        Accepts a dictionary and attempts to return a set
        of coordinates in format [latitude, longitude] that
        match the place.
        '''
        if (not self.api_key):
            logger.warn("No API key set for Bing " +
                        "(use bing_api_key configuration key)")
            return None

        address = Address(spatial)
        for candidate in address.get_candidates():
            # See if this address candidate exists in our cache
            if (candidate not in DplaBingGeocoder.resultCache):
                results = self._fetch_results(candidate)
                DplaBingGeocoder.resultCache[candidate] = list(results)

            # Require that a single match, or closely grouped matches be
            # returned to avoid bad geocoding results
            candidates = len(DplaBingGeocoder.resultCache[candidate])
            closely_grouped_results = self._are_closely_grouped_results(
                DplaBingGeocoder.resultCache[candidate])
            if (candidates == 1 or closely_grouped_results):
                result = DplaBingGeocoder.resultCache[candidate][0]
                coordinates = (result["geocodePoints"][0]["coordinates"][0],
                               result["geocodePoints"][0]["coordinates"][1])
                valid_result = True

                # If we have a specified country, perform a sanity check that
                # the returned coordinates is within the country's bounding box
                if (address.country and "countryRegion" in result["address"]):
                    bbox_result = self._is_in_country(coordinates,
                                                      address.country)

                    # If we can't get a country's bbox, assume that we have a
                    # good result
                    if (bbox_result is not None):
                        valid_result = bbox_result
                        if (not valid_result):
                            msg = "Geocode result [%s] " % result["name"] + \
                                  "not in the correct country " + \
                                  "[%s], ignoring" % address.country
                            logger.debug(msg)

                if (valid_result):
                    if ("name" in spatial):
                        logger.debug("Geocode result: %s => %s (%s)" % (
                            spatial["name"],
                            result["name"],
                            result["point"]["coordinates"],
                        ))
                    else:
                        logger.debug("Geocode result: %s => %s (%s)" % (
                            spatial,
                            result["name"],
                            result["point"]["coordinates"],
                        ))
                    return coordinates

        return None
Beispiel #20
0
def shred(body, ctype, action="shred", prop=None, delim=";", keepdup=None):
    """
    Service that accepts a JSON document and "shreds" or "unshreds" the value
    of the field(s) named by the "prop" parameter

    "prop" can include multiple property names, delimited by a comma (the delim
    property is used only for the fields to be shredded/unshredded). This
    requires that the fields share a common delimiter however.

    The 'shred' action splits values by delimeter. It handles some complex edge
    cases beyond what split() expects. For example:
      ["a,b,c", "d,e,f"] -> ["a","b","c","d","e","f"]
      'a,b(,c)' -> ['a', 'b(,c)']
    Duplicate values are removed unless keepdup evaluates true.

    The 'unshred' action joins a list of values with delim.

    See: https://issues.dp.la/issues/2940
         https://issues.dp.la/issues/4251
         https://issues.dp.la/issues/4266
         https://issues.dp.la/issues/4578
         https://issues.dp.la/issues/4600
    """
    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON\n" + str(e)

    def index_for_first_open_paren(values):
        """
        Accepts a list of values. Returns the index of the index of the first 
        value containing an opening paren.
        """
        for v in values:
            if v.count("(") > v.count(")"):
                return values.index(v)
        return None

    def index_for_matching_close_paren(values):
        """
        Accepts a list of values. Returns the index of the index of the first 
        value containing a closing paren.
        """
        index = None
        for v in values:
            if index is not None and v.count("(") > v.count(")"):
                return index
            elif v.count(")") > v.count("("):
                index = values.index(v)
        return index

    def rejoin_partials(values, delim):
        """
        Accepts a list of values which have been split by delim. Searches for 
        values that have been separated 

        For example, this value:
          'my (somewhat contrived; value) with a delimeter enclosed in parens'
        would be split into: 
          ['my (somewhat contrived', 'value) with a delimeter enclosed in parens']
       
        This method rejoins it.
        """
        index1 = index_for_first_open_paren(values)
        index2 = index_for_matching_close_paren(values)
        if index1 is not None and index2 is not None:
            if index1 == 0 and index2 == len(values) - 1:
                return [delim.join(values)]
            elif index1 == 0:
                values = [delim.join(values[: index2 + 1])] + values[index2 + 1 :]
            elif index2 == len(values) - 1:
                values = values[:index1] + [delim.join(values[index1:])]
            else:
                values = values[:index1] + [delim.join(values[index1 : index2 + 1])] + values[index2 + 1 :]
            return rejoin_partials(values, delim)
        else:
            return values

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if action == "shred":
                if isinstance(v, list):
                    try:
                        v = delim.join(v)
                        v = v.replace("%s%s" % (delim, delim), delim)
                    except Exception as e:
                        logger.warn("Can't join list %s on delim for %s, %s" % (v, data["_id"], e))
                if delim in v:
                    setprop(data, p, v)
                else:
                    continue

                shredded = [""]
                for s in re.split(re.escape(delim), v):
                    shredded.append(s)
                shredded = rejoin_partials(shredded, delim)
                shredded = [i.strip() for i in shredded if i.strip()]

                if not keepdup:
                    result = []
                    for s in shredded:
                        if s not in result:
                            result.append(s)
                    shredded = result
                setprop(data, p, shredded)
            elif action == "unshred":
                if isinstance(v, list):
                    setprop(data, p, delim.join(v))

    return json.dumps(data)
Beispiel #21
0
def copyprop(
    body, ctype, prop=None, to_prop=None, create=False, key=None, remove=None, no_replace=None, no_overwrite=None
):
    """Copies value in one prop to another prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    create -- creates to_prop if True (default False)
    key -- the key to use if to_prop is a dict (default None)
    remove  -- removes prop if True (default False)
    no_replace -- creates list of to_prop string and appends prop if True
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, to_prop) and no_overwrite:
        pass
    else:
        if exists(data, prop) and create and not exists(data, to_prop):
            val = {} if key else ""
            setprop(data, to_prop, val)

        if exists(data, prop) and exists(data, to_prop):
            val = getprop(data, prop)
            to_element = getprop(data, to_prop)

            if isinstance(to_element, basestring):
                if no_replace:
                    el = [to_element] if to_element else []
                    el.append(val)
                    # Flatten
                    val = [e for s in el for e in (s if not isinstance(s, basestring) else [s])]
                setprop(data, to_prop, val)
            else:
                # If key is set, assume to_element is dict or list of dicts
                if key:
                    if not isinstance(to_element, list):
                        to_element = [to_element]
                    for dict in to_element:
                        if exists(dict, key) or create:
                            setprop(dict, key, val)
                        else:
                            logger.error("Key %s does not exist in %s" % (key, to_prop))
                else:
                    # Handle case where to_element is a list
                    if isinstance(to_element, list):
                        if isinstance(val, list):
                            to_element = to_element + val
                        else:
                            to_element.append(val)
                        setprop(data, to_prop, to_element)
                    else:
                        # to_prop is dictionary but no key was passed.
                        logger.warn("%s is a dict but no key was passed" % to_prop)
                        setprop(data, to_prop, val)

            if remove:
                delprop(data, prop)

    return json.dumps(data)