Esempio n. 1
0
class LockssomaticSpaceSchema(TypeOfSpaceSchema):
    collection_iri = UnicodeString(max=256)
    content_provider_id = UnicodeString(max=32)
    checksum_type = UnicodeString(max=64)
    keep_local = Bool()
    au_size = Int()
    sd_iri = URL(max=256)
    external_domain = URL()
Esempio n. 2
0
def isurl(s, require_tld=True):
    u = URL(add_http=False, require_tld=require_tld)
    try:
        u.to_python(s)
        return True
    except Invalid:
        url_local = re.compile(r'//localhost(:|/)').search(s)
        if url_local is not None:
            return True
        return False
Esempio n. 3
0
def isurl(s, require_tld=True):
    u = URL(add_http=False, require_tld=require_tld)
    try:
        u.to_python(s)
        return True
    except Invalid:
        url_local = re.compile(r'//localhost(:|/)').search(s)
        if url_local is not None:
            return True
        return False
Esempio n. 4
0
class ApplnValidator(Schema):
    name = ByteString(not_empty=True)
    email = Email()
    dob = DateConverter(month_style='iso')
    gender = OneOf(['M', 'F'])
    applntype = OneOf(['new', 'renewal'])
    race1_name = ByteString(not_empty=True)
    race1_location = ByteString(not_empty=True)
    race1_date = DateConverter(month_style='iso')
    race1_distance = Number(min=0, max=200)
    race1_units = OneOf(['miles', 'km'])
    race1_time = TimeOptHoursConverter()
    race1_resultslink = URL()
    race2_name = ByteString(not_empty=True)
    race2_location = ByteString(not_empty=True)
    race2_date = DateConverter(month_style='iso')
    race2_distance = Number(min=0, max=200)
    race2_units = OneOf(['miles', 'km'])
    race2_time = TimeOptHoursConverter()
    race2_resultslink = URL()
Esempio n. 5
0
class UriData(Schema):
    """Validate URI data received from CSV or user input.

    All fields are required, cannot be None and no extra fields are allowed.
    """

    title = String(not_empty=True)
    URI = URL(not_empty=True)
    notify = OneOf(
        ["always", "valid", "invalid"],
        not_empty=True
    )
Esempio n. 6
0
 def __init__(self, service_key=None, http_cache=None):
     self._service_key = service_key
     self._http_cache = http_cache
     self._as_url = URL(add_http=True)
     self._http_client = Http(cache=self._http_cache, **DEFAULT_HTTP_ARGS)
Esempio n. 7
0
class GoogleFeedSearchService(object):
    """
    This class provides a simple feed searching service based on 
    google APIs.  You can query by keywords, urls of web pages linking
    to feeds or direct urls to feeds.
    
    It is a very fast path for common feeds, but *not comprehensive*.
    
    If you want to support small-fry feeds that have not been discovered by google, 
    consider chaining this together with the HandScrapedFeedSearchService as 
    a fallback.
    """

    def __init__(self, service_key=None, http_cache=None):
        self._service_key = service_key
        self._http_cache = http_cache
        self._as_url = URL(add_http=True)
        self._http_client = Http(cache=self._http_cache, **DEFAULT_HTTP_ARGS)

    def find_feeds(self, query, max_results=5):
        try:
            rs = self._find_feeds(query)
            rs = uniquify_results(rs)
            return rs[0:max_results]
        except:
            log.error("Error searching for feeds. query=%s: %s" % (query, traceback.format_exc()))
            return []

    def _find_feeds(self, query):
        # firstly, could this be a url? if so, let's try that...
        try:
            url = self._as_url.to_python(query)
            rs = self._search_url_any(url)
            if len(rs) > 0:
                return rs
        except Invalid:
            # nope, let's move on...
            pass

        # try searching terms...
        try:
            return self._search_terms(query)
        except:
            return []

    def is_feed(self, url):
        """
        test whether there is a feed at the url specified.
        (not referred to in the page specified, actually there)
        """
        return self._load_feed(url) is not None

    def _search_terms(self, query):
        rs = self._query(FIND_FEED, query)
        if rs is not None:
            return [Dibject(url=r["url"],
                            title=r.get('title', ''),
                            link=r.get('link', ''))
                    for r in rs.entries]
        else:
            return []

    def _load_feed(self, url):
        rs = self._query(LOAD_FEED, url)
        if rs is None:
            return None

        return Dibject(url=url,
                       title=rs.feed.get('title', ''),
                       link=rs.feed.get('link', ''))


    def _search_url_any(self, url):
        # okay, well this could be the url of a feed or 
        # it could be a web page that has a feed link in 
        # it...
        
        # try searching it as a web page first...
        rs = self._search_terms('site:%s' % url)
        if len(rs) > 0:
            return rs
        
        # hmm didn't seem to find anything, but it may be the 
        # actual url of a feed...
        rs = self._load_feed(url)
        if rs is not None:
            return [rs]
        else:
            return []
            

    def _query(self, service, q):
        query_url = service
        query_url += urllib.quote_plus(q)
        if self._service_key:
            query_url += "&key=%s" % self._service_key

        log.debug("Issuing query %s" % query_url)
        result = self._http_client.request(query_url, 'GET')
        if result is None:
            log.error("No response to query %s" % query_url)
            return None

        response, content = result
        log.debug("response was: %s, %s" % (response, content))
        
        if response.status != 200:
            log.error("Error response to %s: (%s, %s)" % (query_url, response, content))
            return None

        rr = json_wake(content)
        if not hasattr(rr, 'responseData'):
            return None
        return rr.responseData
Esempio n. 8
0
class HandScrapedFeedSearchService(object):
    """
    This is a slow path hand scraper / checker feed search. It does not 
    handle keyword searches, only urls which it laboriously fetches and 
    inspects.  If there is a feed to be found, it will try pretty hard
    (at the expense of time) to prove it.
    """

    def __init__(self, http_cache=None):
        self._http_cache = http_cache
        self._as_url = URL(add_http=True)
        self._http_client = Http(cache=self._http_cache, **DEFAULT_HTTP_ARGS)
        
    def find_feeds(self, query, max_results=5):
        try:
            rs = self._find_feeds(query)
            rs = uniquify_results(rs)
            return rs[0:max_results]
        except:
            log.error("Error finding feeds in %s, %s" % (query, traceback.format_exc()))
            return []
            
    def _find_feeds(self, query):
        try:
            url = self._as_url.to_python(query)
        except Invalid:
            return []
            
        rs = self._search_url(url)
        if len(rs) > 0:
            return rs
        else:
            return []

    def is_feed(self, url):
        try:
            ff, response, content = self._check_for_feed(url)
            return ff is not None
        except:
            log.error("Error determining existence of feed %s: %s" % (url, traceback.format_exc()))

    def _search_url(self, url):
        feeds = []
        try:
            # check to see if the URL points to a feed
            ff, response, content = self._check_for_feed(url)
                
            if ff is not None:
                return [ff]
                
            if response is None or content is None:
                return []

            # dig feed links out...
            ct = get_content_type(response.get('content-type', '')).lower()
            if ct in DEFINITE_HTML_CONTENT_TYPES or ct in AMBIGUOUS_XML_CONTENT_TYPES:
                try:
                    feed_urls = self._find_feed_links(content)
                except:
                    log.error("Error scraping feed links from %s: %s" % (url, traceback.format_exc()))
                else:
                    for furl in feed_urls:
                        # mm make sure it's not relative...
                        furl = urlparse.urljoin(url, furl)
                        ff, response, content = self._check_for_feed(furl)
                        if ff is not None:
                            feeds.append(ff)
            return feeds
        except ForbiddenHost, e:
            log.warn(e)
            return []
        except:
Esempio n. 9
0
class DSpaceSpaceSchema(TypeOfSpaceSchema):
    sd_iri = URL(max=256)
    user = UnicodeString(max=64)
    password = UnicodeString(max=64)
    metadata_policy = UnicodeString()  # JSONField ...
    archive_format = OneOf(_flatten(models.DSpace.ARCHIVE_FORMAT_CHOICES))
Esempio n. 10
0
class PipelineSchema(Schema):
    api_key = UnicodeString(max=256)
    api_username = UnicodeString(max=256)
    description = UnicodeString(max=256)
    enabled = Bool()
    remote_name = Any(validators=[IPAddress(), URL()])