コード例 #1
0
 def __call__(self, data: str):
     try:
         url = URL(data)
     except ValueError:
         raise ValidationError(_("URL cannot be parsed"),
                               code="parse_error")
     if url.has_query_param('db'):
         if not url.query_param('db').isdigit():
             raise ValidationError(_("Invalid port specified"),
                                   code="invalid_port")
     if url.scheme() == "unix":
         if url.host():
             raise ValidationError(
                 _("Hostname not supported for unix domain sockets"),
                 code="unix_domain_socket_hostname")
         if url.port():
             raise ValidationError(
                 _("Port not supported for unix domain sockets"),
                 code="unix_domain_socket_port")
         if not url.path():
             raise ValidationError(
                 _("No path specified for unix domain socket"),
                 code="unix_domain_socket_path")
     if url.scheme() in ("redis", "redis+tls"):
         if not url.host():
             raise ValidationError(_("No host specified"),
                                   code="host_missing")
コード例 #2
0
 def _collect_external_links(self, response):
     """Сборщик внешних ссылок
     """
     for link in response.html.absolute_links:
         url = URL(link)
         if self.domain not in url.host():
             self.external_links.add(url.host())
コード例 #3
0
ファイル: database.py プロジェクト: LinguList/clldclient
 def url(self, path='/', **query):
     url = URL(path)
     if not url.host():
         url = url.host(self.host)
     if not url.scheme():
         url = url.scheme('http')
     for k, v in query.items():
         url = url.query_param(k, v)
     return url
コード例 #4
0
ファイル: main.py プロジェクト: basicworld/sitedownloader
    def __init__(self, url, save_dir='tmp'):
        """
        @url: full url of a site
        @save_dir: dir to save site
        """
        # log
        self.logger = logger('file', 'sitelog.log', save_dir)
        self.logger.info('-' * 20)
        self.logger.info('start')
        self.logger.info('start func: __init__')
        self.logger.info('url: %s' % url)

        save_time = datetime.strftime(datetime.now(), '%Y%m%d%H%M')
        self.save_time = save_time
        self.save_dir = os.path.abspath(os.path.join(save_dir, save_time))
        # create dir if not exist
        if not os.path.isdir(self.save_dir):
            os.makedirs(self.save_dir)

        self.url = url
        u = URL(url)
        # get host like: http://m.sohu.xom
        self.host = u.scheme() + '://' + u.host()
        print '%s: saving %s' % (save_time, self.url)
        self.logger.info('end func: __init__')
コード例 #5
0
ファイル: helpers.py プロジェクト: cevmartinez/clld
def cc_link(req, license_url, button="regular"):
    if license_url == "http://en.wikipedia.org/wiki/Public_domain":
        license_url = "http://creativecommons.org/publicdomain/zero/1.0/"
    license_url = URL(license_url)
    if license_url.host() != "creativecommons.org":
        return

    comps = license_url.path().split("/")
    if len(comps) < 3:
        return  # pragma: no cover

    known = {
        "zero": "Public Domain",
        "by": "Creative Commons Attribution License",
        "by-nc": "Creative Commons Attribution-NonCommercial License",
        "by-nc-nd": "Creative Commons Attribution-NonCommercial-NoDerivatives License",
        "by-nc-sa": "Creative Commons Attribution-NonCommercial-ShareAlike License",
        "by-nd": "Creative Commons Attribution-NoDerivatives License",
        "by-sa": "Creative Commons Attribution-ShareAlike License",
    }
    if comps[2] not in known:
        return

    icon = "cc-" + comps[2] + ("-small" if button == "small" else "") + ".png"
    img_attrs = dict(alt=known[comps[2]], src=req.static_url("clld:web/static/images/" + icon))
    height, width = (15, 80) if button == "small" else (30, 86)
    img_attrs.update(height=height, width=width)
    return HTML.a(HTML.img(**img_attrs), href=license_url, rel="license")
コード例 #6
0
ファイル: helpers.py プロジェクト: clld/clld
def cc_link(req, license_url, button='regular'):
    if license_url == 'https://en.wikipedia.org/wiki/Public_domain':
        license_url = 'https://creativecommons.org/publicdomain/zero/1.0/'
    license_url = URL(license_url)
    if license_url.host() != 'creativecommons.org':
        return

    comps = license_url.path().split('/')
    if len(comps) < 3:
        return  # pragma: no cover

    known = {
        'zero': 'Public Domain',
        'by': 'Creative Commons Attribution License',
        'by-nc': 'Creative Commons Attribution-NonCommercial License',
        'by-nc-nd': 'Creative Commons Attribution-NonCommercial-NoDerivatives License',
        'by-nc-sa': 'Creative Commons Attribution-NonCommercial-ShareAlike License',
        'by-nd': 'Creative Commons Attribution-NoDerivatives License',
        'by-sa': 'Creative Commons Attribution-ShareAlike License'}
    if comps[2] not in known:
        return

    icon = 'cc-' + comps[2] + ('-small' if button == 'small' else '') + '.png'
    img_attrs = dict(
        alt=known[comps[2]],
        src=req.static_url('clld:web/static/images/' + icon))
    height, width = (15, 80) if button == 'small' else (30, 86)
    img_attrs.update(height=height, width=width)
    return HTML.a(HTML.img(**img_attrs), href=license_url, rel='license')
コード例 #7
0
ファイル: util.py プロジェクト: clld/dogonlanguages
def get_bib(args):
    uploaded = load(args.data_file('repos', 'cdstar.json'))
    fname_to_cdstar = {}
    for type_ in ['texts', 'docs', 'data']:
        for hash_, paths in load(args.data_file('repos', type_ + '.json')).items():
            if hash_ in uploaded:
                for path in paths:
                    fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_]
    for hash_, paths in load(args.data_file('repos', 'edmond.json')).items():
        if hash_ in uploaded:
            for path in paths:
                fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_]
    db = Database.from_file(args.data_file('repos', 'Dogon.bib'), lowercase=True)
    for rec in db:
        doc = Document(rec)
        newurls = []
        for url in rec.get('url', '').split(';'):
            if not url.strip():
                continue
            if url.endswith('sequence=1'):
                newurls.append(url)
                continue
            url = URL(url.strip())
            if url.host() in ['dogonlanguages.org', 'github.com', '']:
                fname = url.path().split('/')[-1]
                doc.files.append((fname, fname_to_cdstar[fname]))
            else:
                newurls.append(url.as_string())
        doc.rec['url'] = '; '.join(newurls)
        yield doc
コード例 #8
0
def cc_link(req, license_url, button='regular'):
    if license_url == 'https://en.wikipedia.org/wiki/Public_domain':
        license_url = 'https://creativecommons.org/publicdomain/zero/1.0/'  # pragma: no cover
    license_url = URL(license_url)
    if license_url.host() != 'creativecommons.org':
        return

    comps = license_url.path().split('/')
    if len(comps) < 3:
        return  # pragma: no cover

    known = {
        'zero': 'Public Domain',
        'by': 'Creative Commons Attribution License',
        'by-nc': 'Creative Commons Attribution-NonCommercial License',
        'by-nc-nd': 'Creative Commons Attribution-NonCommercial-NoDerivatives License',
        'by-nc-sa': 'Creative Commons Attribution-NonCommercial-ShareAlike License',
        'by-nd': 'Creative Commons Attribution-NoDerivatives License',
        'by-sa': 'Creative Commons Attribution-ShareAlike License'}
    if comps[2] not in known:
        return

    icon = 'cc-' + comps[2] + ('-small' if button == 'small' else '') + '.png'
    img_attrs = dict(
        alt=known[comps[2]],
        src=req.static_url('clld:web/static/images/' + icon))
    height, width = (15, 80) if button == 'small' else (30, 86)
    img_attrs.update(height=height, width=width)
    return HTML.a(HTML.img(**img_attrs), href=license_url, rel='license')
コード例 #9
0
    def info(self, url):
        """Interface method to be called when processing new images.

        This method ties together the DataProvider workflow.
        """
        url = URL(url)
        return self.postprocess(
            self.info_for_id(self.id_from_url(url, url.host(), url.path_segments())))
コード例 #10
0
ファイル: helpers.py プロジェクト: clld/clld
def maybe_license_link(req, license, **kw):
    cc_link_ = cc_link(req, license, button=kw.pop('button', 'regular'))
    if cc_link_:
        return cc_link_
    license_url = URL(license)
    if license_url.host():
        return external_link(license_url, **kw)
    return license
コード例 #11
0
def maybe_license_link(req, license, **kw):
    cc_link_ = cc_link(req, license, button=kw.pop('button', 'regular'))
    if cc_link_:
        return cc_link_
    license_url = URL(license)
    if license_url.host():
        return external_link(license_url, **kw)
    return license
コード例 #12
0
ファイル: cli.py プロジェクト: sesh/downyt
    def get_video_id_from_url(self, video_url):
        video_url = URL(video_url)

        if 'youtube' not in video_url.host():
            raise DownytError(
                'Provided URL is not from YouTube: {}'.format(video_url)
            )

        return video_url.query_param('v')
コード例 #13
0
ファイル: views.py プロジェクト: moriwx/wals3
def blog_feed(request):
    """
    Proxy feeds from the blog, so they can be accessed via XHR requests.

    We also convert RSS to ATOM so that clld's javascript Feed component can read them.
    """
    if not request.params.get('path'):
        raise HTTPNotFound()
    path = URL(request.params['path'])
    assert not path.host()
    try:
        return atom_feed(request, request.blog.url(path.as_string()))
    except ConnectionError:  # pragma: no cover
        raise HTTPNotFound()
コード例 #14
0
ファイル: views.py プロジェクト: clld/wals3
def blog_feed(request):
    """
    Proxy feeds from the blog, so they can be accessed via XHR requests.

    We also convert RSS to ATOM so that clld's javascript Feed component can read them.
    """
    if not request.params.get('path'):
        raise HTTPNotFound()
    path = URL(request.params['path'])
    assert not path.host()
    try:
        return atom_feed(request, request.blog.url(path.as_string()))
    except ConnectionError:
        raise HTTPNotFound()
コード例 #15
0
ファイル: util.py プロジェクト: afehn/tsammalex
def license_name(license_url):
    if license_url == "http://commons.wikimedia.org/wiki/GNU_Free_Documentation_License":
        return 'GNU Free Documentation License'
    if license_url == 'http://en.wikipedia.org/wiki/Public_domain':
        license_url = 'http://creativecommons.org/publicdomain/zero/1.0/'
    license_url_ = URL(license_url)
    if license_url_.host() != 'creativecommons.org':
        return license_url

    comps = license_url_.path().split('/')
    if len(comps) < 3:
        return license_url

    return {
        'zero': 'Public Domain',
    }.get(comps[2], '(CC) %s' % comps[2].upper())
コード例 #16
0
def license_name(license_url):
    if license_url == "http://commons.wikimedia.org/wiki/GNU_Free_Documentation_License":
        return 'GNU Free Documentation License'
    if license_url == 'http://en.wikipedia.org/wiki/Public_domain':
        license_url = 'http://creativecommons.org/publicdomain/zero/1.0/'
    license_url_ = URL(license_url)
    if license_url_.host() != 'creativecommons.org':
        return license_url

    comps = license_url_.path().split('/')
    if len(comps) < 3:
        return license_url

    return {
        'zero': 'Public Domain',
    }.get(comps[2], '(CC) %s' % comps[2].upper())
コード例 #17
0
ファイル: utils.py プロジェクト: dr4ke616/LazyTorrent
class Segments(object):
    """
    URL segment handler, not intended for direct use. The URL is constructed by
    joining base, path and segments.
    """
    def __init__(self, base, path, segments, defaults):
        # Preserve the base URL
        self.base = PURL(base, path=path)
        # Map the segments and defaults lists to an ordered dict
        self.segments = OrderedDict(zip(segments, defaults))

    def build(self):
        # Join base segments and segments
        segments = self.base.path_segments() + tuple(self.segments.values())

        # Create a new URL with the segments replaced
        url = self.base.path_segments(segments)
        return url

    def full_path(self):
        full_path = self.build().as_string()
        full_path = full_path.replace(self.base.host(), '')
        full_path = full_path.replace(self.base.scheme(), '')
        return full_path[4:]

    def __str__(self):
        return self.build().as_string()

    def _get_segment(self, segment):
        return self.segments[segment]

    def _set_segment(self, segment, value):
        self.segments[segment] = value

    @classmethod
    def _segment(cls, segment):
        """
        Returns a property capable of setting and getting a segment.
        """
        return property(
            fget=lambda x: cls._get_segment(x, segment),
            fset=lambda x, v: cls._set_segment(x, segment, v),
        )
コード例 #18
0
def main(args):
    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    data.add(common.Contribution,
             'tsammalex',
             name="Tsammalex",
             id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec,
                 rec.id,
                 _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(data,
                           lang,
                           lang.id.split('-')[0],
                           None,
                           glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg',
                      source_url).replace('/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(url=image.source_url,
                            thumbnail=image_url(image.source_url, 'thumbnail'),
                            web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(object=data['Taxon'][image.taxa__id],
                                       id=image.id,
                                       name=image.tags,
                                       jsondata=jsondata,
                                       mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
コード例 #19
0
def get_image_info(img):
    for field in ['source', 'source_url', 'id']:
        for provider in PROVIDERS:
            url = URL(img[field])
            if provider.id_from_url(url, url.host(), url.path_segments()):
                return provider.info(img[field])
コード例 #20
0
ファイル: helpers.py プロジェクト: clld/clld
def maybe_external_link(text, **kw):
    url = URL(text)
    if url.host() and url.scheme() in ['http', 'https']:
        return external_link(text, **kw)
    return text
コード例 #21
0
        sys.exit(2)

    # Do some sanity checks on the config
    requiredAttribs = [
        'serviceName', 'package', 'components', 'configurations'
    ]
    for attrib in requiredAttribs:
        if not attrib in service_config:
            log.error("Invalid configuration. Missing required attribute '%s'",
                      attrib)
            sys.exit(3)

    log.info('Installing service: %s on ambari host: %s',
             service_config['serviceName'], args.ambari_host)
    ambari_host_uri = URL(args.ambari_host)
    ambari_client = Ambari(ambari_host_uri.host(),
                           port=ambari_host_uri.port(),
                           protocol=ambari_host_uri.scheme(),
                           username=args.username,
                           password=args.password,
                           identifier='hdiapps')
    # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses
    ambari_client.client.request_params['hooks'] = dict(
        response=shared_lib.Fixup(ambari_host_uri).fixup)
    # Assume we only have 1 cluster managed by this Ambari installation
    cluster = ambari_client.clusters.next()
    log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href)

    # Pull in any extra dynamic configuration
    if args.extra_config:
        try:
コード例 #22
0
ファイル: initializedb.py プロジェクト: clld/tsammalex
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
コード例 #23
0
        service_config = config_request.json()
        log.debug('Service config: %s', service_config)
    except:
        log.error("Invalid configuration URI", exc_info=True)
        sys.exit(2)

    # Do some sanity checks on the config
    requiredAttribs = ['serviceName', 'package', 'components', 'configurations']
    for attrib in requiredAttribs:
        if not attrib in service_config:
            log.error("Invalid configuration. Missing required attribute '%s'", attrib)
            sys.exit(3)

    log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host)
    ambari_host_uri = URL(args.ambari_host)
    ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps')
    # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses
    ambari_client.client.request_params['hooks'] = dict(response=shared_lib.Fixup(ambari_host_uri).fixup)
    # Assume we only have 1 cluster managed by this Ambari installation 
    cluster = ambari_client.clusters.next()
    log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href)

    # Pull in any extra dynamic configuration
    if args.extra_config:
        try:
            extra_config = json.loads(args.extra_config)
            log.debug('Applying dynamic service configuration specified on command-line: %s', extra_config)
        except:
            log.warning('Extra configuration specified by the -x argument could not be parsed as JSON. The value was \'%s\'. Details: ', args.extra_config, exc_info=True)
            extra_config = {}    
    else:
コード例 #24
0
ファイル: taxa.py プロジェクト: clld/tsammalex-data
def wikipedia_url(s):  # pragma: no cover
    url = URL(s)
    if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host():
        return s
コード例 #25
0
 def url_parts(self, url):
     url = URL(url)
     return url, url.host(), url.path_segments()
コード例 #26
0
print(str_url)
print(str_url.as_string())
argument_url = URL(scheme='https',
                   host='www.google.com',
                   path='/search',
                   query='q=google')
print(argument_url)
print(argument_url.as_string())
inline_url = URL().scheme('https').domain('www.google.com').path(
    'search').query_param('q', 'google')
print(inline_url)
print(inline_url.as_string())

u = URL('postgres://*****:*****@localhost:1234/test?ssl=true')
print(u.scheme())
print(u.host())
print(u.domain())
print(u.username())
print(u.password())
print(u.netloc())
print(u.port())
print(u.path())
print(u.query())
print(u.path_segments())
print(u.query_param('ssl'))
print(u.query_param('ssl', as_list=True))
print(u.query_params())
print(u.has_query_param('ssl'))
print(u.subdomains())

u = URL.from_string('https://github.com/minwook-shin')
コード例 #27
0
def maybe_external_link(text, **kw):
    url = URL(text)
    if url.host() and url.scheme() in ['http', 'https']:
        return external_link(text, **kw)
    return text
コード例 #28
0
ファイル: taxa.py プロジェクト: xrotwang/tsammalex-data
def wikipedia_url(s):  # pragma: no cover
    url = URL(s)
    if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host():
        return s