Exemple #1
1
    def login_url(cls, next_url=""):
        return_url = furl(settings.BASE_URL)
        return_url.path = "auth"

        ulb_url = furl("https://www.ulb.ac.be/commons/intranet")
        ulb_url.args["_prt"] = "ulb:gehol"
        ulb_url.args["_ssl"] = "on"
        ulb_url.args["_prtm"] = "redirect"
        ulb_url.args["_appl"] = return_url

        return ulb_url
Exemple #2
0
def assert_urls_equal(url1, url2):
    furl1 = furl.furl(url1)
    furl2 = furl.furl(url2)
    for attr in ['scheme', 'host', 'port']:
        setattr(furl1, attr, None)
        setattr(furl2, attr, None)
    assert_equal(furl1, furl2)
Exemple #3
0
def title_from_id(identifier_key, identifier_value):
    if identifier_key is None or identifier_value is None:
        raise AttributeError("Neither identifier key nor value were supplied")
    try:
        if identifier_key == "imdbid":
            if identifier_value[0:2] != "tt":
                identifier_value = "tt%s" % identifier_value
            url = furl("http://www.omdbapi.com").add({"i": identifier_value, "plot": "short", "r": "json"}).tostr()
            omdb = webaccess.get(url)
            return omdb.json()["Title"]

        if identifier_key not in ("rid", "tvdbid"):
            raise AttributeError("Unknown identifier %s" % identifier_key)

        tvmaze_key = "tvrage" if identifier_key == "rid" else "thetvdb"
        tvmaze = webaccess.get(furl("http://api.tvmaze.com/lookup/shows").add({tvmaze_key: identifier_value}).url)
        if tvmaze.status_code == 404:
            #Unfortunately TVMaze returns a 404 for unknown/invalid IDs
            raise ExternalApiInfoException("Unable to find id %s and value %s at TVMaze" % (identifier_key, identifier_value))
        tvmaze.raise_for_status()
        
        return tvmaze.json()["name"]

    except (HTTPError, ConnectionError, ReadTimeout) as e:
        logger.exception("Unable to retrieve title by id %s and value %s" % (identifier_key, identifier_value))
        raise ExternalApiInfoException(str(e))
    except Exception as e:
        logger.exception("Unable to retrieve title by id %s and value %s" % (identifier_key, identifier_value))
        raise ExternalApiInfoException(str(e))
    def fetch_records(self, url):
        resp = self.requests.get(url)
        resp_xml = etree.XML(resp.content)
        num_records = int(resp_xml.xpath('//search_results/@count')[0])

        if num_records > 0:
            # create a new URL to request all results
            url = furl(url).add(query_params={
                'count': num_records
            }).url

            all_records_resp = self.requests.get(url)
            all_records_doc = etree.XML(all_records_resp.content)

            # retrieve the URLs for each document to make requests for their full content
            record_urls = [
                furl(record.xpath('url/node()')[0]).set(query_params={
                    'displayxml': 'true'
                }).url
                for record in all_records_doc.xpath('//clinical_study')
            ]

            total = len(record_urls)
            for i, url in enumerate(record_urls):
                logger.debug('[%d / %d] Requesting %s', i, total, url)
                record_resp = self.requests.get(url)

                doc = etree.fromstring(record_resp.content, parser=etree.XMLParser(recover=True))
                yield doc.xpath('//nct_id/node()')[0], etree.tostring(doc, encoding=str)
Exemple #5
0
def addon_view_file(auth, node, file_node, version):
    # TODO: resolve circular import issue
    from website.addons.wiki import settings as wiki_settings

    if isinstance(version, tuple):
        version, error = version
        error = error.replace('\n', '').strip()
    else:
        error = None

    ret = serialize_node(node, auth, primary=True)

    if file_node._id + '-' + version._id not in node.file_guid_to_share_uuids:
        node.file_guid_to_share_uuids[file_node._id + '-' + version._id] = uuid.uuid4()
        node.save()

    if ret['user']['can_edit']:
        sharejs_uuid = str(node.file_guid_to_share_uuids[file_node._id + '-' + version._id])
    else:
        sharejs_uuid = None

    download_url = furl.furl(request.url.encode('utf-8')).set(args=dict(request.args, **{
        'direct': None,
        'mode': 'render',
        'action': 'download',
    }))

    render_url = furl.furl(settings.MFR_SERVER_URL).set(
        path=['render'],
        args={'url': download_url.url}
    )

    ret.update({
        'urls': {
            'render': render_url.url,
            'mfr': settings.MFR_SERVER_URL,
            'sharejs': wiki_settings.SHAREJS_URL,
            'gravatar': get_gravatar(auth.user, 25),
            'files': node.web_url_for('collect_file_trees'),
            'archived_from': get_archived_from_url(node, file_node) if node.is_registration else None,
        },
        'error': error,
        'file_name': file_node.name,
        'file_name_title': os.path.splitext(file_node.name)[0],
        'file_name_ext': os.path.splitext(file_node.name)[1],
        'file_path': file_node.path,
        'sharejs_uuid': sharejs_uuid,
        'provider': file_node.provider,
        'materialized_path': file_node.materialized_path,
        'extra': version.metadata.get('extra', {}),
        'size': version.size if version.size is not None else 9966699,
        'private': getattr(node.get_addon(file_node.provider), 'is_private', False),
        'file_tags': [tag._id for tag in file_node.tags],
        'file_guid': file_node.get_guid()._id,
        'file_id': file_node._id,
        'allow_comments': file_node.provider in settings.ADDONS_COMMENTABLE
    })

    ret.update(rubeus.collect_addon_assets(node))
    return ret
Exemple #6
0
    def switch_to_test_mode(self, instance_number=None):
        mongo_url   = furl(self.settings.get('mongo', 'url'))
        server_port = self.settings.getint('test', 'server_port')
        server_url  = furl(self.settings.get('test', 'server_url'))

        if instance_number is not None:
            mongo_url.path.segments[0] = "test_%d_%s" % (instance_number, mongo_url.path.segments[0])
            server_port += instance_number
            if not server_url.port or server_url.port != self.settings.getint('test', 'server_port'):
                raise Exception("Can't detect how to adjust server url for instance: %d" % instance_number)
            server_url.port = server_port
        else:
            mongo_url.path.segments[0] = "test_%s" % mongo_url.path.segments[0]

        self.settings.set('mongo', 'url', str(mongo_url))
        self.settings.set('server', 'port', str(server_port))
        self.settings.set('server', 'url', str(server_url))
        self.settings.set('ratelimit_authentication', 'allowed_failures', '10000')

        @self.route('/__test_drop_mongoengine_cache__')
        def endpoint():
            self.logger.debug("Received /__test_drop_mongoengine_cache__ request, dropping mongoengine cached collections/connections")
            self.drop_mongoengine_cached_handles()
            return ''

        self.in_test_mode = True
        self.init_application()
Exemple #7
0
def get_url_args(doc, defaults=None):
    """Return url args recovered from django_full_path cookie in
    the bokeh request header.

    If defaults values are provided, overwrite the default values
    obtained from the API
    """

    args = get_data('defaults')

    # overwrite api default values
    if defaults:
        for key in defaults:
            args[key] = defaults[key]

    r = doc().session_context.request
    if r:
        if 'django_full_path' in r.cookies:
            django_full_path = r.cookies['django_full_path'].value
            tmp = furl(django_full_path).args
            for key in tmp:
                # overwrite default values with those passed
                # as url args, make sure the url arg (key) is valid
                if key in args:
                    args[key] = tmp[key]

            # the bokeh app name is the second segment of the url path
            args['bokeh_app'] = furl(django_full_path).path.segments[1]

    return args
def get_featureOfInterest(query_uri_base, aws_urn=None):
    # assemble SOS query string for one or all stations
    q = None
    if aws_urn is not None:
        q = furl(query_uri_base + '/service').add({
            'service': 'SOS',
            'version': '2.0.0',
            'request': 'GetFeatureOfInterest',
            'featureOfInterest': aws_urn
        }).url
    else:
        q = furl(query_uri_base + '/sos/kvp').add({
            'service': 'SOS',
            'version': '2.0.0',
            'request': 'GetFeatureOfInterest',
        }).url

    # run the query request
    creds = json.load(open('creds.json'))
    auth = HTTPProxyAuth(creds['username'], creds['password'])
    ga_proxy = {"http": creds['proxy']}
    headers = {'accept': 'application/json'}
    r = requests.get(q, headers=headers, proxies=ga_proxy, auth=auth)

    results = json.loads(r.text)

    # return one or all
    if aws_urn is not None:
        return results['featureOfInterest'][0]
    else:
        #return sorted(results['featureOfInterest'], key=lambda k: k['name'])
        return sorted(results['featureOfInterest'])
Exemple #9
0
    def fetch_records(self, url, end_day):
        page, detail = 0, None

        while True:
            page += 1
            resp = self.requests.get(furl(url).add(query_params={
                'page': page,
            }).url)

            if resp.status_code == 422:
                # We've asked for too much. Time to readjust date range
                # Thanks for leaking variables python
                page, url = 0, furl(url).add(query_params={
                    'modified_date': pendulum.parse(detail['modified_date']).date().isoformat()
                })
                continue

            for item in resp.json():
                resp = self.requests.get(item['url'])
                detail = resp.json()

                if pendulum.parse(detail['modified_date']).date() > end_day:
                    return

                yield item['url'], detail

            if len(resp.json()) < self.page_size:
                return  # We've hit the end of our results
Exemple #10
0
def make_query(url, page):
    if page != 1:
        return furl(url).remove(['page']).add({"page": page}).url.split('?')[1]
    try:
        return furl(url).remove(['page']).url.split('?')[1]
    except IndexError:
        return ""
Exemple #11
0
    def fetch_records(self, url):
        resp = self.requests.get(url)
        resp_xml = etree.XML(resp.content)
        num_records = int(resp_xml.xpath('//search_results/@count')[0])

        if num_records > 0:
            # create a new URL to request all results
            url = furl(url).add(query_params={
                'count': num_records
            }).url

            all_records_resp = self.requests.get(url)
            all_records_doc = etree.XML(all_records_resp.content)

            # retrieve the URLs for each document to make requests for their full content
            record_urls = [
                furl(record.xpath('url/node()')[0]).set(query_params={
                    'displayxml': 'true'
                }).url
                for record in all_records_doc.xpath('//clinical_study')
            ]

            logger.info("There are {} record urls to harvest - this may take a while...".format(len(record_urls)))
            for url in record_urls:
                try:
                    record_resp = self.requests.get(url)
                except self.requests.exceptions.ConnectionError as e:
                    logger.warning('Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    record_resp = self.requests.get(url)

                doc = etree.XML(record_resp.content)
                record = etree.tostring(doc)
                doc_id = doc.xpath('//nct_id/node()')[0]
                yield (doc_id, record)
Exemple #12
0
  def test_remove(self):
    url = 'http://host:69/a/big/path/?a=a&b=b&s=s+s#a frag?with=args&a=a'
    
    fu = furl.furl(url)
    assert fu == fu.remove(fragment=True, args=['a', 'b'], path='path',
                           port=True)
    assert fu.url == 'http://host/a/big/?s=s+s'

    # No errors are thrown when removing url components that don't exist.
    fu = furl.furl(url)
    assert fu == fu.remove(fragment_path=['asdf'], fragment_args=['asdf'],
                           args=['asdf'], path=['ppp', 'ump'])
    assert self._param(fu.url, 'a', 'a')
    assert self._param(fu.url, 'b', 'b')
    assert self._param(fu.url, 's', 's s')
    assert fu.pathstr == '/a/big/path/'
    assert fu.fragment.pathstr == 'a frag'
    assert fu.fragment.args == {'a':'a', 'with':'args'}

    # Path as a list of paths to join before removing.
    assert fu == fu.remove(fragment_path='a frag', fragment_args=['a'],
                           query_params=['a','b'], path=['big', 'path'],
                           port=True)
    assert fu.url == 'http://host/a/?s=s+s#with=args'

    assert fu == fu.remove(path=True, query=True, fragment=True)
    assert fu.url == 'http://host'
Exemple #13
0
def download(
    urls: List[Tuple[str, Union[str, None]]], verbose: bool = False, force: bool = False
) -> None:
    for address, filename in urls:
        if not address:
            continue
        try:
            host = ".".join(furl(address).host.split(".")[-2:])
            try:
                Story = AVAILABLE_SITES[host]
                story = Story(furl(address), verbose)
                story.force = force
                if filename:
                    story.filename = filename
                story.run()
            except KeyError:
                click.echo(
                    f"{__file__} is currently only able to download from {list2text(AVAILABLE_SITES.keys())}."
                )
        except AttributeError as e:
            print(e)
            error = "There were problems with parsing the URL."
            with open("pyffdl.log", "a") as fp:
                click.echo(error, file=fp)
            click.echo(error, err=True)
    def build_uri(self, uri, start, end, width='*', height='*', composite_to='*.*',
                  bg_url=None, bg_width='*', bg_height='*', **kwds):
        """Create a cropped URL in Akamai

        >>> crop = AkamaiCrop()
        >>> crop.build_uri(
        ...    'https://example.com/test.jpg',
        ...    Coord(10, 20),
        ...    Coord(30, 40),
        ... )
        'https://example.com/test.jpg?crop=30:40%3B10,20'
        """
        furl_obj = furl.furl(uri)
        furl_obj.args['crop'] = '{}:{};{},{}'.format(
            (end.x - start.x), (end.y - end.x), start.x, start.y)
        furl_obj.args['resize'] = '{}:{}'.format(width, height)
        akamai_url = furl_obj.url

        if bg_url:
            furl_obj.args['composite-to'] = composite_to

            bg_furl_obj = furl.furl(bg_url)
            bg_furl_obj.args['resize'] = '{}:{}'.format(bg_width, bg_height)
            akamai_url += ('|' + bg_furl_obj.url)

        return akamai_url
def parse_i18n(
        url,
        language_codes,
        default_language_code=None,
):
    """
    Takes an url containing a "*" character and creates an index per language
    replacing the "*" with the language code, except for the default language.
    :param url:
    :return:
    """
    if '*' not in furl(url).path.segments[0]:
        index_name = furl(url).path.segments[0]
        raise Exception(
            (
                'The index name in the haystack url {} is not supported. Must '
                'have a * in its name for multilingual index support'
            ).format(index_name)
        )
    connections = {}
    for language_code in language_codes:
        if default_language_code and language_code == default_language_code:
            connections['default'] = parse(url, suffix='default')
        else:
            connections[language_code] = parse(url, suffix=language_code)
    return connections
Exemple #16
0
def get_root_url(): 
    f = furl()
    f.scheme = request.scheme
    f.host = furl(request.host_url).host
    f.port = config.settings.main.port
    if config.settings.main.urlBase:
        f.path = config.settings.main.urlBase
    return str(f) + "/"
    def __init__(self, url, params=None):
        self._url = url
        self._furl = furl.furl(url)
        self._params = furl.furl(url).args
        self._furl.set(args={})

        params = params or {}
        for (k, v) in params.items():
            self._params.add(k, v)
Exemple #18
0
    def get_details_link(self, guid):
        if "nzbgeek" in self.settings.host:
            f = furl(self.settings.host)
        else:
            f = furl(self.settings.host.replace("api.", "www."))  # Quick and dirty fix so it doesn't link to the API

        f.path.add("details")
        f.path.add(guid)
        return f.url
Exemple #19
0
 def __init__(self, total, page=1, per_page=10, list_count=10, base_uri=None):
     self.total = total
     self.list_count = list_count
     self.page = page
     self.per_page = per_page
     if base_uri:
         self.uri = furl(base_uri)
     else:
         self.uri = furl('')
     self.calculate()
Exemple #20
0
 def test_auth_download(self):
     url = self.build_url()
     res = self.test_app.get(url)
     assert_equal(res.json['auth'], views.make_auth(self.user))
     assert_equal(res.json['credentials'], self.node_addon.serialize_waterbutler_credentials())
     assert_equal(res.json['settings'], self.node_addon.serialize_waterbutler_settings())
     expected_url = furl.furl(self.node.api_url_for('create_waterbutler_log', _absolute=True))
     observed_url = furl.furl(res.json['callback_url'])
     observed_url.port = expected_url.port
     assert_equal(expected_url, observed_url)
Exemple #21
0
def create_file(request, uri, headers):
    folder_path = furl(uri).args['path']
    # folder_name = folder_path.split('/')[1]
    # provider_name = furl(uri).args['path']
    nid = furl(uri).args['nid']
    provider = session.query(File).filter(File.parent == None and File.node_id==nid).one()
    new_file = create_new_file(provider)
    resp = json.dumps({
        'data':new_file.as_dict()
    })
    return (200, headers, resp)
Exemple #22
0
def assert_urls_equal(url1, url2):
    furl1 = furl.furl(url1)
    furl2 = furl.furl(url2)
    for attr in ['scheme', 'host', 'port']:
        setattr(furl1, attr, None)
        setattr(furl2, attr, None)
    # Note: furl params are ordered and cause trouble
    assert_equal(dict(furl1.args), dict(furl2.args))
    furl1.args = {}
    furl2.args = {}
    assert_equal(furl1, furl2)
Exemple #23
0
 def test_auth_download(self):
     url = self.build_url()
     res = self.app.get(url, auth=self.user.auth)
     data = jwt.decode(jwe.decrypt(res.json['payload'].encode('utf-8'), self.JWE_KEY), settings.WATERBUTLER_JWT_SECRET, algorithm=settings.WATERBUTLER_JWT_ALGORITHM)['data']
     assert_equal(data['auth'], views.make_auth(self.user))
     assert_equal(data['credentials'], self.node_addon.serialize_waterbutler_credentials())
     assert_equal(data['settings'], self.node_addon.serialize_waterbutler_settings())
     expected_url = furl.furl(self.node.api_url_for('create_waterbutler_log', _absolute=True, _internal=True))
     observed_url = furl.furl(data['callback_url'])
     observed_url.port = expected_url.port
     assert_equal(expected_url, observed_url)
Exemple #24
0
def addon_view_file(auth, node, file_node, version):
    # TODO: resolve circular import issue
    from website.addons.wiki import settings as wiki_settings

    if isinstance(version, tuple):
        version, error = version
        error = error.replace("\n", "").strip()
    else:
        error = None

    ret = serialize_node(node, auth, primary=True)

    if file_node._id not in node.file_guid_to_share_uuids:
        node.file_guid_to_share_uuids[file_node._id] = uuid.uuid4()
        node.save()

    if ret["user"]["can_edit"]:
        sharejs_uuid = str(node.file_guid_to_share_uuids[file_node._id])
    else:
        sharejs_uuid = None

    download_url = furl.furl(request.url.encode("utf-8")).set(
        args=dict(request.args, **{"direct": None, "mode": "render", "action": "download"})
    )

    render_url = furl.furl(settings.MFR_SERVER_URL).set(path=["render"], args={"url": download_url.url})

    ret.update(
        {
            "urls": {
                "render": render_url.url,
                "mfr": settings.MFR_SERVER_URL,
                "sharejs": wiki_settings.SHAREJS_URL,
                "gravatar": get_gravatar(auth.user, 25),
                "files": node.web_url_for("collect_file_trees"),
            },
            "error": error,
            "file_name": file_node.name,
            "file_name_title": os.path.splitext(file_node.name)[0],
            "file_name_ext": os.path.splitext(file_node.name)[1],
            "file_path": file_node.path,
            "sharejs_uuid": sharejs_uuid,
            "provider": file_node.provider,
            "materialized_path": file_node.materialized_path,
            "extra": version.metadata.get("extra", {}),
            "size": version.size if version.size is not None else 9966699,
            "private": getattr(node.get_addon(file_node.provider), "is_private", False),
            "file_tags": [tag._id for tag in file_node.tags],
        }
    )

    ret.update(rubeus.collect_addon_assets(node))
    return ret
Exemple #25
0
def addon_view_or_download_file(auth, path, provider, **kwargs):
    extras = request.args.to_dict()
    action = extras.get('action', 'view')
    node = kwargs.get('node') or kwargs['project']

    node_addon = node.get_addon(provider)

    if not path:
        raise HTTPError(httplib.BAD_REQUEST)

    if not node_addon:
        raise HTTPError(httplib.BAD_REQUEST, {
            'message_short': 'Bad Request',
            'message_long': 'The add-on containing this file is no longer connected to the {}.'.format(node.project_or_component)
        })

    if not node_addon.has_auth:
        raise HTTPError(httplib.UNAUTHORIZED, {
            'message_short': 'Unauthorized',
            'message_long': 'The add-on containing this file is no longer authorized.'
        })

    if not node_addon.complete:
        raise HTTPError(httplib.BAD_REQUEST, {
            'message_short': 'Bad Request',
            'message_long': 'The add-on containing this file is no longer configured.'
        })

    if not path.startswith('/'):
        path = '/' + path

    guid_file, created = node_addon.find_or_create_file_guid(path)

    if guid_file.guid_url != request.path:
        guid_url = furl.furl(guid_file.guid_url)
        guid_url.args.update(extras)
        return redirect(guid_url)

    guid_file.maybe_set_version(**extras)

    if request.method == 'HEAD':
        download_url = furl.furl(guid_file.download_url)
        download_url.args.update(extras)
        download_url.args['accept_url'] = 'false'
        return make_response(('', 200, {'Location': download_url.url}))

    if action == 'download':
        download_url = furl.furl(guid_file.download_url)
        download_url.args.update(extras)

        return redirect(download_url.url)

    return addon_view_file(auth, node, node_addon, guid_file, extras)
Exemple #26
0
    def testUrlGeneration(self):
        w = NzbClub(config.indexerSettings.nzbclub)
        self.args = SearchRequest(query="a showtitle", season=1, episode=2)
        urls = w.get_showsearch_urls(self.args)
        self.assertEqual(1, len(urls))
        print(urls[0])
        self.assertEqual('a showtitle s01e02 or a showtitle 1x02', furl(urls[0]).args["q"])

        self.args = SearchRequest(query="a showtitle", season=1, episode=None)
        urls = w.get_showsearch_urls(self.args)
        self.assertEqual(1, len(urls))
        self.assertEqual('a showtitle s01 or a showtitle "season 1"', furl(urls[0]).args["q"])
Exemple #27
0
 def test_auth_bad_cookie(self):
     url = self.build_url(cookie=self.cookie)
     res = self.app.get(url, expect_errors=True)
     assert_equal(res.status_code, 200)
     data = jwt.decode(res.json, settings.WATERBUTLER_JWT_SECRET, algorithm=settings.WATERBUTLER_JWT_ALGORITHM)['data']
     assert_equal(data['auth'], views.make_auth(self.user))
     assert_equal(data['credentials'], self.node_addon.serialize_waterbutler_credentials())
     assert_equal(data['settings'], self.node_addon.serialize_waterbutler_settings())
     expected_url = furl.furl(self.node.api_url_for('create_waterbutler_log', _absolute=True))
     observed_url = furl.furl(data['callback_url'])
     observed_url.port = expected_url.port
     assert_equal(expected_url, observed_url)
    def testUrlGeneration(self):
        w = Binsearch(getIndexerSettingByName("binsearch"))
        self.args = SearchRequest(query="a showtitle", season=1, episode=2)
        urls = w.get_showsearch_urls(self.args)
        self.assertEqual(2, len(urls))
        self.assertEqual('a showtitle s01e02', furl(urls[0]).args["q"])
        self.assertEqual('a showtitle 1x02', furl(urls[1]).args["q"])

        self.args = SearchRequest(query="a showtitle", season=1, episode=None)
        urls = w.get_showsearch_urls(self.args)
        self.assertEqual(2, len(urls))
        self.assertEqual('a showtitle s01', furl(urls[0]).args["q"])
        self.assertEqual('a showtitle "season 1"', furl(urls[1]).args["q"])
    def process_formdata(self, valuelist):
        """
        Process data received over the wire from a form.

        This will be called during form construction with data supplied
        through the `formdata` argument.

        :param valuelist: A list of strings to process.
        """
        if valuelist:
            self.data = furl.furl(valuelist[0])
        else:
            self.data = furl.furl('')
    def fetch_records(self, url, start_date, end_date):
        count, page = 0, 0
        resp = self.requests.get(furl(url).set(query_params={'page': page}))
        total = BeautifulSoup(resp.content, 'html.parser').find(id='page-title').text.split(' ')[0].strip().replace(',', '')

        try:
            total = int(total)
        except ValueError:
            # Handle the case of "No" results
            assert total == 'No'
            total = 0

        logging.info('Found %d results from biorxiv', total)

        while count < total:
            links = re.findall(b'href="(/content/early/[^"]+?/[^"]+)"', resp.content)

            logger.info('On document %d of %d (%d%%)', count, total, (count / total) * 100)

            for link in links:
                article = self.requests.get('http://biorxiv.org' + link.decode())
                if article.status_code // 100 != 2:
                    logger.warning('Got non-200 status %s from %s', article, link)
                    continue
                article.raise_for_status()
                soup = BeautifulSoup(article.content, 'lxml')

                data = {
                    'subject-areas': [
                        subject.a.text.strip()
                        for subject in
                        soup.find_all(**{'class': 'highwire-article-collection-term'})
                    ]
                }

                for meta in BeautifulSoup(article.content, 'lxml').find_all('meta'):
                    if 'name' not in meta.attrs:
                        continue
                    if meta.attrs['name'] in data:
                        if not isinstance(data[meta.attrs['name']], list):
                            data[meta.attrs['name']] = [data[meta.attrs['name']]]
                        data[meta.attrs['name']].append(meta.attrs['content'])
                    else:
                        data[meta.attrs['name']] = meta.attrs['content']

                count += 1
                yield link.decode(), data

            page += 1
            resp = self.requests.get(furl(url).set(query_params={'page': page}))
Exemple #31
0
 def get_logout_url(self, service_url):
     url = furl.furl(self.BASE_URL)
     url.path.segments.append('logout')
     url.args['service'] = service_url
     return url.url
Exemple #32
0
TRANSCRIPT = re.compile('Earnings Call Transcript')

next_page = True
page = 1
driver = webdriver.Firefox()
while next_page:
    print(f'Page: {page}')
    url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
    driver.get(urljoin(SA_URL, url))
    response = driver.page_source
    page += 1
    soup = BeautifulSoup(response, 'lxml')
    links = soup.find_all(name='a', string=TRANSCRIPT)
    if len(links) == 0:
        next_page = False
    else:
        for link in links:
            transcript_url = link.attrs.get('href')
            article_url = furl(urljoin(SA_URL, transcript_url)).add({'part': 'single'})
            driver.get(article_url.url)
            html = driver.page_source
            result = parse_html(html)
            if result is not None:
                meta, participants, content = result
                meta['link'] = link
                store_result(meta, participants, content)
            sleep(5 + (random() - .5) * 2)

driver.close()
# pd.Series(articles).to_csv('articles.csv')
Exemple #33
0
def clone_repository_cached(session, execution, destination):
    # type: (Session, ExecutionInfo, Path) -> Tuple[VCS, RepoInfo]
    """
    Clone a remote repository.
    :param execution: execution info
    :param destination: directory to clone to (in which a directory for the repository will be created)
    :param session: program session
    :return: repository information
    :raises: CommandFailedError if git/hg is not installed
    """
    # mock lock
    repo_lock = Lock()
    repo_lock_timeout_sec = 300
    repo_url = execution.repository or ''  # type: str
    parsed_url = furl(repo_url)
    no_password_url = parsed_url.copy().remove(password=True).url

    clone_folder_name = Path(str(furl(repo_url).path)).name  # type: str
    clone_folder = Path(destination) / clone_folder_name

    standalone_mode = session.config.get("agent.standalone_mode", False)
    if standalone_mode:
        cached_repo_path = clone_folder
    else:
        vcs_cache_path = Path(session.config["agent.vcs_cache.path"]).expanduser()
        repo_hash = md5(ensure_binary(repo_url)).hexdigest()
        # create lock
        repo_lock = FileLock(filename=(vcs_cache_path / '{}.lock'.format(repo_hash)).as_posix())
        # noinspection PyBroadException
        try:
            repo_lock.acquire(timeout=repo_lock_timeout_sec)
        except BaseException:
            print('Could not lock cache folder "{}" (timeout {} sec), using temp vcs cache.'.format(
                clone_folder_name, repo_lock_timeout_sec))
            repo_hash = '{}_{}'.format(repo_hash, str(random()).replace('.', ''))
            # use mock lock for the context
            repo_lock = Lock()
        # select vcs cache folder
        cached_repo_path = vcs_cache_path / "{}.{}".format(clone_folder_name, repo_hash) / clone_folder_name

    with repo_lock:
        vcs = VcsFactory.create(
            session, execution_info=execution, location=cached_repo_path
        )
        if not find_executable(vcs.executable_name):
            raise CommandFailedError(vcs.executable_not_found_error_help())

        if not standalone_mode:
            if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
                print('Using cached repository in "{}"'.format(cached_repo_path))

            else:
                print("cloning: {}".format(no_password_url))
                rm_tree(cached_repo_path)
                # We clone the entire repository, not a specific branch
                vcs.clone()  # branch=execution.branch)

            vcs.pull()
            rm_tree(destination)
            shutil.copytree(Text(cached_repo_path), Text(clone_folder))
            if not clone_folder.is_dir():
                raise CommandFailedError(
                    "copying of repository failed: from {} to {}".format(
                        cached_repo_path, clone_folder
                    )
                )

    # checkout in the newly copy destination
    vcs.location = Text(clone_folder)
    vcs.checkout()

    repo_info = vcs.get_repository_copy_info(clone_folder)

    # make sure we have no user/pass in the returned repository structure
    repo_info = attr.evolve(repo_info, url=no_password_url)

    return vcs, repo_info
Exemple #34
0
 def _get_prefix_from_bucket_config(self, config):
     prefix = furl.furl(scheme="gs",
                        netloc=config.bucket,
                        path=config.subdir)
     return str(prefix)
Exemple #35
0
 def server_url(self) -> furl:
     return furl().set(scheme=self.scheme,
                       host=self.host,
                       port=self.server_port,
                       path=self.imposters_path)
Exemple #36
0
LOGOUT_REDIRECT_URL = "/"

# Database
# https://docs.djangoproject.com/en/2.0/ref/settings/#databases
DATABASES = {
    "default": {
        "ENGINE": "django.db.backends.sqlite3",
        "NAME": path.join(BASE_DIR, "db.sqlite3"),
    }
}
# Change 'default' database configuration with $DATABASE_URL.
DATABASES["default"].update(
    dj_database_url.config(
        env="DATABASE_URL",
        conn_max_age=env.int("DATABASE_CONN_MAX_AGE", 500),
        ssl_require="sslmode" not in furl(env("DATABASE_URL", "")).args,
    ))

# work-around for dj-database-url: explicitly disable ssl for sqlite
if DATABASES["default"].get("ENGINE") == "django.db.backends.sqlite3":
    DATABASES["default"].get("OPTIONS", {}).pop("sslmode", None)

# work-around for dj-database-url: patch ssl for mysql
if DATABASES["default"].get("ENGINE") == "django.db.backends.mysql":
    DATABASES["default"].get("OPTIONS", {}).pop("sslmode", None)
    if env("MYSQL_SSL_CA", None):
        DATABASES["default"].setdefault("OPTIONS",
                                        {}).setdefault("ssl", {}).setdefault(
                                            "ca", env("MYSQL_SSL_CA", None))

# default to a sensible modern driver for Azure SQL
Exemple #37
0
 def get_url(self, url='url', **keys):
     parsed = furl.furl(self.credentials.get(url, ''))
     for key, value in keys.items():
         setattr(parsed, key, self.credentials.get(value))
     return parsed.url
Exemple #38
0
def addon_view_file(auth, node, file_node, version):
    # TODO: resolve circular import issue
    from website.addons.wiki import settings as wiki_settings

    if isinstance(version, tuple):
        version, error = version
        error = error.replace('\n', '').strip()
    else:
        error = None

    ret = serialize_node(node, auth, primary=True)

    if file_node._id not in node.file_guid_to_share_uuids:
        node.file_guid_to_share_uuids[file_node._id] = uuid.uuid4()
        node.save()

    if ret['user']['can_edit']:
        sharejs_uuid = str(node.file_guid_to_share_uuids[file_node._id])
    else:
        sharejs_uuid = None

    download_url = furl.furl(request.url.encode('utf-8')).set(
        args=dict(request.args, **{
            'direct': None,
            'mode': 'render',
            'action': 'download',
        }))

    render_url = furl.furl(settings.MFR_SERVER_URL).set(
        path=['render'], args={'url': download_url.url})

    ret.update({
        'urls': {
            'render': render_url.url,
            'mfr': settings.MFR_SERVER_URL,
            'sharejs': wiki_settings.SHAREJS_URL,
            'gravatar': get_gravatar(auth.user, 25),
            'files': node.web_url_for('collect_file_trees'),
        },
        'error':
        error,
        'file_name':
        file_node.name,
        'file_name_title':
        os.path.splitext(file_node.name)[0],
        'file_name_ext':
        os.path.splitext(file_node.name)[1],
        'file_path':
        file_node.path,
        'sharejs_uuid':
        sharejs_uuid,
        'provider':
        file_node.provider,
        'materialized_path':
        file_node.materialized_path,
        'extra':
        version.metadata.get('extra', {}),
        'size':
        version.size if version.size is not None else 9966699,
        'private':
        getattr(node.get_addon(file_node.provider), 'is_private', False),
        'file_tags': [tag._id for tag in file_node.tags],
        'file_guid':
        file_node.get_guid()._id,
        'file_id':
        file_node._id,
        'allow_comments':
        file_node.provider in settings.ADDONS_COMMENTABLE
    })

    ret.update(rubeus.collect_addon_assets(node))
    return ret
Exemple #39
0
    def update(self):
        """
        Downloads the latest source tarball from github and installs it over the existing version.
        """
        base_url = furl(self.repositoryBase)
        base_url.path.add(self.repository)
        base_url.path.add("tarball")
        base_url.path.add(self.branch)
        tar_download_url = base_url.url
        main_dir = os.path.dirname(os.path.dirname(__file__))

        try:
            self.backup()
            
            # prepare the update dir
            update_dir = os.path.join(main_dir, 'update')

            if os.path.isdir(update_dir):
                logger.info("Clearing out update folder " + update_dir + " before extracting")
                shutil.rmtree(update_dir)

            logger.info("Creating update folder " + update_dir + " before extracting")
            os.makedirs(update_dir)

            # retrieve file
            logger.info("Downloading update from " + repr(tar_download_url))
            tar_download_path = os.path.join(update_dir, 'sb-update.tar')
            response = webaccess.get(tar_download_url, stream=True) #Apparently SSL causes problems on some systems (#138)b
            with open(tar_download_path, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
            del response

            if not os.path.isfile(tar_download_path):
                logger.error("Unable to retrieve new version from " + tar_download_url + ", can't update")
                return False

            if not tarfile.is_tarfile(tar_download_path):
                logger.error("Retrieved version from " + tar_download_url + " is corrupt, can't update")
                return False

            # extract to sb-update dir
            logger.info("Extracting update file " + tar_download_path)
            tar = tarfile.open(tar_download_path)
            tar.extractall(update_dir)
            tar.close()

            # delete .tar.gz
            logger.info("Deleting update file " + tar_download_path)
            os.remove(tar_download_path)

            # find update dir name
            update_dir_contents = [x for x in os.listdir(update_dir) if os.path.isdir(os.path.join(update_dir, x))]
            if len(update_dir_contents) != 1:
                logger.error("Invalid update data, update failed: " + str(update_dir_contents))
                return False
            content_dir = os.path.join(update_dir, update_dir_contents[0])

            # walk temp folder and move files to main folder
            logger.info("Moving files from " + content_dir + " to " + main_dir)
            for dirname, dirnames, filenames in os.walk(content_dir):
                dirname = dirname[len(content_dir) + 1:]
                for curfile in filenames:
                    old_path = os.path.join(content_dir, dirname, curfile)
                    new_path = os.path.join(main_dir, dirname, curfile)
            
                    if os.path.isfile(new_path):
                        os.remove(new_path)
                    os.renames(old_path, new_path)


        except Exception as e:
            logger.error("Error while trying to update: " + str(e))
            return False
        logger.info("Update successful")
        return True
Exemple #40
0
def validate_referer(referer):
    if furl.furl(referer).host != furl.furl(request.url).host:
        raise ValidationError('Invalid referer.')
Exemple #41
0
 def __init__(self, url, client_info=None, timeout=15):
     client.Client.__init__(self, url, client_info, timeout)
     self.base_url = furl(url)
Exemple #42
0
def solr_request(path: str,
                 params: SolrParams = None,
                 content: Union[str, SolrParams] = None,
                 content_type: Optional[str] = None,
                 config: Optional[CommonConfig] = None) -> str:
    """
    Send a request to Solr.

    :param path: Solr path to call, e.g. 'select'.
    :param params: Query parameters to add to the path.
    :param content: String or dictionary content to send via POST request.
    :param content_type: Content-Type for the POST content.
    :param config: (testing) Configuration object
    :return: Raw response content on success, raise exception on error.
    """
    path = decode_object_from_bytes_if_needed(path)
    params = decode_object_from_bytes_if_needed(params)
    content = decode_object_from_bytes_if_needed(content)
    content_type = decode_object_from_bytes_if_needed(content_type)

    if not path:
        raise McSolrRequestInvalidParamsException("Path is unset.")

    if params:
        if not isinstance(params, dict):
            raise McSolrRequestInvalidParamsException(
                f"Params is not a dictionary: {params}")

    if content:
        if not (isinstance(content, str) or isinstance(content, dict)):
            raise McSolrRequestInvalidParamsException(
                f"Content is not a string not a dictionary: {content}")

    if not config:
        config = CommonConfig()

    solr_url = config.solr_url()

    if not params:
        params = {}

    abs_uri = furl(f"{solr_url}/mediacloud/{path}")
    abs_uri = abs_uri.set(params)
    abs_url = str(abs_uri)

    ua = UserAgent()
    ua.set_timeout(__QUERY_HTTP_TIMEOUT)
    ua.set_max_size(None)

    # Remediate CVE-2017-12629
    q_param = str(params.get('q', ''))
    if 'xmlparser' in q_param.lower():
        raise McSolrRequestQueryErrorException(
            "XML queries are not supported.")

    # Solr might still be starting up so wait for it to expose the collections list
    __wait_for_solr_to_start(config=config)

    if content:

        if not content_type:
            fallback_content_type = 'text/plain; charset=utf-8'
            log.warning(
                f"Content-Type is not set; falling back to '{fallback_content_type}'"
            )
            content_type = fallback_content_type

        if isinstance(content, dict):
            content = urlencode(content, doseq=True)

        content_encoded = content.encode('utf-8', errors='replace')

        request = Request(method='POST', url=abs_url)
        request.set_header(name='Content-Type', value=content_type)
        request.set_header(name='Content-Length',
                           value=str(len(content_encoded)))
        request.set_content(content_encoded)

    else:

        request = Request(method='GET', url=abs_url)

    log.debug(f"Sending Solr request: {request}")

    response = ua.request(request)

    if not response.is_success():
        error_message = __solr_error_message_from_response(response=response)
        raise McSolrRequestQueryErrorException(
            f"Error fetching Solr response: {error_message}")

    return response.decoded_content()
Exemple #43
0
    def source_url(self):
        url = furl(self.source_node.absolute_url)
        url.path.segments = self.source_node.web_url_for(
            'collect_file_trees').split('/')

        return url.url
Exemple #44
0
WSGI_APPLICATION = 'django_wyh.wsgi.application'

# Database
# https://docs.djangoproject.com/en/3.0/ref/settings/#databases

DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.sqlite3',
        'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
    }
}
DATABASES['default'].update(
    dj_database_url.config(
        env='DATABASE_URL',
        conn_max_age=env.int('DATABASE_CONN_MAX_AGE', 500),
        ssl_require='sslmode' not in furl(env('DATABASE_URL', '')).args,
    ))

# Password validation
# https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators

AUTH_PASSWORD_VALIDATORS = [
    {
        'NAME':
        'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
    },
    {
        'NAME':
        'django.contrib.auth.password_validation.MinimumLengthValidator',
    },
    {
Exemple #45
0
    def update(self):
        """
        Downloads the latest source tarball from github and installs it over the existing version.
        """
        base_url = furl(self.repositoryBase)
        base_url.path.add(self.repository)
        base_url.path.add("tarball")
        base_url.path.add(self.branch)
        tar_download_url = base_url.url
        main_dir = os.path.dirname(os.path.dirname(__file__))

        try:
            self.backup()
            # prepare the update dir
            update_dir = os.path.join(main_dir, 'update')

            if os.path.isdir(update_dir):
                logger.info("Clearing out update folder " + update_dir + " before extracting")
                shutil.rmtree(update_dir)

            logger.info("Creating update folder " + update_dir + " before extracting")
            os.makedirs(update_dir)

            # retrieve file
            logger.info("Downloading update from " + repr(tar_download_url))
            tar_download_path = os.path.join(update_dir, 'sb-update.tar')
            urllib.urlretrieve(tar_download_url, tar_download_path)

            if not os.path.isfile(tar_download_path):
                logger.error("Unable to retrieve new version from " + tar_download_url + ", can't update")
                return False

            if not tarfile.is_tarfile(tar_download_path):
                logger.error("Retrieved version from " + tar_download_url + " is corrupt, can't update")
                return False

            # extract to sb-update dir
            logger.info("Extracting update file " + tar_download_path)
            tar = tarfile.open(tar_download_path)
            tar.extractall(update_dir)
            tar.close()

            # delete .tar.gz
            logger.info("Deleting update file " + tar_download_path)
            os.remove(tar_download_path)

            # find update dir name
            update_dir_contents = [x for x in os.listdir(update_dir) if os.path.isdir(os.path.join(update_dir, x))]
            if len(update_dir_contents) != 1:
                logger.error("Invalid update data, update failed: " + str(update_dir_contents))
                return False
            content_dir = os.path.join(update_dir, update_dir_contents[0])
            
            dontUpdateThese = ["nssm.exe"]#("msvcm90.dll", "msvcr90.dll", "msvcm90.dll")
            #rename exes, pyd and dll files so they can be overwritten
            filesToRename = []
            for filename in os.listdir(main_dir):         
                if (filename.endswith(".pyd") or filename.endswith(".dll") or filename.endswith(".exe")) and filename not in dontUpdateThese:
                    filesToRename.append((filename, filename + ".updated"))
            logger.info("Renaming %d files so they can be overwritten" % len(filesToRename))
            for toRename in filesToRename:
                logger.debug("Renaming %s to %s" % (toRename[0], toRename[1]))
                shutil.move(toRename[0], toRename[1])      

            # walk temp folder and move files to main folder
            logger.info("Moving files from " + content_dir + " to " + main_dir)
            for dirname, dirnames, filenames in os.walk(content_dir):
                dirname = dirname[len(content_dir) + 1:]
                for curfile in filenames:
                    if curfile not in dontUpdateThese:
                        old_path = os.path.join(content_dir, dirname, curfile)
                        new_path = os.path.join(main_dir, dirname, curfile)
                        logger.debug("Updating %s" % curfile)
                        if os.path.isfile(new_path):
                            os.remove(new_path)
                        os.renames(old_path, new_path)
                    else:
                        logger.debug("Skipping %s" % curfile)

        except Exception as e:
            logger.error("Error while trying to update: " + str(e))
            return False
        logger.info("Update successful")
        return True
Exemple #46
0
    def parse(self, response):
        """ Required first level page parser.

        :param response: The response instance from ``start_requests``
        :type response: scrapy.Request
        :returns: Yields torrent items
        :rtype: list[items.Torrent]
        """

        soup = self.get_soup(response.text)
        try:
            results = soup\
                .find('table', {'id': 'searchResult'})\
                .find_all('tr')[1:]
        except AttributeError:
            return

        for result in results:
            torrent = items.Torrent(spider=self.name)
            torrent['categories'] = [
                self._category_map.get(
                    furl.furl(category.attrs['href']).path.segments[-1],
                    items.TorrentCategory.Unknown
                ) for category in result.find(
                    'td', {'class': 'vertTh'}
                ).find_all('a')
            ]
            torrent['magnet'] = result.find(
                'a', {'href': re.compile('^magnet\:.*')}
            )['href']
            torrent['hash'] = re.match(
                r'.*magnet:\?xt=urn:(?:btih)+:([a-zA-Z0-9]+).*',
                torrent['magnet']
            ).groups()[0].lower()
            (torrent['seeders'], torrent['leechers'],) = tuple([
                int(column.contents[0])
                for column in result.find_all('td', {'align': 'right'})
            ])

            result_links = result.find('a', {'class': 'detLink'})
            if 'href' in result_links.attrs:
                torrent['source'] = furl.furl(response.url).set(
                    path=result_links.attrs['href'], args={}
                ).url

            torrent['name'] = result_links.contents[0].strip()

            result_desc = result.find('font', {'class': 'detDesc'})
            (time_content, size_content,) = \
                result_desc.contents[0].split(',')[:2]
            torrent['uploaded'] = self.parse_datetime(
                time_content.split(' ')[-1],
                formats=[
                    '%m-%d %Y',
                    '%m-%d %H:%M',
                    '%H:%M',
                    'Y-day %H:%M'
                ]
            )
            torrent['size'] = self.parse_size(
                size_content.split(' ')[-1]
            )

            try:
                torrent['uploader'] = result_desc.find(
                    'a', {'href': re.compile('^/user/.*')}
                ).contents[0]
            except AttributeError:
                pass

            yield torrent
Exemple #47
0
 def get_profile_url(self):
     url = furl.furl(self.BASE_URL)
     url.path.segments.extend(('oauth2', 'profile',))
     return url.url
Exemple #48
0
    def resolve(self, context=None, request=None, resolved_object=None):
        if not context and not request:
            raise ImproperlyConfigured(
                'Must provide a context or a request in order to resolve the '
                'link.')

        AccessControlList = apps.get_model(app_label='acls',
                                           model_name='AccessControlList')

        if not context:
            context = RequestContext(request=request)

        if not request:
            # Try to get the request object the faster way and fallback to the
            # slower method.
            try:
                request = context.request
            except AttributeError:
                request = Variable('request').resolve(context)

        current_path = request.META['PATH_INFO']
        current_view_name = resolve(current_path).view_name

        # ACL is tested agains the resolved_object or just {{ object }} if not
        if not resolved_object:
            try:
                resolved_object = Variable('object').resolve(context=context)
            except VariableDoesNotExist:
                pass

        # If this link has a required permission check that the user has it
        # too
        if self.permissions:
            if resolved_object:
                try:
                    AccessControlList.objects.check_access(
                        obj=resolved_object,
                        permissions=self.permissions,
                        user=request.user)
                except PermissionDenied:
                    return None
            else:
                try:
                    Permission.check_user_permissions(
                        permissions=self.permissions, user=request.user)
                except PermissionDenied:
                    return None

        # Check to see if link has conditional display function and only
        # display it if the result of the conditional display function is
        # True
        if self.condition:
            if not self.condition(context):
                return None

        resolved_link = ResolvedLink(current_view_name=current_view_name,
                                     link=self)

        if self.view:
            view_name = Variable('"{}"'.format(self.view))
            if isinstance(self.args, list) or isinstance(self.args, tuple):
                # TODO: Don't check for instance check for iterable in try/except
                # block. This update required changing all 'args' argument in
                # links.py files to be iterables and not just strings.
                args = [Variable(arg) for arg in self.args]
            else:
                args = [Variable(self.args)]

            # If we were passed an instance of the view context object we are
            # resolving, inject it into the context. This help resolve links for
            # object lists.
            if resolved_object:
                context['resolved_object'] = resolved_object

            try:
                kwargs = self.kwargs(context)
            except TypeError:
                # Is not a callable
                kwargs = self.kwargs

            kwargs = {key: Variable(value) for key, value in kwargs.items()}

            # Use Django's exact {% url %} code to resolve the link
            node = URLNode(view_name=view_name,
                           args=args,
                           kwargs=kwargs,
                           asvar=None)
            try:
                resolved_link.url = node.render(context)
            except Exception as exception:
                logger.error('Error resolving link "%s" URL; %s', self.text,
                             exception)
        elif self.url:
            resolved_link.url = self.url

        # This is for links that should be displayed but that are not clickable
        if self.conditional_disable:
            resolved_link.disabled = self.conditional_disable(context)
        else:
            resolved_link.disabled = False

        # Lets a new link keep the same URL query string of the current URL
        if self.keep_query:
            # Sometimes we are required to remove a key from the URL QS
            parsed_url = furl(
                force_str(request.get_full_path() or request.META.get(
                    'HTTP_REFERER', reverse(setting_home_view.value))))

            for key in self.remove_from_query:
                try:
                    parsed_url.query.remove(key)
                except KeyError:
                    pass

            # Use the link's URL but with the previous URL querystring
            new_url = furl(resolved_link.url)
            new_url.args = parsed_url.querystr
            resolved_link.url = new_url.url

        resolved_link.context = context
        return resolved_link
Exemple #49
0
# 路径分析
from furl import furl
from PIL import Image

import hashlib


GROUP_START = 1
GROUP_END = 5

# 1.请求这个url可以看到他是以xhr(ajax)的方法请求的
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}

f = furl('https://www.toutiao.com/search_content/?offset=90&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20')


def get_page(offset, keyword):
    '''
        获取页面
    '''
    # 构建新的url
    f.args['offset'] = offset
    f.args['keyword'] = keyword

    try:
        respone = requests.get(f.url, headers)
        # 判断返回状态
        if respone.status_code == requests.codes.ok:
            return respone.json()
Exemple #50
0
def run(arguments):
    arguments.config = arguments.config if os.path.isabs(
        arguments.config) else os.path.join(nzbhydra.getBasePath(),
                                            arguments.config)
    arguments.database = arguments.database if os.path.isabs(
        arguments.database) else os.path.join(nzbhydra.getBasePath(),
                                              arguments.database)
    nzbhydra.configFile = settings_file = arguments.config
    nzbhydra.databaseFile = database_file = arguments.database

    logger.notice("Loading settings from {}".format(settings_file))
    try:
        config.load(settings_file)
        config.save(settings_file)  # Write any new settings back to the file
        log.setup_custom_logger(arguments.logfile, arguments.quiet)
    except Exception:
        print(
            "An error occured during migrating the old config. Sorry about that...: "
        )
        traceback.print_exc(file=sys.stdout)
        print("Trying to log messages from migration...")
        config.logLogMessages()
        os._exit(-5)

    try:
        logger.info("Started")

        if arguments.daemon:
            logger.info("Daemonizing...")
            daemonize(arguments.pidfile)

        config.logLogMessages()

        if arguments.clearloganddb:
            logger.warning("Deleting log file and database now as requested")
            try:
                logger.warning("Deleting database file %s" % database_file)
                os.unlink(database_file)
            except Exception as e:
                logger.error("Unable to close or delete log file: %s" % e)

            try:
                handler = logger.handlers[1] if len(
                    logger.handlers) == 2 else logger.handlers[0]
                filename = handler.stream.name

                if filename and os.path.exists(filename):
                    logger.warn("Deleting file %s" % filename)
                handler.flush()
                handler.close()
                logger.removeHandler(handler)
                os.unlink(filename)
                logger.addHandler(handler)
            except Exception as e:
                print("Unable to close or delete log file: %s" % e)

        try:
            import _sqlite3
            logger.debug("SQLite3 version: %s" % _sqlite3.sqlite_version)
        except:
            logger.error("Unable to log SQLite version")

        logger.info("Loading database file %s" % database_file)
        if not os.path.exists(database_file):
            database.init_db(database_file)
        else:
            database.update_db(database_file)
        logger.info("Starting db")

        indexers.read_indexers_from_config()

        if config.settings.main.debug:
            logger.info("Debug mode enabled")

        # Clean up any "old" files from last update
        oldfiles = glob.glob("*.updated")
        if len(oldfiles) > 0:
            logger.info("Deleting %d old files remaining from update" %
                        len(oldfiles))
            for filename in oldfiles:
                try:
                    if "hydratray" not in filename:
                        logger.debug("Deleting %s" % filename)
                        os.remove(filename)
                    else:
                        logger.debug(
                            "Not deleting %s because it's still running. TrayHelper will restart itself"
                            % filename)
                except Exception:
                    logger.warn(
                        "Unable to delete old file %s. Please delete manually"
                        % filename)

        host = config.settings.main.host if arguments.host is None else arguments.host
        port = config.settings.main.port if arguments.port is None else arguments.port

        socksproxy = config.settings.main.socksProxy if arguments.socksproxy is None else arguments.socksproxy
        if socksproxy:
            webaccess.set_proxies(socksproxy)
        elif config.settings.main.httpProxy:
            webaccess.set_proxies(config.settings.main.httpProxy,
                                  config.settings.main.httpsProxy)

        logger.notice("Starting web app on %s:%d" % (host, port))
        if config.settings.main.externalUrl is not None and config.settings.main.externalUrl != "":
            f = furl(config.settings.main.externalUrl)
        else:
            f = furl()
            f.host = "127.0.0.1" if config.settings.main.host == "0.0.0.0" else config.settings.main.host
            f.port = port
            f.scheme = "https" if config.settings.main.ssl else "http"
        if not arguments.nobrowser and config.settings.main.startupBrowser:
            if arguments.restarted:
                logger.info("Not opening the browser after restart")
            else:
                logger.info("Opening browser to %s" % f.url)
                webbrowser.open_new(f.url)
        else:
            logger.notice("Go to %s for the frontend" % f.url)

        web.run(host, port, basepath)
    except Exception:
        logger.exception("Fatal error occurred")
Exemple #51
0
def make_response_from_ticket(ticket, service_url):
    """
    Given a CAS ticket and service URL, attempt to validate the user and return a proper redirect response.

    :param str ticket: CAS service ticket
    :param str service_url: Service URL from which the authentication request originates
    :return: redirect response
    """

    service_furl = furl.furl(service_url)
    # `service_url` is guaranteed to be removed of `ticket` parameter, which has been pulled off in
    # `framework.sessions.before_request()`.
    if 'ticket' in service_furl.args:
        service_furl.args.pop('ticket')
    client = get_client()
    cas_resp = client.service_validate(ticket, service_furl.url)
    if cas_resp.authenticated:
        user, external_credential, action = get_user_from_cas_resp(cas_resp)
        # user found and authenticated
        if user and action == 'authenticate':
            # if we successfully authenticate and a verification key is present, invalidate it
            if user.verification_key:
                user.verification_key = None
                user.save()

            # if user is authenticated by external IDP, ask CAS to authenticate user for a second time
            # this extra step will guarantee that 2FA are enforced
            # current CAS session created by external login must be cleared first before authentication
            if external_credential:
                user.verification_key = generate_verification_key()
                user.save()
                return redirect(
                    get_logout_url(
                        get_login_url(service_url,
                                      username=user.username,
                                      verification_key=user.verification_key)))

            # if user is authenticated by CAS
            # TODO [CAS-27]: Remove Access Token From Service Validation
            return authenticate(user,
                                cas_resp.attributes.get('accessToken', ''),
                                redirect(service_furl.url))
        # first time login from external identity provider
        if not user and external_credential and action == 'external_first_login':
            from website.util import web_url_for
            # orcid attributes can be marked private and not shared, default to orcid otherwise
            fullname = u'{} {}'.format(
                cas_resp.attributes.get('given-names', ''),
                cas_resp.attributes.get('family-name', '')).strip()
            # TODO [CAS-27]: Remove Access Token From Service Validation
            user = {
                'external_id_provider': external_credential['provider'],
                'external_id': external_credential['id'],
                'fullname': fullname,
                'access_token': cas_resp.attributes.get('accessToken', ''),
                'service_url': service_furl.url,
            }
            return external_first_login_authenticate(
                user, redirect(web_url_for('external_login_email_get')))
    # Unauthorized: ticket could not be validated, or user does not exist.
    return redirect(service_furl.url)
Exemple #52
0
def addon_view_or_download_file(auth, path, provider, **kwargs):
    extras = request.args.to_dict()
    extras.pop('_', None)  # Clean up our url params a bit
    action = extras.get('action', 'view')
    node = kwargs.get('node') or kwargs['project']

    node_addon = node.get_addon(provider)

    if not path:
        raise HTTPError(httplib.BAD_REQUEST)

    if not isinstance(node_addon, StorageAddonBase):
        raise HTTPError(
            httplib.BAD_REQUEST, {
                'message_short':
                'Bad Request',
                'message_long':
                'The add-on containing this file is no longer connected to the {}.'
                .format(node.project_or_component)
            })

    if not node_addon.has_auth:
        raise HTTPError(
            httplib.UNAUTHORIZED, {
                'message_short':
                'Unauthorized',
                'message_long':
                'The add-on containing this file is no longer authorized.'
            })

    if not node_addon.complete:
        raise HTTPError(
            httplib.BAD_REQUEST, {
                'message_short':
                'Bad Request',
                'message_long':
                'The add-on containing this file is no longer configured.'
            })

    file_node = FileNode.resolve_class(provider, FileNode.FILE).get_or_create(
        node, path)

    # Note: Cookie is provided for authentication to waterbutler
    # it is overriden to force authentication as the current user
    # the auth header is also pass to support basic auth
    version = file_node.touch(
        request.headers.get('Authorization'),
        **dict(extras, cookie=request.cookies.get(settings.COOKIE_NAME)))

    if version is None:
        if file_node.get_guid():
            # If this file has been successfully view before but no longer exists

            # Move file to trashed file node
            if not TrashedFileNode.load(file_node._id):
                file_node.delete()

            # Show a nice error message
            return addon_deleted_file(file_node=file_node, **kwargs)

        raise HTTPError(
            httplib.NOT_FOUND, {
                'message_short': 'Not Found',
                'message_long': 'This file does not exist'
            })

    # TODO clean up these urls and unify what is used as a version identifier
    if request.method == 'HEAD':
        return make_response(('', 200, {
            'Location':
            file_node.generate_waterbutler_url(
                **dict(extras, direct=None, version=version.identifier))
        }))

    if action == 'download':
        return redirect(
            file_node.generate_waterbutler_url(
                **dict(extras, direct=None, version=version.identifier)))

    if len(request.path.strip('/').split('/')) > 1:
        guid = file_node.get_guid(create=True)
        return redirect(
            furl.furl('/{}/'.format(guid._id)).set(args=extras).url)

    return addon_view_file(auth, node, file_node, version)
Exemple #53
0
 def fetch_by_id(self, provider_id):
     url = furl(self.url)
     url.args['verb'] = 'GetRecord'
     url.args['metadataPrefix'] = self.metadata_prefix
     url.args['identifier'] = provider_id
     return etree.tostring(self.fetch_page(url)[0][0])
Exemple #54
0
 def get_application_revocation_url(self):
     url = furl.furl(self.BASE_URL)
     url.path.segments.extend(('oauth2', 'revoke'))
     return url.url
Exemple #55
0
 def something_in_stock_mass(self):
     for i in range(len(self.asin_list)):
         params = {}
         for x in range(len(self.asin_list[i])):
             params[f"ASIN.{x + 1}"] = self.asin_list[i][x]
             params[f"Quantity.{x + 1}"] = 1
         f = furl(AMAZON_URLS["CART_URL"])
         f.set(params)
         self.driver.get(f.url)
         title = self.driver.title
         bad_list_flag = False
         if title in DOGGO_TITLES:
             good_asin_list = []
             for asin in self.asin_list[i]:
                 checkparams = {}
                 checkparams[f"ASIN.1"] = asin
                 checkparams[f"Quantity.1"] = 1
                 check = furl(AMAZON_URLS["CART_URL"])
                 check.set(checkparams)
                 self.driver.get(check.url)
                 sanity_check = self.driver.title
                 if sanity_check in DOGGO_TITLES:
                     log.error(f"{asin} blocked from bulk adding by Amazon")
                 else:
                     log.info(f"{asin} appears to allow adding")
                     good_asin_list.append(asin)
                 time.sleep(1)
             if len(good_asin_list) > 0:
                 log.info(
                     "Revising ASIN list to include only good ASINs listed above"
                 )
                 self.asin_list[i] = good_asin_list
             else:
                 log.error(f"No ASINs work in list {i + 1}.")
                 self.asin_list[i] = self.asin_list[i][
                     0]  # just assign one asin to list, can't remove during execution
                 bad_list_flag = True
         if bad_list_flag:
             continue
         self.check_if_captcha(self.wait_for_pages, ADD_TO_CART_TITLES)
         price_element = self.driver.find_elements_by_xpath(
             '//td[@class="price item-row"]')
         if price_element:
             price_flag = False
             price_warning_flag = False
             for price_e in price_element:
                 str_price = price_e.text
                 log.info(f"Item Cost: {str_price}")
                 price = parse_price(str_price)
                 priceFloat = price.amount
                 if priceFloat is None:
                     log.error("Error reading price information on page.")
                 elif priceFloat <= self.reserve[i]:
                     log.info("Item in stock and under reserve!")
                     price_flag = True
                 else:
                     log.info("Item greater than reserve price")
                     price_warning_flag = True
             if price_flag:
                 log.info("Attempting to purchase")
                 if price_warning_flag:
                     log.info(
                         "Cart included items below and above reserve price, cancel unwanted items ASAP!"
                     )
                     self.take_screenshot("attempting-to-purchase")
                 return i + 1
     return 0
Exemple #56
0
def normalize_url(url: str) -> str:
    """Normalize URL

    * Fix common mistypes, e.g. "http://http://..."
    * Run URL through normalization, i.e. standardize URL's scheme and hostname case, remove default port, uppercase
      all escape sequences, un-escape octets that can be represented as plain characters, remove whitespace before /
      after the URL string)
    * Remove #fragment
    * Remove various ad tracking query parameters, e.g. "utm_source", "utm_medium", "PHPSESSID", etc.

    Return normalized URL on success; raise on error"""
    url = decode_object_from_bytes_if_needed(url)
    if url is None:
        raise McNormalizeURLException("URL is None")
    if len(url) == 0:
        raise McNormalizeURLException("URL is empty")

    log.debug("normalize_url: " + url)

    url = fix_common_url_mistakes(url)

    try:
        url = canonical_url(url)
    except Exception as ex:
        raise McNormalizeURLException("Unable to get canonical URL: %s" %
                                      str(ex))

    if not is_http_url(url):
        raise McNormalizeURLException("URL is not HTTP(s): %s" % url)

    uri = furl(url)

    # Remove #fragment
    uri.fragment.set(path='')

    parameters_to_remove = []

    # Facebook parameters (https://developers.facebook.com/docs/games/canvas/referral-tracking)
    parameters_to_remove += [
        'fb_action_ids',
        'fb_action_types',
        'fb_source',
        'fb_ref',
        'action_object_map',
        'action_type_map',
        'action_ref_map',
        'fsrc_fb_noscript',
    ]

    # metrika.yandex.ru parameters
    parameters_to_remove += [
        'yclid',
        '_openstat',
    ]

    if 'facebook.com' in uri.host.lower():
        # Additional parameters specifically for the facebook.com host
        parameters_to_remove += [
            'ref',
            'fref',
            'hc_location',
        ]

    if 'nytimes.com' in uri.host.lower():
        # Additional parameters specifically for the nytimes.com host
        parameters_to_remove += [
            'emc',
            'partner',
            '_r',
            'hp',
            'inline',
            'smid',
            'WT.z_sma',
            'bicmp',
            'bicmlukp',
            'bicmst',
            'bicmet',
            'abt',
            'abg',
        ]

    if 'livejournal.com' in uri.host.lower():
        # Additional parameters specifically for the livejournal.com host
        parameters_to_remove += [
            'thread',
            'nojs',
        ]

    if 'google.' in uri.host.lower():
        # Additional parameters specifically for the google.[com,lt,...] host
        parameters_to_remove += [
            'gws_rd',
            'ei',
        ]

    # Some other parameters (common for tracking session IDs, advertising, etc.)
    parameters_to_remove += [
        'PHPSESSID',
        'PHPSESSIONID',
        'cid',
        's_cid',
        'sid',
        'ncid',
        'ir',
        'ref',
        'oref',
        'eref',
        'ns_mchannel',
        'ns_campaign',
        'ITO',
        'wprss',
        'custom_click',
        'source',
        'feedName',
        'feedType',
        'skipmobile',
        'skip_mobile',
        'altcast_code',
    ]

    # Make the sorting default (e.g. on Reddit)
    # Some other parameters (common for tracking session IDs, advertising, etc.)
    parameters_to_remove += ['sort']

    # Some Australian websites append the "nk" parameter with a tracking hash
    if 'nk' in uri.query.params:
        for nk_value in uri.query.params['nk']:
            if re.search(r'^[0-9a-fA-F]+$', nk_value, re.I):
                parameters_to_remove += ['nk']
                break

    # Delete the "empty" parameter (e.g. in http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6)
    parameters_to_remove += ['']

    # Remove cruft parameters
    for parameter in parameters_to_remove:
        if ' ' in parameter:
            log.warning('Invalid cruft parameter "%s"' % parameter)
        uri.query.params.pop(parameter, None)

    for name in list(
            uri.query.params.keys()):  # copy of list to be able to delete

        # Remove parameters that start with '_' (e.g. '_cid') because they're
        # more likely to be the tracking codes
        if name.startswith('_'):
            uri.query.params.pop(name, None)

        # Remove GA parameters, current and future (e.g. "utm_source",
        # "utm_medium", "ga_source", "ga_medium")
        # (https://support.google.com/analytics/answer/1033867?hl=en)
        if name.startswith('ga_') or name.startswith('utm_'):
            uri.query.params.pop(name, None)

    url = uri.url

    # Remove empty values in query string, e.g. http://bash.org/?244321=
    url = url.replace('=&', '&')
    url = re.sub(r'=$', '', url)

    return url
Exemple #57
0
# 获取图片
img = soup.find_all("img")

# 保存图片url
imgs = []
for item in img:
    imgs.append(item.get('data-src'))
# 下载图片
# 保存图片,思路:将所有的图片保存在本地的一个文件夹下,用图片的url链接的后缀名来命名
dir_name = soup.select('h2')[0].string  # 设置文件夹的名字
dir_name = dir_name.replace("\n", "")
dir_name = dir_name.strip()


print('dir_name', dir_name)

if not os.path.exists(str(dir_name)):  # os模块判断并创建
    os.mkdir(dir_name)

# 保存图片
picture_name = 0
for img_url in imgs:
    time.sleep(1)  # 设置间隔时间,防止把网页爬崩
    if img_url:
        f = furl(img_url)
        reponse = requests.get(img_url)
        # 还有遍历的作用
        with open(dir_name+'/'+str(picture_name)+str('.') + str(f.args['wx_fmt']), 'wb') as f:
            f.write(reponse.content)
        picture_name += 1
Exemple #58
0
 def get_base_url(self, url):
     url = furl.furl(url)
     return '{}://{}'.format(url.scheme, url.host)
async def test_conductor (conductor, simpleWebServer):
	serverSocketPath, c = conductor
	socketPath, runner = simpleWebServer

	user = getpass.getuser ()
	key = 'foobar'
	auth = '123'

	conn = aiohttp.UnixConnector(path=serverSocketPath)
	async with aiohttp.ClientSession(connector=conn) as session:
		async with session.get(f'http://{key}-{user}.conductor.local/_conductor/status') as resp:
			assert resp.status == 200
			o = await resp.json ()
			assert o['routesTotal'] == 0
			assert o['requestTotal'] == 1
			assert o['requestActive'] == 1
			assert o['noroute'] == 1

		# make sure that requests are properly counted and requestActive is decreased
		reader, writer = await asyncio.open_unix_connection (path=serverSocketPath)
		writer.write (b'invalid http request\n')
		writer.close ()
		await writer.wait_closed ()
		async with session.get(f'http://{key}-{user}.conductor.local/_conductor/status') as resp:
			assert resp.status == 200
			o = await resp.json ()
			assert o['requestTotal'] == 3
			assert o['requestActive'] == 1

		async with session.get(f'http://{key}-{user}.conductor.local/_conductor/nonexistent') as resp:
			assert resp.status == 404

		async with session.get(f'http://{key}-{user}.conductor.local/') as resp:
			assert resp.status == 404

		routeKey = RouteKey (key=key, user=user)
		route = Route (key=routeKey, auth=c.hashKey (auth), socket=socketPath)
		c.addRoute (route)

		for u in (f'http://nonexistent-{user}.conductor.local', 'http://invalidpattern.conductor.local', 'http://different.domain'):
			async with session.get(u) as resp:
				assert resp.status == 404

		async with session.get(f'http://{key}-{user}.conductor.local') as resp:
			assert resp.status == 403
		# add unrelated cookie
		session.cookie_jar.update_cookies ({'unrelated': 'value'})
		async with session.get(f'http://{key}-{user}.conductor.local') as resp:
			assert resp.status == 403

		async with session.get(f'http://{key}-{user}.conductor.local/_conductor/auth/{auth}') as resp:
			assert resp.status == 200
			assert await resp.text() == 'Hello, world'

		# make sure responses larger than any buffer work
		async with session.get(f'http://{key}-{user}.conductor.local/large') as resp:
			assert resp.status == 200
			assert await resp.text() == 'a'*(1024*1024)

		async with session.get(f'http://{key}-{user}.conductor.local/_conductor/auth/{auth}?next=/nonexistent') as resp:
			assert resp.status == 404
			assert furl (resp.url).path == '/nonexistent'

		# big request
		headers = dict ([(f'key-{i}', 'val') for i in range (101)])
		async with session.get(f'http://{key}-{user}.conductor.local/', headers=headers) as resp:
			assert resp.status == 400

		# destroy application
		await runner.cleanup()

		async with session.get(f'http://{key}-{user}.conductor.local/') as resp:
			assert resp.status == 502

		c.deleteRoute (routeKey)

		async with session.get(f'http://{key}-{user}.conductor.local/') as resp:
			assert resp.status == 404
Exemple #60
0
 def fetch_xml(self, file_name):
     file_url = furl(self.BASE_DATA_URL.format(file_name))
     # Not using self.requests when getting the file contents because the eLife rate limit (1, 60) does not apply
     resp = requests.get(file_url.url)
     xml = etree.XML(resp.content)
     return xml