Example #1
0
    def _send_request(self, method, path, data=None, headers=None, sleep_before_request=0):
        """

        :param method: HTTP method include 'GET','POST','DELETE','PUT'
        :param path: request url path,
        :param path: data to send with the request
        :param path: headers information
        :param sleep_before_request: sleep(timeinsec) to allow enough time gap to send requests to server
                            this is to avoid ConnectionError "Max retries exceeded with url". Default with no delay
        :return: response in json format
        """

        url = self.solrURL.replace(self.path, '')
        sleep(sleep_before_request)
        try:
            if self._auth:
                response = requests.request(method=method, url=urljoin(url, path),
                                            headers=headers,data=data,auth=self._auth)
            else:
                response = requests.request(method=method, url=urljoin(url, path),
                                            headers=headers,data=data)
        except requests.exceptions.ConnectionError:
            self._logger.warning("Connection refused when requesting [%s]", urljoin(url, path))
            raise SolrError("Connection refused.")

        if response.status_code not in (200, 304):
            self._logger.error("failed to send request to [%s]. Reason: [%s]", urljoin(url, path),response.reason)
            raise SolrError(self._extract_error(headers, response.reason))

        return response.json()
Example #2
0
    def test__request(self):
        httpretty.register_uri(
            httpretty.POST,
            urljoin(SmsAero.URL_GATE, '/send/'),
            body='{}',
            status=500,
        )

        try:
            self.api.send('89111111111', 'message')
            self.assertTrue(False)
        except SmsAeroHTTPError:
            pass

        def exceptionCallback(request, uri, headers):
            raise requests.Timeout('Connection timed out.')

        httpretty.register_uri(
            httpretty.POST,
            urljoin(SmsAero.URL_GATE, '/send/'),
            body=exceptionCallback,
            status=200,
            content_type='text/json',
        )

        try:
            self.api.send('89111111111', 'message')
            self.assertTrue(False)
        except SmsAeroHTTPError:
            pass
Example #3
0
    def __init__(self, package, resource):
        self.package = package

        path = resource.get("path")
        if path:
            self.url = urljoin(package.url, path)
        elif "url" in resource:
            self.url = resource["url"]
        elif "data" in resource:
            raise NotImplementedError("Embedded datapackage resource data " "are not supported")
        else:
            raise MetadataError("No path or url specified in a package " "resource.")

        self.name = resource.get("name")
        self.title = resource.get("title")
        self.url = urljoin(package.url, path)

        schema = resource.get("schema")
        if schema:
            fields = schema.get("fields")
            self.fields = schema_to_fields(fields)
        else:
            self.fields = None

        self.type = resource.get("type", os.path.splitext(self.url)[1][1:])

        if self.type not in OBJECT_TYPES:
            raise TypeError("Data object type '%s' is not supported in " "datapackage." % self.type)
Example #4
0
def post_to_hastebin(data, url="http://hastebin.com/"):
    if isinstance(data, str):
        data = data.encode()
    response = requests.post(urljoin(url, "documents"), data)
    response.raise_for_status()
    result = response.json()
    return urljoin(url, result['key'])
Example #5
0
    def test_checksending(self):
        httpretty.register_uri(
            httpretty.POST,
            urljoin(SmsAero.URL_GATE, '/checksending/'),
            body='{"reason": {"33460579": "smsc reject", \
                "33460580": "delivery success"}, \
                "result": "accepted"}',
            status=200,
            content_type='text/json',
        )

        self.api.checksending(322)

        httpretty.register_uri(
            httpretty.POST,
            urljoin(SmsAero.URL_GATE, '/checksending/'),
            body='{"reason": "empty field", "result": "reject"}',
            status=200,
            content_type='text/json',
        )

        try:
            self.api.checksending('')
            self.assertTrue(False)
        except SmsAeroError:
            pass
Example #6
0
    def soupIt(self):
        http = urllib3.PoolManager()
        r = http.request("GET", self.url)
        soup = BeautifulSoup(r.data.decode('ISO-8859-1'), "lxml")
        self.title = soup.title.string

        # remove unused header parts
        # in comments because of firefox
        # for p in soup(["meta"]):
        #    p.extract()

        # remove comments
        for element in soup(text=lambda text: isinstance(text, Comment)):
            element.extract()

        # remove some images
        unused_images = soup.find_all('img', {'alt': 'bullet'}) \
                        + soup.find_all('img', {'src': '../../images/ilmulislam.gif'}) \
                        + soup.find_all('img', {'src': '../../images/enzykopf.gif'})
        for i in soup.find_all('img'):
            if i in unused_images:
                i.extract()

        # remove all links, but keep text
        # don't keep text for navigation links that don't lead to "begriffe" or "manuskripte"
        for l in soup.findAll('a'):
            if "begriffe" in urljoin(self.url, l['href']) or "manuskripte" in urljoin(self.url, l['href']):
                l.replaceWith(l.text)
            else:
                l.extract()

        # remove top blocks
        topBlocks = soup.findAll('td', {'width': '50%'})
        for block in topBlocks:
            if len(block.findChildren('img')):
                self.images += block.findChildren('img')
            block.extract()

        # remove trash tags and empty tags
        for tag in soup.findAll():

            if tag.name == "meta":
                continue
            if tag.name in ("td", "tr", "table", "center", "div", "font", "strong", "b"):
                tag.unwrap()
            if len(tag.text) == 0 or tag.text == '\n' or re.match(r'^\s*$',
                                                                  tag.text) or tag.is_empty_element or tag.isSelfClosing:
                tag.extract()

        for l in soup.find_all(text=re.compile('^\n')):
            l.extract()

        for l in soup.find_all(text=re.compile('\r\n')):
            l.replaceWith(" ")

        # append immages
        for i in self.images:
            soup.body.insert(0, i)

        return soup.prettify()
Example #7
0
def report_from(result, year_range):
  link = result.select("a")[0]
  title = link.text
  landing_url = urljoin(REPORTS_URL, link.get('href'))
  report_id_node, published_node = result.select("div.release_info")
  report_id = report_id_node.text.strip().replace(",", "")
  published_on = datetime.datetime.strptime(published_node.text, '%b %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % landing_url)
    return

  logging.debug("Scraping landing url: %s", landing_url)
  landing_page = beautifulsoup_from_url(landing_url)
  summary = landing_page.select("div.left_col")[0].text.strip()

  pdf_link = landing_page.select("#link_bar > a")[0]
  report_url = urljoin(REPORTS_URL, pdf_link.get('href'))

  text_link = landing_page.select("#add_material a")[-1]
  text_report_url = urljoin(REPORTS_URL, text_link.get('href'))

  report = {
    'inspector': 'gao',
    'inspector_url': 'http://www.gao.gov/about/workforce/ig.html',
    'agency': 'gao',
    'agency_name': 'Government Accountability Office',
    'report_id': report_id,
    'url': report_url,
    'text_url': text_report_url,
    'landing_url': landing_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
    def setUpClass(cls):
        """Create an RPM repository with a valid feed and sync it.

        Do the following:

        1. Reset Pulp, including the Squid cache.
        2. Create a repository with the "on demand" download policy.
        3. Sync and publish the repository.
        4. Download an RPM from the published repository.
        5. Download the same RPM to ensure it is served by the cache.
        """
        super(OnDemandTestCase, cls).setUpClass()

        # Ensure `locally_stored_units` is 0 before we start.
        utils.reset_squid(cls.cfg)
        utils.reset_pulp(cls.cfg)

        # Create, sync and publish a repository.
        repo = _create_repo(cls.cfg, 'on_demand')
        cls.resources.add(repo['_href'])
        utils.sync_repo(cls.cfg, repo['_href'])

        # Read the repository.
        client = api.Client(cls.cfg)
        cls.repo = client.get(repo['_href'], params={'details': True}).json()

        # Download the same RPM twice.
        path = urljoin('/pulp/repos/', repo['id'] + '/')
        path = urljoin(path, RPM)
        cls.rpm = client.get(path)
        cls.same_rpm = client.get(path)
    def setUpClass(cls):
        """Create an RPM repository with a valid feed and sync it.

        Do the following:

        1. Reset Pulp, including the Squid cache.
        2. Create a repository with the "background" download policy.
        3. Sync and publish the repository.
        4. Download an RPM from the repository.
        """
        super(BackgroundTestCase, cls).setUpClass()
        if (selectors.bug_is_untestable(1905, cls.cfg.version) and
                _os_is_rhel6(cls.cfg)):
            raise unittest.SkipTest('https://pulp.plan.io/issues/1905')

        # Required to ensure content is actually downloaded.
        utils.reset_squid(cls.cfg)
        utils.reset_pulp(cls.cfg)

        # Create, sync and publish a repository.
        repo = _create_repo(cls.cfg, 'background')
        cls.resources.add(repo['_href'])
        report = utils.sync_repo(cls.cfg, repo['_href']).json()

        # Record the tasks spawned when syncing the repository, and the state
        # of the repository itself after the sync.
        client = api.Client(cls.cfg)
        cls.repo = client.get(repo['_href'], params={'details': True}).json()
        cls.tasks = tuple(api.poll_spawned_tasks(cls.cfg, report))

        # Download an RPM.
        path = urljoin('/pulp/repos/', repo['id'] + '/')
        path = urljoin(path, RPM)
        cls.rpm = client.get(path)
Example #10
0
  def urls_for(self):
    only = self.options.get('topics')
    if only: # if only...
      only = set(only.split(','))
      only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
              for o in only]
      yield from self.urls_for_topics(only)
      # If there are topics selected, ONLY yield URLs for those.
      return

    # First yield the URLs for the topics that are tangential to the main
    # Calendar Year reports.
    yield from self.urls_for_topics(ADDITIONAL_TOPICS)

    # Not getting reports from specific topics, iterate over all Calendar Year
    # reports.
    page = BeautifulSoup(utils.download(BASE_URL))

    # Iterate over each "Calendar Year XXXX" link
    for li in page.select('.field-items li'):
      md = RE_CALENDAR_YEAR.search(li.text)
      if md:
        cur_year = int(md.group(1))
        if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
          href = li.select('a')[0]['href']
          next_url = urljoin(BASE_URL, href)
          # The first page of reports is yielded.
          yield next_url

          # Next, read all the pagination links for the page and yield those. So
          # far, I haven't seen a page that doesn't have all of the following
          # pages enumerated.
          next_page = BeautifulSoup(utils.download(next_url))
          for link in next_page.select('li.pager-item a'):
            yield urljoin(BASE_URL, link['href'])
Example #11
0
def root():
    fp = request.fullpath

    try:
        numpkgs = len(list(packages()))
    except:
        numpkgs = 0

    return """<html><head><title>Welcome to pypiserver!</title></head><body>
<h1>Welcome to pypiserver!</h1>
<p>This is a PyPI compatible package index serving %(NUMPKGS)s packages.</p>

<p> To use this server with pip, run the the following command:
<blockquote><pre>
pip install -i %(URL)ssimple/ PACKAGE [PACKAGE2...]
</pre></blockquote></p>

<p> To use this server with easy_install, run the the following command:
<blockquote><pre>
easy_install -i %(URL)ssimple/ PACKAGE
</pre></blockquote></p>

<p>The complete list of all packages can be found <a href="%(PACKAGES)s">here</a> or via the <a href="%(SIMPLE)s">simple</a> index.</p>

<p>This instance is running version %(VERSION)s of the <a href="http://pypi.python.org/pypi/pypiserver">pypiserver</a> software.</p>
</body></html>
""" % dict(URL=request.url, VERSION=__version__, NUMPKGS=numpkgs,
           PACKAGES=urljoin(fp, "packages/"),
           SIMPLE=urljoin(fp, "simple/"))
Example #12
0
    def _crawl(self):
        uri = urljoin(self.__uri, self.__next)
        self.__class__._log("debug", "%s crawls url: %s" % (self.__class__.__name__, uri))

        (page, base, _) = self.__class__._fetch_remote_html(uri)
        if not page:
            self.__class__._log("debug", "%s crawled EMPTY url: %s" % (self.__class__.__name__, uri))
            return

        # get more content ("scroll down")
        # to know what page to parse next
        # update new last URI when we're not on first run
        _next = None
        _more = page.find("div", {"id": "more_loading"})
        if _more:
            _more = _more.find("a", {"href": True})
            if _more:
                _next = urljoin(base, _more["href"])
        if _next:
            self.__next = _next
        else:
            self.__class__._log("debug", "%s found no `next` on url: %s" % (self.__class__.__name__, uri))

        # for every found imageContainer
        # add img-src to map if not blacklisted
        images_added = 0
        for con in page.find_all("div", {"class": "imagecontainer"}):
            image = con.find("img", {"src": True})
            if image:
                if self._add_image(urljoin(base, image["src"]), self.__site):
                    images_added += 1

        if not images_added:
            self.__class__._log("debug", "%s found no images on url: %s" % (self.__class__.__name__, uri))
Example #13
0
 def test_entry_feed_enclosure(self):
     entry = self.create_published_entry()
     feed = EntryFeed()
     self.assertEquals(
         feed.item_enclosure_url(entry), 'http://example.com/image.jpg')
     self.assertEquals(feed.item_enclosure_length(entry), '100000')
     self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg')
     entry.content = 'My test content with image <img src="image.jpg" />'
     entry.save()
     self.assertEquals(
         feed.item_enclosure_url(entry), 'http://example.com/image.jpg')
     self.assertEquals(feed.item_enclosure_length(entry), '100000')
     self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg')
     entry.content = 'My test content with image ' \
                     '<img src="http://test.com/image.jpg" />'
     entry.save()
     self.assertEquals(
         feed.item_enclosure_url(entry), 'http://test.com/image.jpg')
     self.assertEquals(feed.item_enclosure_length(entry), '100000')
     self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg')
     path = default_storage.save('enclosure.png', ContentFile('Content'))
     entry.image = path
     entry.save()
     self.assertEquals(feed.item_enclosure_url(entry),
                       urljoin('http://example.com', entry.image.url))
     self.assertEquals(feed.item_enclosure_length(entry), '7')
     self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/png')
     default_storage.delete(path)
     entry.image = 'invalid_image_without_extension'
     entry.save()
     self.assertEquals(feed.item_enclosure_url(entry),
                       urljoin('http://example.com', entry.image.url))
     self.assertEquals(feed.item_enclosure_length(entry), '100000')
     self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg')
def parse_repomd(repo, baseurl):
    url = urljoin(baseurl, 'repodata/repomd.xml')
    repomd = requests.get(url)
    if repomd.status_code != requests.codes.ok:
        return False

    ns = {'r': 'http://linux.duke.edu/metadata/repo'}
    root = ET.fromstring(repomd.content)
    primary_element = root.find('.//r:data[@type="primary"]', ns)
    location = primary_element.find('r:location', ns).get('href')
    sha256_expected = primary_element.find('r:checksum[@type="sha256"]', ns).text

    f = tempfile.TemporaryFile()
    f.write(repomd.content)
    f.flush()
    os.lseek(f.fileno(), 0, os.SEEK_SET)
    repo.add_repomdxml(solv.xfopen_fd(None, f.fileno()), 0)
    url = urljoin(baseurl, location)
    with requests.get(url, stream=True) as primary:
        if primary.status_code != requests.codes.ok:
            raise Exception(url + ' does not exist')
        sha256 = hashlib.sha256(primary.content).hexdigest()
        if sha256 != sha256_expected:
            raise Exception('checksums do not match {} != {}'.format(sha256, sha256_expected))

        content = gzip.GzipFile(fileobj=io.BytesIO(primary.content))
        os.lseek(f.fileno(), 0, os.SEEK_SET)
        f.write(content.read())
        f.flush()
        os.lseek(f.fileno(), 0, os.SEEK_SET)
        repo.add_rpmmd(solv.xfopen_fd(None, f.fileno()), None, 0)
        return True

    return False
def fake(base_url, username, password, tourney_id):
    url_opener = _utils.login_and_enter_arcade(base_url, username, password)

    # calculate some more URLs
    tourneys_url = urljoin(base_url, "arcade.php?&do=viewtournaments")
    join_tourney_url = urljoin(base_url, "arcade.php?&do=registertourney&tid={0}".format(
        tourney_id
    ))
    #view_tourney_url = urljoin(base_url, "arcade.php?&do=viewtourney&tid={0}".format(
    #    tourney_id
    #))

    # go to tourneys
    print("entering tourneys page")
    tourneys_response = url_opener.open(tourneys_url)
    tourneys_response.read()

    # go to tourney creation form
    print("joining tourney")
    join_tourney_response = url_opener.open(join_tourney_url)
    join_tourney_response.read()

    # look at tourney to make sure it sticks
    #print("looking at tourney")
    #view_tourney_response = url_opener.open(view_tourney_url)
    #view_tourney_response.read()

    print("done")
Example #16
0
def get_episodes(html, url):
	html = html.replace("\n", "")
	
	js = """
		var output;
		function getCookie() {}
		function getcookie() {}
		var window = {
			open: function(result){
				output = result;
			}
		};
		var document = {
			location: {
				href: ""
			}
		};
	""" + grabhtml(urljoin(url, "/js/comicview.js"))
	
	s = []
	matches = re.finditer(
		r'<a [^>]*?onclick="(cview[^"]+?);[^>]*>(.+?)</a>',
		html, re.M
	)
	with VM(js) as vm:
		for match in matches:
			cview, title = match.groups()
			
			vm.run(cview)
			ep_url = vm.run("output")
			title = clean_tags(title)

			e = Episode(title, urljoin(url, ep_url))
			s.append(e)
	return s
def parse_susetags(repo, baseurl):
    url = urljoin(baseurl, 'content')
    content = requests.get(url)
    if content.status_code != requests.codes.ok:
        return False

    f = tempfile.TemporaryFile()
    f.write(content.content)
    f.flush()
    os.lseek(f.fileno(), 0, os.SEEK_SET)
    repo.add_content(solv.xfopen_fd(None, f.fileno()), 0)

    defvendorid = repo.meta.lookup_id(solv.SUSETAGS_DEFAULTVENDOR)
    descrdir = repo.meta.lookup_str(solv.SUSETAGS_DESCRDIR)
    if not descrdir:
        descrdir = 'suse/setup/descr'

    url = urljoin(baseurl, descrdir + '/packages.gz')
    with requests.get(url, stream=True) as packages:
        if packages.status_code != requests.codes.ok:
            raise Exception(url + ' does not exist')

        content = gzip.GzipFile(fileobj=io.BytesIO(packages.content))
        os.lseek(f.fileno(), 0, os.SEEK_SET)
        f.write(content.read())
        f.flush()
        os.lseek(f.fileno(), 0, os.SEEK_SET)
        repo.add_susetags(f, defvendorid, None, solv.Repo.REPO_NO_INTERNALIZE|solv.Repo.SUSETAGS_RECORD_SHARES)
        return True
    return False
Example #18
0
    def _find_matches(self, match_trees):
        """
        Static method used by match finders to find matches from
        :param match_trees: list of html trees (usually table rows)
        """
        for match in match_trees:
            team1 = ''.join(match.xpath('.//span[contains(@class,"opp1")]//text()')).strip()
            team2 = ''.join(match.xpath('.//span[contains(@class,"opp2")]//text()')).strip()
            team1_bet = ''.join(match.xpath('.//span[contains(@class,"bet1")]//text()')).strip('() \n')
            team2_bet = ''.join(match.xpath('.//span[contains(@class,"bet2")]//text()')).strip('() \n')

            match_url = ''.join(match.xpath('.//a[contains(@class,"match")]/@href')).strip()
            match_id = match_url.rsplit('/', 1)[-1].split('-')[0] if match_url else 'not found'
            match_url = urljoin(self.domain, match_url)
            live_in = ''.join(match.xpath('.//span[contains(@class,"live-in")]/text()')).strip()

            score = match.xpath('.//span[contains(@class,"score-wrap")]//span[contains(@class, "score")]/text()')
            team1_score = score[0] if score else ''
            team2_score = score[1] if len(score) > 1 else ''

            tournament = ''.join(match.xpath('.//a[contains(@class,"tournament")]/@href')).strip()
            tournament = urljoin(self.domain, tournament)

            has_vods = bool(match.xpath('.//span[contains(@class,"vod")]/img'))
            yield Match(self.game, team1, team1_score, team1_bet, team2, team2_score, team2_bet, live_in, tournament,
                        has_vods, match_id, match_url)
Example #19
0
    def get_img_list(self):
        """ Gets list of images from the page_html. """
        tree = html.fromstring(self.page_html)
        img = tree.xpath('//img/@src')
        links = tree.xpath('//a/@href')
        img_list = self.process_links(img)
        img_links = self.process_links(links)
        img_list.extend(img_links)

        if self.filename_pattern:
            # Compile pattern for efficiency
            pattern = re.compile(self.filename_pattern)

            # Verifies filename in the image URL matches pattern
            def matches_pattern(img_url):
                """ Function to check if pattern is matched. """

                img_filename = urlparse(img_url).path.split('/')[-1]
                return pattern.search(img_filename)

            images = [urljoin(self.url, img_url) for img_url in img_list
                      if matches_pattern(img_url)]
        else:
            images = [urljoin(self.url, img_url) for img_url in img_list]

        images = list(set(images))
        self.images = images
        if self.scrape_reverse:
            self.images.reverse()
        return self.images
Example #20
0
    def search_film(self, search_query):
        logging.info('Searching film for query: {}'.format(search_query))

        search_url = urljoin(self.site_url, "/search/movies/")
        search_url = urljoin(search_url, quote_plus(search_query))

        search_page = self.fetch_page(search_url)
        pq = PyQuery(search_page)

        dom_search_list = pq(u".list_item")
        film_list = []
        for dom_item in dom_search_list:
            name = pq(dom_item).find('img[border="0"]').show().attr('alt')
            category = "Film"

            film = Media(name=name, category=category)

            # set description
            desc = pq(dom_item).find('.plot').text()
            film.description = re.sub('\s', ' ', str(desc))  # remove newlines from description

            film.rating = pq(dom_item).find('span.rank_value').text()

            # set page url
            href = pq(dom_item).find('a.panel').attr('href')
            film.url = urljoin(self.site_url, href)

            # set thumbnail url
            href_thumbnail = pq(dom_item).find('img[border="0"]').show().attr('src')
            film.thumbnail = urljoin(self.site_url, href_thumbnail)

            film_list.append(film)

        return film_list
Example #21
0
def request_odes_extract(extract, request, url_for, api_key):
    '''
    '''
    env = Environment(loader=PackageLoader(__name__, 'templates'))
    args = dict(
        name = extract.name or extract.wof.name or 'an unnamed place',
        link = urljoin(util.get_base_url(request), url_for('ODES.get_extract', extract_id=extract.id)),
        extracts_link = urljoin(util.get_base_url(request), url_for('ODES.get_extracts')),
        created = extract.created
        )

    email = dict(
        email_subject=env.get_template('email-subject.txt').render(**args),
        email_body_text=env.get_template('email-body.txt').render(**args),
        email_body_html=env.get_template('email-body.html').render(**args)
        )

    params = {key: extract.envelope.bbox[i] for (i, key) in enumerate(('bbox_w', 'bbox_s', 'bbox_e', 'bbox_n'))}
    params.update(email)

    post_url = uritemplate.expand(odes_extracts_url, dict(api_key=api_key))
    resp = requests.post(post_url, data=params)
    oj = resp.json()
    
    if 'error' in oj:
        raise util.KnownUnknown("Error: {}".format(oj['error']))
    elif resp.status_code != 200:
        raise Exception("Bad ODES status code: {}".format(resp.status_code))
    
    return data.ODES(str(oj['id']), status=oj['status'], bbox=oj['bbox'],
                     links=oj.get('download_links', {}),
                     processed_at=(parse_datetime(oj['processed_at']) if oj['processed_at'] else None),
                     created_at=(parse_datetime(oj['created_at']) if oj['created_at'] else None))
Example #22
0
def main():
    # 指定种子页面
    base_url = 'https://www.zhihu.com/'
    seed_url = urljoin(base_url, 'explore')
    # 创建Redis客户端
    client = Redis(host='1.2.3.4', port=6379, password='******')
    # 设置用户代理(否则访问会被拒绝)
    headers = {'user-agent': 'Baiduspider'}
    # 通过requests模块发送GET请求并指定用户代理
    resp = requests.get(seed_url, headers=headers)
    # 创建BeautifulSoup对象并指定使用lxml作为解析器
    soup = BeautifulSoup(resp.text, 'lxml')
    href_regex = re.compile(r'^/question')
    # 将URL处理成SHA1摘要(长度固定更简短)
    hasher_proto = sha1()
    # 查找所有href属性以/question打头的a标签
    for a_tag in soup.find_all('a', {'href': href_regex}):
        # 获取a标签的href属性值并组装完整的URL
        href = a_tag.attrs['href']
        full_url = urljoin(base_url, href)
        # 传入URL生成SHA1摘要
        hasher = hasher_proto.copy()
        hasher.update(full_url.encode('utf-8'))
        field_key = hasher.hexdigest()
        # 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
        if not client.hexists('zhihu', field_key):
            html_page = requests.get(full_url, headers=headers).text
            # 对页面进行序列化和压缩操作
            zipped_page = zlib.compress(pickle.dumps(html_page))
            # 使用hash数据类型保存URL摘要及其对应的页面代码
            client.hset('zhihu', field_key, zipped_page)
    # 显示总共缓存了多少个页面
    print('Total %d question pages found.' % client.hlen('zhihu'))
Example #23
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Can limit search to any of the components listed at the top of this script
  component = options.get('component')
  if component and component in components:
    source_links = {}
    link = urljoin(base_url, "%s.htm" % component)
    source_links[link] = components[component]

  # Otherwise, get links to each component's landing page from main page.
  else:
    starting_point = "https://oig.justice.gov/reports/components.htm"
    content = get_content(starting_point)
    source_links = {}
    for c in content:
      links = c.find_all("a")
      for l in links:
        name = l.string
        link = urljoin(base_url, l.get("href"))
        source_links[link] = name

  # For each component's landing page, run the processor over it
  keys = list(source_links.keys())
  keys.sort()

  for link in keys:
    content = get_content(link)
    extract_info(content, source_links[link], year_range)


  logging.info("Found %i reports, for year %i to %i" % (len(list(report.keys())), year_range[0], year_range[-1]))

  for key in list(report.keys()):
    inspector.save_report(report[key])
def compose_url(season, year=None, sport=None):
    if year and sport:
        return urljoin(URL, season + '/' + year + '/' + sport)
    elif year:
        return urljoin(URL, season + '/' + year)
    else:
        return urljoin(URL, season)
 def getStreamURLs(self):
     time = self.time
     logging.debug("%s: Starting update of streamURL array", threading.current_thread().name)
     for i in range(0, self.length):
         if re.findall(r"(^.*Helios-HSS.*$)", self.playlist.getPlaylistUrl()):
             url = urljoin(
                 self.baseUrl,
                 "IRDETO-HSS-H/QualityLevels("
                 + str(self.qualityLevels)
                 + ")/Fragments(video="
                 + str(int(time))
                 + ")",
             )
             # print(self.baseUrl, "IS Helios VOD")
         elif re.findall(r"(^.*\.vod.*$)", self.baseUrl):
             url = urljoin(
                 self.baseUrl,
                 "IRDETO-HSS-O/QualityLevels("
                 + str(self.qualityLevels)
                 + ")/Fragments(video="
                 + str(int(time))
                 + ")",
             )
             # print(self.baseUrl, "IS Orion VOD")
         else:
             url = urljoin(
                 self.baseUrl,
                 "QualityLevels(" + str(self.qualityLevels) + ")/Fragments(video=" + str(int(time)) + ")",
             )
             # print(self.baseUrl, "IS LIVE")
         self.streamUrls.append(url)
         time = time + int(self.deltaArray[i])
         # print(self.streamUrls[i], 'index : ', i)
     logging.debug("%s: Completed updating streamURL array", threading.current_thread().name)
     return self
Example #26
0
def get_ENCODE(obj_id, connection, frame="object"):
    '''GET an ENCODE object as JSON and return as dict'''
    if frame is None:
        if '?' in obj_id:
            url = urljoin(connection.server, obj_id+'&limit=all')
        else:
            url = urljoin(connection.server, obj_id+'?limit=all')
    elif '?' in obj_id:
        url = urljoin(connection.server, obj_id+'&limit=all&frame='+frame)
    else:
        url = urljoin(connection.server, obj_id+'?limit=all&frame='+frame)
    logging.debug('GET %s' % (url))
    response = requests.get(url, auth=connection.auth, headers=connection.headers)
    logging.debug('GET RESPONSE code %s' % (response.status_code))
    try:
        if response.json():
            logging.debug('GET RESPONSE JSON: %s' % (json.dumps(response.json(), indent=4, separators=(',', ': '))))
    except:
        logging.debug('GET RESPONSE text %s' % (response.text))
    if not response.status_code == 200:
        if response.json().get("notification"):
            logging.warning('%s' % (response.json().get("notification")))
        else:
            logging.warning('GET failure.  Response code = %s' % (response.text))
    return response.json()
Example #27
0
def adaptionset(element, url, baseurl=None, offset_sec=None, duration_sec=None):
    streams = {}

    dirname = os.path.dirname(url) + "/"
    if baseurl:
        dirname = urljoin(dirname, baseurl)

    template = element[0].find("{urn:mpeg:dash:schema:mpd:2011}SegmentTemplate")
    represtation = element[0].findall(".//{urn:mpeg:dash:schema:mpd:2011}Representation")

    for i in represtation:
        files = []
        segments = False
        filename = dirname
        bitrate = int(i.attrib["bandwidth"]) / 1000
        idnumber = i.attrib["id"]

        if i.find("{urn:mpeg:dash:schema:mpd:2011}BaseURL") is not None:
            filename = urljoin(filename, i.find("{urn:mpeg:dash:schema:mpd:2011}BaseURL").text)

        if i.find("{urn:mpeg:dash:schema:mpd:2011}SegmentBase") is not None:
            segments = True
            files.append(filename)
        if template is not None:
            segments = True
            files = templateelemt(template, filename, idnumber, offset_sec, duration_sec)
        elif i.find("{urn:mpeg:dash:schema:mpd:2011}SegmentTemplate") is not None:
            segments = True
            files = templateelemt(i.find("{urn:mpeg:dash:schema:mpd:2011}SegmentTemplate"), filename, idnumber, offset_sec, duration_sec)

        if files:
            streams[bitrate] = {"segments": segments, "files": files}

    return streams
Example #28
0
def MyParser(url,index):
    global links,A,num
    if (not IsInTheList(url, links)) and (len(links) <= num) and Is_ntut_web(url):
        try:
            soup = BeautifulSoup(urlopen(url), "lxml")
            result = soup.find("meta",attrs={"http-equiv":"refresh"})
            meta = str(soup.html.head.meta)
            if result:
                links.append(url)
                wait,text=result["content"].split(";")
                if text.lower().startswith("url="):
                    pice=text[4:]
                    tempUrl = urljoin('http://www.ntut.edu.tw',pice)
                    print(url)
                    MyParser(tempUrl,FindIndex(url,links))
                    if index != FindIndex(url,links):
                        A[FindIndex(url,links),index]=1
            elif meta.find('text/html;') >= 0:
                links.append(url)
                for link in soup.findAll('a'):
                    #print(A[:,0])
                    tempUrl = link.get('href')
                    tempUrl = urljoin("http://www.ntut.edu.tw",tempUrl)
                    MyParser(tempUrl,FindIndex(url,links))
                    if index != FindIndex(url,links):
                        A[FindIndex(url,links),index]=1
        except:
            pass
    elif IsInTheList(url, links) and (len(links) <= num+1):
        if index != FindIndex(url,links):
            A[FindIndex(url,links),index]=1
Example #29
0
def parse_homework(words):
    n, gist, id, time = words
    dirname = os.path.join(OUTPUT, 'homework', n)
    name = id
    url = 'http://nbviewer.ipython.org/%s' % gist
    text = infopen(url)
    if text is None:
        url = 'http://gist.github.com/%s' % gist
        text = infopen(url)
        assert text is not None
        soup = BS(text)
        a = soup.find('a', title='View Raw')
        assert a is not None
        content = infopen(urljoin(url, a['href']))
        assert content is not None
        good = False
    else:
        soup = BS(text)
        a = soup.find('a', text='Download Notebook')
        if a is None:
            content = text
            good = False
        else:
            content = infopen(urljoin(url, a['href']))
            assert content is not None
            good = True
    return Bunch(
        dirname=dirname,
        name=name,
        content=content,
        good=good,
        time=time,
        title='homework %s' % n,
        author=id
    )
Example #30
0
 def startElementNS(self, name, qname, attrs):
     stack = self.stack
     stack.append(ElementHandler())
     current = self.current
     parent = self.parent
     base = attrs.get(BASE, None)
     if base is not None:
         base, frag = urldefrag(base)
         if parent and parent.base:
             base = urljoin(parent.base, base)
         else:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base = urljoin(systemId, base)
     else:
         if parent:
             base = parent.base
         if base is None:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base, frag = urldefrag(systemId)
     current.base = base
     language = attrs.get(LANG, None)
     if language is None:
         if parent:
             language = parent.language
     current.language = language
     current.start(name, qname, attrs)
def add_lista_IPs (lista_add_ips): #Adiciona um dicionário de IPs
    result = connection.post(urljoin(baseurl_main, '/network-list/v2/network-lists/' + uniqID + '/append'), json=lista_add_ips)
    saida_json = json.loads(result.text)
    print(saida_json)
    print(result.content)
class MainClass(NexusPHP):
    URL = 'https://u2.dmhy.org/'
    USERNAME_REGEX = '<bdo dir=\'ltr\'>{username}</bdo>'
    SUCCEED_REGEX = '.{0,500}奖励UCoin: <b>\\d+'
    USER_CLASSES = {
        'downloaded': [3298534883328],
        'share_ratio': [4.55],
        'days': [700]
    }

    DATA = {
        'regex_keys': ['<input type="submit" name="(captcha_.*?)" value="(.*?)" />'],
        'req': '<input type="hidden" name="req" value="(.*?)" />',
        'hash': '<input type="hidden" name="hash" value="(.*?)" />',
        'form': '<input type="hidden" name="form" value="(.*?)" />'
    }

    def __init__(self):
        super(NexusPHP, self).__init__()
        self.times = 0

    @classmethod
    def build_sign_in(cls, entry, config):
        site_config = entry['site_config']
        succeed_regex = [cls.USERNAME_REGEX.format(username=site_config.get('username')) + cls.SUCCEED_REGEX,
                         '<a href="showup.php">已[签簽]到</a>']
        entry['url'] = cls.URL
        entry['workflow'] = cls.build_workflow(succeed_regex)
        entry['user_classes'] = cls.USER_CLASSES
        site_config.setdefault('ocr_config', {})
        ocr_config = site_config.get('ocr_config')
        ocr_config.setdefault('retry', _RETRY)
        ocr_config.setdefault('char_count', _CHAR_COUNT)
        ocr_config.setdefault('score', _SCORE)

        entry['headers'] = {
            'cookie': site_config.get('cookie'),
            'user-agent': config.get('user-agent'),
            'referer': entry['url']
        }

    @classmethod
    def build_workflow(cls, succeed_regex):
        return [
            Work(
                url='/showup.php?action=show',
                method='get',
                succeed_regex=succeed_regex,
                check_state=('sign_in', SignState.NO_SIGN_IN),
                is_base_content=True
            ),
            Work(
                url='/showup.php?action=show',
                method='anime',
                data=cls.DATA,
                check_state=('network', NetworkState.SUCCEED),
                img_regex='image\\.php\\?action=adbc2&req=.+?(?=&imagehash)',
                reload_regex='image\\.php\\?action=reload_adbc2&div=showup&rand=\\d+'
            ),
            Work(
                url='/showup.php?action=show',
                method='get',
                succeed_regex=succeed_regex,
                fail_regex='这是一个杯具。<br />验证码已过期。',
                check_state=('final', SignState.SUCCEED)
            ),

        ]

    def sign_in_by_anime(self, entry, config, work, last_content):
        if not fuzz or not process:
            entry.fail_with_prefix('Dependency does not exist: [fuzzywuzzy]')
            return

        ocr_config = entry['site_config'].get('ocr_config')
        data = self.build_data(entry, config, work, last_content, ocr_config)
        if not data:
            entry.fail_with_prefix('Can not build_data')
            return
        logger.info(data)
        return self._request(entry, 'post', work.url, data=data)

    def build_data(self, entry, config, work, base_content, ocr_config):
        if entry.failed:
            return None
        img_url_match = re.search(work.img_regex, base_content)
        if not img_url_match:
            entry.fail_with_prefix('Can not found img_url')
            return None
        img_url = img_url_match.group()
        logger.info('attempts: {} / {}, url: {}', self.times, ocr_config.get('retry'), urljoin(entry['url'], img_url))
        data = {}
        found = False
        if images := self.get_image(entry, config, img_url, ocr_config.get('char_count')):
            image1, image2 = images
            self.save_iamge(image1, 'step3_a_diff.png')
            self.save_iamge(image2, 'step3_b_diff.png')
            ocr_text1 = BaiduOcr.get_jap_ocr(image1, entry, config)
            ocr_text2 = BaiduOcr.get_jap_ocr(image2, entry, config)
            oct_text = ocr_text1 if len(ocr_text1) > len(ocr_text2) else ocr_text2
            logger.debug('jap_ocr: {}', oct_text)
            if oct_text and len(oct_text) > ocr_config.get('char_count'):
                for key, regex in work.data.items():
                    if key == 'regex_keys':
                        for regex_key in regex:
                            regex_key_search = re.findall(regex_key, base_content, re.DOTALL)
                            select = {}
                            ratio_score = 0
                            if regex_key_search:
                                for captcha, value in regex_key_search:
                                    answer_list = list(filter(lambda x2: len(x2) > 0,
                                                              map(lambda x: re.sub('[^\\w]|[a-zA-Z\\d]', '', x),
                                                                  value.split('\n'))))
                                    if answer_list:
                                        split_value, partial_ratio = process.extractOne(oct_text, answer_list,
                                                                                        scorer=fuzz.partial_ratio)
                                    else:
                                        partial_ratio = 0
                                    if partial_ratio > ratio_score:
                                        select = (captcha, value)
                                        ratio_score = partial_ratio
                                    logger.debug('value: {}, ratio: {}', value.replace('\n', '\\'), partial_ratio)
                            else:
                                entry.fail_with_prefix(
                                    'Cannot find regex_key: {}, url: {}'.format(regex_key, work.url))
                                return None
                            if ratio_score and ratio_score > ocr_config.get('score'):
                                captcha, value = select
                                data[captcha] = value
                                found = True
                    else:
                        value_search = re.search(regex, base_content, re.DOTALL)
                        if value_search:
                            data[key] = value_search.group(1)
                        else:
                            entry.fail_with_prefix('Cannot find key: {}, url: {}'.format(key, work.url))
                            return None

        if not found:
            if self.times < ocr_config.get('retry'):
                self.times += 1
                reload_url = re.search(work.reload_regex, base_content).group()
                real_reload_url = urljoin(entry['url'], reload_url)
                reload_response = self._request(entry, 'get', real_reload_url)
                reload__net_state = self.check_network_state(entry, real_reload_url, reload_response)
                if reload__net_state != NetworkState.SUCCEED:
                    return None
                reload_content = NetUtils.decode(reload_response)
                return self.build_data(entry, config, work, reload_content, ocr_config)
            else:
                return None
        site_config = entry['site_config']
        data['message'] = site_config.get('comment')
        return data
 def build_data(self, entry, config, work, base_content, ocr_config):
     if entry.failed:
         return None
     img_url_match = re.search(work.img_regex, base_content)
     if not img_url_match:
         entry.fail_with_prefix('Can not found img_url')
         return None
     img_url = img_url_match.group()
     logger.info('attempts: {} / {}, url: {}', self.times, ocr_config.get('retry'), urljoin(entry['url'], img_url))
     data = {}
     found = False
     if images := self.get_image(entry, config, img_url, ocr_config.get('char_count')):
         image1, image2 = images
         self.save_iamge(image1, 'step3_a_diff.png')
         self.save_iamge(image2, 'step3_b_diff.png')
         ocr_text1 = BaiduOcr.get_jap_ocr(image1, entry, config)
         ocr_text2 = BaiduOcr.get_jap_ocr(image2, entry, config)
         oct_text = ocr_text1 if len(ocr_text1) > len(ocr_text2) else ocr_text2
         logger.debug('jap_ocr: {}', oct_text)
         if oct_text and len(oct_text) > ocr_config.get('char_count'):
             for key, regex in work.data.items():
                 if key == 'regex_keys':
                     for regex_key in regex:
                         regex_key_search = re.findall(regex_key, base_content, re.DOTALL)
                         select = {}
                         ratio_score = 0
                         if regex_key_search:
                             for captcha, value in regex_key_search:
                                 answer_list = list(filter(lambda x2: len(x2) > 0,
                                                           map(lambda x: re.sub('[^\\w]|[a-zA-Z\\d]', '', x),
                                                               value.split('\n'))))
                                 if answer_list:
                                     split_value, partial_ratio = process.extractOne(oct_text, answer_list,
                                                                                     scorer=fuzz.partial_ratio)
                                 else:
                                     partial_ratio = 0
                                 if partial_ratio > ratio_score:
                                     select = (captcha, value)
                                     ratio_score = partial_ratio
                                 logger.debug('value: {}, ratio: {}', value.replace('\n', '\\'), partial_ratio)
                         else:
                             entry.fail_with_prefix(
                                 'Cannot find regex_key: {}, url: {}'.format(regex_key, work.url))
                             return None
                         if ratio_score and ratio_score > ocr_config.get('score'):
                             captcha, value = select
                             data[captcha] = value
                             found = True
                 else:
                     value_search = re.search(regex, base_content, re.DOTALL)
                     if value_search:
                         data[key] = value_search.group(1)
                     else:
                         entry.fail_with_prefix('Cannot find key: {}, url: {}'.format(key, work.url))
                         return None
Example #34
0
"""URL objects

Extends the `url` submodule from the ETLAssist package.
"""
try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin

from etlassist.url import *  # pylint: disable=wildcard-import, unused-wildcard-import

EPERMITTING_ACCELA_FTP = "sftp://64.74.214.187:/"
EPERMITTING_ADDRESSES_FTP = "sftp://imd20.cbs.state.or.us:/home/lane_co/"

OEM_FILE_SHARING = "https://upload.oregonem.com/"

RLID = "https://www.rlid.org/"
RLID_MAPS = "https://open.maps.rlid.org/"

RLID_IMAGE_SHARE = urljoin(RLID, "/ImageShare")
RLID_PROPERTY_SEARCH = urljoin(
    RLID, "/property_search/standard.cfm?do=propsearch_standard.reprocess")
Example #35
0
def get_frame_src(session, baseurl, url, framename):
    result = session.get(urljoin(baseurl, url))
    return extract_frame_src(framename, result)
Example #36
0
def scrap_biblis_book_lents(account_config: dir):
    with requests.Session() as session:
        # all requests through session now have User-Agent header set
        session.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
        }

        starturl = 'https://biblis.de/FOR/lissy/lissy.ly?pg=bnrlogin'

        topframeurl = get_frame_src(session, starturl, starturl, 'topeframe')
        rightframeurl = get_frame_src(session, starturl, topframeurl,
                                      'toprighteframe')

        rightframe = session.get(urljoin(starturl, rightframeurl))
        tree = html.fromstring(rightframe.text)

        inputs = list(set(tree.xpath("//form//input[@value]/@name")))
        formparams = dict([
            (i,
             str(
                 list(
                     set(
                         tree.xpath("//form//input[@name='{}']/@value".format(
                             i))))[0])) for i in inputs
        ])
        formposturl = str(
            list(set(tree.xpath("//form[@name='form1']/@action")))[0])
        formparams["bnr"] = account_config["user"]
        formparams["gd"] = account_config["password"]

        loggedin = session.post(urljoin(starturl, formposturl),
                                data=formparams)
        topframeurl = extract_frame_src('topframe', loggedin)
        toplefturl = get_frame_src(session, starturl, topframeurl,
                                   'topleftframe')

        menu = session.get(urljoin(starturl, toplefturl))
        tree = html.fromstring(menu.text)
        itemlisturl = list(
            tree.xpath(
                "//td//a[img/@alt='Entliehene Medien anzeigen']/@href"))[0]

        tmp = session.get(urljoin(starturl, itemlisturl))
        tree = html.fromstring(tmp.text)
        script = list(tree.xpath("//head/script"))[0].text

        leftmarker = 'window.location.replace("'
        leftcuturl = script[script.find(leftmarker) + len(leftmarker):]
        listurl = leftcuturl[:leftcuturl.find('"')]
        tmp = session.get(urljoin(starturl, listurl))

        tree = html.fromstring(tmp.text)

        noresult = list(tree.xpath("//font/h3[@align='center']"))
        if len(noresult) == 1 and noresult[
                0].text == '\r\n*** Sie haben zur Zeit keine Medien entliehen! ***\r\n':
            return []

        # rows = list(tree.xpath('//table/tr[td]'))
        rows = list(tree.xpath('//table/tr[count(/td) > 5]'))
        itemslist = [{
            "account":
            account_config,
            "name":
            item.xpath('td[4]')[0].text.replace('\u200b', ''),
            "date":
            datetime.strptime(
                item.xpath('td[5]')[0].text.replace('\u200b', ''),
                '%d.%m.%Y').date(),
            "remarks":
            item.xpath('td[6]')[0].text.replace('\u200b',
                                                '').replace('---', ''),
        } for item in rows]

        return itemslist
def del_unique_IP (lista_del_ips): ## Passa como parâmetro uma unica string um IP ou bloco
    result = connection.delete(urljoin(baseurl_main, '/network-list/v2/network-lists/'+ uniqID + '/elements?element='+ lista_del_ips))
    saida_json = json.loads(result.text)
    print(saida_json)
    print(result)
Example #38
0
# coding=utf-8
from urllib.parse import urljoin

from pulp_smash.constants import PULP_FIXTURES_BASE_URL
from pulp_smash.pulp3.constants import (
    BASE_DISTRIBUTION_PATH,
    BASE_REMOTE_PATH,
    BASE_REPO_PATH,
    BASE_CONTENT_PATH,
)

CONTAINER_MANIFEST_PATH = urljoin(BASE_CONTENT_PATH, "container/manifests/")

CONTAINER_TAG_PATH = urljoin(BASE_CONTENT_PATH, "container/tags/")

CONTAINER_BLOB_PATH = urljoin(BASE_CONTENT_PATH, "container/blobs/")

CONTAINER_CONTENT_NAME = "container.blob"

CONTAINER_DISTRIBUTION_PATH = urljoin(BASE_DISTRIBUTION_PATH,
                                      "container/container/")

CONTAINER_REPO_PATH = urljoin(BASE_REPO_PATH, "container/container/")

CONTAINER_REMOTE_PATH = urljoin(BASE_REMOTE_PATH, "container/container/")

CONTAINER_IMAGE_URL = urljoin(PULP_FIXTURES_BASE_URL,
                              "container/busybox:latest.tar")
"""The URL to a Container image as created by ``docker save``."""

# hello-world is the smalest container image available on docker hub 1.84kB
 def handle_starttag(self, tag, attrs):
     if tag == 'a':
         for(attribute,value) in attrs:
             if attribute == 'href':
                 url = parse.urljoin(self.base_url,value)
                 self.links.add(url)
Example #40
0
def _local_task(task_args: Dict[str, Any]) -> None:
    requests.post(
        urljoin("http://localhost:5000", task_args["relative_uri"]),
        data=json.dumps(task_args["body"]).encode(),
    )
Example #41
0
    def test_unpublished(self):
        """Test permissions on an unpublished layer
        """
        thefile = os.path.join(gisdata.VECTOR_DATA,
                               'san_andres_y_providencia_highway.shp')
        layer = file_upload(thefile, overwrite=True)
        layer.set_default_permissions()
        check_layer(layer)

        # we need some time to have the service up and running
        time.sleep(20)

        try:
            # request getCapabilities: layer must be there as it is published and
            # advertised: we need to check if in response there is
            # <Name>geonode:san_andres_y_providencia_water</Name>
            geoserver_base_url = settings.OGC_SERVER['default']['LOCATION']
            get_capabilities_url = 'ows?' \
                'service=wms&version=1.3.0&request=GetCapabilities'
            url = urljoin(geoserver_base_url, get_capabilities_url)
            str_to_check = '<Name>geonode:san_andres_y_providencia_highway</Name>'
            request = Request(url)
            response = urlopen(request)

            # by default the uploaded layer is published
            self.assertTrue(layer.is_published, True)
            self.assertTrue(
                any(str_to_check in ensure_string(s)
                    for s in response.readlines()))
        finally:
            # Clean up and completely delete the layer
            layer.delete()

        # with settings disabled
        with self.settings(RESOURCE_PUBLISHING=True):
            layer = file_upload(thefile,
                                overwrite=True,
                                is_approved=False,
                                is_published=False)
            layer.set_default_permissions()
            check_layer(layer)

            # we need some time to have the service up and running
            time.sleep(20)

            try:
                # by default the uploaded layer must be unpublished
                self.assertEqual(layer.is_published, False)

                # check the layer is not in GetCapabilities
                request = Request(url)
                response = urlopen(request)

                # now test with published layer
                layer = Layer.objects.get(pk=layer.pk)
                layer.is_published = True
                layer.save()

                # we need some time to have the service up and running
                time.sleep(20)

                request = Request(url)
                response = urlopen(request)
                self.assertTrue(
                    any(str_to_check in ensure_string(s)
                        for s in response.readlines()))
            finally:
                # Clean up and completely delete the layer
                layer.delete()
Example #42
0
 def job_url(self, test_job):
     result_type, tux_project, tux_uid = self.parse_job_id(test_job.job_id)
     tux_group, tux_user = tux_project.split('@')
     endpoint = f'groups/{tux_group}/projects/{tux_user}/{result_type.lower()}s/{tux_uid}'
     return urljoin(self.data.url, endpoint)
Example #43
0
 def files(self):
     anchors = [p.find('a') for p in self._soup('p', string=re.compile(r'^\s*Click here to download'))]
     files = [File(urljoin(self.url, a['href'])) for a in anchors]
     for file in files:
         file.filename = strictSplitext(self.name)[0] + os.path.splitext(file.filename)[1]
     return files
Example #44
0
    def receive(cls, xml, c_from, u_to):
        data = cls.as_dict(xml)
        shared = DiasporaPost.get_by_guid(data['root_guid'])
        if not shared:
            # Try to pull it from the Atom feed
            author = DiasporaContact.get_by_username(data['root_diaspora_id'],
                                                     True, True)
            if not author:
                raise TryLater()
            author.import_public_posts()
            shared = DiasporaPost.get_by_guid(data['root_guid'])

        if not shared:
            # Fall back to poking the origin server
            post_url = urljoin(author.server,
                               "/p/{0}.xml".format(data['root_guid']))
            resp = urlopen(post_url, timeout=10)
            current_app.logger.debug(
                'Injecting downloaded message into processing loop')
            process_incoming_message(resp.read(), author.contact, None)
            shared = DiasporaPost.get_by_guid(data['root_guid'])

        if not shared:
            # Failed
            current_app.logger.warning(
                'Could not find post being reshared (with GUID {0})'.format(
                    data['root_guid']))
            raise TryLater()
        shared = shared.post
        created = datetime.strptime(data['created_at'], '%Y-%m-%d %H:%M:%S %Z')
        post = Post(author=c_from, created_at=created)
        share_part = MimePart(type='application/x-pyaspora-share',
                              body=dumps({
                                  'post': {
                                      'id': shared.id
                                  },
                                  'author': {
                                      'id': shared.author_id,
                                      'name': shared.author.realname,
                                  }
                              }).encode('utf-8'),
                              text_preview=u"shared {0}'s post".format(
                                  shared.author.realname))
        post.add_part(share_part, order=0, inline=True)
        order = 0
        for part in shared.parts:
            if part.mime_part.type != 'application/x-pyaspora-share':
                order += 1
                post.add_part(part.mime_part, inline=part.inline, order=order)
        if not post.tags:
            post.tags = shared.tags
        if u_to:
            post.share_with([c_from])
            if u_to.contact.subscribed_to(c_from):
                p.share_with([u_to.contact])
        else:
            post.share_with([c_from], show_on_wall=True)
        post.thread_modified()

        post.diasp = DiasporaPost(guid=data['guid'],
                                  type='limited' if u_to else 'public')
        db.session.add(post)
        db.session.commit()
Example #45
0
 def songs(self):
     table = self._contentSoup.find('table')
     anchors = [tr.find('a') for tr in table('tr') if not tr.find('th')]
     urls = [a['href'] for a in anchors]
     songs = [Song(urljoin(self.url, url)) for url in urls]
     return songs
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for p in page_lines:

        raw_text = unspace(p.get_text())
        pdf = None
        title = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            title = get_title(p)
            #print(pdf)
        else:
            print(p)
            continue

        presentation = None

        authors = has_italic(p)
        try:
            authors = remove_parenthesised(authors)
        except:
            print(authors)
            continue
        authors = namify(authors)
Example #47
0
 def media_url(self):
     return urljoin(settings.MEDIA_URL, self.url)
Example #48
0
 def images(self):
     anchors = self._contentSoup('p')[1]('a')
     urls = [a['href'] for a in anchors]
     images = [File(urljoin(self.url, url)) for url in urls]
     return images
 def as_uri(self, scheme='file'):
     return urlparse.urljoin(
         str(scheme) + ':', urllib.pathname2url(str(self)))
Example #50
0
 def __init__(self, soundtrackId):
     self.id = soundtrackId
     self.url = urljoin(BASE_URL, 'game-soundtracks/album/' + self.id)
Example #51
0
def loadfeed(category="faculty", subtype=None):

    # Validate the category and subtype
    category = category.lower().strip()
    if not category in CATEGORIES:
        return None
    if category == "faculty" and subtype != None:
        subtype = subtype.lower().strip()
        if not subtype in SUBTYPES:
            return None

    # Make the request and retrieve DOM with BeautifulSoup4
    req = requests.get(urljoin(CS_BASE_ENDPOINT, category))
    if not req.ok:
        return
    try:
        dom = bs4.BeautifulSoup(req.content, features="html5lib")
    except bs4.FeatureNotFound:
        dom = bs4.BeautifulSoup(req.content, features="html.parser")
    people_divs = dom.find_all(name="div", attrs={"class": "person"})

    people = []

    # Process each record individual and parse out data
    for p in people_divs:
        record = {}

        # Extract full name
        fullname_tag = p.find("h2", attrs={"class": "person-name"})
        if fullname_tag.find("a"):
            record["full_name"] = fullname_tag.find("a").text.strip()
        else:
            record["full_name"] = fullname_tag.text.strip()

            if fullname_tag.find("small"):
                maintext = fullname_tag.text
                badtext = fullname_tag.find("small").text
                record["full_name"] = maintext.replace(badtext, "").strip()

        # Process other standard directory information
        record["title"] = p.find("div", attrs={
            "class": "person-title"
        }).text.strip()

        degree_tag = p.find("div", attrs={"class": "person-degree"})
        if degree_tag:
            record["degree"] = degree_tag.text.strip()

        # Process image
        photo_imgtag = p.find("div", attrs={
            "class": "person-photo"
        }).find("img")
        if photo_imgtag and photo_imgtag.has_attr("src"):
            photo_rel_url = photo_imgtag.get("src", None)
            if photo_rel_url:
                record["photo_link"] = urljoin(CS_BASE_ENDPOINT, photo_rel_url)

        # Process email / CS NetID
        raw_address_items = p.find_all("span",
                                       attrs={"class": "person-address-item"})

        for item in raw_address_items:
            if item == None or not item.find("span", "glyphicon"):
                continue

            item_type = list(
                filter(lambda s: "glyphicon-" in s,
                       item.find("span", "glyphicon").get("class", [])))

            text = item.text.strip()

            # Email
            if "glyphicon-envelope" in item_type:

                # Bad hack for bad HTML parsers
                if "&commat" in text:
                    text = text.replace("&commat", "@")

                username = re.sub(r'\W+', '', text.split("@")[0])
                domain = text.split("@")[1].strip().strip(')')
                record["email"] = "{}@{}".format(username, domain)
                record["net_id"] = username

            # Phone
            if "glyphicon-earphone" in item_type:
                record["phone"] = text

            # Address
            if "glyphicon-briefcase" in item_type:
                record["address"] = text

        # Add to processed records
        people.append(record)

    return people
Example #52
0
output_folder = "output"


# Open and parse page
request = requests.get(url_padron)
page = BeautifulSoup(request.text, 'lxml')


# Find all PDF files links
main_content = page.find(class_='tab-content')
links = main_content.find_all('a', href=re.compile("pdf"))

full_links = []

for link in links:
    full_link = urljoin(url_padron, link['href'])
    full_links.append(full_link)

print("Found " + str(full_links.count) + " links")


# Download each file in the output directory
cwd = os.getcwd()
output_path = cwd + '/' + output_folder
Path(output_path).mkdir(exist_ok=True)

for full_link in full_links:
    print("Downloading " + full_link)
    local_filename = full_link.split('/')[-1]

    with requests.get(full_link, stream=True) as r:
Example #53
0
 def save(self, *args, **kwargs):
     if not self.id:
         models.Model.save(self, *args, **kwargs)
     self.url = up.urljoin(UM.ARTICLE_INSIDE, '?id={id}'.format(id=self.id))
     models.Model.save(self, *args, **kwargs)
Example #54
0
def is_safe_url(target):
    ref_url = urlparse(request.host_url)
    test_url = urlparse(urljoin(request.host_url, target))
    return test_url.scheme in ('http', 'https') and \
           ref_url.netloc == test_url.netloc
    def __init__(self, base_url, login_url=None, useragent=None, debug=False,
                 insecure=False, openid_insecure=False, username=None,
                 session_id=None, session_name='session',
                 openid_session_id=None, openid_session_name='FAS_OPENID',
                 cache_session=True, retries=None, timeout=None):
        """Client for interacting with web services relying on fas_openid auth.

        :arg base_url: Base of every URL used to contact the server
        :kwarg login_url: The url to the login endpoint of the application.
            If none are specified, it uses the default `/login`.
        :kwarg useragent: Useragent string to use.  If not given, default to
            "Fedora OpenIdBaseClient/VERSION"
        :kwarg debug: If True, log debug information
        :kwarg insecure: If True, do not check server certificates against
            their CA's.  This means that man-in-the-middle attacks are
            possible against the `BaseClient`. You might turn this option on
            for testing against a local version of a server with a self-signed
            certificate but it should be off in production.
        :kwarg openid_insecure: If True, do not check the openid server
            certificates against their CA's.  This means that man-in-the-
            middle attacks are possible against the `BaseClient`. You might
            turn this option on for testing against a local version of a
            server with a self-signed certificate but it should be off in
            production.
        :kwarg username: Username for establishing authenticated connections
        :kwarg session_id: id of the user's session
        :kwarg session_name: name of the cookie to use with session handling
        :kwarg openid_session_id: id of the user's openid session
        :kwarg openid_session_name: name of the cookie to use with openid
            session handling
        :kwarg cache_session: If set to true, cache the user's session data on
            the filesystem between runs
        :kwarg retries: if we get an unknown or possibly transient error from
            the server, retry this many times.  Setting this to a negative
            number makes it try forever.  Defaults to zero, no retries.
        :kwarg timeout: A float describing the timeout of the connection. The
            timeout only affects the connection process itself, not the
            downloading of the response body. Defaults to 120 seconds.

        """

        # These are also needed by OpenIdProxyClient
        self.useragent = useragent or 'Fedora BaseClient/%(version)s' % {
            'version': __version__}
        self.base_url = base_url
        self.login_url = login_url or urljoin(self.base_url, '/login')
        self.debug = debug
        self.insecure = insecure
        self.openid_insecure = openid_insecure
        self.retries = retries
        self.timeout = timeout
        self.session_name = session_name
        self.openid_session_name = openid_session_name

        # These are specific to OpenIdBaseClient
        self.username = username
        self.cache_session = cache_session

        # Make sure the database for storing the session cookies exists
        if cache_session:
            self._db = self._initialize_session_cache()
            if not self._db:
                self.cache_session = False

        # Session cookie that identifies this user to the application
        self._session_id_map = defaultdict(str)
        if session_id:
            self.session_id = session_id
        if openid_session_id:
            self.openid_session_id = openid_session_id

        # python-requests session.  Holds onto cookies
        self._session = requests.session()
Example #56
0
    def _get_content_hls_url(self, content_id):
        d = self.session.http.get(urljoin(self.url, self.content_api.format(id=content_id)))
        d = self.session.http.json(d, schema=self.content_api_schema)

        return urljoin((d["serviceUrl"] or d["defaultServiceUrl"]), d["securePath"])
        continue

    cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
    cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) )
    conn.commit()

    # Retrieve all of the anchor tags
    tags = soup('a')
    count = 0
    for tag in tags:
        href = tag.get('href', None)
        if ( href is None ) : continue
        # Resolve relative references like href="/contact"
        up = urlparse(href)
        if ( len(up.scheme) < 1 ) :
            href = urljoin(url, href)
        ipos = href.find('#')
        if ( ipos > 1 ) : href = href[:ipos]
        if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
        if ( href.endswith('/') ) : href = href[:-1]
        # print href
        if ( len(href) < 1 ) : continue

	# Check if the URL is in any of the webs
        found = False
        for web in webs:
            if ( href.startswith(web) ) :
                found = True
                break
        if not found : continue
Example #58
0
 def parse_page(self, response):
     for review in response.xpath('//div[@class="news-item__top"]/a/@href').extract():
         review_url = urljoin(response.url, review)
         yield Request(url=review_url, callback=self.parse_news)
Example #59
0
def get_recipe_detail(recipe_url):
    """从url中获取菜谱详细信息

    :param recipe_url: str
        菜谱url,如:https://www.xinshipu.com/zuofa/598775;
        https://www.xinshipu.com//zuofa/749342
    :return:dict
    """
    response = requests.get(recipe_url, headers=get_header())
    html = BeautifulSoup(response.text, 'lxml')

    # 获取菜名
    name = html.find("div", {"class": "re-up"}).h1.text

    # 主图
    img = html.find("div", {"class": "gallery"}).a['href']
    img = urljoin(HOME_URL, img)

    all_info = html.find_all("div", {"class": "dd"})

    if len(all_info) == 4:
        # 简介
        intro = re.sub('\n|\t|\r| ', '', all_info[0].text)
        material_i = 1
        method_i = 2
    else:
        intro = None
        material_i = 0
        method_i = 1

    # 食材
    material = all_info[material_i].text.strip()
    material = re.sub('\r\n|\r\n \n|\n\n\n', '\n', material)

    # 做法
    try:
        method_steps = html.find("ol", {
            "class": "re-step-wpic"
        }).find_all('li')
        method = []
        for i, m in enumerate(method_steps, 1):
            step = dict(step_num=i)
            step['text'] = m.text.strip()
            if m.img:
                step['img_url'] = urljoin(HOME_URL, m.img['src'])
            method.append(step)
    except:
        method = all_info[method_i].text.strip()
        method = re.sub('\r\n|\r\n \n|\n\n\n\n', '\n', method)

    # 相关菜品
    classify = all_info[-1].text.strip()
    if '\xa0\xa0' in classify:
        classify = classify.replace('\xa0\xa0', ' | ')
    else:
        classify = ""

    return {
        "name": name,
        "url": recipe_url,
        "img": img,
        "intro": intro,
        "material": material,
        "method": method,
        "classify": classify
    }
Example #60
0
    def find_url(self, url):
        """find link tag in html
        
            Args:
                url: url take address of website finding the link tag
        """
        #arguments url null check
        if not url:
            log.info("find_url() Line="+str(inspect.currentframe().f_lineno)+" args: url does not exist")
            return

        #arguments url type check
        if type(url) is not str:
            log.info("find_url() Line="+str(inspect.currentframe().f_lineno)+" args: url tpye is not string")
            return

        try:

            url = urllib.parse.quote(url, safe=':/&?=')
            req = Request(url, headers = self.headers)
            site = urlopen(req, timeout=2)
            soup = BeautifulSoup(site.read(),'lxml')
        except Exception as e:
            log.error("find_url() line="+str(inspect.currentframe().f_lineno)+' Error: '+str(e))
            return None
        for link in soup.findAll('a'):
            temp_url = str(link.get('href'))

            allow = self.allowed_url_check(temp_url)
            if not allow:
                continue
            #만약 상대주소로 되어있다면 not_allowed_url_check에 넣은게 들어갈 수 도 있다.
            # 가령 /policy/privacy.html 같은 경우 절대주소로 바꾸면 www.naver.com/policy/privacy.html 로 된다.
            # 따라서 걸러지지 않는데 이는 처음 시작 주소를 잘선택하면 문제없다. 
            disallow = self.not_allowed_url_check(temp_url)
            if not disallow:
                continue

            #request q보내려는 url가 실행파일, 집파일, rmp, deb, gz인 경우 건너뛴다.
            
            if 'Content-Type' not in site.headers:
                continue
            
            if 'text/html' not in site.headers['Content-Type']:
                continue
                
            #if re.search('(exe)$|(zip)$|(rpm)$|(gz)$|(deb)$|(txt)$|(csv)$|(pdf)$|(ppt)$', temp_url):
            #    continue

            if 'https' in temp_url:
                #절대주소
                pass
            elif re.match('/.+|\.\..+' , temp_url):
                #match 함수는 시작부터 일치하는지 검사한다. search는 문자열 내에 존재하면 찾아준다.
                #상대주소를 절대주소로 url 변경
                temp_url = urljoin(url,temp_url)
            else:
                #그 외는 None 처리
                temp_url=None

            if temp_url and temp_url not in self.visited:
                self.visited.setdefault(temp_url,True)
                self.links.append(temp_url)
            
        return soup #deque로 넘겨주어 popleft()로 앞에서부터 뺀다.