def _send_request(self, method, path, data=None, headers=None, sleep_before_request=0): """ :param method: HTTP method include 'GET','POST','DELETE','PUT' :param path: request url path, :param path: data to send with the request :param path: headers information :param sleep_before_request: sleep(timeinsec) to allow enough time gap to send requests to server this is to avoid ConnectionError "Max retries exceeded with url". Default with no delay :return: response in json format """ url = self.solrURL.replace(self.path, '') sleep(sleep_before_request) try: if self._auth: response = requests.request(method=method, url=urljoin(url, path), headers=headers,data=data,auth=self._auth) else: response = requests.request(method=method, url=urljoin(url, path), headers=headers,data=data) except requests.exceptions.ConnectionError: self._logger.warning("Connection refused when requesting [%s]", urljoin(url, path)) raise SolrError("Connection refused.") if response.status_code not in (200, 304): self._logger.error("failed to send request to [%s]. Reason: [%s]", urljoin(url, path),response.reason) raise SolrError(self._extract_error(headers, response.reason)) return response.json()
def test__request(self): httpretty.register_uri( httpretty.POST, urljoin(SmsAero.URL_GATE, '/send/'), body='{}', status=500, ) try: self.api.send('89111111111', 'message') self.assertTrue(False) except SmsAeroHTTPError: pass def exceptionCallback(request, uri, headers): raise requests.Timeout('Connection timed out.') httpretty.register_uri( httpretty.POST, urljoin(SmsAero.URL_GATE, '/send/'), body=exceptionCallback, status=200, content_type='text/json', ) try: self.api.send('89111111111', 'message') self.assertTrue(False) except SmsAeroHTTPError: pass
def __init__(self, package, resource): self.package = package path = resource.get("path") if path: self.url = urljoin(package.url, path) elif "url" in resource: self.url = resource["url"] elif "data" in resource: raise NotImplementedError("Embedded datapackage resource data " "are not supported") else: raise MetadataError("No path or url specified in a package " "resource.") self.name = resource.get("name") self.title = resource.get("title") self.url = urljoin(package.url, path) schema = resource.get("schema") if schema: fields = schema.get("fields") self.fields = schema_to_fields(fields) else: self.fields = None self.type = resource.get("type", os.path.splitext(self.url)[1][1:]) if self.type not in OBJECT_TYPES: raise TypeError("Data object type '%s' is not supported in " "datapackage." % self.type)
def post_to_hastebin(data, url="http://hastebin.com/"): if isinstance(data, str): data = data.encode() response = requests.post(urljoin(url, "documents"), data) response.raise_for_status() result = response.json() return urljoin(url, result['key'])
def test_checksending(self): httpretty.register_uri( httpretty.POST, urljoin(SmsAero.URL_GATE, '/checksending/'), body='{"reason": {"33460579": "smsc reject", \ "33460580": "delivery success"}, \ "result": "accepted"}', status=200, content_type='text/json', ) self.api.checksending(322) httpretty.register_uri( httpretty.POST, urljoin(SmsAero.URL_GATE, '/checksending/'), body='{"reason": "empty field", "result": "reject"}', status=200, content_type='text/json', ) try: self.api.checksending('') self.assertTrue(False) except SmsAeroError: pass
def soupIt(self): http = urllib3.PoolManager() r = http.request("GET", self.url) soup = BeautifulSoup(r.data.decode('ISO-8859-1'), "lxml") self.title = soup.title.string # remove unused header parts # in comments because of firefox # for p in soup(["meta"]): # p.extract() # remove comments for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() # remove some images unused_images = soup.find_all('img', {'alt': 'bullet'}) \ + soup.find_all('img', {'src': '../../images/ilmulislam.gif'}) \ + soup.find_all('img', {'src': '../../images/enzykopf.gif'}) for i in soup.find_all('img'): if i in unused_images: i.extract() # remove all links, but keep text # don't keep text for navigation links that don't lead to "begriffe" or "manuskripte" for l in soup.findAll('a'): if "begriffe" in urljoin(self.url, l['href']) or "manuskripte" in urljoin(self.url, l['href']): l.replaceWith(l.text) else: l.extract() # remove top blocks topBlocks = soup.findAll('td', {'width': '50%'}) for block in topBlocks: if len(block.findChildren('img')): self.images += block.findChildren('img') block.extract() # remove trash tags and empty tags for tag in soup.findAll(): if tag.name == "meta": continue if tag.name in ("td", "tr", "table", "center", "div", "font", "strong", "b"): tag.unwrap() if len(tag.text) == 0 or tag.text == '\n' or re.match(r'^\s*$', tag.text) or tag.is_empty_element or tag.isSelfClosing: tag.extract() for l in soup.find_all(text=re.compile('^\n')): l.extract() for l in soup.find_all(text=re.compile('\r\n')): l.replaceWith(" ") # append immages for i in self.images: soup.body.insert(0, i) return soup.prettify()
def report_from(result, year_range): link = result.select("a")[0] title = link.text landing_url = urljoin(REPORTS_URL, link.get('href')) report_id_node, published_node = result.select("div.release_info") report_id = report_id_node.text.strip().replace(",", "") published_on = datetime.datetime.strptime(published_node.text, '%b %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % landing_url) return logging.debug("Scraping landing url: %s", landing_url) landing_page = beautifulsoup_from_url(landing_url) summary = landing_page.select("div.left_col")[0].text.strip() pdf_link = landing_page.select("#link_bar > a")[0] report_url = urljoin(REPORTS_URL, pdf_link.get('href')) text_link = landing_page.select("#add_material a")[-1] text_report_url = urljoin(REPORTS_URL, text_link.get('href')) report = { 'inspector': 'gao', 'inspector_url': 'http://www.gao.gov/about/workforce/ig.html', 'agency': 'gao', 'agency_name': 'Government Accountability Office', 'report_id': report_id, 'url': report_url, 'text_url': text_report_url, 'landing_url': landing_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def setUpClass(cls): """Create an RPM repository with a valid feed and sync it. Do the following: 1. Reset Pulp, including the Squid cache. 2. Create a repository with the "on demand" download policy. 3. Sync and publish the repository. 4. Download an RPM from the published repository. 5. Download the same RPM to ensure it is served by the cache. """ super(OnDemandTestCase, cls).setUpClass() # Ensure `locally_stored_units` is 0 before we start. utils.reset_squid(cls.cfg) utils.reset_pulp(cls.cfg) # Create, sync and publish a repository. repo = _create_repo(cls.cfg, 'on_demand') cls.resources.add(repo['_href']) utils.sync_repo(cls.cfg, repo['_href']) # Read the repository. client = api.Client(cls.cfg) cls.repo = client.get(repo['_href'], params={'details': True}).json() # Download the same RPM twice. path = urljoin('/pulp/repos/', repo['id'] + '/') path = urljoin(path, RPM) cls.rpm = client.get(path) cls.same_rpm = client.get(path)
def setUpClass(cls): """Create an RPM repository with a valid feed and sync it. Do the following: 1. Reset Pulp, including the Squid cache. 2. Create a repository with the "background" download policy. 3. Sync and publish the repository. 4. Download an RPM from the repository. """ super(BackgroundTestCase, cls).setUpClass() if (selectors.bug_is_untestable(1905, cls.cfg.version) and _os_is_rhel6(cls.cfg)): raise unittest.SkipTest('https://pulp.plan.io/issues/1905') # Required to ensure content is actually downloaded. utils.reset_squid(cls.cfg) utils.reset_pulp(cls.cfg) # Create, sync and publish a repository. repo = _create_repo(cls.cfg, 'background') cls.resources.add(repo['_href']) report = utils.sync_repo(cls.cfg, repo['_href']).json() # Record the tasks spawned when syncing the repository, and the state # of the repository itself after the sync. client = api.Client(cls.cfg) cls.repo = client.get(repo['_href'], params={'details': True}).json() cls.tasks = tuple(api.poll_spawned_tasks(cls.cfg, report)) # Download an RPM. path = urljoin('/pulp/repos/', repo['id'] + '/') path = urljoin(path, RPM) cls.rpm = client.get(path)
def urls_for(self): only = self.options.get('topics') if only: # if only... only = set(only.split(',')) only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o for o in only] yield from self.urls_for_topics(only) # If there are topics selected, ONLY yield URLs for those. return # First yield the URLs for the topics that are tangential to the main # Calendar Year reports. yield from self.urls_for_topics(ADDITIONAL_TOPICS) # Not getting reports from specific topics, iterate over all Calendar Year # reports. page = BeautifulSoup(utils.download(BASE_URL)) # Iterate over each "Calendar Year XXXX" link for li in page.select('.field-items li'): md = RE_CALENDAR_YEAR.search(li.text) if md: cur_year = int(md.group(1)) if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]: href = li.select('a')[0]['href'] next_url = urljoin(BASE_URL, href) # The first page of reports is yielded. yield next_url # Next, read all the pagination links for the page and yield those. So # far, I haven't seen a page that doesn't have all of the following # pages enumerated. next_page = BeautifulSoup(utils.download(next_url)) for link in next_page.select('li.pager-item a'): yield urljoin(BASE_URL, link['href'])
def root(): fp = request.fullpath try: numpkgs = len(list(packages())) except: numpkgs = 0 return """<html><head><title>Welcome to pypiserver!</title></head><body> <h1>Welcome to pypiserver!</h1> <p>This is a PyPI compatible package index serving %(NUMPKGS)s packages.</p> <p> To use this server with pip, run the the following command: <blockquote><pre> pip install -i %(URL)ssimple/ PACKAGE [PACKAGE2...] </pre></blockquote></p> <p> To use this server with easy_install, run the the following command: <blockquote><pre> easy_install -i %(URL)ssimple/ PACKAGE </pre></blockquote></p> <p>The complete list of all packages can be found <a href="%(PACKAGES)s">here</a> or via the <a href="%(SIMPLE)s">simple</a> index.</p> <p>This instance is running version %(VERSION)s of the <a href="http://pypi.python.org/pypi/pypiserver">pypiserver</a> software.</p> </body></html> """ % dict(URL=request.url, VERSION=__version__, NUMPKGS=numpkgs, PACKAGES=urljoin(fp, "packages/"), SIMPLE=urljoin(fp, "simple/"))
def _crawl(self): uri = urljoin(self.__uri, self.__next) self.__class__._log("debug", "%s crawls url: %s" % (self.__class__.__name__, uri)) (page, base, _) = self.__class__._fetch_remote_html(uri) if not page: self.__class__._log("debug", "%s crawled EMPTY url: %s" % (self.__class__.__name__, uri)) return # get more content ("scroll down") # to know what page to parse next # update new last URI when we're not on first run _next = None _more = page.find("div", {"id": "more_loading"}) if _more: _more = _more.find("a", {"href": True}) if _more: _next = urljoin(base, _more["href"]) if _next: self.__next = _next else: self.__class__._log("debug", "%s found no `next` on url: %s" % (self.__class__.__name__, uri)) # for every found imageContainer # add img-src to map if not blacklisted images_added = 0 for con in page.find_all("div", {"class": "imagecontainer"}): image = con.find("img", {"src": True}) if image: if self._add_image(urljoin(base, image["src"]), self.__site): images_added += 1 if not images_added: self.__class__._log("debug", "%s found no images on url: %s" % (self.__class__.__name__, uri))
def test_entry_feed_enclosure(self): entry = self.create_published_entry() feed = EntryFeed() self.assertEquals( feed.item_enclosure_url(entry), 'http://example.com/image.jpg') self.assertEquals(feed.item_enclosure_length(entry), '100000') self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg') entry.content = 'My test content with image <img src="image.jpg" />' entry.save() self.assertEquals( feed.item_enclosure_url(entry), 'http://example.com/image.jpg') self.assertEquals(feed.item_enclosure_length(entry), '100000') self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg') entry.content = 'My test content with image ' \ '<img src="http://test.com/image.jpg" />' entry.save() self.assertEquals( feed.item_enclosure_url(entry), 'http://test.com/image.jpg') self.assertEquals(feed.item_enclosure_length(entry), '100000') self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg') path = default_storage.save('enclosure.png', ContentFile('Content')) entry.image = path entry.save() self.assertEquals(feed.item_enclosure_url(entry), urljoin('http://example.com', entry.image.url)) self.assertEquals(feed.item_enclosure_length(entry), '7') self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/png') default_storage.delete(path) entry.image = 'invalid_image_without_extension' entry.save() self.assertEquals(feed.item_enclosure_url(entry), urljoin('http://example.com', entry.image.url)) self.assertEquals(feed.item_enclosure_length(entry), '100000') self.assertEquals(feed.item_enclosure_mime_type(entry), 'image/jpeg')
def parse_repomd(repo, baseurl): url = urljoin(baseurl, 'repodata/repomd.xml') repomd = requests.get(url) if repomd.status_code != requests.codes.ok: return False ns = {'r': 'http://linux.duke.edu/metadata/repo'} root = ET.fromstring(repomd.content) primary_element = root.find('.//r:data[@type="primary"]', ns) location = primary_element.find('r:location', ns).get('href') sha256_expected = primary_element.find('r:checksum[@type="sha256"]', ns).text f = tempfile.TemporaryFile() f.write(repomd.content) f.flush() os.lseek(f.fileno(), 0, os.SEEK_SET) repo.add_repomdxml(solv.xfopen_fd(None, f.fileno()), 0) url = urljoin(baseurl, location) with requests.get(url, stream=True) as primary: if primary.status_code != requests.codes.ok: raise Exception(url + ' does not exist') sha256 = hashlib.sha256(primary.content).hexdigest() if sha256 != sha256_expected: raise Exception('checksums do not match {} != {}'.format(sha256, sha256_expected)) content = gzip.GzipFile(fileobj=io.BytesIO(primary.content)) os.lseek(f.fileno(), 0, os.SEEK_SET) f.write(content.read()) f.flush() os.lseek(f.fileno(), 0, os.SEEK_SET) repo.add_rpmmd(solv.xfopen_fd(None, f.fileno()), None, 0) return True return False
def fake(base_url, username, password, tourney_id): url_opener = _utils.login_and_enter_arcade(base_url, username, password) # calculate some more URLs tourneys_url = urljoin(base_url, "arcade.php?&do=viewtournaments") join_tourney_url = urljoin(base_url, "arcade.php?&do=registertourney&tid={0}".format( tourney_id )) #view_tourney_url = urljoin(base_url, "arcade.php?&do=viewtourney&tid={0}".format( # tourney_id #)) # go to tourneys print("entering tourneys page") tourneys_response = url_opener.open(tourneys_url) tourneys_response.read() # go to tourney creation form print("joining tourney") join_tourney_response = url_opener.open(join_tourney_url) join_tourney_response.read() # look at tourney to make sure it sticks #print("looking at tourney") #view_tourney_response = url_opener.open(view_tourney_url) #view_tourney_response.read() print("done")
def get_episodes(html, url): html = html.replace("\n", "") js = """ var output; function getCookie() {} function getcookie() {} var window = { open: function(result){ output = result; } }; var document = { location: { href: "" } }; """ + grabhtml(urljoin(url, "/js/comicview.js")) s = [] matches = re.finditer( r'<a [^>]*?onclick="(cview[^"]+?);[^>]*>(.+?)</a>', html, re.M ) with VM(js) as vm: for match in matches: cview, title = match.groups() vm.run(cview) ep_url = vm.run("output") title = clean_tags(title) e = Episode(title, urljoin(url, ep_url)) s.append(e) return s
def parse_susetags(repo, baseurl): url = urljoin(baseurl, 'content') content = requests.get(url) if content.status_code != requests.codes.ok: return False f = tempfile.TemporaryFile() f.write(content.content) f.flush() os.lseek(f.fileno(), 0, os.SEEK_SET) repo.add_content(solv.xfopen_fd(None, f.fileno()), 0) defvendorid = repo.meta.lookup_id(solv.SUSETAGS_DEFAULTVENDOR) descrdir = repo.meta.lookup_str(solv.SUSETAGS_DESCRDIR) if not descrdir: descrdir = 'suse/setup/descr' url = urljoin(baseurl, descrdir + '/packages.gz') with requests.get(url, stream=True) as packages: if packages.status_code != requests.codes.ok: raise Exception(url + ' does not exist') content = gzip.GzipFile(fileobj=io.BytesIO(packages.content)) os.lseek(f.fileno(), 0, os.SEEK_SET) f.write(content.read()) f.flush() os.lseek(f.fileno(), 0, os.SEEK_SET) repo.add_susetags(f, defvendorid, None, solv.Repo.REPO_NO_INTERNALIZE|solv.Repo.SUSETAGS_RECORD_SHARES) return True return False
def _find_matches(self, match_trees): """ Static method used by match finders to find matches from :param match_trees: list of html trees (usually table rows) """ for match in match_trees: team1 = ''.join(match.xpath('.//span[contains(@class,"opp1")]//text()')).strip() team2 = ''.join(match.xpath('.//span[contains(@class,"opp2")]//text()')).strip() team1_bet = ''.join(match.xpath('.//span[contains(@class,"bet1")]//text()')).strip('() \n') team2_bet = ''.join(match.xpath('.//span[contains(@class,"bet2")]//text()')).strip('() \n') match_url = ''.join(match.xpath('.//a[contains(@class,"match")]/@href')).strip() match_id = match_url.rsplit('/', 1)[-1].split('-')[0] if match_url else 'not found' match_url = urljoin(self.domain, match_url) live_in = ''.join(match.xpath('.//span[contains(@class,"live-in")]/text()')).strip() score = match.xpath('.//span[contains(@class,"score-wrap")]//span[contains(@class, "score")]/text()') team1_score = score[0] if score else '' team2_score = score[1] if len(score) > 1 else '' tournament = ''.join(match.xpath('.//a[contains(@class,"tournament")]/@href')).strip() tournament = urljoin(self.domain, tournament) has_vods = bool(match.xpath('.//span[contains(@class,"vod")]/img')) yield Match(self.game, team1, team1_score, team1_bet, team2, team2_score, team2_bet, live_in, tournament, has_vods, match_id, match_url)
def get_img_list(self): """ Gets list of images from the page_html. """ tree = html.fromstring(self.page_html) img = tree.xpath('//img/@src') links = tree.xpath('//a/@href') img_list = self.process_links(img) img_links = self.process_links(links) img_list.extend(img_links) if self.filename_pattern: # Compile pattern for efficiency pattern = re.compile(self.filename_pattern) # Verifies filename in the image URL matches pattern def matches_pattern(img_url): """ Function to check if pattern is matched. """ img_filename = urlparse(img_url).path.split('/')[-1] return pattern.search(img_filename) images = [urljoin(self.url, img_url) for img_url in img_list if matches_pattern(img_url)] else: images = [urljoin(self.url, img_url) for img_url in img_list] images = list(set(images)) self.images = images if self.scrape_reverse: self.images.reverse() return self.images
def search_film(self, search_query): logging.info('Searching film for query: {}'.format(search_query)) search_url = urljoin(self.site_url, "/search/movies/") search_url = urljoin(search_url, quote_plus(search_query)) search_page = self.fetch_page(search_url) pq = PyQuery(search_page) dom_search_list = pq(u".list_item") film_list = [] for dom_item in dom_search_list: name = pq(dom_item).find('img[border="0"]').show().attr('alt') category = "Film" film = Media(name=name, category=category) # set description desc = pq(dom_item).find('.plot').text() film.description = re.sub('\s', ' ', str(desc)) # remove newlines from description film.rating = pq(dom_item).find('span.rank_value').text() # set page url href = pq(dom_item).find('a.panel').attr('href') film.url = urljoin(self.site_url, href) # set thumbnail url href_thumbnail = pq(dom_item).find('img[border="0"]').show().attr('src') film.thumbnail = urljoin(self.site_url, href_thumbnail) film_list.append(film) return film_list
def request_odes_extract(extract, request, url_for, api_key): ''' ''' env = Environment(loader=PackageLoader(__name__, 'templates')) args = dict( name = extract.name or extract.wof.name or 'an unnamed place', link = urljoin(util.get_base_url(request), url_for('ODES.get_extract', extract_id=extract.id)), extracts_link = urljoin(util.get_base_url(request), url_for('ODES.get_extracts')), created = extract.created ) email = dict( email_subject=env.get_template('email-subject.txt').render(**args), email_body_text=env.get_template('email-body.txt').render(**args), email_body_html=env.get_template('email-body.html').render(**args) ) params = {key: extract.envelope.bbox[i] for (i, key) in enumerate(('bbox_w', 'bbox_s', 'bbox_e', 'bbox_n'))} params.update(email) post_url = uritemplate.expand(odes_extracts_url, dict(api_key=api_key)) resp = requests.post(post_url, data=params) oj = resp.json() if 'error' in oj: raise util.KnownUnknown("Error: {}".format(oj['error'])) elif resp.status_code != 200: raise Exception("Bad ODES status code: {}".format(resp.status_code)) return data.ODES(str(oj['id']), status=oj['status'], bbox=oj['bbox'], links=oj.get('download_links', {}), processed_at=(parse_datetime(oj['processed_at']) if oj['processed_at'] else None), created_at=(parse_datetime(oj['created_at']) if oj['created_at'] else None))
def main(): # 指定种子页面 base_url = 'https://www.zhihu.com/' seed_url = urljoin(base_url, 'explore') # 创建Redis客户端 client = Redis(host='1.2.3.4', port=6379, password='******') # 设置用户代理(否则访问会被拒绝) headers = {'user-agent': 'Baiduspider'} # 通过requests模块发送GET请求并指定用户代理 resp = requests.get(seed_url, headers=headers) # 创建BeautifulSoup对象并指定使用lxml作为解析器 soup = BeautifulSoup(resp.text, 'lxml') href_regex = re.compile(r'^/question') # 将URL处理成SHA1摘要(长度固定更简短) hasher_proto = sha1() # 查找所有href属性以/question打头的a标签 for a_tag in soup.find_all('a', {'href': href_regex}): # 获取a标签的href属性值并组装完整的URL href = a_tag.attrs['href'] full_url = urljoin(base_url, href) # 传入URL生成SHA1摘要 hasher = hasher_proto.copy() hasher.update(full_url.encode('utf-8')) field_key = hasher.hexdigest() # 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存 if not client.hexists('zhihu', field_key): html_page = requests.get(full_url, headers=headers).text # 对页面进行序列化和压缩操作 zipped_page = zlib.compress(pickle.dumps(html_page)) # 使用hash数据类型保存URL摘要及其对应的页面代码 client.hset('zhihu', field_key, zipped_page) # 显示总共缓存了多少个页面 print('Total %d question pages found.' % client.hlen('zhihu'))
def run(options): year_range = inspector.year_range(options, archive) # Can limit search to any of the components listed at the top of this script component = options.get('component') if component and component in components: source_links = {} link = urljoin(base_url, "%s.htm" % component) source_links[link] = components[component] # Otherwise, get links to each component's landing page from main page. else: starting_point = "https://oig.justice.gov/reports/components.htm" content = get_content(starting_point) source_links = {} for c in content: links = c.find_all("a") for l in links: name = l.string link = urljoin(base_url, l.get("href")) source_links[link] = name # For each component's landing page, run the processor over it keys = list(source_links.keys()) keys.sort() for link in keys: content = get_content(link) extract_info(content, source_links[link], year_range) logging.info("Found %i reports, for year %i to %i" % (len(list(report.keys())), year_range[0], year_range[-1])) for key in list(report.keys()): inspector.save_report(report[key])
def compose_url(season, year=None, sport=None): if year and sport: return urljoin(URL, season + '/' + year + '/' + sport) elif year: return urljoin(URL, season + '/' + year) else: return urljoin(URL, season)
def getStreamURLs(self): time = self.time logging.debug("%s: Starting update of streamURL array", threading.current_thread().name) for i in range(0, self.length): if re.findall(r"(^.*Helios-HSS.*$)", self.playlist.getPlaylistUrl()): url = urljoin( self.baseUrl, "IRDETO-HSS-H/QualityLevels(" + str(self.qualityLevels) + ")/Fragments(video=" + str(int(time)) + ")", ) # print(self.baseUrl, "IS Helios VOD") elif re.findall(r"(^.*\.vod.*$)", self.baseUrl): url = urljoin( self.baseUrl, "IRDETO-HSS-O/QualityLevels(" + str(self.qualityLevels) + ")/Fragments(video=" + str(int(time)) + ")", ) # print(self.baseUrl, "IS Orion VOD") else: url = urljoin( self.baseUrl, "QualityLevels(" + str(self.qualityLevels) + ")/Fragments(video=" + str(int(time)) + ")", ) # print(self.baseUrl, "IS LIVE") self.streamUrls.append(url) time = time + int(self.deltaArray[i]) # print(self.streamUrls[i], 'index : ', i) logging.debug("%s: Completed updating streamURL array", threading.current_thread().name) return self
def get_ENCODE(obj_id, connection, frame="object"): '''GET an ENCODE object as JSON and return as dict''' if frame is None: if '?' in obj_id: url = urljoin(connection.server, obj_id+'&limit=all') else: url = urljoin(connection.server, obj_id+'?limit=all') elif '?' in obj_id: url = urljoin(connection.server, obj_id+'&limit=all&frame='+frame) else: url = urljoin(connection.server, obj_id+'?limit=all&frame='+frame) logging.debug('GET %s' % (url)) response = requests.get(url, auth=connection.auth, headers=connection.headers) logging.debug('GET RESPONSE code %s' % (response.status_code)) try: if response.json(): logging.debug('GET RESPONSE JSON: %s' % (json.dumps(response.json(), indent=4, separators=(',', ': ')))) except: logging.debug('GET RESPONSE text %s' % (response.text)) if not response.status_code == 200: if response.json().get("notification"): logging.warning('%s' % (response.json().get("notification"))) else: logging.warning('GET failure. Response code = %s' % (response.text)) return response.json()
def adaptionset(element, url, baseurl=None, offset_sec=None, duration_sec=None): streams = {} dirname = os.path.dirname(url) + "/" if baseurl: dirname = urljoin(dirname, baseurl) template = element[0].find("{urn:mpeg:dash:schema:mpd:2011}SegmentTemplate") represtation = element[0].findall(".//{urn:mpeg:dash:schema:mpd:2011}Representation") for i in represtation: files = [] segments = False filename = dirname bitrate = int(i.attrib["bandwidth"]) / 1000 idnumber = i.attrib["id"] if i.find("{urn:mpeg:dash:schema:mpd:2011}BaseURL") is not None: filename = urljoin(filename, i.find("{urn:mpeg:dash:schema:mpd:2011}BaseURL").text) if i.find("{urn:mpeg:dash:schema:mpd:2011}SegmentBase") is not None: segments = True files.append(filename) if template is not None: segments = True files = templateelemt(template, filename, idnumber, offset_sec, duration_sec) elif i.find("{urn:mpeg:dash:schema:mpd:2011}SegmentTemplate") is not None: segments = True files = templateelemt(i.find("{urn:mpeg:dash:schema:mpd:2011}SegmentTemplate"), filename, idnumber, offset_sec, duration_sec) if files: streams[bitrate] = {"segments": segments, "files": files} return streams
def MyParser(url,index): global links,A,num if (not IsInTheList(url, links)) and (len(links) <= num) and Is_ntut_web(url): try: soup = BeautifulSoup(urlopen(url), "lxml") result = soup.find("meta",attrs={"http-equiv":"refresh"}) meta = str(soup.html.head.meta) if result: links.append(url) wait,text=result["content"].split(";") if text.lower().startswith("url="): pice=text[4:] tempUrl = urljoin('http://www.ntut.edu.tw',pice) print(url) MyParser(tempUrl,FindIndex(url,links)) if index != FindIndex(url,links): A[FindIndex(url,links),index]=1 elif meta.find('text/html;') >= 0: links.append(url) for link in soup.findAll('a'): #print(A[:,0]) tempUrl = link.get('href') tempUrl = urljoin("http://www.ntut.edu.tw",tempUrl) MyParser(tempUrl,FindIndex(url,links)) if index != FindIndex(url,links): A[FindIndex(url,links),index]=1 except: pass elif IsInTheList(url, links) and (len(links) <= num+1): if index != FindIndex(url,links): A[FindIndex(url,links),index]=1
def parse_homework(words): n, gist, id, time = words dirname = os.path.join(OUTPUT, 'homework', n) name = id url = 'http://nbviewer.ipython.org/%s' % gist text = infopen(url) if text is None: url = 'http://gist.github.com/%s' % gist text = infopen(url) assert text is not None soup = BS(text) a = soup.find('a', title='View Raw') assert a is not None content = infopen(urljoin(url, a['href'])) assert content is not None good = False else: soup = BS(text) a = soup.find('a', text='Download Notebook') if a is None: content = text good = False else: content = infopen(urljoin(url, a['href'])) assert content is not None good = True return Bunch( dirname=dirname, name=name, content=content, good=good, time=time, title='homework %s' % n, author=id )
def startElementNS(self, name, qname, attrs): stack = self.stack stack.append(ElementHandler()) current = self.current parent = self.parent base = attrs.get(BASE, None) if base is not None: base, frag = urldefrag(base) if parent and parent.base: base = urljoin(parent.base, base) else: systemId = self.locator.getPublicId() \ or self.locator.getSystemId() if systemId: base = urljoin(systemId, base) else: if parent: base = parent.base if base is None: systemId = self.locator.getPublicId() \ or self.locator.getSystemId() if systemId: base, frag = urldefrag(systemId) current.base = base language = attrs.get(LANG, None) if language is None: if parent: language = parent.language current.language = language current.start(name, qname, attrs)
def add_lista_IPs (lista_add_ips): #Adiciona um dicionário de IPs result = connection.post(urljoin(baseurl_main, '/network-list/v2/network-lists/' + uniqID + '/append'), json=lista_add_ips) saida_json = json.loads(result.text) print(saida_json) print(result.content)
class MainClass(NexusPHP): URL = 'https://u2.dmhy.org/' USERNAME_REGEX = '<bdo dir=\'ltr\'>{username}</bdo>' SUCCEED_REGEX = '.{0,500}奖励UCoin: <b>\\d+' USER_CLASSES = { 'downloaded': [3298534883328], 'share_ratio': [4.55], 'days': [700] } DATA = { 'regex_keys': ['<input type="submit" name="(captcha_.*?)" value="(.*?)" />'], 'req': '<input type="hidden" name="req" value="(.*?)" />', 'hash': '<input type="hidden" name="hash" value="(.*?)" />', 'form': '<input type="hidden" name="form" value="(.*?)" />' } def __init__(self): super(NexusPHP, self).__init__() self.times = 0 @classmethod def build_sign_in(cls, entry, config): site_config = entry['site_config'] succeed_regex = [cls.USERNAME_REGEX.format(username=site_config.get('username')) + cls.SUCCEED_REGEX, '<a href="showup.php">已[签簽]到</a>'] entry['url'] = cls.URL entry['workflow'] = cls.build_workflow(succeed_regex) entry['user_classes'] = cls.USER_CLASSES site_config.setdefault('ocr_config', {}) ocr_config = site_config.get('ocr_config') ocr_config.setdefault('retry', _RETRY) ocr_config.setdefault('char_count', _CHAR_COUNT) ocr_config.setdefault('score', _SCORE) entry['headers'] = { 'cookie': site_config.get('cookie'), 'user-agent': config.get('user-agent'), 'referer': entry['url'] } @classmethod def build_workflow(cls, succeed_regex): return [ Work( url='/showup.php?action=show', method='get', succeed_regex=succeed_regex, check_state=('sign_in', SignState.NO_SIGN_IN), is_base_content=True ), Work( url='/showup.php?action=show', method='anime', data=cls.DATA, check_state=('network', NetworkState.SUCCEED), img_regex='image\\.php\\?action=adbc2&req=.+?(?=&imagehash)', reload_regex='image\\.php\\?action=reload_adbc2&div=showup&rand=\\d+' ), Work( url='/showup.php?action=show', method='get', succeed_regex=succeed_regex, fail_regex='这是一个杯具。<br />验证码已过期。', check_state=('final', SignState.SUCCEED) ), ] def sign_in_by_anime(self, entry, config, work, last_content): if not fuzz or not process: entry.fail_with_prefix('Dependency does not exist: [fuzzywuzzy]') return ocr_config = entry['site_config'].get('ocr_config') data = self.build_data(entry, config, work, last_content, ocr_config) if not data: entry.fail_with_prefix('Can not build_data') return logger.info(data) return self._request(entry, 'post', work.url, data=data) def build_data(self, entry, config, work, base_content, ocr_config): if entry.failed: return None img_url_match = re.search(work.img_regex, base_content) if not img_url_match: entry.fail_with_prefix('Can not found img_url') return None img_url = img_url_match.group() logger.info('attempts: {} / {}, url: {}', self.times, ocr_config.get('retry'), urljoin(entry['url'], img_url)) data = {} found = False if images := self.get_image(entry, config, img_url, ocr_config.get('char_count')): image1, image2 = images self.save_iamge(image1, 'step3_a_diff.png') self.save_iamge(image2, 'step3_b_diff.png') ocr_text1 = BaiduOcr.get_jap_ocr(image1, entry, config) ocr_text2 = BaiduOcr.get_jap_ocr(image2, entry, config) oct_text = ocr_text1 if len(ocr_text1) > len(ocr_text2) else ocr_text2 logger.debug('jap_ocr: {}', oct_text) if oct_text and len(oct_text) > ocr_config.get('char_count'): for key, regex in work.data.items(): if key == 'regex_keys': for regex_key in regex: regex_key_search = re.findall(regex_key, base_content, re.DOTALL) select = {} ratio_score = 0 if regex_key_search: for captcha, value in regex_key_search: answer_list = list(filter(lambda x2: len(x2) > 0, map(lambda x: re.sub('[^\\w]|[a-zA-Z\\d]', '', x), value.split('\n')))) if answer_list: split_value, partial_ratio = process.extractOne(oct_text, answer_list, scorer=fuzz.partial_ratio) else: partial_ratio = 0 if partial_ratio > ratio_score: select = (captcha, value) ratio_score = partial_ratio logger.debug('value: {}, ratio: {}', value.replace('\n', '\\'), partial_ratio) else: entry.fail_with_prefix( 'Cannot find regex_key: {}, url: {}'.format(regex_key, work.url)) return None if ratio_score and ratio_score > ocr_config.get('score'): captcha, value = select data[captcha] = value found = True else: value_search = re.search(regex, base_content, re.DOTALL) if value_search: data[key] = value_search.group(1) else: entry.fail_with_prefix('Cannot find key: {}, url: {}'.format(key, work.url)) return None if not found: if self.times < ocr_config.get('retry'): self.times += 1 reload_url = re.search(work.reload_regex, base_content).group() real_reload_url = urljoin(entry['url'], reload_url) reload_response = self._request(entry, 'get', real_reload_url) reload__net_state = self.check_network_state(entry, real_reload_url, reload_response) if reload__net_state != NetworkState.SUCCEED: return None reload_content = NetUtils.decode(reload_response) return self.build_data(entry, config, work, reload_content, ocr_config) else: return None site_config = entry['site_config'] data['message'] = site_config.get('comment') return data
def build_data(self, entry, config, work, base_content, ocr_config): if entry.failed: return None img_url_match = re.search(work.img_regex, base_content) if not img_url_match: entry.fail_with_prefix('Can not found img_url') return None img_url = img_url_match.group() logger.info('attempts: {} / {}, url: {}', self.times, ocr_config.get('retry'), urljoin(entry['url'], img_url)) data = {} found = False if images := self.get_image(entry, config, img_url, ocr_config.get('char_count')): image1, image2 = images self.save_iamge(image1, 'step3_a_diff.png') self.save_iamge(image2, 'step3_b_diff.png') ocr_text1 = BaiduOcr.get_jap_ocr(image1, entry, config) ocr_text2 = BaiduOcr.get_jap_ocr(image2, entry, config) oct_text = ocr_text1 if len(ocr_text1) > len(ocr_text2) else ocr_text2 logger.debug('jap_ocr: {}', oct_text) if oct_text and len(oct_text) > ocr_config.get('char_count'): for key, regex in work.data.items(): if key == 'regex_keys': for regex_key in regex: regex_key_search = re.findall(regex_key, base_content, re.DOTALL) select = {} ratio_score = 0 if regex_key_search: for captcha, value in regex_key_search: answer_list = list(filter(lambda x2: len(x2) > 0, map(lambda x: re.sub('[^\\w]|[a-zA-Z\\d]', '', x), value.split('\n')))) if answer_list: split_value, partial_ratio = process.extractOne(oct_text, answer_list, scorer=fuzz.partial_ratio) else: partial_ratio = 0 if partial_ratio > ratio_score: select = (captcha, value) ratio_score = partial_ratio logger.debug('value: {}, ratio: {}', value.replace('\n', '\\'), partial_ratio) else: entry.fail_with_prefix( 'Cannot find regex_key: {}, url: {}'.format(regex_key, work.url)) return None if ratio_score and ratio_score > ocr_config.get('score'): captcha, value = select data[captcha] = value found = True else: value_search = re.search(regex, base_content, re.DOTALL) if value_search: data[key] = value_search.group(1) else: entry.fail_with_prefix('Cannot find key: {}, url: {}'.format(key, work.url)) return None
"""URL objects Extends the `url` submodule from the ETLAssist package. """ try: from urllib.parse import urljoin except ImportError: from urlparse import urljoin from etlassist.url import * # pylint: disable=wildcard-import, unused-wildcard-import EPERMITTING_ACCELA_FTP = "sftp://64.74.214.187:/" EPERMITTING_ADDRESSES_FTP = "sftp://imd20.cbs.state.or.us:/home/lane_co/" OEM_FILE_SHARING = "https://upload.oregonem.com/" RLID = "https://www.rlid.org/" RLID_MAPS = "https://open.maps.rlid.org/" RLID_IMAGE_SHARE = urljoin(RLID, "/ImageShare") RLID_PROPERTY_SEARCH = urljoin( RLID, "/property_search/standard.cfm?do=propsearch_standard.reprocess")
def get_frame_src(session, baseurl, url, framename): result = session.get(urljoin(baseurl, url)) return extract_frame_src(framename, result)
def scrap_biblis_book_lents(account_config: dir): with requests.Session() as session: # all requests through session now have User-Agent header set session.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' } starturl = 'https://biblis.de/FOR/lissy/lissy.ly?pg=bnrlogin' topframeurl = get_frame_src(session, starturl, starturl, 'topeframe') rightframeurl = get_frame_src(session, starturl, topframeurl, 'toprighteframe') rightframe = session.get(urljoin(starturl, rightframeurl)) tree = html.fromstring(rightframe.text) inputs = list(set(tree.xpath("//form//input[@value]/@name"))) formparams = dict([ (i, str( list( set( tree.xpath("//form//input[@name='{}']/@value".format( i))))[0])) for i in inputs ]) formposturl = str( list(set(tree.xpath("//form[@name='form1']/@action")))[0]) formparams["bnr"] = account_config["user"] formparams["gd"] = account_config["password"] loggedin = session.post(urljoin(starturl, formposturl), data=formparams) topframeurl = extract_frame_src('topframe', loggedin) toplefturl = get_frame_src(session, starturl, topframeurl, 'topleftframe') menu = session.get(urljoin(starturl, toplefturl)) tree = html.fromstring(menu.text) itemlisturl = list( tree.xpath( "//td//a[img/@alt='Entliehene Medien anzeigen']/@href"))[0] tmp = session.get(urljoin(starturl, itemlisturl)) tree = html.fromstring(tmp.text) script = list(tree.xpath("//head/script"))[0].text leftmarker = 'window.location.replace("' leftcuturl = script[script.find(leftmarker) + len(leftmarker):] listurl = leftcuturl[:leftcuturl.find('"')] tmp = session.get(urljoin(starturl, listurl)) tree = html.fromstring(tmp.text) noresult = list(tree.xpath("//font/h3[@align='center']")) if len(noresult) == 1 and noresult[ 0].text == '\r\n*** Sie haben zur Zeit keine Medien entliehen! ***\r\n': return [] # rows = list(tree.xpath('//table/tr[td]')) rows = list(tree.xpath('//table/tr[count(/td) > 5]')) itemslist = [{ "account": account_config, "name": item.xpath('td[4]')[0].text.replace('\u200b', ''), "date": datetime.strptime( item.xpath('td[5]')[0].text.replace('\u200b', ''), '%d.%m.%Y').date(), "remarks": item.xpath('td[6]')[0].text.replace('\u200b', '').replace('---', ''), } for item in rows] return itemslist
def del_unique_IP (lista_del_ips): ## Passa como parâmetro uma unica string um IP ou bloco result = connection.delete(urljoin(baseurl_main, '/network-list/v2/network-lists/'+ uniqID + '/elements?element='+ lista_del_ips)) saida_json = json.loads(result.text) print(saida_json) print(result)
# coding=utf-8 from urllib.parse import urljoin from pulp_smash.constants import PULP_FIXTURES_BASE_URL from pulp_smash.pulp3.constants import ( BASE_DISTRIBUTION_PATH, BASE_REMOTE_PATH, BASE_REPO_PATH, BASE_CONTENT_PATH, ) CONTAINER_MANIFEST_PATH = urljoin(BASE_CONTENT_PATH, "container/manifests/") CONTAINER_TAG_PATH = urljoin(BASE_CONTENT_PATH, "container/tags/") CONTAINER_BLOB_PATH = urljoin(BASE_CONTENT_PATH, "container/blobs/") CONTAINER_CONTENT_NAME = "container.blob" CONTAINER_DISTRIBUTION_PATH = urljoin(BASE_DISTRIBUTION_PATH, "container/container/") CONTAINER_REPO_PATH = urljoin(BASE_REPO_PATH, "container/container/") CONTAINER_REMOTE_PATH = urljoin(BASE_REMOTE_PATH, "container/container/") CONTAINER_IMAGE_URL = urljoin(PULP_FIXTURES_BASE_URL, "container/busybox:latest.tar") """The URL to a Container image as created by ``docker save``.""" # hello-world is the smalest container image available on docker hub 1.84kB
def handle_starttag(self, tag, attrs): if tag == 'a': for(attribute,value) in attrs: if attribute == 'href': url = parse.urljoin(self.base_url,value) self.links.add(url)
def _local_task(task_args: Dict[str, Any]) -> None: requests.post( urljoin("http://localhost:5000", task_args["relative_uri"]), data=json.dumps(task_args["body"]).encode(), )
def test_unpublished(self): """Test permissions on an unpublished layer """ thefile = os.path.join(gisdata.VECTOR_DATA, 'san_andres_y_providencia_highway.shp') layer = file_upload(thefile, overwrite=True) layer.set_default_permissions() check_layer(layer) # we need some time to have the service up and running time.sleep(20) try: # request getCapabilities: layer must be there as it is published and # advertised: we need to check if in response there is # <Name>geonode:san_andres_y_providencia_water</Name> geoserver_base_url = settings.OGC_SERVER['default']['LOCATION'] get_capabilities_url = 'ows?' \ 'service=wms&version=1.3.0&request=GetCapabilities' url = urljoin(geoserver_base_url, get_capabilities_url) str_to_check = '<Name>geonode:san_andres_y_providencia_highway</Name>' request = Request(url) response = urlopen(request) # by default the uploaded layer is published self.assertTrue(layer.is_published, True) self.assertTrue( any(str_to_check in ensure_string(s) for s in response.readlines())) finally: # Clean up and completely delete the layer layer.delete() # with settings disabled with self.settings(RESOURCE_PUBLISHING=True): layer = file_upload(thefile, overwrite=True, is_approved=False, is_published=False) layer.set_default_permissions() check_layer(layer) # we need some time to have the service up and running time.sleep(20) try: # by default the uploaded layer must be unpublished self.assertEqual(layer.is_published, False) # check the layer is not in GetCapabilities request = Request(url) response = urlopen(request) # now test with published layer layer = Layer.objects.get(pk=layer.pk) layer.is_published = True layer.save() # we need some time to have the service up and running time.sleep(20) request = Request(url) response = urlopen(request) self.assertTrue( any(str_to_check in ensure_string(s) for s in response.readlines())) finally: # Clean up and completely delete the layer layer.delete()
def job_url(self, test_job): result_type, tux_project, tux_uid = self.parse_job_id(test_job.job_id) tux_group, tux_user = tux_project.split('@') endpoint = f'groups/{tux_group}/projects/{tux_user}/{result_type.lower()}s/{tux_uid}' return urljoin(self.data.url, endpoint)
def files(self): anchors = [p.find('a') for p in self._soup('p', string=re.compile(r'^\s*Click here to download'))] files = [File(urljoin(self.url, a['href'])) for a in anchors] for file in files: file.filename = strictSplitext(self.name)[0] + os.path.splitext(file.filename)[1] return files
def receive(cls, xml, c_from, u_to): data = cls.as_dict(xml) shared = DiasporaPost.get_by_guid(data['root_guid']) if not shared: # Try to pull it from the Atom feed author = DiasporaContact.get_by_username(data['root_diaspora_id'], True, True) if not author: raise TryLater() author.import_public_posts() shared = DiasporaPost.get_by_guid(data['root_guid']) if not shared: # Fall back to poking the origin server post_url = urljoin(author.server, "/p/{0}.xml".format(data['root_guid'])) resp = urlopen(post_url, timeout=10) current_app.logger.debug( 'Injecting downloaded message into processing loop') process_incoming_message(resp.read(), author.contact, None) shared = DiasporaPost.get_by_guid(data['root_guid']) if not shared: # Failed current_app.logger.warning( 'Could not find post being reshared (with GUID {0})'.format( data['root_guid'])) raise TryLater() shared = shared.post created = datetime.strptime(data['created_at'], '%Y-%m-%d %H:%M:%S %Z') post = Post(author=c_from, created_at=created) share_part = MimePart(type='application/x-pyaspora-share', body=dumps({ 'post': { 'id': shared.id }, 'author': { 'id': shared.author_id, 'name': shared.author.realname, } }).encode('utf-8'), text_preview=u"shared {0}'s post".format( shared.author.realname)) post.add_part(share_part, order=0, inline=True) order = 0 for part in shared.parts: if part.mime_part.type != 'application/x-pyaspora-share': order += 1 post.add_part(part.mime_part, inline=part.inline, order=order) if not post.tags: post.tags = shared.tags if u_to: post.share_with([c_from]) if u_to.contact.subscribed_to(c_from): p.share_with([u_to.contact]) else: post.share_with([c_from], show_on_wall=True) post.thread_modified() post.diasp = DiasporaPost(guid=data['guid'], type='limited' if u_to else 'public') db.session.add(post) db.session.commit()
def songs(self): table = self._contentSoup.find('table') anchors = [tr.find('a') for tr in table('tr') if not tr.find('th')] urls = [a['href'] for a in anchors] songs = [Song(urljoin(self.url, url)) for url in urls] return songs
delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for p in page_lines: raw_text = unspace(p.get_text()) pdf = None title = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) title = get_title(p) #print(pdf) else: print(p) continue presentation = None authors = has_italic(p) try: authors = remove_parenthesised(authors) except: print(authors) continue authors = namify(authors)
def media_url(self): return urljoin(settings.MEDIA_URL, self.url)
def images(self): anchors = self._contentSoup('p')[1]('a') urls = [a['href'] for a in anchors] images = [File(urljoin(self.url, url)) for url in urls] return images
def as_uri(self, scheme='file'): return urlparse.urljoin( str(scheme) + ':', urllib.pathname2url(str(self)))
def __init__(self, soundtrackId): self.id = soundtrackId self.url = urljoin(BASE_URL, 'game-soundtracks/album/' + self.id)
def loadfeed(category="faculty", subtype=None): # Validate the category and subtype category = category.lower().strip() if not category in CATEGORIES: return None if category == "faculty" and subtype != None: subtype = subtype.lower().strip() if not subtype in SUBTYPES: return None # Make the request and retrieve DOM with BeautifulSoup4 req = requests.get(urljoin(CS_BASE_ENDPOINT, category)) if not req.ok: return try: dom = bs4.BeautifulSoup(req.content, features="html5lib") except bs4.FeatureNotFound: dom = bs4.BeautifulSoup(req.content, features="html.parser") people_divs = dom.find_all(name="div", attrs={"class": "person"}) people = [] # Process each record individual and parse out data for p in people_divs: record = {} # Extract full name fullname_tag = p.find("h2", attrs={"class": "person-name"}) if fullname_tag.find("a"): record["full_name"] = fullname_tag.find("a").text.strip() else: record["full_name"] = fullname_tag.text.strip() if fullname_tag.find("small"): maintext = fullname_tag.text badtext = fullname_tag.find("small").text record["full_name"] = maintext.replace(badtext, "").strip() # Process other standard directory information record["title"] = p.find("div", attrs={ "class": "person-title" }).text.strip() degree_tag = p.find("div", attrs={"class": "person-degree"}) if degree_tag: record["degree"] = degree_tag.text.strip() # Process image photo_imgtag = p.find("div", attrs={ "class": "person-photo" }).find("img") if photo_imgtag and photo_imgtag.has_attr("src"): photo_rel_url = photo_imgtag.get("src", None) if photo_rel_url: record["photo_link"] = urljoin(CS_BASE_ENDPOINT, photo_rel_url) # Process email / CS NetID raw_address_items = p.find_all("span", attrs={"class": "person-address-item"}) for item in raw_address_items: if item == None or not item.find("span", "glyphicon"): continue item_type = list( filter(lambda s: "glyphicon-" in s, item.find("span", "glyphicon").get("class", []))) text = item.text.strip() # Email if "glyphicon-envelope" in item_type: # Bad hack for bad HTML parsers if "&commat" in text: text = text.replace("&commat", "@") username = re.sub(r'\W+', '', text.split("@")[0]) domain = text.split("@")[1].strip().strip(')') record["email"] = "{}@{}".format(username, domain) record["net_id"] = username # Phone if "glyphicon-earphone" in item_type: record["phone"] = text # Address if "glyphicon-briefcase" in item_type: record["address"] = text # Add to processed records people.append(record) return people
output_folder = "output" # Open and parse page request = requests.get(url_padron) page = BeautifulSoup(request.text, 'lxml') # Find all PDF files links main_content = page.find(class_='tab-content') links = main_content.find_all('a', href=re.compile("pdf")) full_links = [] for link in links: full_link = urljoin(url_padron, link['href']) full_links.append(full_link) print("Found " + str(full_links.count) + " links") # Download each file in the output directory cwd = os.getcwd() output_path = cwd + '/' + output_folder Path(output_path).mkdir(exist_ok=True) for full_link in full_links: print("Downloading " + full_link) local_filename = full_link.split('/')[-1] with requests.get(full_link, stream=True) as r:
def save(self, *args, **kwargs): if not self.id: models.Model.save(self, *args, **kwargs) self.url = up.urljoin(UM.ARTICLE_INSIDE, '?id={id}'.format(id=self.id)) models.Model.save(self, *args, **kwargs)
def is_safe_url(target): ref_url = urlparse(request.host_url) test_url = urlparse(urljoin(request.host_url, target)) return test_url.scheme in ('http', 'https') and \ ref_url.netloc == test_url.netloc
def __init__(self, base_url, login_url=None, useragent=None, debug=False, insecure=False, openid_insecure=False, username=None, session_id=None, session_name='session', openid_session_id=None, openid_session_name='FAS_OPENID', cache_session=True, retries=None, timeout=None): """Client for interacting with web services relying on fas_openid auth. :arg base_url: Base of every URL used to contact the server :kwarg login_url: The url to the login endpoint of the application. If none are specified, it uses the default `/login`. :kwarg useragent: Useragent string to use. If not given, default to "Fedora OpenIdBaseClient/VERSION" :kwarg debug: If True, log debug information :kwarg insecure: If True, do not check server certificates against their CA's. This means that man-in-the-middle attacks are possible against the `BaseClient`. You might turn this option on for testing against a local version of a server with a self-signed certificate but it should be off in production. :kwarg openid_insecure: If True, do not check the openid server certificates against their CA's. This means that man-in-the- middle attacks are possible against the `BaseClient`. You might turn this option on for testing against a local version of a server with a self-signed certificate but it should be off in production. :kwarg username: Username for establishing authenticated connections :kwarg session_id: id of the user's session :kwarg session_name: name of the cookie to use with session handling :kwarg openid_session_id: id of the user's openid session :kwarg openid_session_name: name of the cookie to use with openid session handling :kwarg cache_session: If set to true, cache the user's session data on the filesystem between runs :kwarg retries: if we get an unknown or possibly transient error from the server, retry this many times. Setting this to a negative number makes it try forever. Defaults to zero, no retries. :kwarg timeout: A float describing the timeout of the connection. The timeout only affects the connection process itself, not the downloading of the response body. Defaults to 120 seconds. """ # These are also needed by OpenIdProxyClient self.useragent = useragent or 'Fedora BaseClient/%(version)s' % { 'version': __version__} self.base_url = base_url self.login_url = login_url or urljoin(self.base_url, '/login') self.debug = debug self.insecure = insecure self.openid_insecure = openid_insecure self.retries = retries self.timeout = timeout self.session_name = session_name self.openid_session_name = openid_session_name # These are specific to OpenIdBaseClient self.username = username self.cache_session = cache_session # Make sure the database for storing the session cookies exists if cache_session: self._db = self._initialize_session_cache() if not self._db: self.cache_session = False # Session cookie that identifies this user to the application self._session_id_map = defaultdict(str) if session_id: self.session_id = session_id if openid_session_id: self.openid_session_id = openid_session_id # python-requests session. Holds onto cookies self._session = requests.session()
def _get_content_hls_url(self, content_id): d = self.session.http.get(urljoin(self.url, self.content_api.format(id=content_id))) d = self.session.http.json(d, schema=self.content_api_schema) return urljoin((d["serviceUrl"] or d["defaultServiceUrl"]), d["securePath"])
continue cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) conn.commit() # Retrieve all of the anchor tags tags = soup('a') count = 0 for tag in tags: href = tag.get('href', None) if ( href is None ) : continue # Resolve relative references like href="/contact" up = urlparse(href) if ( len(up.scheme) < 1 ) : href = urljoin(url, href) ipos = href.find('#') if ( ipos > 1 ) : href = href[:ipos] if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue if ( href.endswith('/') ) : href = href[:-1] # print href if ( len(href) < 1 ) : continue # Check if the URL is in any of the webs found = False for web in webs: if ( href.startswith(web) ) : found = True break if not found : continue
def parse_page(self, response): for review in response.xpath('//div[@class="news-item__top"]/a/@href').extract(): review_url = urljoin(response.url, review) yield Request(url=review_url, callback=self.parse_news)
def get_recipe_detail(recipe_url): """从url中获取菜谱详细信息 :param recipe_url: str 菜谱url,如:https://www.xinshipu.com/zuofa/598775; https://www.xinshipu.com//zuofa/749342 :return:dict """ response = requests.get(recipe_url, headers=get_header()) html = BeautifulSoup(response.text, 'lxml') # 获取菜名 name = html.find("div", {"class": "re-up"}).h1.text # 主图 img = html.find("div", {"class": "gallery"}).a['href'] img = urljoin(HOME_URL, img) all_info = html.find_all("div", {"class": "dd"}) if len(all_info) == 4: # 简介 intro = re.sub('\n|\t|\r| ', '', all_info[0].text) material_i = 1 method_i = 2 else: intro = None material_i = 0 method_i = 1 # 食材 material = all_info[material_i].text.strip() material = re.sub('\r\n|\r\n \n|\n\n\n', '\n', material) # 做法 try: method_steps = html.find("ol", { "class": "re-step-wpic" }).find_all('li') method = [] for i, m in enumerate(method_steps, 1): step = dict(step_num=i) step['text'] = m.text.strip() if m.img: step['img_url'] = urljoin(HOME_URL, m.img['src']) method.append(step) except: method = all_info[method_i].text.strip() method = re.sub('\r\n|\r\n \n|\n\n\n\n', '\n', method) # 相关菜品 classify = all_info[-1].text.strip() if '\xa0\xa0' in classify: classify = classify.replace('\xa0\xa0', ' | ') else: classify = "" return { "name": name, "url": recipe_url, "img": img, "intro": intro, "material": material, "method": method, "classify": classify }
def find_url(self, url): """find link tag in html Args: url: url take address of website finding the link tag """ #arguments url null check if not url: log.info("find_url() Line="+str(inspect.currentframe().f_lineno)+" args: url does not exist") return #arguments url type check if type(url) is not str: log.info("find_url() Line="+str(inspect.currentframe().f_lineno)+" args: url tpye is not string") return try: url = urllib.parse.quote(url, safe=':/&?=') req = Request(url, headers = self.headers) site = urlopen(req, timeout=2) soup = BeautifulSoup(site.read(),'lxml') except Exception as e: log.error("find_url() line="+str(inspect.currentframe().f_lineno)+' Error: '+str(e)) return None for link in soup.findAll('a'): temp_url = str(link.get('href')) allow = self.allowed_url_check(temp_url) if not allow: continue #만약 상대주소로 되어있다면 not_allowed_url_check에 넣은게 들어갈 수 도 있다. # 가령 /policy/privacy.html 같은 경우 절대주소로 바꾸면 www.naver.com/policy/privacy.html 로 된다. # 따라서 걸러지지 않는데 이는 처음 시작 주소를 잘선택하면 문제없다. disallow = self.not_allowed_url_check(temp_url) if not disallow: continue #request q보내려는 url가 실행파일, 집파일, rmp, deb, gz인 경우 건너뛴다. if 'Content-Type' not in site.headers: continue if 'text/html' not in site.headers['Content-Type']: continue #if re.search('(exe)$|(zip)$|(rpm)$|(gz)$|(deb)$|(txt)$|(csv)$|(pdf)$|(ppt)$', temp_url): # continue if 'https' in temp_url: #절대주소 pass elif re.match('/.+|\.\..+' , temp_url): #match 함수는 시작부터 일치하는지 검사한다. search는 문자열 내에 존재하면 찾아준다. #상대주소를 절대주소로 url 변경 temp_url = urljoin(url,temp_url) else: #그 외는 None 처리 temp_url=None if temp_url and temp_url not in self.visited: self.visited.setdefault(temp_url,True) self.links.append(temp_url) return soup #deque로 넘겨주어 popleft()로 앞에서부터 뺀다.