def _get_sudo_session(self): user, password = self.AUTH session = requests.Session() # Get initial cookies url = urljoin(self.HOST, '/secure/admin/user/AddUser!default.jspa') r = session.get(url=url, auth=self.AUTH) r.raise_for_status() # Login form url_login = urljoin(self.HOST, '/login.jsp') payload_login = { "os_username": user, "os_password": password, "os_destination": "/secure/admin/user/AddUser!default.jspa", "user_role": "ADMIN", "atl_token": session.cookies.get('atlassian.xsrf.token'), "login": "******", } r = session.post(url=url_login, data=payload_login) r.raise_for_status() # Sudo login form url_login_sudo = urljoin(self.HOST, '/secure/admin/WebSudoAuthenticate.jspa') payload_login_sudo = { "webSudoPassword": password, "webSudoDestination": "/secure/admin/user/AddUser!default.jspa", "webSudoIsPost": "false", "atl_token": session.cookies.get('atlassian.xsrf.token'), } r = session.post(url=url_login_sudo, data=payload_login_sudo) r.raise_for_status() return session
def put(self, document, doc_id=None, **options): if not isinstance(document, dict): document = document.__dict__ document = {key: val for key, val in document.items( ) if not key.startswith('__')} if not doc_id: doc_id = document.get('_id', document.get('id')) if 'id' in document: del document['id'] if '_id' in document: del document['_id'] if '_rev' in document and not document['_rev']: del document['_rev'] if not doc_id: doc_id = str(uuid4()) posturl = urljoin(self._dburl, doc_id) response = yield from aiohttp.request( 'POST', self._dburl, data=json_dumps(document), headers={ 'Accept': 'application/json', 'content-type': 'application/json' }) data = yield from response.read() data = json_loads(data) if 'ok' in data and data['ok'] is True: document['_id'] = data['id'] document['_rev'] = data['rev'] return Bunch(**data)
def fetch(endpoint, safe=":,", **parameters): """ Fetch JSON data with the given endpoint and parameters. Arguments: endpoint: The API endpoint as a string. safe: A string containing characters exempt from URL encoding. parameters: A variable number of keyword arguments containing the URL parameters to send with the request. Returns: The response as a dictionary containing JSON data. Raises: ValueError: when no API token is set. """ if API_TOKEN is None: raise ValueError("no API token found") for key, value in parameters.items(): if isinstance(value, (datetime.datetime, datetime.date)): parameters[key] = value.strftime("%Y-%m-%d") elif isinstance(value, (list, tuple)): parameters[key] = ",".join(map(str, value)) elif not isinstance(value, str): parameters[key] = str(value) url = urljoin(BASE_URL, endpoint) + "?" + urlencode(parameters, safe=safe) request = Request(url, headers={"token": API_TOKEN}) response = urlopen(request) return json.loads(response.read().decode("utf-8"))
def crawl(self,pages,depth=2): for i in range(depth): newpages = set() for page in pages: try: c = request.urlopen(page) except Exception as e: print("Could not open %s" % page) print("Due to:"+e.__str__()) continue soup = BeautifulSoup(c.readall(),from_encoding="utf-8") self.addtoindex(page,soup) links = soup("a") for link in links: if "href" in dict(link.attrs): url = request.urljoin(page,link["href"]) if url.find("'")!=-1:continue url = url.split("#")[0]# remove url portion if url[0:4]=="http" and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() pages = newpages
def _make_link(self, *, key, link): url = urljoin(self.HOST, '/rest/api/2/issueLink') link_type = self.LINK_TYPES.get(link['type']['name']) if 'outwardIssue' in link: inwardIssue = key outwardIssue = link['outwardIssue']['key'] elif 'inwardIssue' in link: inwardIssue = link['inwardIssue']['key'] outwardIssue = key payload = { "type": { "name": link_type }, "inwardIssue": { "key": inwardIssue }, "outwardIssue": { "key": outwardIssue }, } r = requests.post(url=url, json=payload, auth=self.AUTH) r.raise_for_status()
def run(self, baseUri='', context={}, session=None): """Run actions' request, perform assertion testing""" self.request['url'] = urljoin(baseUri, self.request['url']) # apply template to request and assertions self.request = apply_context(self.request, context) self.asserts = apply_context(self.asserts, context) # create assertions assertion = assertions.Assert(self.asserts) # get response request = requests.Request(**self.request).prepare() http = session or requests.Session() self.response = http.send(request) # test assertion, will raise an excep try: assertion.test(self.response) except Exception as error: # save error self.error = error # reraise raise # save context vars if self.vars: try: data = { 'json': self.response.json(), 'headers': self.response.headers } for name, path in self.vars.items(): self.vars[name] = json_path(path, data) except ValueError: # no json, it's can be ok pass return self
def create_versions(self): print('Creating Versions') url = urljoin(self.HOST, '/rest/api/2/version') for version in Version.objects.all(): print(version) data = version.json payload = { "name": data['name'], "archived": data['archived'], "released": data['released'], "project": self.PKEY, "projectId": self.PID } description = data.get('description') if description: payload['description'] = description userStartDate = data.get('userStartDate') if userStartDate: payload['userStartDate'] = userStartDate userReleaseDate = data.get('userReleaseDate') if userReleaseDate: payload['userReleaseDate'] = userReleaseDate r = requests.post(url=url, json=payload, auth=self.AUTH) r.raise_for_status()
def __init__(self, username: str, password: str or None = None): """ :param username: The account username. :type username: str :param password: Required for quick connection, instead of calling later change_password. :type password: str or None """ from pymal.account_objects import account_animes, account_mangas self.__username = username self.__password = password self.connect = global_functions.connect self.__user_id = None self.__auth_object = None self._main_profile_url = request.urljoin(HOST_NAME, "profile/{0:s}".format(self.username)) self.__animes = account_animes.AccountAnimes(self) self.__mangas = account_mangas.AccountMangas(self) self.__friends = None self.__image_url = "" self.__session = global_functions.generate_session() self._is_loaded = False if password is not None: self.change_password(password)
def catcher_remote_image(request): """远程抓图,当catchRemoteImageEnable:true时, 如果前端插入图片地址与当前web不在同一个域,则由本函数从远程下载图片到本地 """ if not request.method == "POST": return HttpResponse(json.dumps(u"{'state:'ERROR'}"), content_type="application/javascript") state = "SUCCESS" allow_type = list(request.GET.get("catcherAllowFiles", USettings.UEditorUploadSettings.get("catcherAllowFiles", ""))) max_size = int(request.GET.get("catcherMaxSize", USettings.UEditorUploadSettings.get("catcherMaxSize", 0))) remote_urls = request.POST.getlist("source[]", []) catcher_infos = [] path_format_var = get_path_format_vars() for remote_url in remote_urls: # 取得上传的文件的原始名称 remote_file_name = os.path.basename(remote_url) remote_original_name, remote_original_ext = os.path.splitext(remote_file_name) # 文件类型检验 if remote_original_ext in allow_type: path_format_var.update({ "basename": remote_original_name, "extname": remote_original_ext[1:], "filename": remote_original_name }) # 计算保存的文件名 o_path_format, o_path, o_file = get_output_path(request, "catcherPathFormat", path_format_var) o_filename = os.path.join(o_path, o_file).replace("\\", "/") # 读取远程图片文件 try: remote_image = urllib.urlopen(remote_url) # 将抓取到的文件写入文件 try: f = open(o_filename, 'wb') f.write(remote_image.read()) f.close() state = "SUCCESS" except Exception as e: state = u"写入抓取图片文件错误:%s" % e except Exception as e: state = u"抓取图片错误:%s" % e catcher_infos.append({ "state": state, "url": urljoin(USettings.gSettings.MEDIA_URL, o_path_format), "size": os.path.getsize(o_filename), "title": os.path.basename(o_file), "original": remote_file_name, "source": remote_url }) return_info = { "state": "SUCCESS" if len(catcher_infos) > 0 else "ERROR", "list": catcher_infos } return HttpResponse(json.dumps(return_info, ensure_ascii=False), content_type="application/javascript")
def get(self, doc_id, **options): response = yield from aiohttp.request( 'GET', urljoin(self._dburl, doc_id), headers={ 'Accept': 'application/json' }) data = yield from response.read() return json_loads(data)
def _get_versions(self): versions_endpoint = '/rest/api/2/project/%s/versions' % self.PKEY response = requests.get(url=urljoin(self.HOST, versions_endpoint), auth=self.AUTH).json() bulk_versions = [Version(name=v['name'], uid=v['id'], link=v['self'], json=v) for v in response] Version.objects.all().delete() Version.objects.bulk_create(bulk_versions) print('Версии загружены')
def all(self, **options): response = yield from aiohttp.request( 'GET', urljoin(self._dburl, '_all_docs'), headers={ 'Accept': 'application/json' }) data = yield from response.read() return Bunch(**json_loads(data))
def _make_subtask_relation(self, *, issue): uid_dest = str(issue.uid_dest) session = requests.Session() url0 = urljoin(self.HOST, '/secure/ConvertIssueSetIssueType.jspa?id=' + uid_dest) r = session.get(url=url0, auth=self.AUTH) soup = BeautifulSoup(r.text) guid = soup.find_all("input", type="hidden", id="guid")[0]['value'] # Step 1: Select Parent and Sub-task Type url_s1 = urljoin(self.HOST, '/secure/ConvertIssueSetIssueType.jspa') payload_s1 = { "parentIssueKey": issue.json['fields']['parent']['key'], "issuetype": "10000", "id": uid_dest, "guid": guid, "Next >>": "Next >>", "atl_token": session.cookies.get('atlassian.xsrf.token'), } r = session.post(url=url_s1, data=payload_s1, headers={"Referer": url0}) r.raise_for_status() # Step 2: Update Fields url_s2 = urljoin(self.HOST, '/secure/ConvertIssueUpdateFields.jspa') payload_s2 = { "id": uid_dest, "guid": guid, "Next >>": "Next >>", "atl_token": session.cookies.get('atlassian.xsrf.token'), } r = session.post(url=url_s2, data=payload_s2) r.raise_for_status() # Step 3: Confirm the conversion with all of the details you have just configured url_s3 = urljoin(self.HOST, '/secure/ConvertIssueConvert.jspa') payload_s3 = { "id": uid_dest, "guid": guid, "Finish": "Finish", "atl_token": session.cookies.get('atlassian.xsrf.token'), } r = session.post(url=url_s3, data=payload_s3) r.raise_for_status()
def start(self): div_tag = self.soup.find('div', {'class': 'box3'}) a_tags = div_tag.findAll('a') href_list = map(lambda x: urljoin(self.menu_url, x['href']), a_tags) pool = Pool(8) pool.map(self.download_mp3, href_list) pool.close()
def __getLinks(self, html, url): """ Extract Links from a Soup object. """ soup = BeautifulSoup(html.read(), "html.parser") myset = set() for link in soup.find_all("a"): # Normalize the link myset.add(urljoin(url, link.get("href")).strip("/").split("#")[0]) return myset
def action(self, url, method='GET', headers=None, body=None, json=None): # create Request request = requests.Request(method, urljoin(self.baseUri, url), headers=headers) if json: request.json = json elif body: request.data = body prepared = request.prepare() return self.session.send(prepared)
def _get_issues_details(self): for issue in Issue.objects.all(): print('Обрабатывается:', issue) response = requests.get( url=urljoin(self.HOST, Issue.API.format(uid=issue.uid)), auth=self.AUTH, ).json() issue.json = response issue.save()
def get_design_doc(self, ddoc_name, **options): posturl = urljoin(self._dburl, "_design/%s/" % ddoc_name) response = yield from aiohttp.request( 'GET', posturl, headers={ 'Accept': 'application/json', }) data = yield from response.read() return Bunch(**json_loads(data))
def _do_transition(self, *, issue): url = urljoin(self.HOST, '/rest/api/2/issue/{key}/transitions'.format(key=issue.key)) status = issue.json['fields']['status']['name'] payload = { 'transition': {'id': self.STATUSES[status]}, } r = requests.post(url=url, json=payload, auth=self.AUTH) r.raise_for_status()
def delete_db(self, dbname=None, **options): if not dbname: dbname = self._dbname response = yield from aiohttp.request( 'DELETE', urljoin(self._url, dbname), headers={ 'Accept': 'application/json' }) data = yield from response.read() return Bunch(**json_loads(data))
def put_design_doc(self, ddoc_name, ddoc, **options): posturl = urljoin(self._dburl, "_design/%s/" % ddoc_name) response = yield from aiohttp.request( 'PUT', posturl, data=json_dumps(ddoc), headers={ 'Accept': 'application/json', 'content-type': 'application/json' }) data = yield from response.read() return Bunch(**json_loads(data))
def load_files_for_url(url, dst, extension=None): if not dst.exists(): print("creating: {}".format(str(dst))) dst.mkdir() for filename, fileurl in get_files(url, extension): full_url = request.urljoin(EXUA_URL, fileurl) filepath = dst.joinpath(filename) print("downloading: {} -> {}".format(full_url, filepath)) retrieve_file(full_url, filepath)
def read_index(language): index = {} base_url = 'https://devdocs.io/' request = Request(base_url + 'docs/' + language + '/index.json') with urlopen(request) as response: index = json.loads(response.read().decode('utf-8')) for entry in index['entries']: path = '/'.join([language, entry['path']]) entry['path'] = urljoin(base_url, path) return index
def info(self, doc_id=None): url = self._dburl if doc_id: url = urljoin(url, doc_id) response = yield from aiohttp.request( 'GET', url, headers={ 'Accept': 'application/json' }) data = yield from response.read() return Bunch(**json_loads(data))
def load_gabi_set(index, cname="csc-default", version='', savedir='.'): if not os.path.exists(savedir): os.makedirs(savedir) # grab links to database browser pages base, links = grab_db_browser_links(index, cname=cname) for link in links: print('Attempting to load %s...\n' % link) load_gabi_collection(urljoin(base, link), version=version, savedir=savedir)
def plan_page(self, response): soup = BeautifulSoup(response.text) next_link = '' t = soup('a') for i in t: if i.get_text() == '上一页': link = urljoin(response.url, i['href']) next_link = link self.crawl(link, callback=self.plan_page, age=1, save=response.save) break lists = soup.find('table', {'width':'100%'}).find_all('a', href=True) print(len(lists)) for i in lists: if i.get_text() != '下一页': link = urljoin(response.url, i['href']) if link != next_link: print(link) self.crawl(link, callback=self.content_page, save=response.save, fetch_type='js')
def main_page(self): raw = self.bs4markup(self.do_get(M['main'])) npc = raw.find('div', id='NanoPCSeries') _table = npc.next_sibling.next_sibling dat = [] for li_ in _table.find_all('li'): dat.append({ 'name': li_.text, 'url': urljoin(M['index'], li_.a.get('href') + '/zh') }) return dat
def page_data(self, response): html = response.text href_s = re.findall(r'<span class="post"><a href="(.*?)"', html) for href in href_s: yield scrapy.Request(href, callback=self.page_info) try: next_href = re.findall(r'<a href="(.*?)" rel=\'nofollow\'>下一页', html)[0] next_href1 = request.urljoin(response.url, next_href) yield scrapy.Request(next_href1, callback=self.page_data) except Exception as e: print(e)
def get_all_website_links(self, url: str = None) -> None: """ Returns all URLs that is found on `url` in which it belongs to the same website """ url = url or self.url logger.debug(f"get_all_website_links: {url}") domain_name = urlparse(url).netloc soup = BeautifulSoup(requests.get(url).content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: self.invalid_links.add((url, href)) continue href = urljoin(url, href) parsed_href = urlparse(href) href = (href if (parsed_href.netloc.endswith("youtube.com") or parsed_href.netloc.endswith("youtu.be")) else parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path) if not self.is_valid(href): if href not in self.urls_to_ignore: self.invalid_links.add((url, href)) continue if ( # href in self.internal_urls # or href in self.external_urls href in self.urls_to_ignore): continue if domain_name not in href and self.url not in href: if href not in self.external_urls: self.external_urls.add(href) if not self.check_external_url(href): logger.debug(f"INVALID {(url, href)}") logger.debug("\tretrying") for _ in range(self.RETRY_NR): sleep(1) if self.check_external_url(href): logger.debug("\tseems to work !!!") continue self.invalid_links.add((url, href)) else: if href not in self.internal_urls: self.internal_urls.add(href) if not self.check_internal_url(href): self.invalid_links.add((url, href)) logger.debug(f"INVALID {(url, href)}") try: self.get_all_website_links(href) except: self.invalid_links.add((url, href)) logger.debug(f"INVALID {(url, href)}")
def get_links(text, url): time.sleep(1) content = {} linkList = [] soup = BeautifulSoup(text, "html.parser") links = soup.findAll("a", href=True) for link in links: href = link['href'] href = urljoin(url, href) parsed_href = urlparse(href) href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if href.endswith("/"): href = href[:-1] if "https" not in href: href = href.replace("http", "https") if href not in urls_list: if "uic.edu" in href and "@" not in href and is_valid(href): urls_queue.put(href) urls_list.append(href) linkList.append(href) else: linkList.append(href) # remove JS & CSS content for script in soup(["script", "style"]): script.extract() # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) doc = preprocess(text) if url.endswith("/"): url = url[:-1] if "https" not in url: url = url.replace("http", "https") content["page"] = url content["text"] = doc content["link"] = linkList pagesList.append(content) return
def tobii(company_name, companies_details): career_page_url = companies_details[company_name]['career_page_url'] sector = companies_details[company_name]['sector'] print(company_name) df = pd.DataFrame(columns=fields_needed) try: html = requests.get(career_page_url).text soup = BS(html, 'lxml') for item in soup.findAll("div", {"class": "job-listing-container"}): for ul in item.findAll('ul'): for li in ul.findAll('li'): for a in li.findAll('a'): job_specific_url = urljoin(career_page_url, a.get('href')) for j in li.findAll("span", {"class": "title"}): job_title = j.text.strip() for t in li.findAll("span", {"class": "meta"}): ty = t.text.strip() ty_split = ty.split('-', 1) job_type = ty_split[0] job_location = ty_split[1] job_description = np.nan years_of_experience = np.nan job_department = np.nan df = df.append(pd.Series(data=[ company_name, job_title, job_description, job_location, job_type, years_of_experience, job_department, job_specific_url, career_page_url, sector ], index=fields_needed), ignore_index=True) except Exception as error: print(error) print( "<<<<<<<<<<<<<<<<<<<<< This company got an issue %s >>>>>>>>>>>>>>>>>>>>>>>" % career_page_url) return df
def fetch_random_video(videos_list): video_name = random.choice(videos_list) cache_path = os.path.join(CACHE_DIR, video_name) if not os.path.exists(cache_path): url = request.urljoin(UCF_ROOT, video_name) response = (request.urlopen(url, context=UNVERIFIED_CONTEXT).read()) with open(cache_path, 'wb') as f: f.write(response) return cache_path
def delete(self, doc_id, rev=None, **options): if rev == None: rev = yield from self.info(doc_id) rev = rev._rev url = '%s?rev=%s' % (urljoin(self._dburl, doc_id), quote(rev)) response = yield from aiohttp.request( 'DELETE', url, headers={ 'Accept': 'application/json' }) data = yield from response.read() return Bunch(**json_loads(data))
async def mailboxes(self): headers = {'Authorization': 'bearer {}'.format(self.token_renew.token)} url = urljoin(self.host, self.endpoints['mailboxes']) try: async with aiohttp.ClientSession(raise_for_status=True) as session: async with session.get(url, headers=headers) as resp: body = await resp.json() return body.get('_embedded', {}).get('mailboxes', []) except aiohttp.ClientConnectionError as e: msg = 'Error making request %s. Reason: %s' logger.error(msg, url, str(e))
def parse(html, base_url): soup = BeautifulSoup(html, 'html.parser') title = soup.find('h1').get_text().strip() # title = soup.find('title') urls = soup.find_all('a', {"href": re.compile('^/.+?/$')}) page_urls = set([urljoin(base_url, url['href']) for url in urls]) # remove duplication url = soup.find('meta', {'property': "og:url"})['content'] return title, url, page_urls
def parse(html): """ 获取网站中网页的title, 页面url""" soup = BeautifulSoup(html, 'lxml') # 页面中的链接是/开头和结尾/,因此这里使用了RE进行提取 urls = soup.find_all('a', {"href": re.compile('^/.+?/$')}) # 获取页面标题,去除空格 title = soup.find('h1').get_text().strip() # 获取页面的url url = soup.find('meta', {'property': "og:url"})['content'] # 构建一个集合set,保存该页面内的url, 好处是可以去重, 使用urljoin函数拼接url page_urls = set([urljoin(base_url, url['href']) for url in urls]) return title, page_urls, url
def normalize_url(url): resource = re.search(r'/[^/]+$', url) end = resource.group() address = url[0:resource.start()] address = address.casefold() url = address + end url = url.strip() url = url.split('www.') url = url[-1].split('//') url = urljoin('https://', ('//' + url[-1])) return url
def test__renew_token(self): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) token = TokenRenew(urljoin(settings.HELPSCOUT_HOST, '/v2/oauth2/token')) loop.run_until_complete(token.renew_token(token)) self.assertEqual(len(self.cassette), 1) self.assertEqual(self.cassette.responses[0]['status']['code'], 200) self.assertEqual(self.cassette.requests[0].uri, 'https://api.helpscout.net/v2/oauth2/token')
def useful_links(url): parsed_uri = urlparse(url) base_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) try: page = urlopen(url) except: return [] query = '.main-content a[href]' soup = BeautifulSoup(page, 'html.parser') findings = soup.select(query) links = {url_no_params(urljoin(base_url, anchor['href'])) for anchor in findings} return {link for link in links if link.startswith(root_domain)}
def parse(self, response): privance = response.xpath( '//div[@class="topcity"]/dl[@id="clist"]//dd//a/@href').extract() for city in privance: if 'http://g.58.com/' in city: continue for i in range(1, 71): addVal = "job/pn{}".format(str(i)) fullUrl = request.urljoin(city, addVal) yield scrapy.Request(fullUrl, callback=self.parseList, meta={'baseurl': city})
def get_all_website_links(url: str) -> list: """ Gets all urls that are found on `url` webpage hosted on medium.com. :param url: The url for the webpage :return: List of urls for the 'url' parameter """ # All urls of `url` parameter # Using a set for unique elements urls = set() # Domain name of the url without the protocol (https) domain_name = urlparse(url).netloc # HTTP GET request to the url result = requests.get(url) # BeautifulSoup gets the html of the url's webpage soup = BeautifulSoup(result.content, "html.parser") # Begin looking for all links ie. <a href=""> on the current webpage for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: # href empty tag, links nowhere continue # Join the url if it's relative (not absolute link) href = urljoin(url, href) # urlparse() returns a ParseResult object with [scheme, netloc, path, params, query] parsed_href = urlparse(href) # Combines url pieces to get ready to insert into final list of all the urls href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if not is_valid(href): # Not a valid URL continue if href in internal_urls: # Already in the set # We only want unique url links continue if domain_name not in href: # External link; links to domain outside of medium.com continue urls.add(href) internal_urls.add(href) return urls
def read_index(lang='css'): index = {} base_url = 'https://devdocs.io/' request = Request(base_url + 'docs/' + lang + '/index.json') with urlopen(request) as response: index = json.loads(response.read().decode('utf-8')) # convert the path into a absolute url for entry in index['entries']: path = '/'.join([lang, entry['path']]) entry['path'] = urljoin(base_url, path) return index
def rest_upload_file(self, _file, username=GEONODE_USER, password=GEONODE_PASSWD): """ function that uploads a file, or a collection of files, to the GeoNode""" assert authenticate(username=username, password=password) self.assertTrue(self.client.login(username=username, password=password)) spatial_files = ("dbf_file", "shx_file", "prj_file") base, ext = os.path.splitext(_file) params = { # make public since wms client doesn't do authentication 'permissions': '{ "users": {"AnonymousUser": ["view_resourcebase"]} , "groups":{}}', 'time': 'false', 'charset': 'UTF-8' } # deal with shapefiles if ext.lower() == '.shp': for spatial_file in spatial_files: ext, _ = spatial_file.split('_') file_path = f"{base}.{ext}" # sometimes a shapefile is missing an extra file, # allow for that if os.path.exists(file_path): params[spatial_file] = open(file_path, 'rb') with open(_file, 'rb') as base_file: params['base_file'] = base_file for name, value in params.items(): if isinstance(value, IOBase): params[name] = (os.path.basename(value.name), value) url = urljoin(f"{reverse('uploads-list')}/", 'upload/') logger.error(f" ---- UPLOAD URL: {url}") response = self.client.post(url, data=params) # Closes the files for spatial_file in spatial_files: if isinstance(params.get(spatial_file), IOBase): params[spatial_file].close() try: logger.error( f" -- response: {response.status_code} / {response.json()}") return response, response.json() except (ValueError, TypeError): logger.exception( ValueError( f"probably not json, status {response.status_code} / {response.content}" )) return response, response.content
def getdata(url): domain_name = urlparse(url).netloc ua = UserAgent() headers = {"User-Agent": ua.random} while True: try: session = requests.Session() session.headers.update(headers) retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503]) adapter = HTTPAdapter(max_retries=retries) session.mount('http://', adapter) session.mount('https://', adapter) res = session.get(url, allow_redirects=True, timeout=15) res.raise_for_status() except: pass else: internal_urls = [] external_urls = [] soup = BeautifulSoup(res.content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: continue if href == "#" or href == "/": continue href = urljoin(url, href) parsed_href = urlparse(href) href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if (parsed_href.scheme == 'invalid-title'): continue if not is_valid(href): continue if href in internal_urls: continue if domain_name not in href: if href not in external_urls: encode_href = unquote(href) external_urls.append(encode_href) continue encode_href = unquote(href) internal_urls.append(encode_href) print('External Urls = ', external_urls, '\n') print('Internal Urls = ', internal_urls, '\n') break
def get_list(url): base_url = "https://so.gushiwen.org/" response = requests.get(url) sel = Selector(response) links = sel.css(".typecont a::attr(href)").extract() links = [urljoin(base_url, url) for url in links] return links
def scrape(self, url): r = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) x = urlopen(r) codebase = BeautifulSoup(x, 'html.parser') images = codebase.findAll("img") imageUrls = [] for i in images: relativeUrl = i.get("src") if (not relativeUrl): relativeUrl = i.get("data-src") if relativeUrl: if "http" in relativeUrl: imageUrls.append(relativeUrl) else: tempUrl = urljoin(url, relativeUrl) imageUrls.append(tempUrl) iconLink = codebase.find("link", rel="shortcut icon") if not iconLink: iconLink = ' ' else: iconLink = urljoin(url, iconLink.get('href')) title = codebase.title.string if not title: domain = urlparse(url) title = domain.hostname for i in imageUrls: keywords, label = self.detect_web_uri(i) x = '' for j in keywords: x = x + j + ", " imageDB.objects.create(keywords=x, dateTime=timezone.now(), sourceUrl=url, imageUrl=i, label=label, icon=iconLink, title=title)
def oneorigin(company_name, companies_details): # company_name = 'ONEORIGIN' # career_page_url = 'https://www.oneorigin.us/careers/' print(company_name) career_page_url = companies_details[company_name]['career_page_url'] sector = companies_details[company_name]['sector'] headers = {'User-Agent': 'Mozilla/5.0'} html = requests.get(career_page_url, headers=headers).text soup = BS(html, 'lxml') divs = soup.find('div', {'class': 'column medium-12 col-md-12'}) divs_all = divs.find_all('div', {'class': 'job-preview clearfix'}) df = pd.DataFrame(columns=fields_needed) for div in divs_all: job_title = div.find('div', { "class": "job-content" }).find('h5').find('span').get_text().strip() job_description = div.find('div', { "class": "job_custom_message" }).get_text().strip() job_location = np.nan job_type = div.find('div', { "class": "job-additional-information" }).find('span').get_text().strip() years_of_experience = np.nan job_department = np.nan job_specific_url = urljoin( career_page_url, div.find('div', { "class": "job-content" }).find('h5').find('a')['href']) df = df.append(pd.Series(data=[ company_name, job_title, job_description, job_location, job_type, years_of_experience, job_department, job_specific_url, career_page_url, sector ], index=fields_needed), ignore_index=True) return df
def get_view_history_link(search_word): "该函数用来获取历史编辑连接地址" root_url = 'https://en.wikipedia.org' url = root_url + '/wiki/' + search_word headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, 'lxml') relavtiv_link = soup.find('li', {'id': 'ca-history'}).span.a['href'] return urljoin(root_url, relavtiv_link)
def crawl( thread_name , url , linksToCrawl ): try: link = urljoin( Crawler.base_url , url ) if(urlparse(link).netloc == 'tutorialedge.net') and (link not in Crawler.crawledLinks): request = Request(link , headers = {'Use-Agent':'Mozilla/5.0'}) response = urlopen(request , context = Crawler.myssl ) Crawler.crawledLinks.add( link ) print('Url {} Crawled with Status : {} : {} Crawled In Total'.format(response.geturl() , response.getcode(),len(Crawler.crawledLinks))) soup = BeautifulSoup(response.read() , 'html.parser') Crawler.enqueueLinks(soup.find_all('a') , linksToCrawl) except URLError as e: print('URL {} threw this error when trying to parse : {}'.format(link , e.reason )) Crawler.errorLinks.add(link)
def parse(body): parser = html.parse(body) for quote_block in parser.xpath("//div[@class='quote']"): quote = quote_block.xpath("./span[@class='text']/text()") author = quote_block.xpath("./span[2]/small/text()") data.append({'quote': quote, 'author': author}) next_page = parser.xpath("//li[@class='next']/a/@href") if next_page: return request.urljoin("http://quotes.toscrape.com", next_page[0]) else: return None
def parse(self, response): pagetitle = response.xpath('/html/head/title').extract_first() print("content parsing:", pagetitle) products = response.xpath('//*[@class="product_top"]') for product in products: if product is not None: page = product.xpath('./a/@href').extract_first() detailPage = request.urljoin(response.url, page) pictureUrl = product.xpath('./a/img/@src').extract_first() yield scrapy.Request(detailPage, callback=self.page_parse, meta={'pictureUrl': pictureUrl})
def delete_design_doc(self, ddoc_name, rev=None, **options): if rev == None: rev = yield from self.get_design_doc(ddoc_name) rev = rev._rev posturl = urljoin(self._dburl, "_design/%s/" % ddoc_name) url = '%s?rev=%s' % (posturl, quote(rev)) response = yield from aiohttp.request( 'DELETE', url, headers={ 'Accept': 'application/json' }) data = yield from response.read() return Bunch(**json_loads(data))
def scrap_hero_links(page, baselink): soup = BeautifulSoup(page, "html.parser") heroes = soup.find("div", {"class": "hero-grid"}) children = heroes.findChildren("a") links = [] for child in children: link = urljoin(baselink, child['href']) name = child.find("div", {"class": "name"}).text l = child['href'].split("/") h_id = l.pop() hero = {"id": h_id, "name": name, "url": link} links.append(hero) return links
def get_ip(search_word): """该函数用来获取匿名修改的IP地址""" url = get_view_history_link(search_word) soup = BeautifulSoup(requests.get(url).text, 'lxml') page_500_link = urljoin( 'https://en.wikipedia.org', soup.find_all('a', {'class': 'mw-numlink'})[-1]['href']) soup = BeautifulSoup(requests.get(page_500_link).text, 'lxml') ips = soup.find_all('a', {'class': 'mw-anonuserlink'}) set_ips = set() for ip in ips: set_ips.add(ip.text) return set_ips
def GetUrl(url,home_url = 'http://www.yhdfa.com/index.php?s=/home/type/index.html'): global url_set html = requests.get(url).content.decode('utf-8') for url in re.findall('/index.php\?s=/home/series/series/typeno/\w+.html', html): url = urljoin(home_url,url) htmls =requests.get(url).content.decode('utf-8') for var in re.findall('/index.php\?s=/home/detail/detail/xlno/\w+.html',htmls): img_list = [] img_url = {} var = urljoin(home_url,var) file_name = os.path.splitext(var)[0] file_list = file_name.split('/') brandname = file_list.pop() html = requests.get(var).content.decode('utf-8') for index,img in enumerate(re.findall('Uploads/Picture/Code/.*?.jpg',html)): img = urljoin(home_url,img) filename = DownloadImg(img,path) img_url[index] = filename img_list.append(img_url) sql = 'update zhoucheng set img="'+pymysql.escape_string(json.dumps(img_list))+'" where brand = "'+brandname+'"'; cursor.execute(sql) print(brandname)
def parse_list(self, response): doc = PyQuery(response.text) data_list = doc('.rank-item .info a') for index, data in enumerate(data_list.items()): if not data.attr('href'): continue item = { 'url': urljoin(self.config['url'], data.attr('href')), 'title': data.text(), 'source': self.config_id, 'real_pos': index } yield item
def parse(html): soup = BeautifulSoup(html, 'html.parser') #提取所有网址 urls = soup.find_all('a', {"href": re.compile('^/.+?/$')}) #提取标题 title = soup.find('h1').get_text().strip() #strip()移除字符串首尾的空格和换行 #set()创建一个无序不重复的元素集;urljoin()构建一个完整的绝对url地址 page_urls = set([urljoin(base_url, url['href']) for url in urls]) #当前网页的url url = soup.find('meta', {'property': "og:url"})['content'] return title, page_urls, url
def download_query_results(self, query_id: str, api_key: str, filename: str = 'results.csv') -> None: """ Args: query_id (str) : specific query id api_key (str) : api authorization key filename (str) : output file name Returns: """ opener = request.build_opener() opener.addheaders = [('Authorization', f'Key {api_key}')] request.install_opener(opener=opener) url = request.urljoin(self.url, f'{query_id}/') url = request.urljoin(url, 'results.csv') request.urlretrieve(url=url, filename=filename)
def _set_password(self, *, user, password): session = self._get_sudo_session() url = urljoin(self.HOST, '/secure/admin/user/SetPassword.jspa') payload = { "inline": "true", "decorator": "dialog", "password": password, "confirm": password, "name": user.name, "atl_token": session.cookies.get('atlassian.xsrf.token'), } r = session.post(url=url, data=payload) r.raise_for_status()