Example #1
0
    def _get_sudo_session(self):
        user, password = self.AUTH
        session = requests.Session()

        # Get initial cookies
        url = urljoin(self.HOST, '/secure/admin/user/AddUser!default.jspa')
        r = session.get(url=url, auth=self.AUTH)
        r.raise_for_status()

        # Login form
        url_login = urljoin(self.HOST, '/login.jsp')
        payload_login = {
            "os_username": user,
            "os_password": password,
            "os_destination": "/secure/admin/user/AddUser!default.jspa",
            "user_role": "ADMIN",
            "atl_token": session.cookies.get('atlassian.xsrf.token'),
            "login": "******",
        }
        r = session.post(url=url_login, data=payload_login)
        r.raise_for_status()

        # Sudo login form
        url_login_sudo = urljoin(self.HOST, '/secure/admin/WebSudoAuthenticate.jspa')
        payload_login_sudo = {
            "webSudoPassword": password,
            "webSudoDestination": "/secure/admin/user/AddUser!default.jspa",
            "webSudoIsPost": "false",
            "atl_token": session.cookies.get('atlassian.xsrf.token'),
        }
        r = session.post(url=url_login_sudo, data=payload_login_sudo)
        r.raise_for_status()

        return session
Example #2
0
 def put(self, document, doc_id=None, **options):
     if not isinstance(document, dict):
         document = document.__dict__
     document = {key: val for key, val in document.items(
     ) if not key.startswith('__')}
     if not doc_id:
         doc_id = document.get('_id', document.get('id'))
     if 'id' in document:
         del document['id']
     if '_id' in document:
         del document['_id']
     if '_rev' in document and not document['_rev']:
         del document['_rev']
     if not doc_id:
         doc_id = str(uuid4())
     posturl = urljoin(self._dburl, doc_id)
     response = yield from aiohttp.request(
         'POST', self._dburl, data=json_dumps(document),
         headers={
             'Accept': 'application/json',
             'content-type': 'application/json'
         })
     data = yield from response.read()
     data = json_loads(data)
     if 'ok' in data and data['ok'] is True:
         document['_id'] = data['id']
         document['_rev'] = data['rev']
     return Bunch(**data)
Example #3
0
def fetch(endpoint, safe=":,", **parameters):
    """
    Fetch JSON data with the given endpoint and parameters.

    Arguments:
        endpoint: The API endpoint as a string.
        safe: A string containing characters exempt from URL encoding.
        parameters: A variable number of keyword arguments containing the
                    URL parameters to send with the request.

    Returns:
        The response as a dictionary containing JSON data.

    Raises:
        ValueError: when no API token is set.
    """
    if API_TOKEN is None:
        raise ValueError("no API token found")

    for key, value in parameters.items():
        if isinstance(value, (datetime.datetime, datetime.date)):
            parameters[key] = value.strftime("%Y-%m-%d")

        elif isinstance(value, (list, tuple)):
            parameters[key] = ",".join(map(str, value))

        elif not isinstance(value, str):
            parameters[key] = str(value)

    url = urljoin(BASE_URL, endpoint) + "?" + urlencode(parameters, safe=safe)
    request = Request(url, headers={"token": API_TOKEN})

    response = urlopen(request)
    return json.loads(response.read().decode("utf-8"))
    def crawl(self,pages,depth=2):
        for i in range(depth):
            newpages = set()
            for page in pages:
                try:
                    c = request.urlopen(page)
                except Exception as e:
                    print("Could not open %s" % page)
                    print("Due to:"+e.__str__())
                    continue
                soup = BeautifulSoup(c.readall(),from_encoding="utf-8")
                self.addtoindex(page,soup)

                links = soup("a")
                for link in links:
                    if "href" in dict(link.attrs):
                        url = request.urljoin(page,link["href"])
                        if url.find("'")!=-1:continue
                        url = url.split("#")[0]# remove url portion
                        if url[0:4]=="http" and not self.isindexed(url):
                            newpages.add(url)

                        linkText = self.gettextonly(link)
                        self.addlinkref(page,url,linkText)
                self.dbcommit()
            pages = newpages
Example #5
0
    def _make_link(self, *, key, link):
        url = urljoin(self.HOST, '/rest/api/2/issueLink')

        link_type = self.LINK_TYPES.get(link['type']['name'])
        if 'outwardIssue' in link:
            inwardIssue = key
            outwardIssue = link['outwardIssue']['key']
        elif 'inwardIssue' in link:
            inwardIssue = link['inwardIssue']['key']
            outwardIssue = key

        payload = {
            "type": {
                "name": link_type
            },
            "inwardIssue": {
                "key": inwardIssue
            },
            "outwardIssue": {
                "key": outwardIssue
            },
        }

        r = requests.post(url=url, json=payload, auth=self.AUTH)
        r.raise_for_status()
Example #6
0
 def run(self, baseUri='', context={}, session=None):
     """Run actions' request, perform assertion testing"""
     self.request['url'] = urljoin(baseUri, self.request['url'])
     # apply template to request and assertions
     self.request = apply_context(self.request, context)
     self.asserts = apply_context(self.asserts, context)
     # create assertions
     assertion = assertions.Assert(self.asserts)
     # get response
     request = requests.Request(**self.request).prepare()
     http = session or requests.Session()
     self.response = http.send(request)
     # test assertion, will raise an excep
     try:
         assertion.test(self.response)
     except Exception as error:
         # save error
         self.error = error
         # reraise
         raise
     # save context vars
     if self.vars:
         try:
             data = {
                 'json': self.response.json(),
                 'headers': self.response.headers
             }
             for name, path in self.vars.items():
                 self.vars[name] = json_path(path, data)
         except ValueError:
             # no json, it's can be ok
             pass
     return self
Example #7
0
    def create_versions(self):
        print('Creating Versions')
        url = urljoin(self.HOST, '/rest/api/2/version')

        for version in Version.objects.all():
            print(version)
            data = version.json
            payload = {
                "name": data['name'],
                "archived": data['archived'],
                "released": data['released'],
                "project": self.PKEY,
                "projectId": self.PID
            }
            description = data.get('description')
            if description:
                payload['description'] = description
            userStartDate = data.get('userStartDate')
            if userStartDate:
                payload['userStartDate'] = userStartDate
            userReleaseDate = data.get('userReleaseDate')
            if userReleaseDate:
                payload['userReleaseDate'] = userReleaseDate

            r = requests.post(url=url, json=payload, auth=self.AUTH)
            r.raise_for_status()
Example #8
0
    def __init__(self, username: str, password: str or None = None):
        """
        :param username: The account username.
        :type username: str
        :param password: Required for quick connection, instead of calling later change_password.
        :type password: str or None
        """
        from pymal.account_objects import account_animes, account_mangas

        self.__username = username
        self.__password = password
        self.connect = global_functions.connect
        self.__user_id = None
        self.__auth_object = None

        self._main_profile_url = request.urljoin(HOST_NAME, "profile/{0:s}".format(self.username))

        self.__animes = account_animes.AccountAnimes(self)
        self.__mangas = account_mangas.AccountMangas(self)
        self.__friends = None
        self.__image_url = ""

        self.__session = global_functions.generate_session()

        self._is_loaded = False

        if password is not None:
            self.change_password(password)
Example #9
0
def catcher_remote_image(request):
    """远程抓图,当catchRemoteImageEnable:true时,
        如果前端插入图片地址与当前web不在同一个域,则由本函数从远程下载图片到本地
    """
    if not request.method == "POST":
        return HttpResponse(json.dumps(u"{'state:'ERROR'}"), content_type="application/javascript")

    state = "SUCCESS"

    allow_type = list(request.GET.get("catcherAllowFiles", USettings.UEditorUploadSettings.get("catcherAllowFiles", "")))
    max_size = int(request.GET.get("catcherMaxSize", USettings.UEditorUploadSettings.get("catcherMaxSize", 0)))

    remote_urls = request.POST.getlist("source[]", [])
    catcher_infos = []
    path_format_var = get_path_format_vars()

    for remote_url in remote_urls:
        # 取得上传的文件的原始名称
        remote_file_name = os.path.basename(remote_url)
        remote_original_name, remote_original_ext = os.path.splitext(remote_file_name)
        # 文件类型检验
        if remote_original_ext in allow_type:
            path_format_var.update({
                "basename": remote_original_name,
                "extname": remote_original_ext[1:],
                "filename": remote_original_name
            })
            # 计算保存的文件名
            o_path_format, o_path, o_file = get_output_path(request, "catcherPathFormat", path_format_var)
            o_filename = os.path.join(o_path, o_file).replace("\\", "/")
            # 读取远程图片文件
            try:
                remote_image = urllib.urlopen(remote_url)
                # 将抓取到的文件写入文件
                try:
                    f = open(o_filename, 'wb')
                    f.write(remote_image.read())
                    f.close()
                    state = "SUCCESS"
                except Exception as e:
                    state = u"写入抓取图片文件错误:%s" % e
            except Exception as e:
                state = u"抓取图片错误:%s" % e

            catcher_infos.append({
                "state": state,
                "url": urljoin(USettings.gSettings.MEDIA_URL, o_path_format),
                "size": os.path.getsize(o_filename),
                "title": os.path.basename(o_file),
                "original": remote_file_name,
                "source": remote_url
            })

    return_info = {
        "state": "SUCCESS" if len(catcher_infos) > 0 else "ERROR",
        "list": catcher_infos
    }

    return HttpResponse(json.dumps(return_info, ensure_ascii=False), content_type="application/javascript")
Example #10
0
 def get(self, doc_id, **options):
     response = yield from aiohttp.request(
         'GET', urljoin(self._dburl, doc_id),
         headers={
             'Accept': 'application/json'
         })
     data = yield from response.read()
     return json_loads(data)
Example #11
0
    def _get_versions(self):
        versions_endpoint = '/rest/api/2/project/%s/versions' % self.PKEY
        response = requests.get(url=urljoin(self.HOST, versions_endpoint), auth=self.AUTH).json()

        bulk_versions = [Version(name=v['name'], uid=v['id'], link=v['self'], json=v) for v in response]
        Version.objects.all().delete()
        Version.objects.bulk_create(bulk_versions)
        print('Версии загружены')
Example #12
0
 def all(self, **options):
     response = yield from aiohttp.request(
         'GET', urljoin(self._dburl, '_all_docs'),
         headers={
             'Accept': 'application/json'
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #13
0
    def _make_subtask_relation(self, *, issue):
        uid_dest = str(issue.uid_dest)
        session = requests.Session()

        url0 = urljoin(self.HOST, '/secure/ConvertIssueSetIssueType.jspa?id=' + uid_dest)
        r = session.get(url=url0, auth=self.AUTH)
        soup = BeautifulSoup(r.text)
        guid = soup.find_all("input", type="hidden", id="guid")[0]['value']

        # Step 1: Select Parent and Sub-task Type
        url_s1 = urljoin(self.HOST, '/secure/ConvertIssueSetIssueType.jspa')
        payload_s1 = {
            "parentIssueKey": issue.json['fields']['parent']['key'],
            "issuetype": "10000",
            "id": uid_dest,
            "guid": guid,
            "Next >>": "Next >>",
            "atl_token": session.cookies.get('atlassian.xsrf.token'),
        }

        r = session.post(url=url_s1, data=payload_s1, headers={"Referer": url0})
        r.raise_for_status()

        # Step 2: Update Fields
        url_s2 = urljoin(self.HOST, '/secure/ConvertIssueUpdateFields.jspa')
        payload_s2 = {
            "id": uid_dest,
            "guid": guid,
            "Next >>": "Next >>",
            "atl_token": session.cookies.get('atlassian.xsrf.token'),
        }

        r = session.post(url=url_s2, data=payload_s2)
        r.raise_for_status()

        # Step 3: Confirm the conversion with all of the details you have just configured
        url_s3 = urljoin(self.HOST, '/secure/ConvertIssueConvert.jspa')
        payload_s3 = {
            "id": uid_dest,
            "guid": guid,
            "Finish": "Finish",
            "atl_token": session.cookies.get('atlassian.xsrf.token'),
        }

        r = session.post(url=url_s3, data=payload_s3)
        r.raise_for_status()
Example #14
0
    def start(self):
        div_tag = self.soup.find('div', {'class': 'box3'})
        a_tags = div_tag.findAll('a')
        href_list = map(lambda x: urljoin(self.menu_url, x['href']), a_tags)

        pool = Pool(8)
        pool.map(self.download_mp3, href_list)
        pool.close()
Example #15
0
 def __getLinks(self, html, url):
     """ Extract Links from a Soup object. """
     soup = BeautifulSoup(html.read(), "html.parser")
     myset = set()
     for link in soup.find_all("a"):
         # Normalize the link
         myset.add(urljoin(url, link.get("href")).strip("/").split("#")[0])
     return myset
Example #16
0
 def action(self, url, method='GET', headers=None, body=None, json=None):
     # create Request
     request = requests.Request(method, urljoin(self.baseUri, url), headers=headers)
     if json:
         request.json = json
     elif body:
         request.data = body
     prepared = request.prepare()
     return self.session.send(prepared)
Example #17
0
 def _get_issues_details(self):
     for issue in Issue.objects.all():
         print('Обрабатывается:', issue)
         response = requests.get(
             url=urljoin(self.HOST, Issue.API.format(uid=issue.uid)),
             auth=self.AUTH,
         ).json()
         issue.json = response
         issue.save()
Example #18
0
 def get_design_doc(self, ddoc_name, **options):
     posturl = urljoin(self._dburl, "_design/%s/" % ddoc_name)
     response = yield from aiohttp.request(
         'GET', posturl,
         headers={
             'Accept': 'application/json',
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #19
0
    def _do_transition(self, *, issue):
        url = urljoin(self.HOST, '/rest/api/2/issue/{key}/transitions'.format(key=issue.key))

        status = issue.json['fields']['status']['name']
        payload = {
            'transition': {'id': self.STATUSES[status]},
        }

        r = requests.post(url=url, json=payload, auth=self.AUTH)
        r.raise_for_status()
Example #20
0
 def delete_db(self, dbname=None, **options):
     if not dbname:
         dbname = self._dbname
     response = yield from aiohttp.request(
         'DELETE', urljoin(self._url, dbname),
         headers={
             'Accept': 'application/json'
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #21
0
 def put_design_doc(self, ddoc_name, ddoc, **options):
     posturl = urljoin(self._dburl, "_design/%s/" % ddoc_name)
     response = yield from aiohttp.request(
         'PUT', posturl, data=json_dumps(ddoc),
         headers={
             'Accept': 'application/json',
             'content-type': 'application/json'
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #22
0
def load_files_for_url(url, dst, extension=None):

    if not dst.exists():
        print("creating: {}".format(str(dst)))
        dst.mkdir()

    for filename, fileurl in get_files(url, extension):
        full_url = request.urljoin(EXUA_URL, fileurl)
        filepath = dst.joinpath(filename)
        print("downloading: {} -> {}".format(full_url, filepath))
        retrieve_file(full_url, filepath)
    def read_index(language):
        index = {}
        base_url = 'https://devdocs.io/'
        request = Request(base_url + 'docs/' + language + '/index.json')
        with urlopen(request) as response:
            index = json.loads(response.read().decode('utf-8'))

        for entry in index['entries']:
            path = '/'.join([language, entry['path']])
            entry['path'] = urljoin(base_url, path)
        return index
Example #24
0
 def info(self, doc_id=None):
     url = self._dburl
     if doc_id:
         url = urljoin(url, doc_id)
     response = yield from aiohttp.request(
         'GET', url,
         headers={
             'Accept': 'application/json'
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #25
0
def load_gabi_set(index, cname="csc-default", version='', savedir='.'):
    if not os.path.exists(savedir):
        os.makedirs(savedir)

    # grab links to database browser pages

    base, links = grab_db_browser_links(index, cname=cname)

    for link in links:
        print('Attempting to load %s...\n' % link)
        load_gabi_collection(urljoin(base, link), version=version, savedir=savedir)
    def plan_page(self, response):
        soup = BeautifulSoup(response.text)

        next_link = ''
        t = soup('a')
        for i in t:
            if i.get_text() == '上一页':
                link = urljoin(response.url, i['href'])
                next_link = link
                self.crawl(link, callback=self.plan_page, age=1, save=response.save) 
                break

        lists = soup.find('table', {'width':'100%'}).find_all('a', href=True)
        print(len(lists))
        for i in lists:
            if i.get_text() != '下一页':
                link = urljoin(response.url, i['href'])
                if link != next_link:
                    print(link)
                    self.crawl(link, callback=self.content_page, save=response.save, fetch_type='js')
Example #27
0
 def main_page(self):
     raw = self.bs4markup(self.do_get(M['main']))
     npc = raw.find('div', id='NanoPCSeries')
     _table = npc.next_sibling.next_sibling
     dat = []
     for li_ in _table.find_all('li'):
         dat.append({
             'name': li_.text,
             'url': urljoin(M['index'],
                            li_.a.get('href') + '/zh')
         })
     return dat
Example #28
0
 def page_data(self, response):
     html = response.text
     href_s = re.findall(r'<span class="post"><a href="(.*?)"', html)
     for href in href_s:
         yield scrapy.Request(href, callback=self.page_info)
     try:
         next_href = re.findall(r'<a href="(.*?)" rel=\'nofollow\'>下一页',
                                html)[0]
         next_href1 = request.urljoin(response.url, next_href)
         yield scrapy.Request(next_href1, callback=self.page_data)
     except Exception as e:
         print(e)
Example #29
0
    def get_all_website_links(self, url: str = None) -> None:
        """
        Returns all URLs that is found on `url` in which it belongs to the same website
        """
        url = url or self.url
        logger.debug(f"get_all_website_links: {url}")
        domain_name = urlparse(url).netloc
        soup = BeautifulSoup(requests.get(url).content, "html.parser")
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                self.invalid_links.add((url, href))
                continue
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            href = (href if (parsed_href.netloc.endswith("youtube.com")
                             or parsed_href.netloc.endswith("youtu.be")) else
                    parsed_href.scheme + "://" + parsed_href.netloc +
                    parsed_href.path)

            if not self.is_valid(href):
                if href not in self.urls_to_ignore:
                    self.invalid_links.add((url, href))
                continue
            if (
                    # href in self.internal_urls
                    # or href in self.external_urls
                    href in self.urls_to_ignore):
                continue

            if domain_name not in href and self.url not in href:
                if href not in self.external_urls:
                    self.external_urls.add(href)
                    if not self.check_external_url(href):
                        logger.debug(f"INVALID {(url, href)}")
                        logger.debug("\tretrying")
                        for _ in range(self.RETRY_NR):
                            sleep(1)
                            if self.check_external_url(href):
                                logger.debug("\tseems to work !!!")
                                continue
                        self.invalid_links.add((url, href))
            else:
                if href not in self.internal_urls:
                    self.internal_urls.add(href)
                    if not self.check_internal_url(href):
                        self.invalid_links.add((url, href))
                        logger.debug(f"INVALID {(url, href)}")
                    try:
                        self.get_all_website_links(href)
                    except:
                        self.invalid_links.add((url, href))
                        logger.debug(f"INVALID {(url, href)}")
Example #30
0
def get_links(text, url):
    time.sleep(1)
    content = {}
    linkList = []
    soup = BeautifulSoup(text, "html.parser")
    links = soup.findAll("a", href=True)
    for link in links:
        href = link['href']
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if href.endswith("/"):
            href = href[:-1]

        if "https" not in href:
            href = href.replace("http", "https")

        if href not in urls_list:
            if "uic.edu" in href and "@" not in href and is_valid(href):
                urls_queue.put(href)
                urls_list.append(href)
                linkList.append(href)
        else:
            linkList.append(href)

    # remove JS & CSS content
    for script in soup(["script", "style"]):
        script.extract()

    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    doc = preprocess(text)

    if url.endswith("/"):
        url = url[:-1]

    if "https" not in url:
        url = url.replace("http", "https")

    content["page"] = url
    content["text"] = doc
    content["link"] = linkList
    pagesList.append(content)

    return
Example #31
0
def tobii(company_name, companies_details):
    career_page_url = companies_details[company_name]['career_page_url']
    sector = companies_details[company_name]['sector']

    print(company_name)

    df = pd.DataFrame(columns=fields_needed)

    try:
        html = requests.get(career_page_url).text
        soup = BS(html, 'lxml')

        for item in soup.findAll("div", {"class": "job-listing-container"}):
            for ul in item.findAll('ul'):
                for li in ul.findAll('li'):
                    for a in li.findAll('a'):
                        job_specific_url = urljoin(career_page_url,
                                                   a.get('href'))

                    for j in li.findAll("span", {"class": "title"}):
                        job_title = j.text.strip()

                    for t in li.findAll("span", {"class": "meta"}):
                        ty = t.text.strip()

                    ty_split = ty.split('-', 1)

                    job_type = ty_split[0]

                    job_location = ty_split[1]

                    job_description = np.nan

                    years_of_experience = np.nan

                    job_department = np.nan

                    df = df.append(pd.Series(data=[
                        company_name, job_title, job_description, job_location,
                        job_type, years_of_experience, job_department,
                        job_specific_url, career_page_url, sector
                    ],
                                             index=fields_needed),
                                   ignore_index=True)
    except Exception as error:

        print(error)

        print(
            "<<<<<<<<<<<<<<<<<<<<< This company got an issue %s >>>>>>>>>>>>>>>>>>>>>>>"
            % career_page_url)

    return df
Example #32
0
def fetch_random_video(videos_list):
    video_name = random.choice(videos_list)
    cache_path = os.path.join(CACHE_DIR, video_name)

    if not os.path.exists(cache_path):
        url = request.urljoin(UCF_ROOT, video_name)

        response = (request.urlopen(url, context=UNVERIFIED_CONTEXT).read())
        with open(cache_path, 'wb') as f:
            f.write(response)

    return cache_path
Example #33
0
 def delete(self, doc_id, rev=None, **options):
     if rev == None:
         rev = yield from self.info(doc_id)
         rev = rev._rev
     url = '%s?rev=%s' % (urljoin(self._dburl, doc_id), quote(rev))
     response = yield from aiohttp.request(
         'DELETE', url,
         headers={
             'Accept': 'application/json'
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #34
0
    async def mailboxes(self):
        headers = {'Authorization': 'bearer {}'.format(self.token_renew.token)}
        url = urljoin(self.host, self.endpoints['mailboxes'])

        try:
            async with aiohttp.ClientSession(raise_for_status=True) as session:
                async with session.get(url, headers=headers) as resp:
                    body = await resp.json()
                    return body.get('_embedded', {}).get('mailboxes', [])
        except aiohttp.ClientConnectionError as e:
            msg = 'Error making request %s. Reason: %s'
            logger.error(msg, url, str(e))
Example #35
0
def parse(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')

    title = soup.find('h1').get_text().strip()
    # title = soup.find('title')

    urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
    page_urls = set([urljoin(base_url, url['href'])
                     for url in urls])  # remove duplication

    url = soup.find('meta', {'property': "og:url"})['content']
    return title, url, page_urls
Example #36
0
def parse(html):
    """ 获取网站中网页的title, 页面url"""
    soup = BeautifulSoup(html, 'lxml')
    # 页面中的链接是/开头和结尾/,因此这里使用了RE进行提取
    urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
    # 获取页面标题,去除空格
    title = soup.find('h1').get_text().strip()
    # 获取页面的url
    url = soup.find('meta', {'property': "og:url"})['content']
    # 构建一个集合set,保存该页面内的url, 好处是可以去重, 使用urljoin函数拼接url
    page_urls = set([urljoin(base_url, url['href']) for url in urls])
    return title, page_urls, url
Example #37
0
def normalize_url(url):
    resource = re.search(r'/[^/]+$', url)
    end = resource.group()
    address = url[0:resource.start()]
    address = address.casefold()
    url = address + end
    url = url.strip()
    url = url.split('www.')
    url = url[-1].split('//')
    url = urljoin('https://', ('//' + url[-1]))

    return url
Example #38
0
    def test__renew_token(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        token = TokenRenew(urljoin(settings.HELPSCOUT_HOST,
                                   '/v2/oauth2/token'))

        loop.run_until_complete(token.renew_token(token))

        self.assertEqual(len(self.cassette), 1)
        self.assertEqual(self.cassette.responses[0]['status']['code'], 200)
        self.assertEqual(self.cassette.requests[0].uri,
                         'https://api.helpscout.net/v2/oauth2/token')
 def useful_links(url):
     parsed_uri = urlparse(url)
     base_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
     try:
         page = urlopen(url)
     except:
         return []
     query = '.main-content a[href]'
     soup = BeautifulSoup(page, 'html.parser')
     findings = soup.select(query)
     links = {url_no_params(urljoin(base_url, anchor['href'])) for anchor in findings}
     return {link for link in links if link.startswith(root_domain)}
Example #40
0
 def parse(self, response):
     privance = response.xpath(
         '//div[@class="topcity"]/dl[@id="clist"]//dd//a/@href').extract()
     for city in privance:
         if 'http://g.58.com/' in city:
             continue
         for i in range(1, 71):
             addVal = "job/pn{}".format(str(i))
             fullUrl = request.urljoin(city, addVal)
             yield scrapy.Request(fullUrl,
                                  callback=self.parseList,
                                  meta={'baseurl': city})
Example #41
0
def get_all_website_links(url: str) -> list:
    """
    Gets all urls that are found on `url` webpage hosted on medium.com.

    :param url: The url for the webpage
    :return:    List of urls for the 'url' parameter
    """

    # All urls of `url` parameter
    # Using a set for unique elements
    urls = set()

    # Domain name of the url without the protocol (https)
    domain_name = urlparse(url).netloc

    # HTTP GET request to the url
    result = requests.get(url)

    # BeautifulSoup gets the html of the url's webpage
    soup = BeautifulSoup(result.content, "html.parser")

    # Begin looking for all links ie. <a href=""> on the current webpage
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        
        if href == "" or href is None:
            # href empty tag, links nowhere
            continue

        # Join the url if it's relative (not absolute link)
        href = urljoin(url, href)

        # urlparse() returns a ParseResult object with [scheme, netloc, path, params, query]
        parsed_href = urlparse(href)

        # Combines url pieces to get ready to insert into final list of all the urls
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if not is_valid(href):
            # Not a valid URL
            continue
        if href in internal_urls:
            # Already in the set
            # We only want unique url links
            continue
        if domain_name not in href:
            # External link; links to domain outside of medium.com
            continue

        urls.add(href)
        internal_urls.add(href)

    return urls
Example #42
0
def read_index(lang='css'):
    index = {}
    base_url = 'https://devdocs.io/'
    request = Request(base_url + 'docs/' + lang + '/index.json')
    with urlopen(request) as response:
        index = json.loads(response.read().decode('utf-8'))

    # convert the path into a absolute url
    for entry in index['entries']:
        path = '/'.join([lang, entry['path']])
        entry['path'] = urljoin(base_url, path)
    return index
Example #43
0
    def rest_upload_file(self,
                         _file,
                         username=GEONODE_USER,
                         password=GEONODE_PASSWD):
        """ function that uploads a file, or a collection of files, to
        the GeoNode"""
        assert authenticate(username=username, password=password)
        self.assertTrue(self.client.login(username=username,
                                          password=password))
        spatial_files = ("dbf_file", "shx_file", "prj_file")
        base, ext = os.path.splitext(_file)
        params = {
            # make public since wms client doesn't do authentication
            'permissions':
            '{ "users": {"AnonymousUser": ["view_resourcebase"]} , "groups":{}}',
            'time': 'false',
            'charset': 'UTF-8'
        }

        # deal with shapefiles
        if ext.lower() == '.shp':
            for spatial_file in spatial_files:
                ext, _ = spatial_file.split('_')
                file_path = f"{base}.{ext}"
                # sometimes a shapefile is missing an extra file,
                # allow for that
                if os.path.exists(file_path):
                    params[spatial_file] = open(file_path, 'rb')

        with open(_file, 'rb') as base_file:
            params['base_file'] = base_file
            for name, value in params.items():
                if isinstance(value, IOBase):
                    params[name] = (os.path.basename(value.name), value)
            url = urljoin(f"{reverse('uploads-list')}/", 'upload/')
            logger.error(f" ---- UPLOAD URL: {url}")
            response = self.client.post(url, data=params)

        # Closes the files
        for spatial_file in spatial_files:
            if isinstance(params.get(spatial_file), IOBase):
                params[spatial_file].close()

        try:
            logger.error(
                f" -- response: {response.status_code} / {response.json()}")
            return response, response.json()
        except (ValueError, TypeError):
            logger.exception(
                ValueError(
                    f"probably not json, status {response.status_code} / {response.content}"
                ))
            return response, response.content
Example #44
0
def getdata(url):
    domain_name = urlparse(url).netloc
    ua = UserAgent()
    headers = {"User-Agent": ua.random}

    while True:
        try:
            session = requests.Session()
            session.headers.update(headers)
            retries = Retry(total=5,
                            backoff_factor=1,
                            status_forcelist=[429, 500, 502, 503])
            adapter = HTTPAdapter(max_retries=retries)
            session.mount('http://', adapter)
            session.mount('https://', adapter)

            res = session.get(url, allow_redirects=True, timeout=15)
            res.raise_for_status()
        except:
            pass

        else:
            internal_urls = []
            external_urls = []
            soup = BeautifulSoup(res.content, "html.parser")
            for a_tag in soup.findAll("a"):
                href = a_tag.attrs.get("href")
                if href == "" or href is None:
                    continue
                if href == "#" or href == "/":
                    continue
                href = urljoin(url, href)
                parsed_href = urlparse(href)
                href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

                if (parsed_href.scheme == 'invalid-title'):
                    continue
                if not is_valid(href):
                    continue
                if href in internal_urls:
                    continue
                if domain_name not in href:
                    if href not in external_urls:
                        encode_href = unquote(href)
                        external_urls.append(encode_href)
                    continue
                encode_href = unquote(href)
                internal_urls.append(encode_href)

            print('External Urls = ', external_urls, '\n')
            print('Internal Urls = ', internal_urls, '\n')

        break
Example #45
0
def get_list(url):
    base_url = "https://so.gushiwen.org/"

    response = requests.get(url)

    sel = Selector(response)

    links = sel.css(".typecont a::attr(href)").extract()

    links = [urljoin(base_url, url) for url in links]

    return links
Example #46
0
    def scrape(self, url):
        r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        x = urlopen(r)
        codebase = BeautifulSoup(x, 'html.parser')
        images = codebase.findAll("img")
        imageUrls = []
        for i in images:
            relativeUrl = i.get("src")
            if (not relativeUrl):
                relativeUrl = i.get("data-src")
            if relativeUrl:
                if "http" in relativeUrl:
                    imageUrls.append(relativeUrl)
                else:
                    tempUrl = urljoin(url, relativeUrl)
                    imageUrls.append(tempUrl)

        iconLink = codebase.find("link", rel="shortcut icon")
        if not iconLink:
            iconLink = ' '
        else:
            iconLink = urljoin(url, iconLink.get('href'))

        title = codebase.title.string
        if not title:
            domain = urlparse(url)
            title = domain.hostname

        for i in imageUrls:
            keywords, label = self.detect_web_uri(i)
            x = ''
            for j in keywords:
                x = x + j + ", "
            imageDB.objects.create(keywords=x,
                                   dateTime=timezone.now(),
                                   sourceUrl=url,
                                   imageUrl=i,
                                   label=label,
                                   icon=iconLink,
                                   title=title)
Example #47
0
def oneorigin(company_name, companies_details):
    # company_name = 'ONEORIGIN'
    # career_page_url = 'https://www.oneorigin.us/careers/'

    print(company_name)

    career_page_url = companies_details[company_name]['career_page_url']
    sector = companies_details[company_name]['sector']

    headers = {'User-Agent': 'Mozilla/5.0'}

    html = requests.get(career_page_url, headers=headers).text
    soup = BS(html, 'lxml')

    divs = soup.find('div', {'class': 'column medium-12 col-md-12'})
    divs_all = divs.find_all('div', {'class': 'job-preview clearfix'})

    df = pd.DataFrame(columns=fields_needed)

    for div in divs_all:
        job_title = div.find('div', {
            "class": "job-content"
        }).find('h5').find('span').get_text().strip()

        job_description = div.find('div', {
            "class": "job_custom_message"
        }).get_text().strip()

        job_location = np.nan

        job_type = div.find('div', {
            "class": "job-additional-information"
        }).find('span').get_text().strip()

        years_of_experience = np.nan

        job_department = np.nan

        job_specific_url = urljoin(
            career_page_url,
            div.find('div', {
                "class": "job-content"
            }).find('h5').find('a')['href'])

        df = df.append(pd.Series(data=[
            company_name, job_title, job_description, job_location, job_type,
            years_of_experience, job_department, job_specific_url,
            career_page_url, sector
        ],
                                 index=fields_needed),
                       ignore_index=True)
    return df
Example #48
0
def get_view_history_link(search_word):
    "该函数用来获取历史编辑连接地址"
    root_url = 'https://en.wikipedia.org'
    url = root_url + '/wiki/' + search_word
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
        '(KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    relavtiv_link = soup.find('li', {'id': 'ca-history'}).span.a['href']
    return urljoin(root_url, relavtiv_link)
Example #49
0
	def crawl( thread_name , url , linksToCrawl ):
		try:
			link = urljoin( Crawler.base_url , url )
			if(urlparse(link).netloc == 'tutorialedge.net') and (link not in Crawler.crawledLinks):
				request = Request(link , headers = {'Use-Agent':'Mozilla/5.0'})
				response = urlopen(request , context = Crawler.myssl )
				Crawler.crawledLinks.add( link )
				print('Url {} Crawled with Status : {} : {} Crawled In Total'.format(response.geturl() , response.getcode(),len(Crawler.crawledLinks)))
				soup = BeautifulSoup(response.read() , 'html.parser')
				Crawler.enqueueLinks(soup.find_all('a') , linksToCrawl)
		except URLError as e:
			print('URL {} threw this error when trying to parse : {}'.format(link , e.reason ))
			Crawler.errorLinks.add(link)
Example #50
0
def parse(body):
    parser = html.parse(body)

    for quote_block in parser.xpath("//div[@class='quote']"):
        quote = quote_block.xpath("./span[@class='text']/text()")
        author = quote_block.xpath("./span[2]/small/text()")
        data.append({'quote': quote, 'author': author})

    next_page = parser.xpath("//li[@class='next']/a/@href")
    if next_page:
        return request.urljoin("http://quotes.toscrape.com", next_page[0])
    else:
        return None
Example #51
0
    def parse(self, response):
        pagetitle = response.xpath('/html/head/title').extract_first()
        print("content parsing:", pagetitle)

        products = response.xpath('//*[@class="product_top"]')
        for product in products:
            if product is not None:
                page = product.xpath('./a/@href').extract_first()
                detailPage = request.urljoin(response.url, page)
                pictureUrl = product.xpath('./a/img/@src').extract_first()
                yield scrapy.Request(detailPage,
                                     callback=self.page_parse,
                                     meta={'pictureUrl': pictureUrl})
Example #52
0
 def delete_design_doc(self, ddoc_name, rev=None, **options):
     if rev == None:
         rev = yield from self.get_design_doc(ddoc_name)
         rev = rev._rev
     posturl = urljoin(self._dburl, "_design/%s/" % ddoc_name)
     url = '%s?rev=%s' % (posturl, quote(rev))
     response = yield from aiohttp.request(
         'DELETE', url,
         headers={
             'Accept': 'application/json'
         })
     data = yield from response.read()
     return Bunch(**json_loads(data))
Example #53
0
def scrap_hero_links(page, baselink):
    soup = BeautifulSoup(page, "html.parser")
    heroes = soup.find("div", {"class": "hero-grid"})
    children = heroes.findChildren("a")
    links = []
    for child in children:
        link = urljoin(baselink, child['href'])
        name = child.find("div", {"class": "name"}).text
        l = child['href'].split("/")
        h_id = l.pop()
        hero = {"id": h_id, "name": name, "url": link}
        links.append(hero)
    return links
Example #54
0
def get_ip(search_word):
    """该函数用来获取匿名修改的IP地址"""
    url = get_view_history_link(search_word)
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    page_500_link = urljoin(
        'https://en.wikipedia.org',
        soup.find_all('a', {'class': 'mw-numlink'})[-1]['href'])
    soup = BeautifulSoup(requests.get(page_500_link).text, 'lxml')
    ips = soup.find_all('a', {'class': 'mw-anonuserlink'})
    set_ips = set()
    for ip in ips:
        set_ips.add(ip.text)
    return set_ips
Example #55
0
def GetUrl(url,home_url = 'http://www.yhdfa.com/index.php?s=/home/type/index.html'):
    global url_set
    html = requests.get(url).content.decode('utf-8')
    for url in re.findall('/index.php\?s=/home/series/series/typeno/\w+.html', html):
        url = urljoin(home_url,url)
        htmls =requests.get(url).content.decode('utf-8')
        for var in re.findall('/index.php\?s=/home/detail/detail/xlno/\w+.html',htmls):
            img_list = []
            img_url = {}
            var = urljoin(home_url,var)
            file_name = os.path.splitext(var)[0]
            file_list = file_name.split('/')
            brandname = file_list.pop()
            html = requests.get(var).content.decode('utf-8')
            for index,img in enumerate(re.findall('Uploads/Picture/Code/.*?.jpg',html)):
                img = urljoin(home_url,img)
                filename = DownloadImg(img,path)
                img_url[index] = filename
            img_list.append(img_url)
            sql = 'update zhoucheng set img="'+pymysql.escape_string(json.dumps(img_list))+'" where brand = "'+brandname+'"';
            cursor.execute(sql)
            print(brandname)
Example #56
0
 def parse_list(self, response):
     doc = PyQuery(response.text)
     data_list = doc('.rank-item .info a')
     for index, data in enumerate(data_list.items()):
         if not data.attr('href'):
             continue
         item = {
             'url': urljoin(self.config['url'], data.attr('href')),
             'title': data.text(),
             'source': self.config_id,
             'real_pos': index
         }
         yield item
Example #57
0
def load_gabi_set(index, cname="csc-default", version='', savedir='.'):
    if not os.path.exists(savedir):
        os.makedirs(savedir)

    # grab links to database browser pages

    base, links = grab_db_browser_links(index, cname=cname)

    for link in links:
        print('Attempting to load %s...\n' % link)
        load_gabi_collection(urljoin(base, link),
                             version=version,
                             savedir=savedir)
def parse(html):
    soup = BeautifulSoup(html, 'html.parser')
    #提取所有网址
    urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
    #提取标题
    title = soup.find('h1').get_text().strip()  #strip()移除字符串首尾的空格和换行

    #set()创建一个无序不重复的元素集;urljoin()构建一个完整的绝对url地址
    page_urls = set([urljoin(base_url, url['href']) for url in urls])

    #当前网页的url
    url = soup.find('meta', {'property': "og:url"})['content']
    return title, page_urls, url
Example #59
0
    def download_query_results(self,
                               query_id: str,
                               api_key: str,
                               filename: str = 'results.csv') -> None:
        """

        Args:
            query_id (str) : specific query id
            api_key (str) : api authorization key
            filename (str) : output file name

        Returns:

        """

        opener = request.build_opener()
        opener.addheaders = [('Authorization', f'Key {api_key}')]
        request.install_opener(opener=opener)

        url = request.urljoin(self.url, f'{query_id}/')
        url = request.urljoin(url, 'results.csv')
        request.urlretrieve(url=url, filename=filename)
Example #60
0
    def _set_password(self, *, user, password):
        session = self._get_sudo_session()

        url = urljoin(self.HOST, '/secure/admin/user/SetPassword.jspa')
        payload = {
            "inline": "true",
            "decorator": "dialog",
            "password": password,
            "confirm": password,
            "name": user.name,
            "atl_token": session.cookies.get('atlassian.xsrf.token'),
        }
        r = session.post(url=url, data=payload)
        r.raise_for_status()