Ejemplo n.º 1
0
 def get_attribute_from_html(self, data):
     result = {}
     tag_name = []
     for key, value in data.items():
         if "url" != key:
             pars = BeautifulSoup(value, "html.parser").find()
             if pars.get("id", False):
                 parsed_data = "#" + pars.get("id")
                 result[key] = parsed_data
             elif pars.get("src", False):
                 parsed_data = "@" + pars.get("src")
                 result[key] = parsed_data + '|' + pars.name
             elif pars.get("class", False):
                 parsed_data = "." + " ".join(pars.get("class"))
                 result[key] = parsed_data
             elif pars.get('itemprop', False):
                 parsed_data = "&" + pars.get("itemprop")
                 result[key] = parsed_data + "|" + pars.name
             elif pars.get('href', False):
                 parsed_data = "^" + pars.get("href")
                 result[key] = parsed_data + "|" + pars.name
             else:
                 tag_name.append(key)
         else:
             result[key] = value
     if tag_name:
         return [result, tag_name]
     return result
Ejemplo n.º 2
0
def repetitive_rate_by_playlistlink(link1, link2):
    #找到歌单中所有歌组成字符串数组
    lists1 = []
    lists2 = []
    #linklist=[link1,link2]
    #for url in linklist:
    #    s = requests.session()
    #    s = BeautifulSoup(s.get(url, headers=headers).content, 'lxml')
    #    main = s.find('ul', {'class': 'f-hide'})
    #    for music in main.find_all('a'):
    #        lists.append(music.text)
    s1 = requests.session()
    s1 = BeautifulSoup(s1.get(link1, headers=headers).content, 'lxml')
    main = s1.find('ul', {'class': 'f-hide'})
    for music in main.find_all('a'):
        lists1.append(music.text)
    s2 = requests.session()
    s2 = BeautifulSoup(s2.get(link2, headers=headers).content, 'lxml')
    main = s2.find('ul', {'class': 'f-hide'})
    for music in main.find_all('a'):
        lists2.append(music.text)
    myset1 = set(lists1)
    myset2 = set(lists2)
    pattern = re.compile('\Wu\'')
    intersectionset = re.sub(pattern, '<br>\'', str(myset1 & myset2))
    length = len(myset1 | myset2) + len(myset1 & myset2)
    print intersectionset
    return (u"你们的歌单重合率为:%f%%<br><br>重复歌曲共%d首如下:%s" %
            (len(myset1 & myset2) * 200 / length, len(myset1 & myset2),
             intersectionset.decode('unicode-escape')))
Ejemplo n.º 3
0
def evaluateDiv(contentJson):
    contentBase = BeautifulSoup(contentJson, "lxml")

    divs = contentBase.find_all('div')
    totalQuantity = len(divs)

    failed = 0
    badTags = []
    good = 0
    flag = 0
    numId = -1

    for div in divs:
        numId += 1

        divRole = div.get('role')
        divAria = div.get('aria-level')
        nextAux = div.findNext()
        divNested = BeautifulSoup(str(nextAux), 'lxml').find('div')
        divParent = div.parent

        if (divAria):
            #encabezados ARIA anidados - 100%
            parentAria = divParent.get('aria-level')
            parentRole = divParent.get('role')
            if (divParent.name == "div" and parentAria
                    and parentRole == "heading"):
                flag = 1
                typeError = 1

            #encabezados ARIA anidados - 100%
            elif (divNested):
                divNestedRole = divNested.get('role')
                divNestedAria = divNested.get('aria-level')
                if (divNestedAria and (divNestedRole == "heading")):
                    flag = 1
                    typeError = 1

            #ARIA role=heading -25%
            elif (divRole and divRole == "heading" and not divNested):
                good += 1
        else:
            good += 1

        if (flag == 1):
            failed += 1
            pos = position(contentBase, div)
            pos.append(numId)
            pos.append(typeError)
            badTags.append(pos)
            flag = 0

    dataResponse = {
        'tag': 'div',
        'totalTagsAnalyzed': totalQuantity,
        'totalTagsFailed': failed,
        'positionsBadTags': badTags
    }
    response = json.dumps({'status': 'success', 'data': dataResponse})
    return response
Ejemplo n.º 4
0
    def search_id_by_username(self, username):
        """通过用户昵称查找用户 ID。

        :param str username: 用户昵称
        :return int: 用户 ID
        """
        if not username:
            cprint('Must give an <user id> or <username>!', 'yellow')
            sys.exit(1)

        search_url = urljoin(HOST_PAGE,
                             SEARCH_DESIGNER_SUFFIX.format(word=username))
        try:
            response = session_request(search_url)
        except requests.exceptions.ProxyError:
            cprint('Cannot connect to proxy.', 'red')
            sys.exit(1)
        except Exception as e:
            cprint(f'Failed to connect to {search_url}, {e}', 'red')
            sys.exit(1)

        author_1st = BeautifulSoup(response.text,
                                   'html.parser').find(name='div',
                                                       class_='author-info')
        if (not author_1st) or (author_1st.get('data-name') != username):
            cprint(f'Username「{username}」does not exist!', 'yellow')
            sys.exit(1)

        return author_1st.get('data-id')
Ejemplo n.º 5
0
def encodeScript(line):
    sc = BeautifulSoup(line, "html.parser").find("script")
    if(sc.get("src")):
        sc["src"] = encodeBase64(sc.get("src"))
    else:
        sc.string = pattern.sub(
            lambda x:  repr(encodeBase64(x.group(2), dirname)), sc.string)
    return sc.prettify()
Ejemplo n.º 6
0
def process_heading(heading: BeautifulSoup, textf: str, is_toplevel: bool,
                    single_file: bool) -> TocItem:
    """
	Generate and return a TocItem from this heading.

	INPUTS:
	heading: a BeautifulSoup tag representing a heading tag
	text: the path to the file
	is_toplevel: is this heading at the top-most level in the file?
	single_file: is there only one content file in the production (like some Poetry volumes)?

	OUTPUTS:
	a qualified ToCItem object
	"""

    toc_item = TocItem()
    parent_sections = heading.find_parents(["section", "article"])
    if parent_sections:
        toc_item.level = len(parent_sections)
    else:
        toc_item.level = 1

    try:
        toc_item.division = get_book_division(heading)
    except se.InvalidInputException:
        raise se.InvalidInputException(
            f"Couldn’t identify parent section in file: [path][link=file://{textf}]{textf}[/][/]."
        )

    # This stops the first heading in a file getting an anchor id, we don't generally want that.
    # The exceptions are things like poems within a single-file volume.
    toc_item.id = get_parent_id(heading)  # pylint: disable=invalid-name
    if toc_item.id == "":
        toc_item.file_link = textf
    else:
        if not is_toplevel:
            toc_item.file_link = f"{textf}#{toc_item.id}"
        elif single_file:  # It IS the first heading in the file, but there's only a single content file?
            toc_item.file_link = f"{textf}#{toc_item.id}"
        else:
            toc_item.file_link = textf

    toc_item.lang = heading.get("xml:lang") or ""

    # A heading may include z3998:roman directly,
    # eg <h5 epub:type="title z3998:roman">II</h5>.
    attribs = heading.get("epub:type") or ""

    if "z3998:roman" in attribs:
        toc_item.roman = extract_strings(heading)
        toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
        return toc_item

    process_heading_contents(heading, toc_item)

    return toc_item
Ejemplo n.º 7
0
Archivo: msg.py Proyecto: amxtaut/SkPy
 def rawToFields(cls, raw={}):
     fields = super(SkypeLocationMsg, cls).rawToFields(raw)
     locTag = BeautifulSoup(raw.get("content"), "html.parser").find("location")
     # Exponent notation produces a float, meaning lat/long will always be floats too.
     fields.update({"latitude": int(locTag.get("latitude")) / 1e6,
                    "longitude": int(locTag.get("longitude")) / 1e6,
                    "altitude": int(locTag.get("altitude")),
                    "address": locTag.get("address"),
                    "mapUrl": locTag.find("a").get("href")})
     return fields
Ejemplo n.º 8
0
def evaluateLabel(contentJson):
	contentBase= BeautifulSoup(contentJson, "lxml")

	labels = contentBase.find_all('label')
	totalQuantity = len(labels)

	failed = 0
	good = 0
	badTags = []
	flag = 0
	numId = -1

	for label in labels:
		numId += 1
		labelNext = label.findNext()
		nextInput = BeautifulSoup(str(labelNext), 'lxml').find('input')
		labelAttrFor = label.get('for')
		labelPlaceholder = label.text
		if (labelPlaceholder):
			labelPlaceholder = re.sub(r"[ \t\n\x0B\f\r]", '', labelPlaceholder, flags=0)


		#*INPUT type=text inside blank LABEL - 100%
		if (not labelPlaceholder and nextInput):
			inputType = nextInput.get('type')
			if (inputType == "text"):
				flag = 1
				typeError = 1

		#*INPUT type=text con blank LABEL FOR - 100%
		elif (labelAttrFor and nextInput):
			inputType = nextInput.get('type')
			if (inputType == "text"):
				flag = 1
				typeError = 1

		else: 
			good += 1

		if (flag == 1):
			failed += 1
			pos = position(contentBase, label)
			pos.append(numId)
			pos.append(typeError)
			badTags.append(pos)
			flag = 0

	dataResponse ={
		'tag' : 'label',
		'totalTagsAnalyzed': totalQuantity,
		'totalTagsFailed': failed,
		'positionsBadTags': badTags
	}
	response = json.dumps({'status': 'success', 'data': dataResponse})
	return response
Ejemplo n.º 9
0
def findPages(uri,uris):
    # Finds recursively the lings to each page
    import requests
    from bs4 import BeautifulSoup
    link = BeautifulSoup(requests.get(uri).text,'html.parser').find('a',text='Next')
    if link != None:
        uris.append('https://www.washingtonpost.com'+link.get('href'))
        findPages('https://www.washingtonpost.com'+link.get('href'),uris)
        return uris
    else:
        return []
Ejemplo n.º 10
0
    def get_entry_img_url(cls, element: BeautifulSoup):
        if element.img is not None:
            element = element.img

        img_url = element.get("data-src", None)
        if img_url is None:
            img_url = element.get("src", None)

        if img_url is None:
            return None
        return cls.abs_url(img_url)
Ejemplo n.º 11
0
    def chrollPostList(self, postListUrl):
        self.driver.movePage(postListUrl)
        postList = PostList(postListUrl)
        soup = BeautifulSoup(self.driver.getPageSource(), 'html.parser')
        trList = soup.find_all('tr', attrs={'class': 'ub-content us-post'})

        soup.get('')
        for trElement in trList:
            if len(trElement.find_all('b')) > 0: continue
            postList.addPost(self._initPost(trElement))
        self.logger.print('[done] ' + postListUrl + ' 게시글 목록 chrolling 완료')
        return postList
Ejemplo n.º 12
0
Archivo: msg.py Proyecto: amxtaut/SkPy
 def rawToFields(cls, raw={}):
     fields = super(SkypeFileMsg, cls).rawToFields(raw)
     # BeautifulSoup converts tag names to lower case, and find() is case-sensitive.
     file = BeautifulSoup(raw.get("content"), "html.parser").find("uriobject")
     if file:
         fileFields = {"name": (file.find("originalname") or {}).get("v"),
                       "size": (file.find("filesize") or {}).get("v"),
                       "urlFull": file.get("uri"),
                       "urlThumb": file.get("url_thumbnail"),
                       "urlView": (file.find("a") or {}).get("href")}
         fields["file"] = SkypeFileMsg.File(**fileFields)
     return fields
Ejemplo n.º 13
0
    def get_tags(article: BeautifulSoup) -> List[Optional[str]]:
        """
        Get tags from an article.

        Args:
            article (str) : Part of .
        """
        article_tags = (
            article.get("data-tags").split(",") if article.get("data-tags") else []
        )
        article_tags = map(str.strip, article_tags)
        return [_ for _ in article_tags]
Ejemplo n.º 14
0
 def rawToFields(cls, raw={}):
     fields = super(SkypeLocationMsg, cls).rawToFields(raw)
     locTag = BeautifulSoup(raw.get("content"), "html.parser").find("location")
     for attr in ("latitude", "longitude", "altitude", "speed", "course"):
         fields[attr] = int(locTag.get(attr)) if locTag.get(attr) else None
     # Exponent notation produces a float, meaning lat/long will always be floats too.
     for attr in ("latitude", "longitude"):
         if fields[attr]:
             fields[attr] /= 1e6
     fields.update({"address": locTag.get("address"),
                    "mapUrl": locTag.find("a").get("href")})
     return fields
Ejemplo n.º 15
0
def should_skip_element(element: BeautifulSoup) -> bool:
    attributes = element.get_attribute_list('class')
    if element.get('data-no-tax-price') is None:
        return True

    if 'hidden' in attributes:
        return True

    if not is_number(element.get('data-no-tax-price')):
        return True

    return False
Ejemplo n.º 16
0
 def rawToFields(cls, raw={}):
     fields = super(SkypeFileMsg, cls).rawToFields(raw)
     # BeautifulSoup converts tag names to lower case, and find() is case-sensitive.
     file = BeautifulSoup(raw.get("content"), "html.parser").find("uriobject")
     if file:
         fileFields = {"name": (file.find("originalname") or {}).get("v"),
                       "size": (file.find("filesize") or {}).get("v"),
                       "urlFull": file.get("uri"),
                       "urlThumb": file.get("url_thumbnail"),
                       "urlView": (file.find("a") or {}).get("href")}
         fields["file"] = SkypeFileMsg.File(**fileFields)
     return fields
Ejemplo n.º 17
0
 def rawToFields(cls, raw={}):
     fields = super(SkypeLocationMsg, cls).rawToFields(raw)
     locTag = BeautifulSoup(raw.get("content"), "html.parser").find("location")
     for attr in ("latitude", "longitude", "altitude", "speed", "course"):
         fields[attr] = int(locTag.get(attr)) if locTag.get(attr) else None
     # Exponent notation produces a float, meaning lat/long will always be floats too.
     for attr in ("latitude", "longitude"):
         if fields[attr]:
             fields[attr] /= 1e6
     fields.update({"address": locTag.get("address"),
                    "mapUrl": locTag.find("a").get("href")})
     return fields
Ejemplo n.º 18
0
    def get_guanzhu(self, url):
        count = 1
        response = self.session.get(url)
        pages = re.search(r'(<div class=\\"W_pages\\">[\s\S]*?<\\/div>)',
                          response.text).group(1)

        soup = BeautifulSoup(pages, 'lxml').find_all('a')[-1]
        next_page = None
        if '下一页' in str(soup):
            next_page = 'https://weibo.com' + soup.get('href').replace(
                '\\"', '').replace('\\', '')

        uids = []
        guanzhu_li = re.findall(
            r'(<div class=\\"info_name[\s\S]*?>[\s\S]*?<\\/div>)',
            response.text)
        for i in guanzhu_li:
            soup = BeautifulSoup(i, 'lxml')
            a = soup.find('a', attrs={'class': '\\"S_txt1\\"'})
            userid = a.get('usercard')
            if userid:
                uids.append(re.search(r'id=(\d+)&', userid).group(1))
            else:
                uids.append(None)

        while next_page:
            count += 1
            # 新浪规定非互相关注的话只能看前五页关注列表
            if count > 5:
                break
            response = self.session.get(next_page)
            pages = re.search(r'(<div class=\\"W_pages\\">[\s\S]*?<\\/div>)',
                              response.text).group(1)
            soup = BeautifulSoup(pages, 'lxml').find_all('a')[-1]
            next_page = None
            if '下一页' in str(soup) and soup.get('href'):
                next_page = 'https://weibo.com' + soup.get('href').replace(
                    '\\"', '').replace('\\', '')

            guanzhu_li = re.findall(
                r'(<div class=\\"info_name[\s\S]*?>[\s\S]*?<\\/div>)',
                response.text)
            for i in guanzhu_li:
                soup = BeautifulSoup(i, 'lxml')
                a = soup.find('a', attrs={'class': '\\"S_txt1\\"'})
                userid = a.get('usercard')
                if userid:
                    uids.append(re.search(r'id=(\d+)&', userid).group(1))
                else:
                    uids.append(None)
        print(len(uids))
        return uids
Ejemplo n.º 19
0
 def rawToFields(cls, raw={}):
     fields = super(SkypeLocationMsg, cls).rawToFields(raw)
     locTag = BeautifulSoup(raw.get("content"),
                            "html.parser").find("location")
     fields.update({
         # Exponent notation produces a float, meaning lat/long will always be floats too.
         "latitude": int(locTag.get("latitude")) / 1e6,
         "longitude": int(locTag.get("longitude")) / 1e6,
         "altitude": int(locTag.get("altitude")),
         "address": locTag.get("address"),
         "mapUrl": locTag.find("a").get("href")
     })
     return fields
Ejemplo n.º 20
0
def re_the_src(img_src):
    '''
    return the src.sessions of the string
    '''
    # print(img_src)
    this_img = BeautifulSoup(img_src, features="lxml").find('img')
    src = str(this_img.get('src'))
    height = str(this_img.get('height'))
    width = str(this_img.get('width'))
    # print(src)
    b64_data = src.split("base64,")[1]
    image_type = src.split("base64,")[0].split("/")[1][:-1]
    return b64_data, image_type, height, width
Ejemplo n.º 21
0
def get_session(url):
    if "bestbuy.com" in url: # beautifulsoup is faster and headless so I prefer this method for compatible websites
        with open('headers.json') as file:
            data = json.load(file)
        header = data[choice(list(data.keys()))] # random header picker -- originally used as a clever way to prevent amazon blocking scrape, but that no longer works so I switched to Selenium
        r = requests.Session()
        response = r.get(url, headers = header)
        page = BeautifulSoup(response.content, 'html.parser')
    else: # selenium is used for target and amazon
        PATH = os.getcwd() + driver
        page = webdriver.Chrome(PATH)
        page.get(url)
    return page
Ejemplo n.º 22
0
 def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
     """Get a single job attribute from a soup object by JobField
     TODO: impl div class=compactStars value somewhere.
     """
     if parameter == JobField.TITLE:
         # TODO: we should instead get what user sees in the <span>
         return soup.get('data-normalize-job-title')
     elif parameter == JobField.COMPANY:
         return soup.find(
             'div', attrs={'class', 'jobInfoItem jobEmpolyerName'}
         ).text.strip()
     elif parameter == JobField.LOCATION:
         return soup.get('data-job-loc')
     # FIXME: impl.
     # elif parameter == JobField.TAGS:
     #     labels = soup.find_all('div', attrs={'class', 'jobLabel'})
     #     if labels:
     #         return [
     #             l.text.strip() for l in labels if l.text.strip() != 'New'
     #         ]
     #     else:
     #         return []
     # FIXME: impl JobField.REMOTE
     elif parameter == JobField.POST_DATE:
         return calc_post_date_from_relative_str(
             soup.find(
                 'div', attrs={
                     'class': 'd-flex align-items-end pl-std css-mi55ob'
                 }
             ).text.strip()
         )
     elif parameter == JobField.WAGE:
         # NOTE: most jobs don't have this so we wont raise a warning here
         # and will fail silently instead
         wage = soup.find('span', attrs={'class': 'gray salary'})
         if wage is not None:
             return wage.text.strip()
         else:
             return ''
     elif parameter == JobField.KEY_ID:
         return soup.get('data-id')
     elif parameter == JobField.URL:
         part_url = soup.find(
             'div', attrs={'class', 'logoWrap'}
         ).find('a').get('href')
         return (
             f'https://www.glassdoor.{self.config.search_config.domain}'
             f'{part_url}'
         )
     else:
         raise NotImplementedError(f"Cannot get {parameter.name}")
Ejemplo n.º 23
0
def getFirstTitle():
    response = requests.get(
        'https://testerhome.com/topics/last')  #拿到网页地址给response
    response.encoding = 'utf-8'
    #找到标题对应的a标签
    a = BeautifulSoup(response.text, 'lxml') \
            .find("div", class_="panel-body item-list") \
            .find("div", class_="title media-heading") \
            .find('a')
    #创建一个空字典,用于存放title和href
    firstContent = {}
    firstContent['title'] = a.get('title')
    firstContent['href'] = a.get('href')
    #返回两个值1
    return firstContent
Ejemplo n.º 24
0
    def get_voters(self):
        """ 回答赞同者
            return: 回答赞同者名字和主页地址
            rtype: (username, url) Iterable
        """
        get_url = None
        while True:
            if get_url == "":
                break
            voter_soup = self.get_voter_page(get_url)
            get_url = ZHI_HU_URL + voter_soup.json()['paging']['next']

            for item in voter_soup.json()['payload']:
                soup = BeautifulSoup(item, "lxml").find('a')
                yield (soup.get('title'), ZHI_HU_URL + soup.get('href'))
Ejemplo n.º 25
0
 def __init__(self):
     super(GoogleSignIn, self).__init__('google')
     # googleinfo = urllib2.urlopen('https://accounts.google.com/.well-known/openid-configuration')
     # google_params = json.load(googleinfo)
     res = requests.get(
         'https://accounts.google.com/.well-known/openid-configuration')
     google_params = BeautifulSoup(res.text, "html.parser")
     google_params = json.loads(google_params.text)
     self.service = OAuth2Service(
         name='google',
         client_id=self.consumer_id,
         client_secret=self.consumer_secret,
         authorize_url=google_params.get('authorization_endpoint'),
         base_url=google_params.get('userinfo_endpoint'),
         access_token_url=google_params.get('token_endpoint'))
Ejemplo n.º 26
0
 def getView(self, url):
     headers = {
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         "Accept-Encoding": "gzip, deflate, sdch",
         "Accept-Language": "zh-CN,zh;q=0.8",
         "Cache-Control": "no-cache",
         "Connection": "keep-alive",
         "Host": "jw.hzau.edu.cn",
         "Pragma": "no-cache",
         "Referer": "http://jw.hzau.edu.cn/xs_main.aspx?xh=2013307201006",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36",
     }
     viewstate = None
     req = urllib.request.Request(url, headers=headers)
     while viewstate is None:
         try:
             html = urllib.request.urlopen(req, timeout=3).read().decode("gbk")
             soup = BeautifulSoup(html, "html.parser")
             try:
                 soup = soup.find_all(attrs={"name": "__VIEWSTATE"})[0]
                 viewstate = soup.get("value")
             except:
                 print("get response but cant get VIEWSTATE")
                 continue
         except:
             print("get available course table has not accepted response")
             continue
     return viewstate
Ejemplo n.º 27
0
def videos(url):
	response = get_html(url)
	if response == False:
		xbmcgui.Dialog().notification(name, 'No Episodes Available', defaultimage, 5000, False)
		sys.exit()
		xbmc.log('PAGE NOT FOUND')
	soup = BeautifulSoup(response,'html.parser').find_all('div',{'class': 'cattombstone'})
	xbmc.log('SOUP: ' + str(len(soup)))
	nxt = BeautifulSoup(response,'html.parser').find_all('a',{'class': 'nextpostslink'})
	xbmc.log('NEXT: ' + str(len(nxt)))
	if len(nxt) > 0:
		nxt = nxt[0]
		nurl = nxt.get('href')
		xbmc.log('NURL: ' + str(nurl))
	for item in soup:
		title = striphtml(str(item.find('a')))
		thumbnail = item.find('img')['src']
		url = item.find('a')['href']
		purl = 'plugin://plugin.video.bnwmovies?mode=637&url=' + url + "&name=" + urllib.quote_plus(title) + "&iconimage=" + urllib.quote_plus(thumbnail)
		li = xbmcgui.ListItem(title, iconImage=thumbnail, thumbnailImage=thumbnail)
		li.setProperty('fanart_image', defaultfanart)
		li.addContextMenuItems([('Download File', 'XBMC.RunPlugin(%s?mode=80&url=%s)' % (sys.argv[0], url)),('Plot Info', 'XBMC.RunPlugin(%s?mode=81&url=%s)' % (sys.argv[0], url))])
		xbmcplugin.addDirectoryItem(handle=addon_handle, url=purl, listitem=li)
		xbmcplugin.setContent(addon_handle, 'movies')
	if len(nxt) > 0:
		add_directory2('Next Page',nurl,636,defaultfanart,defaultimage,plot='Next Page')
	xbmcplugin.endOfDirectory(addon_handle)
Ejemplo n.º 28
0
 def getView(self,url):
     headers={
         'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'Accept-Encoding':'gzip, deflate, sdch',
         'Accept-Language':'zh-CN,zh;q=0.8',
         'Cache-Control':'no-cache',
         'Connection':'keep-alive',
         'Host':'jw.hzau.edu.cn',
         'Pragma':'no-cache',
         'Referer':'http://jw.hzau.edu.cn/xs_main.aspx?xh=2013307201006',
         'Upgrade-Insecure-Requests':'1',
         'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
     }
     viewstate=None
     req=urllib.request.Request(url,headers=headers)
     while viewstate is None:
         try:
             html=urllib.request.urlopen(req, timeout=3).read().decode('gbk')
             soup=BeautifulSoup(html,'html.parser')
             try:
                 soup=soup.find_all(attrs={'name':'__VIEWSTATE'})[0]
                 viewstate=soup.get('value')
             except:
                 print('get response but cant get VIEWSTATE')
                 continue
         except:
             print('get schedule has not accepted response')
             continue
     return viewstate
Ejemplo n.º 29
0
    def get_playlist(self, playlist_id):
        self.view_capture(int(playlist_id))
        url = default.playlist_api.format(playlist_id)
        s = requests.session()
        s = BeautifulSoup(
            s.get(url, headers=self.__headers).content, "html.parser")
        playlist = json.loads(s.text)['result']

        print("《" + playlist['name'].encode('utf-8') + "》")
        author = playlist['creator']['nickname'].encode('utf-8')
        pc = str(playlist['playCount'])
        sc = str(playlist['subscribedCount'])
        rc = str(playlist['shareCount'])
        cc = str(playlist['commentCount'])
        print("维护者:{}  播放:{} 关注:{} 分享:{} 评论:{}".format(author, pc, sc, rc, cc))
        print("描述:{}".format(playlist['description'].encode('utf-8')))
        print("标签:{}".format(",".join(playlist['tags']).encode("utf-8")))

        tb = [["ID", "歌曲名字", "艺术家", "唱片"]]
        for music in playlist['tracks']:
            artists = []
            for s in music['artists']:
                artists.append(s['name'])
            ms = music['name'].encode("utf-8")
            ar = ",".join(artists).encode("utf-8")
            ab = music['album']['name'].encode("utf-8")
            id = music['id']
            tb.append([id, ms, ar, ab])
        print(AsciiTable(tb).table)
Ejemplo n.º 30
0
 def sendCreds(self, user, pwd, params):
     # Now pass the login credentials over.
     loginResp = self.conn(
         "POST",
         "{0}/ppsecure/post.srf".format(SkypeConnection.API_MSACC),
         params={
             "wa":
             "wsignin1.0",
             "wp":
             "MBI_SSL",
             "wreply":
             "https://lw.skype.com/login/oauth/proxy?client_id=578134&site_name="
             "lw.skype.com&redirect_uri=https%3A%2F%2Fweb.skype.com%2F"
         },
         cookies={
             "MSPRequ": params["MSPRequ"],
             "MSPOK": params["MSPOK"],
             "CkTst": str(int(time.time() * 1000))
         },
         data={
             "login": user,
             "passwd": pwd,
             "PPFT": params["PPFT"]
         })
     tField = BeautifulSoup(loginResp.text, "html.parser").find(id="t")
     if tField is None:
         err = re.search(r"sErrTxt:'([^'\\]*(\\.[^'\\]*)*)'",
                         loginResp.text)
         errMsg = "Couldn't retrieve t field from login response"
         if err:
             errMsg = re.sub(r"<.*?>", "", err.group(1)).replace(
                 "\\'", "'").replace("\\\\", "\\")
         raise SkypeAuthException(errMsg, loginResp)
     return tField.get("value")
Ejemplo n.º 31
0
 def view_capture(self, page, type="全部"):
     s = requests.session()
     play_url = self.__play_url.format(type, page * 35)
     try:
         acmsk = {'class': 'msk'}
         scnb = {'class': 'nb'}
         dcu = {'class': 'u-cover u-cover-1'}
         ucm = {'class': 'm-cvrlst f-cb'}
         s = BeautifulSoup(
             s.get(play_url, headers=self.__headers).content, "html.parser")
         lst = s.find('ul', ucm)
         for play in lst.find_all('div', dcu):
             title = play.find('a', acmsk)['title'].encode('utf-8')
             link = play.find('a', acmsk)['href'].encode('utf-8').replace(
                 "/playlist?id=", "")
             cnt = play.find('span', scnb).text.encode('utf-8').replace(
                 '万', '0000')
             if pysql.single("playlist163", "link", link) is True:
                 pl = pysql.Playlist163(title=title,
                                        link=link,
                                        cnt=int(cnt),
                                        dsc="曲风:{}".format(type))
                 self.session.add(pl)
                 self.session.commit()
     except Exception as e:
         pylog.log.error("抓取歌单出现问题:{} 歌单类型:{} 页码:{}".format(e, type, page))
         raise
Ejemplo n.º 32
0
    def _extract_upload_policies_asset_authenticity_token(
        self,
        html: str,
    ) -> Optional[str]:
        file_attachment_tag = BeautifulSoup(
            html,
            features='lxml',
        ).find('file-attachment', )

        if not file_attachment_tag:
            return None

        authenticity_token = file_attachment_tag.get(
            'data-upload-policy-authenticity-token', None)
        if authenticity_token is None:
            csrf_policy_input = file_attachment_tag.find(
                'input',
                {
                    'class': 'js-data-upload-policy-url-csrf',
                },
            )
            if csrf_policy_input is not None:
                authenticity_token = csrf_policy_input.get('value', None)

        return cast(str, authenticity_token)
Ejemplo n.º 33
0
def get_news(link):
	length = len(user_agents)
	index=random.randint(0,length-1)
	user_agent = user_agents[index]
	headers={
		'Referer': 'http://www.jrzj.com',
		'Host':'www.jrzj.com',
		'User-Agent':user_agent,
		'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
	}
	bsObj=requests.session()
	bsObj=BeautifulSoup(bsObj.get(link,headers=headers).content,'html.parser')
	title=bsObj.h1.get_text()
	#print('标题:',title)
	tags_list=bsObj.find('meta',{'name':'keywords'}).attrs['content']
	l=re.split(',',tags_list)
	tags=[item for item in filter(lambda x:x != '', l)]
	#print('标签:',tags)
	category=bsObj.title.get_text().split('_')[1]
	#print('分类',category)
	#content=bsObj.find('div',{'class':'news_content'}).prettify()
	content=bsObj.find('div',{'class':'news_content'})
	#print('内容:',content)
	#查找图片
	a_tag=content.find('img')
	#print(a_tag)
	image_url=a_tag.attrs['src']
	image_name=os.path.basename(image_url).split('!')[0]
	#下载图片
	get_image(image_url,image_name)
	#删除标签
	a_tag.extract()
	news=News(title,tags,category,content.prettify(),image_name)
	return news
Ejemplo n.º 34
0
def get_music(Songlistid, id, tables):
    songslist_id = id
    url = 'https://music.163.com/playlist?id=' + str(songslist_id)
    insert_list = list()
    s = requests.session()
    s = BeautifulSoup(s.get(url, headers=header).content, 'lxml')

    id_list = s.select('ul.f-hide li a')

    #查詢歌單數據表
    joe = SongsList.objects.get(id=Songlistid)
    i = 1
    previous_songs_num = len(Songs.objects.all())
    for songs in id_list:
        name = songs.text
        id = songs.get('href')[9:]
        url = 'http://music.163.com/song/media/outer/url?id=' + id
        insert_list.append(tables(song_id=id, name=name, url=url))
        i += 1
    tables.objects.bulk_create(insert_list)
    print(f"成功创建{i}个数据表")
    SONGS = Songs.objects.all()[previous_songs_num:]
    for song in SONGS:
        song.songslist.add(joe)
    print(f"{tables}{Songlistid}数据更新完毕")
Ejemplo n.º 35
0
    def fetch_individual_page(self, isbn: str) -> BeautifulSoup:
        """入力されたISBNから個別の書籍ベージのhtmlデータを取得する"""
        # 検索する
        params = {
            "detailFlg": 1,
            "isbn": isbn,
            "seldt": r"2023%2Fall%2Fall%2Fbefore",
            "srchf": 1,
            "store": 1
        }

        # 検索結果から個別ページへいく
        soup = BeautifulSoup(self._fetch_html(self.url, params=params),
                             features="html.parser")
        dytitle = soup.find("a", class_="dyTitle")

        if dytitle is None:
            dytitle = BeautifulSoup(self._fetch_html(self.extended_url, params=params),
                                    features="html.parser").find("a", class_="dyTitle")
            # 電子書籍のみでヒットしなければ紙も検索に含める
        if dytitle is None:    #それでもヒットしなければ
            raise HontoDoesNotHaveDataError(f"Honto not have the book data. {isbn=}")

        individual_page_url = dytitle.get("href")    # 複数hitは先頭のものを抽出

        soup = BeautifulSoup(self._fetch_html(individual_page_url),
                             features="html.parser")
        return soup
Ejemplo n.º 36
0
 def view_capture(self, link):
     self.session.query(pysql.Playlist163).filter(
         pysql.Playlist163.link == link).update({'over': 'Y'})
     url = self.__url + str(link)
     s = requests.session()
     try:
         s = BeautifulSoup(
             s.get(url, headers=self.__headers).content, "html.parser")
         musics = json.loads(s.text)['result']['tracks']
         exist = 0
         for music in musics:
             name = music['name'].encode('utf-8')
             author = music['artists'][0]['name'].encode('utf-8')
             if pysql.single("music163", "song_id", (music['id'])) == True:
                 self.session.add(
                     pysql.Music163(song_id=music['id'],
                                    song_name=name,
                                    author=author))
                 self.session.commit()
                 exist = exist + 1
             else:
                 pylog.log.info('{} : {} {}'.format("重复抓取歌曲", name,
                                                    "取消持久化"))
         print("歌单包含歌曲 {} 首,数据库 merge 歌曲 {} 首 \r\n".format(
             len(musics), exist))
     except Exception:
         pylog.log.error('{} : {}'.format("抓取歌单页面存在问题", url))
         raise
Ejemplo n.º 37
0
def get_mryb_text():
    #获取每日一报的最新文章链接
    newAticle = requests.get("http://weixin.sogou.com/weixin?query=每日读报时间")
    newAticle_aflag = BeautifulSoup(
        newAticle.text, "html.parser").find(uigs="account_article_0")
    mryb_link = newAticle_aflag.get('href')

    #获取每日一报最新文章的网页代码
    mryb = requests.get(mryb_link)

    aticle_soup = BeautifulSoup(mryb.text, "html.parser")
    #网页抓取每日一报的标题
    aticle_title = aticle_soup.find(id="activity-name")
    #网页抓取每日一报的内容
    aticle_content = aticle_soup.find("div", id="js_content")

    #可以使用lstrip(), strip(), rstrip()来去除某个字符,默认空格

    text = ''
    #网页标题
    for string in aticle_title.stripped_strings:
        text += string
        text += '\n'
    #网页内容
    for string in aticle_content.stripped_strings:
        text += string
        text += '\n'

    return text
Ejemplo n.º 38
0
    def parse_element_message(driver, element_message, id_user_write,
                              id_user_read):
        html = element_message.get_attribute('outerHTML')
        bs_obj = BeautifulSoup(html, 'html.parser')

        message_type = HelperMessage.get_message_type(driver, bs_obj)
        # print(message_type)
        # message_source =
        # class, 'message-out'

        if message_type == 'message_text':
            message = HelperMessage.parse_message_text(driver, bs_obj)
        elif message_type == 'message_file':
            message = HelperMessage.parse_message_file(driver, bs_obj)
        elif message_type == 'message_audio':
            message = HelperMessage.parse_message_audio(driver, bs_obj)
        elif message_type == 'message_image':
            message = HelperMessage.parse_message_image(driver, bs_obj)
        elif message_type == 'message_video':
            message = HelperMessage.parse_message_video(driver, bs_obj)
        else:
            raise RuntimeError('unknown message type')

        # filling in the blanks
        if not 'id_user' in message:
            # print('add id_user')
            if 'message-out' in bs_obj.get('class'):
                message['id_user'] = id_user_write
            else:
                message['id_user'] = id_user_read

        if not 'timestamp' in message:
            message['timestamp'] = -1
        return message
Ejemplo n.º 39
0
 def item_enclosure_url(self, item):
     """Returns an image for enclosure"""
     if item.image:
         url = item.image.url
     else:
         img = BeautifulSoup(item.html_content).find('img')
         url = img.get('src') if img else None
     return urljoin(self.site_url, url) if url else None
Ejemplo n.º 40
0
 def item_enclosure_url(self, item):
     """Returns an image for enclosure"""
     if item.image:
         url = item.image.url
     else:
         img = BeautifulSoup(item.html_content).find("img")
         url = img.get("src") if img else None
     self.cached_enclosure_url = url
     return urljoin(self.site_url, url) if url else None
Ejemplo n.º 41
0
def getAvatarByUrl(url):
    try:
        res = getContentByUrl(url)
        soup = BeautifulSoup(res)
        soup = soup.find('div', attrs={'class': 'avatar'}).find('img')
        url = 'http://www.oiegg.com/' + soup.get('src')
        return url
    except:
        return 'http://www.oiegg.com/images/avatars/noavatar.gif'
Ejemplo n.º 42
0
    def wrap(html):
        soup = BeautifulSoup(html).find('div', attrs={"node-type": "feed_merge_list_item"})
        weiboId = soup.get('mid')

        contentNode = soup.find('span', attrs={"node-type": "feed_list_forwardContentAgg"})
        userName = contentNode.get('nick-name')
        content = contentNode.get_text()

        return Forward(weiboId, userName, content)
Ejemplo n.º 43
0
 def item_enclosure_url(self, item):
     """
     Return an image for enclosure.
     """
     if item.image:
         url = item.image.url
     else:
         img = BeautifulSoup(item.html_content, 'html.parser').find('img')
         url = img.get('src') if img else None
     self.cached_enclosure_url = url
     return urljoin(self.site_url, url) if url else None
Ejemplo n.º 44
0
  def _save_details(self, details):
    self.downloads = int(details[0].split()[0])
    self.views = int(details[1].split()[0])

    link = Soup(details[4]).find('a').get('href')
    self.link = str(link)

    tag = Soup(details[5]).find('a')
    teacher_name = str(tag.text)
    teacher_page = str(tag.get('href'))
    self.teacher = Teacher(name=teacher_name, page=teacher_page)
Ejemplo n.º 45
0
    def _populate_latest(self):
        """
        Popular version data for the latest release available for download
        """
        if self.license is None:
            self.log.debug('No license specified, not retrieving latest version information')
            return

        # Submit a request to the client area
        response = self.session.get(self.license.license_url)
        self.log.debug('Response code: %s', response.status_code)
        response.raise_for_status()

        # Load our license page
        soup = BeautifulSoup(response.text, "html.parser")
        script_tpl = soup.find('script', id='download_form')
        form = BeautifulSoup(script_tpl.text, "html.parser").find('form')

        # Parse the response for a download link to the latest IPS release
        version = Version(form.find('label', {'for': 'version_latest'}).text)
        self.log.info('Latest IPS version: %s', version.vstring)
        url = form.get('action')

        # Parse the response for a download link to the latest development release
        try:
            dev_version = Version(form.find('label', {'for': 'version_dev'}).text)
            if dev_version:
                self.log.info('Latest IPS development version: %s', version.vstring)
                dev_url = form.get('action')
                self.dev_version = IpsMeta(self, dev_version, request=('post', dev_url, {'version': 'latestdev'}), dev=True)
        except AttributeError:
            self.log.info('No development release available for download')

        # If we have a cache for this version, just add our url to it
        if version.vtuple in self.versions:
            self.log.debug('Latest IPS version already downloaded, applying URL to cache entry')
            self.versions[version.vtuple].request = ('post', url, {'version': 'latest'})
            return

        self.versions[version.vtuple] = IpsMeta(self, version, request=('post', url, {'version': 'latest'}))
Ejemplo n.º 46
0
 def sendToken(self, token):
     # Send the existing token over.
     loginResp = self.conn("GET", "{0}/login".format(SkypeConnection.API_LOGIN),
                           params={"client_id": "578134", "redirect_uri": "https://web.skype.com"},
                           cookies={"refresh-token": token})
     tField = BeautifulSoup(loginResp.text, "html.parser").find(id="t")
     if tField is None:
         err = re.search(r"sErrTxt:'([^'\\]*(\\.[^'\\]*)*)'", loginResp.text)
         errMsg = "Couldn't retrieve t field from login response"
         if err:
             errMsg = re.sub(r"<.*?>", "", err.group(1)).replace("\\'", "'").replace("\\\\", "\\")
         raise SkypeAuthException(errMsg, loginResp)
     return tField.get("value")
Ejemplo n.º 47
0
def get_product_details(product):
    http = urllib3.PoolManager()
    response = http.request('GET', product.get('url'))

    if response.status != 200:
        print("requesting url failed with code: {0}".format(response.status))
        return None
    else:
        dom_img   = BeautifulSoup(response.data).find('a', {'class': 'zoom-img1'})
        dom_price = BeautifulSoup(response.data).find('div', {'itemprop': 'price'})
        product['details'] = {
            'price'       : dom_price.text.strip(),
            'image_href'  : dom_img.get('href'),
            'image_title' : dom_img.get('title'),
            'content_info': {}
        }

        for content_info in BeautifulSoup(response.data).findAll('div', {'class': 'cnt-info'}):
            content_parts = content_info.text.split(':')
            if len(content_parts) > 1:
                info_label = content_parts[0].strip().split('\n')[0]
                info_value = content_parts[1].strip()
                product['details']['content_info'][info_label] = info_value
        return product
Ejemplo n.º 48
0
 def item_enclosure_url(self, item):
     """
     Return an image for enclosure.
     """
     try:
         url = item.image.url
     except (AttributeError, ValueError):
         img = BeautifulSoup(item.html_content, 'html.parser').find('img')
         url = img.get('src') if img else None
     self.cached_enclosure_url = url
     if url:
         url = urljoin(self.site_url, url)
         if self.feed_format == 'rss':
             url = url.replace('https://', 'http://')
     return url
Ejemplo n.º 49
0
 def deserialize(self, world_xml):
     # noinspection PyBroadException
     try:
         world_xml = BeautifulSoup(world_xml, features="xml").world
         res_world = World(random.randint)
         res_world.setN(int(world_xml.get('size')))
         org_mapper = MapOrganism()
         for org in world_xml.findAll('organism'):
             tmp = org_mapper.deserialize(org)
             if tmp is not None:
                 res_world.add_organism(tmp)
         return res_world
     except:
         print("XML file corrupted")
         return None
Ejemplo n.º 50
0
 def run(self):
     while True:
         try:
             poem_url = self.queue.get(False)
             poem_page = urllib.urlopen(poem_url).read()
             # SoupStrainer
             poetry_soup = BeautifulSoup(poem_page).find('div', class_='poem')
             title = poetry_soup.get('data-text')
             poem = poetry_soup.p.get_text(strip=True)
             body = poem
             self.complete.acquire()
             self.complete.poems.append([title, body])
             self.complete.release()
             self.queue.task_done()
         except Queue.Empty:
             break
Ejemplo n.º 51
0
def main():
    linkdict = get_tiku()  # 获取首页所有课程列表
    for name, links in linkdict.items():
        for link in links:
            if link is not None:
                soup = BeautifulSoup("%s" % link).find("a")
                link = soup.get("href")  # 单个课程url
                title = soup.text  # 单个课程标题, 如: 第1章 集合与函数概念
                types = get_types(link)  # [(选择题, http://www.tikubaba.com/class-69-1.html), ...]
                for tp in types:
                    type_page = tp[1].split("-")[0] + "-" + tp[1].split("-")[1] + "-%s.html"
                    page_num = get_pages(tp[1])
                    for page in range(page_num):
                        for url in get_page_items(type_page % str(page + 1)):
                            # import pdb;pdb.set_trace()
                            result = get_details(url)  # 单道题目的答案
Ejemplo n.º 52
0
 def sendCreds(self, user, pwd, params):
     # Now pass the login credentials over.
     loginResp = self.conn("POST", "{0}/ppsecure/post.srf".format(SkypeConnection.API_MSACC),
                           params={"wa": "wsignin1.0", "wp": "MBI_SSL",
                                   "wreply": "https://lw.skype.com/login/oauth/proxy?client_id=578134&site_name="
                                             "lw.skype.com&redirect_uri=https%3A%2F%2Fweb.skype.com%2F"},
                           cookies={"MSPRequ": params["MSPRequ"],
                                    "MSPOK": params["MSPOK"],
                                    "CkTst": str(int(time.time() * 1000))},
                           data={"login": user, "passwd": pwd, "PPFT": params["PPFT"]})
     tField = BeautifulSoup(loginResp.text, "html.parser").find(id="t")
     if tField is None:
         err = re.search(r"sErrTxt:'([^'\\]*(\\.[^'\\]*)*)'", loginResp.text)
         errMsg = "Couldn't retrieve t field from login response"
         if err:
             errMsg = re.sub(r"<.*?>", "", err.group(1)).replace("\\'", "'").replace("\\\\", "\\")
         raise SkypeAuthException(errMsg, loginResp)
     return tField.get("value")
Ejemplo n.º 53
0
def login():
    loginURL="http://jw.hzau.edu.cn/default2.aspx"
    checkURL="http://jw.hzau.edu.cn/default2.aspx"
    codeURL='http://jw.hzau.edu.cn/CheckCode.aspx'
    user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
    f=urllib.request.urlopen(loginURL)
    html=f.read().decode('gbk')
    soup=BeautifulSoup(html,'html.parser')
    soup=soup.find_all(attrs={'name':'__VIEWSTATE'})[0]
    viewstate=soup.get('value')
    f=urllib.request.urlopen(codeURL)
    path='D:\\aaaaa\\code.gif'
    fl=open(path,'wb')
    fl.write(f.read())
    fl.close()
    img_code=input("please input code: ")
    headers={
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Cache-Control':'no-cache',
            'Connection':'keep-alive',
            'Content-Type':'application/x-www-form-urlencoded',
            'Host':'jw.hzau.edu.cn',
            'Origin':'http://jw.hzau.edu.cn',
            'Pragma':'no-cache',
            'Referer':'http://jw.hzau.edu.cn/default2.aspx',
            'Upgrade-Insecure-Requests':1,
            'User-Agent':user_agent,
        }
    form={
        '__VIEWSTATE':viewstate,
        'txtUserName':'******',
        'TextBox2':'qq520.1314',
        'txtSecretCode':img_code,
        'RadioButtonList1':'学生',
        'Button1':'',
        'lbLanguage':'',
        'hidPdrs':'',
        'hidsc':'',
    }
    post_data=urllib.parse.urlencode(form).encode(encoding='utf-8')
    req=urllib.request.Request(checkURL,post_data,headers)
    f=urllib.request.urlopen(req)
Ejemplo n.º 54
0
    def download_file(self, book_url: str):
        ##调用request函数把套图地址传进去会返回给我们一个response
        html = self.request(book_url)
        a_tag_list = BeautifulSoup(html.text, 'lxml').find_all('a', class_='current')

        for a_tag in a_tag_list:
            down_url = "http://www.en8848.com.cn/" + a_tag.get('href')
            html = self.request(down_url)

            title = BeautifulSoup(html.text, 'lxml').title.contents[0]
            title = str(title).replace("/", "-")
            a_tag_file = BeautifulSoup(html.text, 'lxml').find('a', id='dload')

            if a_tag_file is not None :
                file_url = "http://www.en8848.com.cn/e/DownSys/" + a_tag_file.get('href')[3:]
                print(title + " : " + file_url)
                html = self.request(file_url)
                f = open('/home/laomie/temp/enbooks/' + title + '.rar', 'ab')
                f.write(html.content)
                f.close()
Ejemplo n.º 55
0
 def getViewAndCode(self):
     while self.viewstate is None:
         try:
             html = urllib.request.urlopen(self.loginAndCheckUrl, timeout=3).read().decode("gbk")
             soup = BeautifulSoup(html, "html.parser")
             soup = soup.find_all(attrs={"name": "__VIEWSTATE"})[0]
             self.viewstate = soup.get("value")
         except:
             print("the login.aspx has not response")
             continue
     while self.img_code is None:
         fl = open(self.codeImg, "wb")
         try:
             fl.write(urllib.request.urlopen(self.codeURL, timeout=3).read())
         except:
             print("the code.aspx has not response")
             continue
         finally:
             fl.close()
         self.img_code = input("please input code: ")  # must wait for fl.close()
Ejemplo n.º 56
0
 def start_requests(self):
     print 'Preparing login...'
     # xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0]
     # print xsrf
     # FormRequeset.from_response is a function of Scrapy, to post data
     self.driver.get('https://www.douban.com/login')
     captcha_url = BeautifulSoup(self.driver.page_source, 'lxml').find(id='captcha_image')
     if captcha_url:
         captcha_url = captcha_url.get('src')
         code = requests.get(captcha_url)
         with open('/Users/shichangtai/Desktop/douban/code.jpg', 'wb') as f:
             f.write(code.content)
         captcha = raw_input('请输入图中的验证码:')
         captcha_field = self.driver.find_element_by_id("captcha_field")
         captcha_field.send_keys(captcha)
     username = self.driver.find_element_by_id("email")
     password = self.driver.find_element_by_id("password")
     username.send_keys(“******”)
     password.send_keys(“******”)
     self.driver.find_element_by_name("login").click()
     # return [FormRequest.from_response]
     #some pages cannot display without logging in.
     if self.start_urls:
         print self.start_urls
         print '------------Review crawl mode...----------------'
         self.driver.get(self.start_urls)
         review_url=BeautifulSoup(self.driver.page_source,'lxml').find(id='comments-section').find('span',class_='pl').a['href']
         while(1):
             yield Request(review_url, self.parse_review, headers=self.headers, errback=self.errback_review)
             self.driver.get(review_url)
             if BeautifulSoup(self.driver.page_source, 'lxml').find(id='paginator').find_all('a')[-1].get_text()!=u'\u540e\u9875 >':
                 break
             review_url=self.start_urls+'/comments'+BeautifulSoup(self.driver.page_source, 'lxml').find(id='paginator').find_all('a')[-1]['href']
     else:
         print '---------------Top250 crawl mode...---------------'
         for i in range(10):
             temp_url='https://movie.douban.com/top250?start=%s&filter=' % (i*25)
             yield Request(temp_url,self.parse,headers=self.headers)
Ejemplo n.º 57
0
def getView():
    headers={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Content-Type':'application/x-www-form-urlencoded',
        'Host':'jw.hzau.edu.cn',
        'Origin':'http://jw.hzau.edu.cn',
        'Pragma':'no-cache',
        'Referer':'http://jw.hzau.edu.cn/xs_main.aspx?xh=2013307201006',
        'Upgrade-Insecure-Requests':1,
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
    }
    xkURL='http://jw.hzau.edu.cn/xf_xsqxxxk.aspx?xh=2013307201006&xm=%B3%CC%CA%E9%D2%E2&gnmkdm=N121111'
    req=urllib.request.Request(xkURL,headers=headers)
    f=urllib.request.urlopen(req)
    html=f.read().decode('gbk')
    print(html)
    soup=BeautifulSoup(html,'html.parser')
    soup=soup.find_all(attrs={'name':'__VIEWSTATE'})[0]
    viewstate=soup.get('value')
    return viewstate
Ejemplo n.º 58
0
 def rawToFields(cls, raw={}):
     fields = super(SkypeCallMsg, cls).rawToFields(raw)
     listTag = BeautifulSoup(raw.get("content"), "html.parser").find("partlist")
     fields.update({"state": {"started": cls.State.Started, "ended": cls.State.Ended}[listTag.get("type")],
                    "userIds": [], "userNames": []})
     for partTag in listTag.find_all("part"):
         fields["userIds"].append(partTag.get("identity"))
         fields["userNames"].append(partTag.find("name").text)
     return fields
Ejemplo n.º 59
0
loginUrl = "https://www.facebook.com/login.php?login_attempt=1&lwv=110"
# req = urllib2.Request(hosturl)
soup = BeautifulSoup('<input class="boldest" Extremely bold >').input
print soup.name
  
logResult =  urllib2.urlopen(hosturl,timeout=1000).read() 
soup = BeautifulSoup(logResult)
#print soup.select("#login_form input")
logInfo = {
	"email": "*****@*****.**",
	"pass":"******"
}
for i in  soup.select("#login_form input"):
	print i
	tag =BeautifulSoup(str(i)).input
	if tag.get("name") and tag.get("value"):
		logInfo[tag['name']] =  tag.get("value")
		print tag['name'] + "\t" + str(tag.get("value"))


postData = urllib.urlencode(logInfo)
print postData

headers = {
			"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36",
			"Host": "202.203.222.202",
			"content-type":"application/x-www-form-urlencoded",
			"referer":"https://www.facebook.com/?stype=lo&jlou=AfeSfaeiFvsh5xaSSLIuKbHS0F-1AdSjGCnU5GOP8CBizaJ4XrXli8EJH3r_Ws4vPMeDAHJhu_D63lhaUKVQke3GIbC-kcxT1244kgOo7mcROw&smuh=41755&lh=Ac_3mzPw-4ESY0H_",
			"Upgrade-Insecure-Requests": 1
	      } 
Ejemplo n.º 60
0
def check_best_practices(domain, protocol):
	
	print '<h2>General SEO Best Practices</h2><ul>',
	
	# check for robots.txt
	url = protocol + '://' + domain + '/robots.txt'	
	r = requests.get(url,allow_redirects=False, verify=False)	
	if r.status_code==200:
		print '<li class="good">Robots.txt: Found</li>',	
	else:
		print '<li class="bad">Robots.txt: Not Found</li>',

	# check for sitemap.xml
	url = protocol + '://' + domain + '/sitemap.xml'	
	r = requests.get(url,allow_redirects=False,verify=False)		
	if r.status_code==200:
		'<li class="good">sitemap.xml: Found</li>',	
	else:
		print '<li class="bad">sitemap.xml: Not Found</li>',

	# check for sitemap.xml.gz
	url = protocol + '://' + domain + '/sitemap.xml.gz'	
	r = requests.get(url,allow_redirects=False,verify=False)		
	if r.status_code==200:
		'<li class="good">sitemap.xml.gz: Found</li>',	
	else:
		print '<li class="bad">sitemap.xml.gz: Not Found</li>',		

	# check for sitemap.gz
	url = protocol + '://' + domain + '/sitemap.gz'	
	r = requests.get(url,allow_redirects=False,verify=False)		
	if r.status_code==200:
		'<li class="good">sitemap.xml.gz: Found</li>',	
	else:
		print '<li class="bad">sitemap.xml.gz: Not Found</li>',				

	#fetch the home page response for the rest of analysis
	url = protocol + '://' + domain + '/'
	r = requests.get(url,allow_redirects=False,verify=False)
	soup = BeautifulSoup(r.content, 'lxml')
	
	# check for responsive setup in <meta> tag
	meta_viewport = False
	for meta in soup.find_all('meta'):
			if meta.get('name') == 'viewport':
				meta_viewport = True
				print '<li class="good">Meta-Viewport found: ' + meta.get('content') + '</li>',
	if meta_viewport== False:
		print '<li class="bad">Meta-Viewport is not defined</li>',
	
	# check for unicode content type
	meta_unicode = False
	if 'Content-Type' in r.headers and r.headers['Content-Type'].find('charset') > -1:
		print '<li class="good">Character Encoding detected in response headers.</li>',
		meta_unicode = True
	else:
		for meta in soup.find_all('meta'):
			if meta.get('charset'):
				meta_unicode = True
				print '<li class="good">Character Encoding detected in &lt;meta&gt; tags</li>',
	if meta_viewport== False:
		print '<li class="bad">Charset definition not found.</li>',
	
	# check for language setting
	if soup.get('lang'):
		print '<li class="good">Language details specific in &lt;html&gt; tag.</li>',
	else:
		print '<li class="bad">Language is unspecified in &lt;html&gt; tag.</li>',

	# check for vary:user-agent
	if 'Vary' in r.headers and r.headers['Vary'].find('User-Agent') > -1:
		print '<li class="good"><i>Vary: User-Agent</i> header detected.</li>',
	else:
		print '<li class="bad"><i>Vary: User-Agent</i> header not detected.</li>',

	print '</ul>',