def SeparateWithHtml(html):
        tagList = []
        elemList = []
        attrList = []
        soup = BeautifulSoup(html,"html5lib")
        txt = re.compile("[^-~]")
        for elem in soup.body(text = txt):
            dname=""
            for parent in elem.parents:
                dname = dname + "-" + str(parent.name)
            if elem !=u'\n' and str(elem.parent.name) != "script":
                #print "dname:",dname
                #print "elem:",elem
                #print elem.parent.attrs
                tagList.append(dname)
                elemList.append(elem)
                attrList.append(elem.parent.attrs)
        for elem in soup.body(text =""):
            dname=elem.name
            for parent in elem.parents:
                dname = dname + "-" + str(parent.name)
            if str(elem.parent.name) != "script" and str(elem.name) != "script":
                #print "dname:",dname
                #print "elem:",""
                #print elem.attrs
                tagList.append(dname)
                elemList.append("")
                attrList.append(elem.attrs)
        return tagList,elemList,attrList
def parse_url(parse_url):
    print("Starting the process now")

    #parse_url = sys.stdin.readline()
    #print("URL: " + parse_url)
    feed = feedparser.parse(parse_url)
    print("Successfully parsed the url")

    new_feed = etree.Element('rss', version="2.0")
    channel = etree.SubElement(new_feed, 'channel')
    title = etree.SubElement(channel, 'title')
    title.text = feed.feed.title
    link = etree.SubElement(new_feed, 'link')
    link.text = feed.feed.link
    desc = etree.SubElement(new_feed, 'description')
    desc.text = feed.feed.description

    for entry in feed.entries:
        response = requests.get(entry.link,
                                headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.content, features="html5lib")

        c = soup.body("div", "chapter-inner chapter-content")  #Royal Road
        c2 = soup.body("div", "fr-view")  #WuxiaWorld
        c3 = soup.body("div", "chp_raw")  #ScribbleHub
        c4 = soup.body("div", "entry-content")  #WordPress

        if c is not None and len(c) > 0:
            children = c[0].children
        elif c2 is not None and len(c2) > 0:
            children = c2[0].children
        elif c3 is not None and len(c3) > 0:
            children = c3[0].children
        elif c4 is not None and len(c4) > 0:
            children = c4[0].children

        item = etree.SubElement(channel, 'item')

        item_title = etree.SubElement(item, 'title')
        item_title.text = entry.title

        item_link = etree.SubElement(item, 'link')
        item_link.text = entry.link

        item_desc = etree.SubElement(item, 'description')
        desc = "".join(str(child) for child in children if str(child).strip())
        item_desc.text = desc

        guid = etree.SubElement(item, 'guid', isPermaLink='false')

        try:
            guid.text = entry.id
        except:
            guid.text = ""

        pubDate = etree.SubElement(item, 'pubDate')
        pubDate.text = entry.published

    return (etree.tostring(new_feed, encoding='utf-8', method="xml"))
Example #3
0
def productPrice(product_id):
	api = "http://www.hobbyking.com/hobbyking_api.asp?id=" + str(product_id) + "+&switch=3"
	output = urllib2.urlopen(api).read();
	soup = BeautifulSoup(output, 'html5lib')
	price = ''.join(soup.body(text=True)[0])
	api = "http://www.hobbyking.com/hobbyking_api.asp?id=" + str(product_id) + "+&switch=1"
	output = urllib2.urlopen(api).read();
	soup = BeautifulSoup(output, 'html5lib')
	stock = ''.join(soup.body(text=True)[0])
	return toJson({"price" : price, "stock" : stock})
Example #4
0
def ReadEmailDetails(service, user_id, msg_id):

  temp_dict = { }

  try:

      message = service.users().messages().get(userId=user_id, id=msg_id).execute() # fetch the message using API
      payLoad = message['payload'] # get payload of the message
      headr = payLoad['headers'] # get header of the payload

      for one in headr: # getting the Subject
          if one['name'] == 'Subject':
              msg_subject = one['value']
              temp_dict['Subject'] = msg_subject
          else:
              pass

      for two in headr: # getting the date
          if two['name'] == 'Date':
              msg_date = two['value']
              # date_parse = (parser.parse(msg_date))
              # m_date = (date_parse.datetime())
              temp_dict['DateTime'] = msg_date
          else:
              pass

      # Fetching message body

      part_body = None

      if 'parts' in payLoad:
        email_parts = payLoad['parts'] # fetching the message parts
        part_one  = email_parts[0] # fetching first element of the part
        part_body = part_one['body'] # fetching body of the message
      elif 'body' in payLoad:
        part_body = payLoad['body'] # fetching body of the message

      if part_body['size'] == 0:
        #print(payLoad)
        return None

      part_data = part_body['data'] # fetching data from the body
      clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8
      clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8
      clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8
      soup = BeautifulSoup(clean_two , "lxml" )
      message_body = soup.body()

      # message_body is a readible form of message body
      # depending on the end user's requirements, it can be further cleaned
      # using regex, beautiful soup, or any other method
      temp_dict['Message_body'] = message_body

  except Exception as e:
      print('Email read error: %s' % e)
      temp_dict = None
      pass

  finally:
      return temp_dict
Example #5
0
def spider(url, searchText):
    flag = 0

    try:
        wp = urllib.urlopen(url)  #opens the URL
        urlContent = wp.read()  # read the HTML content
    except:
        #print 'Please check URL, Cannot load url'
        return

    # To check if the page is HTML
    if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1:
        print 'The Page is not a html page'
        return

    soup = BeautifulSoup(''.join(urlContent), "html.parser")

    c = soup.findAll('script')
    for i in c:
        i.extract()

    try:
        bodyText = soup.body(text=True)
    except:
        return
    text = ''.join(bodyText)

    if text.find(searchText) > -1:
        flag = 1
        print 'The String "' + searchText + '" is found in the URL  : ' + url + '\n'
    else:
        return

    if flag == 0:
        print 'The given text was not found in the whole Website'
Example #6
0
def getMeiziTu(page):
    meizi_url = base_url + str(page) + "#comments"
    respone = requests.get(meizi_url, headers=headers)
    respone.encoding = 'utf-8'
    soup = BeautifulSoup(respone.text, "html5lib")
    if soup.find(text=re.compile('屏蔽')) == None:
        print('=============================')
        print('正在下载第 ' + str(page) + ' 页')
        #               存储包含图片地址的标签
        img = []
        imgall = soup.body('li', id=re.compile("comment-"))
        for tmp in imgall:
            img += tmp.div.find(
                'div', class_='row').find(
                'div', class_='text').find_all(
                'img', src=True)
        for n, girl in enumerate(img):
            print('       第 ' + str(n) + ' 张', end='')
            if not girl.has_attr('org_src'):
                url = girl['src']
                with open(Directory + '妹纸图' + str(page) + '-' + str(n)
                                  + url[-4:], 'wb') as f:
                    f.write(requests.get(url).content)
            else:
                url = girl['org_src']
                with open(Directory + '妹纸图' + str(page) + '-' + str(n)
                                  + url[-4:], 'wb') as f:
                    f.write(requests.get(url).content)
            print('...OK!')
        print('第 ' + str(page) + ' 页下载完成啦!!!')
        return True
def strip_html(path, i, label_xid=True):
    """Strip the HTML: get rid of scripts and interactions"""
    print '[{}] Reading {} ...'.format(i, path)
    with open(path, 'r', 'utf8') as fin:
        # TODO: Handle encodings
        soup = BeautifulSoup(fin.read(), 'html5lib')
    # Add doctype if missing
    if not has_doctype(soup):
        soup.insert(0, Doctype('html'))
    # Remove dangerous tags
    for x in soup('script'):
        x.extract()
    for x in soup('noscript'):
        x.extract()
    for x in soup('link'):
        if x.get('as') == 'script':
            x.extract()
    for x in soup('iframe'):
        x['src'] = ''
    # Fix styles
    for x in soup('style'):
        x.string = H.unescape(u"".join(unicode(y) for y in x.contents))
    # Label all tags
    i = 1
    for x in soup.body(True):
        for attr in list(x.attrs):
            if attr.startswith('on') or attr == 'srcset':
                del x[attr]
        if label_xid:
            x['data-xid'] = i
            i += 1
    # Return
    return soup.prettify()
Example #8
0
def highlight(url):
    r = requests.get(url)
    html_text = r.text
    soup = BeautifulSoup(html_text, "lxml")
    headSnippetSoup = BeautifulSoup(SNIPPET_HEADER, "lxml")
    bodySnippetSoup = BeautifulSoup(SNIPPET_BODY, "lxml")

    head_snippet = removeTags(headSnippetSoup)
    body_snippet = removeTags(bodySnippetSoup)

    head = soup.head
    head.insert(1, soup.new_tag('style', type='text/css'))
    head.style.append(highlight_css)
    head.insert(0, head_snippet)
    soup.head = head

    # soup.body = add_text(soup, " I didn't find any helpful answers here")
    body = soup.body
    body.insert(0, body_snippet)
    soup.body = body

    newsoup = Markup(soup)
    html = soup.prettify("utf-8")

    templates = "/SnippetIQ/templates/output_template.html"
    pwd = os.getcwd()
    filename = pwd + templates

    with open(filename, "wb") as file:
        file.write(html)

    return newsoup
Example #9
0
def getFollowees(urls,cur,total):
    if cur<total:
       cur+=1
       for url in urls: 
            temp=[]
            try:
                FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text
            except(TypeError,ConnectionResetError,requests.packages.urllib3.exceptions.ProtocolError,requests.exceptions.ConnectionError):
                print('获取关注列表出错,正在重连。。。')
                try:
                    FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text
                except(TypeError,ConnectionResetError,requests.packages.urllib3.exceptions.ProtocolError,requests.exceptions.ConnectionError):
                    print('远程主机关闭连接,结束本条URL后续操作!')
            source = BeautifulSoup(FolloweesPage,"html5lib")
            for everyone in source.body('div',class_='zm-profile-card zm-profile-section-item zg-clear no-hovercard'):
                href = everyone.find('a',class_='zg-link')['href']
                name=everyone.find('a',class_='zg-link')['title']
                mylock.acquire() #获得锁
                temp.append(href)
                temps[0]+=1
                if href not in set_urls:
                     set_urls.add(href)
                     list_urls.append(href)
                     print('加入:'+href+'    '+name)
                mylock.release()
            getFollowees(temp,cur,total)
Example #10
0
def getFollowees(urls,cur,total):
    if cur<total:
       cur+=1
       for url in urls: 
            temp=[]
            try:
                FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text
            except(TypeError,ConnectionResetError,requests.packages.urllib3.exceptions.ProtocolError,requests.exceptions.ConnectionError):
                print('获取关注列表出错,正在重连。。。')
                FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text
            # selector=etree.HTML(FolloweesPage)
            source = BeautifulSoup(FolloweesPage,"html5lib")
            # Guanzhu_a=selector.xpath('//*[@id="zh-profile-follows-list"]/div/div/div[2]/h2/a')
            # for everyone in Guanzhu_a:
            for everyone in source.body('div',class_='zm-profile-card zm-profile-section-item zg-clear no-hovercard'):
                #href=everyone.xpath('@href')[0]
                href = everyone.find('a',class_='zg-link')['href']
                #name=everyone.xpath('text()')[0]
                name=everyone.find('a',class_='zg-link')['title']
                mylock.acquire() #获得锁
                temp.append(href)
                temps.append(href)
                #list_urls.append(href)
                if href not in set_urls:
                     set_urls.add(href)
                     list_urls.append(href)
                    # temp.append(href)
                     print('加入:'+href+'    '+name)
                # else:
                #      print('已经存在:'+everyone.xpath('@href')[0]+'    '+everyone.xpath('text()')[0])
                mylock.release()
                # print('找到的:'+everyone.xpath('@href')[0]+'    '+everyone.xpath('text()')[0])
            getFollowees(temp,cur,total)
Example #11
0
    def parse_email(self, message):
        temp_dict = {}
        payld = message['payload']  # get payload of the message
        headr = payld['headers']  # get header of the payload

        for one in headr:  # getting the Subject
            if one['name'] == 'Subject':
                msg_subject = one['value']
                temp_dict['Subject'] = msg_subject


#             elif one['name'] == 'Date':
#                 msg_date = one['value']
#                 date_parse = (parser.parse(msg_date))
#                 m_date = (date_parse.date())
#                 temp_dict['Date'] = str(m_date)
            elif one['name'] == 'Date':
                try:
                    msg_date = one['value']
                    date_parse = parse(msg_date)
                    m_date = date_parse.date()
                    temp_dict['Date'] = str(m_date)

                except:
                    pass
                #temp_dict['Date'] = str(m_date)
            elif one['name'] == 'From':
                msg_from = one['value']
                temp_dict['Sender'] = msg_from
            elif one['name'] == 'To':
                msg_from = one['value']
                temp_dict['Receiver'] = msg_from
            else:
                pass

        temp_dict['Snippet'] = message['snippet']  # fetching message snippet

        try:

            # Fetching message body
            mssg_parts = payld['parts']  # fetching the message parts
            part_one = mssg_parts[0]  # fetching first element of the part
            part_body = part_one['body']  # fetching body of the message
            part_data = part_body['data']  # fetching data from the body
            clean_one = part_data.replace("-",
                                          "+")  # decoding from Base64 to UTF-8
            clean_one = clean_one.replace("_",
                                          "/")  # decoding from Base64 to UTF-8
            clean_two = base64.b64decode(bytes(
                clean_one, 'UTF-8'))  # decoding from Base64 to UTF-8
            soup = BeautifulSoup(clean_two, "lxml")
            mssg_body = soup.body()
            # mssg_body is a readible form of message body
            # depending on the end user's requirements, it can be further cleaned
            # using regex, beautiful soup, or any other method
            temp_dict['Message_body'] = mssg_body

        except:
            pass
        return temp_dict
Example #12
0
 def suburl(self, personalurl, recdepth, connectiontrytimes):
     if (recdepth > 0) and (connectiontrytimes >= 0):
         try:
             r = requests.get(personalurl + '/followees',
                              headers=self.header,
                              cookies=self.cookies)
         except (TypeError, ConnectionResetError,
                 requests.packages.urllib3.exceptions.ProtocolError,
                 requests.exceptions.ConnectionError):
             if connectiontrytimes - 1 > 0:
                 print('ConnectionError: 断开连接!进行重试,还剩' + str(
                     connectiontrytimes - 1) + '次重试机会')
             elif connectiontrytimes - 1 == 0:
                 print('ConnectionError: 断开连接!进行最后一次重试')
             self.suburl(personalurl, recdepth, connectiontrytimes - 1)
         else:
             source = BeautifulSoup(r.text, "html5lib")
             for temp in source.body(
                     'div',
                     class_='zm-profile-card zm-profile-section-item zg-clear no-hovercard'):
                 url = temp.find('a', class_='zg-link')['href']
                 self.countnum += 1
                 if url not in self._alreadyurl:
                     print('发现新用户! */' + url[28:])
                     self._allurl.add(url)
                 else:
                     print('已存在用户: */' + url[28:])
                 self.suburl(url, recdepth - 1, connectiontrytimes)
     elif connectiontrytimes < 0:
         print('最后一次重试失败!放弃尝试重新连接!')
    def _add_instant_tags(self, request, response):
        if hasattr(response, "content") and getattr(settings,
                                                    "WTM_INJECT_TAGS", True):
            strategy = TagStrategy(request)
            content = response.content.decode(response.charset)
            doc = BeautifulSoup(content, "html.parser")
            head = getattr(doc, "head", [])
            body = getattr(doc, "body", [])

            for tag in strategy.result:
                obj = tag.get("object")
                element = tag.get("element")

                if head and obj.tag_location == Tag.TOP_HEAD:
                    head.insert(1, element)
                elif head and obj.tag_location == Tag.BOTTOM_HEAD:
                    head.append(element)
                elif body and obj.tag_location == Tag.TOP_BODY:
                    body.insert(1, element)
                elif body and obj.tag_location == Tag.BOTTOM_BODY:
                    body.append(element)

            doc.head = head
            doc.body = body
            response.content = doc.encode(formatter=None)
            return response

        return response
Example #14
0
 def collect_single_level_part(self, part):
     data = part['body']['data'].replace('-','+').replace('_','/')
     clean_data = str(base64.b64decode(bytes(data,'utf-8')))
     soup = BeautifulSoup(clean_data, "lxml")
     text = soup.body()[0].text[2:-1]
     del soup
     return str(text)
Example #15
0
    def _parse(self, html):
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
        self.real_article = True
        if soup.title != None: self.title = soup.title.string
        if soup.date != None: self.date = soup.time.string

        if soup.body != None:
            self.body = "\n".join([p.string for p in soup.body(text=True)])
def get_car_list_from_list_page(page_number):

    # scrape the page with selenium
    url = 'http://www.encar.com/fc/fc_carsearchlist.do?carType=for&searchType=model&wtClick_index=251#!%7B%22action' \
          '%22%3A%22%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22' \
          'page%22%3A{0}%2C%22limit%22%3A20%7D'.format(page_number)
    phantomjs_path = r'D:\Workspace\[LIBRARY]\[WEB]\phantomjs-2.1.1-windows\bin\phantomjs.exe'
    try:
        # driver = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNIT.copy())
        driver = webdriver.PhantomJS(phantomjs_path)
    except (ValueError, KeyError) as e:
        print('Driver error with Chrome browser')
        return None
    else:
        try:
            driver.get(url)
        except (ValueError, KeyError) as e:
            print('URL open error with car page')
            return None
    list_page = driver.page_source
    try:
        # because of bugs in the 'service', pass for quit method is demanded
        driver.quit()
    except AttributeError:
        pass
    soup = BeautifulSoup(list_page, 'lxml')

    # validate the page
    # no_car_message = soup('p', {'class': 'message', 'title': '등록차량이 없습니다. 다른 조건으로 검색하세요.'})
    # if 0 < len(no_car_message):
    #     print('there is no car list')
    #     return None

    # get gar list table
    car_list = soup.body('tbody', {'id': 'sr_normal'})
    if 0 == len(car_list):
        return None

    # get URLs of each car page and parse each page into 'result_car' instance
    result_car_list = []
    for tr in car_list[0]('td', {'class': 'inf'}):
        page_url = 'http://www.encar.com' + tr.a.get("href")
        result_car = get_car_info_from_car_detail_page(page_url)
        if result_car is not None:
            result_car_list.append(result_car)

    current_time_string = time.strftime('%Y%m%d_%H%M%S')
    with open(
            'car_instances_at_page_{0}_{1}.txt'.format(str(page_number),
                                                       current_time_string),
            'w') as outfile:
        json.dump(
            [car_spec_class.__dict__ for car_spec_class in result_car_list],
            outfile)

    return result_car_list
Example #17
0
	def corr_heatmap_with_dual_zoombar(data,file_name='Pairwise_Heatmap.html'):
		# add vertical zoom bar based on corr_heatmap fuction
		report=True
		corr_heatmap(data=data,report=report,file_name=file_name)
		soup = BeautifulSoup(open(file_name))
		p = re.compile('"dataZoom":')
		the_iter = p.finditer(str(soup.body()[1]))
		result = max(enumerate(the_iter))[1]
		idx = result.span()[1]
		s_new = str(soup.body()[1])[:idx+2] + \
		'\n        {\n            "show": true,\n            "type": "slider",\n            "start": 50,\n            "end": 100,\n            "orient": "horizontal",\n            "xAxisIndex": null,\n            "yAxisIndex": null\n        },' + \
		'\n        {\n            "show": true,\n            "type": "slider",\n            "start": 50,\n            "end": 100,\n            "orient": "vertical",\n            "xAxisIndex": null,\n            "yAxisIndex": null\n        },' + \
		'\n        {\n            "show": true,\n            "type": "inside",\n            "start": 50,\n            "end": 100,\n            "orient": "vertical",\n            "xAxisIndex": null,\n            "yAxisIndex": null\n        },' + str(soup.body()[1])[idx+227:]
		s_new = s_new[31:]
		s_new = s_new[:-9]
		soup.body()[1].string = s_new
		Html_file= open(file_name,"w")
		Html_file.write(soup.prettify())
		Html_file.close()
Example #18
0
 def collect_multi_level_part(self, part):
     full_body = ""
     for p in part['parts']:
         data = p['body']['data'].replace('-','+').replace('_','/')
         clean_data = str(base64.urlsafe_b64decode(bytes(data,'UTF-8')))
         soup = BeautifulSoup(clean_data, "html")
         text = soup.body()[0].text[2:-1]
         del soup
         full_body+=str(text)
     return full_body
Example #19
0
    def _parse(self, html):
        soup = BeautifulSoup(html,
                             convertEntities=BeautifulSoup.HTML_ENTITIES,
                             fromEncoding='utf-8')
        self.real_article = True
        if soup.title != None: self.title = soup.title.string
        if soup.date != None: self.date = soup.time.string

        if soup.body != None:
            self.body = "\n".join([p.string for p in soup.body(text=True)])
Example #20
0
def pars_title(link, readmanga=True, mangalib=False):
    f = open('title.txt','rb') 
    content = f.read() 
    soup = BeautifulSoup(content, "lxml")
    if readmanga:   
        title_name = soup.find("meta", itemprop="name")['content'] \
            if soup.find("meta", itemprop="name") else None
        genres = {el.a.text for el in soup.body(
            "span", {"class": "elem_genre"})}
        category = soup.body.find("span", {"class": "elem_category"}).a.text \
            if soup.body.find("span", {"class": "elem_category"}) else None
        year = soup.body.find("span", {"class": "elem_year"}).a.text \
            if soup.body.find("span", {"class": "elem_year"}) else None
        for div in soup.body.findAll("div", {"class": "rightBlock"}):
            if div.h5:
                if div.h5.text == 'Количество закладок':
                    for i, strong in enumerate(div.findAll('strong')):
                        if i == 0:
                            status_process = strong.text
                        if i == 1:
                            status_readed = strong.text
                        if i == 2:
                            status_loved = strong.text
        return {'title_name': title_name, 'genres': genres, 'category': category, 'year': year, 
            'status_process': status_process, 'status_readed': status_readed, 'status_loved': status_loved}
    elif mangalib:
        title_name = soup.find("meta", itemprop="name")['content'] \
            if soup.find("meta", itemprop="name") else None
        for info in soup.body("div", {"class": "info-list__row"}):
            info_name = info.strong.text
            info_value = info.span.text if info.span else None
            if info_name == "Тип":
                category = info_value
            elif info_name == "Дата релиза":
                year = info_value
            elif info_name == "Жанры":
                genres = {el.text for el in info.findAll("a")}
            elif info_name == "Просмотров":
                views_count = info_value
        bookmarks_count_text = soup.body.find("h3", {"class": "aside__title"})
        bookmarks_count = re.match(r"\(.*?\)", bookmarks_count_text)[1:-1]
        return {'title_name': title_name, 'genres': genres, 'category': category, 'year': year, 'views_count': views_count}
Example #21
0
def course_scraper(url):
    courses = []
    html = urlopen(url)
    bs_obj = BeautifulSoup(html,"html.parser")
    for item in bs_obj.body("li"):
        if "data-subject" in item.attrs:
            if(item["data-subject"] != "[]"):
                course = item.get_text()
                courses.append(course)

    return courses
def get_csv_name(year):
    generic_ftp_name = "ftp://ftp.geonet.org.nz/strong/processed/Summary/"
    ftp_link = request.urlopen(generic_ftp_name + str(year))
    soup = BeautifulSoup(ftp_link, 'lxml')
    soupstring = soup.body()[0].text
    string_io = io.StringIO(soupstring)
    table = pd.read_table(string_io, delim_whitespace = True, names = ["blah", "blah2", "blah3","blah4",
                                                                       "blah5", "blah6", "blah7", "blah8", "CSV_link"])
    list_of_csv = table.CSV_link.tolist()
    quake_id_list = [i[0:11] for i in list_of_csv]
    return list_of_csv
Example #23
0
    def processBody(self, payload):
        parts = payload.get('parts')[0]
        data = parts['body']['data']
        data = data.replace("-", "+").replace("_", "/")
        decoded_data = base64.b64decode(data)
        soup = BeautifulSoup(decoded_data, "lxml")
        body = str(soup.body())
        body = body.replace("[<p>", "")
        cleanBody = body.replace("</p>]", "")

        # Should return body information in form of string if all went well
        return cleanBody
Example #24
0
def get_wikipedia_langs(count):
    resp = requests.get(
        'https://en.wikipedia.org/wiki/List_of_programming_languages')
    page = Soup(resp.text, features='html.parser')
    listings = list()

    for section in page.body('div',
                             attrs={'class': 'div-col columns column-width'}):
        for listing in section('li'):
            listings.append(listing.a.text.strip())

    return choices(listings, k=count)
def parsing_message(message):
    temp_dict = {}

    payld = message['payload']
    headr = payld['headers']

    for one in headr:  # getting the Subject
        if one['name'] == 'Subject':
            msg_subject = one['value']
            temp_dict['Subject'] = msg_subject
        else:
            pass

    for two in headr:  # getting the date
        if two['name'] == 'Date':
            msg_date = two['value']
            date_parse = (parser.parse(msg_date))
            m_date = (date_parse.date())
            temp_dict['Date'] = str(m_date)
        else:
            pass

    for three in headr:  # getting the Sender
        if three['name'] == 'From':
            msg_from = three['value']
            temp_dict['Sender'] = msg_from
        else:
            pass
    temp_dict['Snippet'] = message['snippet']  # fetching message snippet

    try:
        # Fetching message body
        mssg_parts = payld['parts']  # fetching the message parts
        part_one = mssg_parts[0]  # fetching first element of the part
        part_body = part_one['body']  # fetching body of the message
        part_data = part_body['data']  # fetching data from the body
        clean_one = part_data.replace("-",
                                      "+")  # decoding from Base64 to UTF-8
        clean_one = clean_one.replace("_",
                                      "/")  # decoding from Base64 to UTF-8
        clean_two = base64.b64decode(bytes(
            clean_one, 'UTF-8'))  # decoding from Base64 to UTF-8
        soup = BeautifulSoup(clean_two, "lxml")
        mssg_body = soup.body()
        # mssg_body is a readible form of message body
        # depending on the end user's requirements, it can be further cleaned
        # using regex, beautiful soup, or any other method
        temp_dict['Message_body'] = mssg_body

    except:
        temp_dict['Message_body'] = "N/A"

    return temp_dict  # This will create a dictonary item in the final list
Example #26
0
    def get_messages(self, labels):
        """Get a list of emails for the corresponding labels where each email is in dict format"""
        messages = self.get_labelled_messages(labels)
        messages_list = []

        for message in messages:
            message_dict = {}
            m_id = message['id']  # get id of individual message
            message = self.service.users().messages().get(
                userId=user_id,
                id=m_id).execute()  # fetch the message using API
            payload = message['payload']  # get payload of the message

            for header in payload['headers']:  # getting the Subject
                if header['name'] == 'Subject':
                    msg_subject = header['value']
                    message_dict['Subject'] = msg_subject
                if header['name'] == 'Date':
                    date_parse = (parser.parse(header['value']))
                    message_dict['Date'] = str(date_parse.date())
                if header['name'] == 'From':
                    message_dict['Sender'] = header['value']
            message_dict['Snipet'] = message["snippet"]

            try:
                # Fetching message body
                parts = payload['parts']  # fetching the message parts
                part_one = parts[0]  # fetching first element of the part
                part_body = part_one['body']  # fetching body of the message
                part_data = part_body['data']  # fetching data from the body
                clean_one = part_data.replace(
                    "-", "+")  # decoding from Base64 to UTF-8
                clean_one = clean_one.replace(
                    "_", "/")  # decoding from Base64 to UTF-8
                clean_two = base64.b64decode(bytes(
                    clean_one, 'UTF-8'))  # decoding from Base64 to UTF-8
                soup = BeautifulSoup(clean_two, "lxml")
                message_dict['Body'] = soup.body()
            except:
                pass

            print(message_dict)
            messages_list.append(message_dict)

            # This will mark the message as read
            self.service.users().messages().modify(userId=self.user_id,
                                                   id=m_id,
                                                   body={
                                                       'removeLabelIds':
                                                       ['UNREAD']
                                                   }).execute()

        return messages_list
def preprocess_html(text, preprocessor, forcePeriod):
    """
        Options:
        preprocessor: justext, bs4, None
        continuous: True, False.

        Use continuous to set if you want to force end of sentences.
    """

    if not preprocessor or not (type(text) == str or type(text)
                                == unicode) or len(text.strip()) == 0:
        print("TEXT IS NOT BEING PRE PROCESSED")
        print("type(text) == %s ; Size Text: %d" %
              (type(text), len(text.strip())))
        return text

    elif preprocessor == "bs4":
        soup = BeautifulSoup(text, "html.parser")
        tags_to_remove = ["script"]
        for tag in tags_to_remove:
            for x in soup.body(tag):
                x.decompose()
        if forcePeriod:
            return soup.body.get_text().replace("\n", ".\n")
        else:
            return soup.body.get_text()

    elif preprocessor == "justext":
        paragraphs = justext.justext(text, justext.get_stoplist('English'))
        text = "\n"
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:  # and not paragraph.is_header:
                if forcePeriod:
                    text = text + paragraph.text + ".\n"
                else:
                    text = text + paragraph.text + "\n"
        return text

    # At the moment that this code was updated, boilerpipe was not available for download via pip.
    elif preprocessor == "boilerpipe" or preprocessor == "boi":
        text = Extractor(extractor='ArticleExtractor', html=text).getText()
        #print("Text before: %s" % text)
        if forcePeriod:
            #print("Text after: %s" % text.replace("\n", ".\n"))
            return text.replace("\n", ".\n")
        else:
            return text

    else:
        print("PRE PROCESSING OPTION %s NOT FOUND. IGNORING PRE PROCESSING." %
              (preprocessor))
        return text
Example #28
0
    def get_message_text(self, msg_id):
        # possible formats: ['full', 'metadata', 'minimal', 'raw']
        msg = self.service.users().messages().get(userId='me', id=msg_id, format='full', prettyPrint=True).execute()

        parts = msg['payload']['parts']
        if self.is_contains_attachment(parts):
            data = parts[0]['parts'][0]['body']['data']
        else:
            data = parts[0]['body']['data']
        clean = base64.urlsafe_b64decode(data)
        soup = BeautifulSoup(clean, "lxml")
        mssg_body = soup.body()
        return mssg_body
Example #29
0
    def preprocess_html(self, text, preprocessor, forcePeriod):
        """
            Options:
            preprocessor: justext, bs4, None
            forcePeriod: True, False. True will force a period whenever a linebreak is found.

        """

        if not preprocessor or type(text) != str or len(text.strip()) == 0:
            return text

        elif preprocessor == "bs4":
            soup = BeautifulSoup(text, "html.parser")
            # This html text has no body!
            if soup.find("body") is None:
                return text

            tags_to_remove = ["script"]
            for tag in tags_to_remove:
                for x in soup.body(tag):
                    x.decompose()
            if forcePeriod:
                return soup.body.get_text().replace("\n", ".\n")
            else:
                return soup.body.get_text()

        elif preprocessor == "justext" or preprocessor == "jst":
            paragraphs = justext.justext(text, justext.get_stoplist('English'))
            text = "\n"
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate: # and not paragraph.is_header:
                    if forcePeriod:
                        text = text + paragraph.text + ".\n"
                    else:
                        text = text + paragraph.text + "\n"
            return text

        # Boilerpipe install is not always working. If you cannot install it, just comment the following code
        # and remove the import
        elif preprocessor == "boilerpipe" or preprocessor == "boi":
            text = Extractor(extractor='ArticleExtractor', html=text).getText()
            #print("Text before: %s" % text)
            if forcePeriod:
                #print("Text after: %s" % text.replace("\n", ".\n"))
                return text.replace("\n", ".\n")
            else:
                return text

        else:
            print("PRE PROCESSING OPTION %s NOT FOUND. IGNORING PRE PROCESSING." % preprocessor)
            return text
def get_car_list_from_list_page(page_number):

    # scrape the page with selenium
    url = 'http://www.encar.com/fc/fc_carsearchlist.do?carType=for&searchType=model&wtClick_index=251#!%7B%22action' \
          '%22%3A%22%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22' \
          'page%22%3A{0}%2C%22limit%22%3A20%7D'.format(page_number)
    phantomjs_path = r'D:\Workspace\[LIBRARY]\[WEB]\phantomjs-2.1.1-windows\bin\phantomjs.exe'
    try:
        # driver = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNIT.copy())
        driver = webdriver.PhantomJS(phantomjs_path)
    except (ValueError, KeyError) as e:
        print('Driver error with Chrome browser')
        return None
    else:
        try:
            driver.get(url)
        except (ValueError, KeyError) as e:
            print('URL open error with car page')
            return None
    list_page = driver.page_source
    try:
        # because of bugs in the 'service', pass for quit method is demanded
        driver.quit()
    except AttributeError:
        pass
    soup = BeautifulSoup(list_page, 'lxml')

    # validate the page
    # no_car_message = soup('p', {'class': 'message', 'title': '등록차량이 없습니다. 다른 조건으로 검색하세요.'})
    # if 0 < len(no_car_message):
    #     print('there is no car list')
    #     return None

    # get gar list table
    car_list = soup.body('tbody', {'id': 'sr_normal'})
    if 0 == len(car_list):
        return None

    # get URLs of each car page and parse each page into 'result_car' instance
    result_car_list = []
    for tr in car_list[0]('td', {'class': 'inf'}):
        page_url = 'http://www.encar.com' + tr.a.get("href")
        result_car = get_car_info_from_car_detail_page(page_url)
        if result_car is not None:
            result_car_list.append(result_car)

    current_time_string = time.strftime('%Y%m%d_%H%M%S')
    with open('car_instances_at_page_{0}_{1}.txt'.format(str(page_number), current_time_string), 'w') as outfile:
        json.dump([car_spec_class.__dict__ for car_spec_class in result_car_list], outfile)

    return result_car_list
Example #31
0
def scrape_player_page(video):
    """
    Try to scrape the site for video and download. 
    """
    if not video['url'].startswith('http'):
        video['url'] = "http://www.svtplay.se" + video['url']
    soup = BeautifulSoup(requests.get(video['url']).text)
    video_player = soup.body('a', {'data-json-href': True})[0]
    if 'oppetarkiv.se' in video['url']:
        flashvars = requests.get(
                "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
    else:
        if video_player.attrs['data-json-href'].startswith("/wd"):
            flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
        else:
            flashvars = requests.get(
                    "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
    video['duration'] = video_player.attrs.get('data-length', 0)
    if not 'title' in video:
        video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_')
    if 'genre' not in video:
        if soup.find(text='Kategori:'):
            video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
        else:
            video['genre'] = 'Ingen Genre'
    if 'dynamicStreams' in flashvars:
        video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
        filename = Path(video['title']).with_suffix(".mp4")
        print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
    if 'pathflv' in flashvars:
        rtmp = flashvars['pathflv'][0]
        filename = Path(video['title']).with_suffix(".flv")
        print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
    if not 'timestamp' in video and soup.find_all(datetime=True):
        xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
        if xmldate_str:
            video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6])  # naive in utc
            video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None)  # convert to local time
    if 'video' in flashvars:
        for reference in flashvars['video']['videoReferences']:
            if 'm3u8' in reference['url']:
                video['url'] = reference['url']
                video['filename'] = Path(video['title']).with_suffix('.ts')
                if 'statistics' in flashvars:
                    video['category'] = flashvars['statistics']['category']
        if not download_from_playlist(video):
            return False
    if 'url' not in video:
        print("Could not find any streams")
        return False
    return video
Example #32
0
def process_as_html(contents, charset):
    if type(contents) == BeautifulSoup:
        soup = contents
    else:
        soup = BeautifulSoup(contents, "html5lib", from_encoding = charset) 
        
    # Get references to all strings
    if not soup.body: return soup

    strings = find_string_elements(soup.body)
    images = soup.body("img")
    iframes = soup.body("iframe")
    #flash = soup.body("param")
    
    for string_el in strings:
        # Run regex on each string
        s = unicode(string_el)
        censored = currency_pattern.sub(censor_currency_match, s)
        if s is not censored:
            
            already_link = change_link_href(string_el, "http://www.slowerinternet.com")
            if not already_link:
                #print "Must create a link!"
                link = soup.new_tag("a", href="http://www.slowerinternet.com")
                link.string = censored
                string_el.replace_with(link)
            else:
                string_el.replace_with(censored)

    for img in images:
        if img.get("src") and adblock_filter.match(img["src"]):
            img["src"] = "https://docs.python.org/favicon.ico"

    for iframe in iframes:
        if iframe.get("src") and adblock_filter.match(iframe["src"]):
            iframe["src"] = "http://example.com"

    return soup
 def key_sector(key_sector):
     url_sec = 'https://de.finance.yahoo.com/quote/' + entry_list + '/holdings?p=' + entry_list
     req_sec = r.get(url_sec)
     dat_sec = BeautifulSoup(req_sec.content, 'html.parser')
     cont_sec = dat_sec.body('div', {'class': 'Mb(25px)'})
     df_sec = pd.DataFrame(cont_sec[1])
     sec = df_sec[0].astype(str).str.split('</span>').to_list()
     df_sec2 = pd.DataFrame(sec).dropna().transpose()
     sec2 = df_sec2[1].astype(str).str.split('">').to_list()
     df_sec3 = pd.DataFrame(sec2)
     sec_industry = df_sec3[4][1:].dropna().reset_index().drop(columns=['index'])
     sec_percent = df_sec3[1].str.replace(',', '.').str.replace('%', '').apply(pd.to_numeric, errors='coerce').dropna().reset_index().drop(columns=['index'])
     df_merge = pd.merge(sec_industry, sec_percent, left_index=True, right_index=True).rename(columns={4: 'Sektor', 1: 'Gewichtung in %'}).sort_values(by=['Gewichtung in %'], ascending=False).reset_index().drop(columns=['index'])
     return df_merge
Example #34
0
def json_from_full_listing(listing_url):
    print(f'fetching {listing_url}...')
    resp = requests.get(listing_url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    script = [
        s for s in soup.body('script')
        if s.string is not None and 'dataLayer' in s.string
    ][0]
    match = re.search('dataLayer = (.*);', script.string)
    jsonstr = match.groups()[0]
    last = json.loads(jsonstr)[-1]
    listing = {**last.get('property', {}), **last.get('sold_property', {})}
    listing['link'] = listing_url
    return pd.json_normalize(listing).to_dict(orient='records')[0]
Example #35
0
    def __init__(self, list_url):

        self.playlist_video_list = []
        r = urlopen(list_url)
        soup = BeautifulSoup(r, "html.parser")
        #print 'lolololololl soup'
        [s.extract() for s in soup.body('script')]

        tag_list = soup.body.find("div", "pl-video-list")
        if not tag_list:
            return

        tag_titles = tag_list.find_all("tr", "pl-video yt-uix-tile ")
        tag_imgs = tag_list.find_all("span", "yt-thumb-clip")

        tag_times = tag_list.find_all('div', 'timestamp')
        tag_uploaders = tag_list.find_all("a", " yt-uix-sessionlink spf-link ")
        ##print soup.prettify()

        #print len(tag_imgs)
        #print len(tag_titles)
        #print len(tag_times)
        #print len(tag_uploaders)
        have_to_be_modified = []
        for n in range(0, len(tag_imgs), 1):
            if "no_thumbnail" in tag_imgs[n].img['data-thumb']:
                have_to_be_modified.append(n)
                ##print "n "+str(n)
        have_to_be_modified = sorted(have_to_be_modified, reverse=True)
        for target_list in have_to_be_modified:
            ##print target_list
            del tag_titles[target_list]
            del tag_imgs[target_list]
        #print "modified"
        #print len(tag_imgs)
        #print len(tag_titles)

        if len(tag_imgs) == len(tag_titles) == len(tag_times) == len(
                tag_uploaders):
            for i in range(0, len(tag_titles)):
                ##print tag_imgs[i].img['data-thumb']
                ##print tag_titles[i]['data-title']
                ##print tag_times[i].string
                ##print tag_uploaders[i].string
                a_pref = playlist_video(i, tag_titles[i]['data-title'],
                                        tag_imgs[i].img['data-thumb'],
                                        tag_times[i].string,
                                        tag_uploaders[i].string)
                ##print a_pref.img
                self.playlist_video_list.append(a_pref)
def searchUrl(url, level, searchText): # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
#        context = ssl._create_unverified_context()
        urlContent = urllib2.urlopen(url,context=context, verify=False).read()
#        soup = BeautifulSoup(''.join(urllib2.urlopen(url,context=context).read()), verify=False)
        urlList.append(url)
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))

    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract() 
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
        
    except:
        return
    text = ''.join(body_texts)
    

    # search
    if text.find(searchText) > -1:
        print url
        print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    searchUrl(linkUrl, level - 1, searchText)
                except:
                    pass
Example #37
0
def main(page):
    for i in range(len(header)):
        if header_Available[i]!=False:
            url = start_url+'/page-'+str(page)+'#comments'
            r = requests.get(url,headers=header[i])
            r.encoding = 'utf-8'
            soup = BeautifulSoup(r.text,"lxml")
            if soup.find(text=re.compile('屏蔽'))==None:
                print('=============================')
                print('正在下载第 '+str(page)+' 页')
#               存储包含图片地址的标签
                img = []
                
#               筛选img标签的替代方案A:
#               comparetolist = ['p','div','div','div','li','ol','div','div','div','div','body']
#               goodjob(soup.find_all('img',src=True),img,comparetolist)

                imgall = soup.body('li',id = re.compile("comment-"))
                for tmp in imgall:
                    img+=tmp.div.find('div',class_ = 'row').find('div',class_ = 'text').find_all('img',src=True)

                for n,girl in enumerate(img):
                    print('       第 '+str(n)+' 张',end='')
                    if not girl.has_attr('org_src'):
                        url = girl['src']
                        with open('妹纸图'+str(page)+'-'+str(n)+url[-4:],'wb') as f:
                            f.write(requests.get(url).content)
                    else:
                        url = girl['org_src']
                        with open('妹纸图'+str(page)+'-'+str(n)+url[-4:],'wb') as f:
                            f.write(requests.get(url).content)
                    print('...OK!')
                print('第 '+str(page)+' 页下载完成啦!!!')
                return True
            else:
                if header_Available[i]!=False:
                    header_Available[i]=False
                    print('被屏蔽,正在反屏蔽.....\n        User-Agent 可用信息:')
                    show_info()
                    if header_Available[len(header)-1]==False:
                        print('反屏蔽失败,线程终止!\nUser-Agent 可用信息:')
                        show_info()
                        return False
 def fuckingweather(self, irc, msg, args, text):
     """
     <zip code>: Displays the weather from http://www.thefuckingweather.com/
     """
     url = 'http://www.thefuckingweather.com/Where/%s' % utils.web.urlquote(text)
     try:
         soup = BeautifulSoup(utils.web.getUrl(url))
         
         find=lambda x,y:soup.body(x,y,limit=1)[0].text
         
         temperature = find('span', {'class':'temperature'})
         remark = find('p',{'class':'remark'})
         flavor = find('p',{'class':'flavor'})
         location = find('span',{'id':'locationDisplay'})
         
         celsius = math.floor((int(temperature)-32)*5/9)
         
         res = "%s\u00B0F / %s\u00B0C in %s?! %s.  %s." % (temperature, celsius, location, remark, flavor)
         irc.reply(res, prefixNick=True)
     except:
         irc.reply("ERROR: IT'S F*****G BROKEN.", prefixNick=True)
Example #39
0
def fetchImage(hostname):

    while(container.empty() == False):
        url,depth = container.get()
        if url in visited_url or depth>3 or hostname!=urlparse.urlparse(url).hostname:
            continue
        visited_url.append(url)
        print 'visiting '+url +' at depth:',depth
        htmlData = urllib2.urlopen(url).read()
        soup = BeautifulSoup(htmlData)
        
        #search imgs in this body section
        
        articles = soup.body("article")
        count = 0
        for article in articles:
            if(article.img['src']==""):
                continue
            imgUrl = urlparse.urljoin(url,article.img['src'])
            appName = article.section.h3.a.string
            category = article.section.h3.span.string
            if(imgUrl not in loaded_image_url):
                print 'imgUrl:' + imgUrl + ' appName:' +appName.encode('big5','ignore') + ' category:'+category.encode('big5','ignore')+' found'
                loaded_image_url.append(imgUrl)
                count = count +1;
                #print "downloading img:" + imgUrl + "	..."
                #cmd = "wget -P ./../res "+imgUrl
                #os.system(cmd)
                #urllib.urlretrieve(imgUrl, "./../res/url"+str(count)+".jpg")
        
        #search for adjancy pages
        links = soup('a')
        for link in links:
            if('href' in dict(link.attrs)):
                newurl = urlparse.urljoin(url,link['href'])
                container.put((newurl,depth+1))
    if line.startswith("#UID:"):
        uid = line.strip().split("#UID:")[1]
    elif line.startswith("#DATE:"):
        date = line.strip().split("#DATE:")[1]
    elif line.startswith("#URL:"):
        url = line.strip().split("#URL:")[1]
    elif line.strip() == "#EOR":
        print uid, date, url

        content = unidecode(content.decode("utf8"))

        if html_parser == "bs4":
            soup = BeautifulSoup(content, "html.parser")
            tags_to_remove = ["script"]
            for tag in tags_to_remove:
                for x in soup.body(tag):
                    x.decompose()
            text = soup.body.get_text()

        elif html_parser == "justext":
            paragraphs = justext.justext(content, justext.get_stoplist('English'))
            text = "\n"
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate: # and not paragraph.is_header:
                    text = text + paragraph.text + "\n"

        elif html_parser == "boilerpipe":
            extractor = Extractor(extractor='ArticleExtractor', html=content)
            text = extractor.getText()

        elif html_parser == "html":
Example #41
0
    print tag.name, tag.string

print '##############根据属性名查找(1):'
# 定位属性名为sister的标签(注意:由于class是python的关键词,所以需要写成class_='*')
# 但是通过 class_ 参数搜索有指定CSS类名的tag 是从Beautiful Soup的4.1.1版本才开始;
for tag in soup.find_all(class_="sister"):
    print tag.name, tag.string

print '##############根据属性名查找(2):'
for tag in soup.find_all(id=re.compile("link"), limit=3):
    print tag.name, tag.string

# find_all() 几乎是Beautiful Soup中最常用的搜索方法, 所以我们定义了它的简写方法.
# BeautifulSoup对象 和 tag对象可以被当作一个方法来使用
# find_all的简写方法
print '##############简写find_all:'
for tag in soup(class_="sister", limit=1):
    print tag.name, tag.string

# 当设置limit=1,等价于使用find()方法
# 区别:
# 唯一的区别是 find_all() 方法的返回结果是值包含一个元素的列表(能够 for in 遍历),而find()方法直接返回结果.
# find_all() 方法没有找到目标是返回空列表, find()方法找不到目标时,返回 None .
print '##############find:'
item_tag = soup.find(class_="sister")
print item_tag, item_tag.string

print "body中id='link2'的标签:", soup.body.find_all(id="link2")
# 简写:
print "body中id='link2'的标签:", soup.body(id="link2")
def get_car_info_from_car_detail_page(page_url):

    # 자동차 페이지
    url_request = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        connection = urlopen(url_request)
    except (ValueError, KeyError) as e:
        print('URL open error with car page')
        return None
    car_page = connection.read()
    connection.close()
    soup = BeautifulSoup(car_page, 'lxml')

    # page validation
    find_iter = soup.find_all('div', attrs={'class': 'car_info'})
    if 0 == len(find_iter):
        return None

    # ================================================================
    # 기본적인 차량 정보 받아오기
    # ================================================================
    find_iter = soup.head.find_all('meta', {'name': 'WT.z_CarId'})
    if 0 == len(find_iter):
        return None
    car_id = int(find_iter[0].attrs.get('content'))

    current_car = CarInfo(car_id)
    # current_car.dealer_ = 'unknown'
    # for i in range(len(find_iter)):
    #     attribute_name = find_iter[i].attrs.get('name')
    #     if attribute_name is None:
    #         continue
    #     # class 순서상 빈 것: 딜러
    #     if 'WT.z_state' == attribute_name:  # 차량 위치
    #         current_car.state_ = find_iter[i].attrs.get('content')
    #     # class 순서상 빈 것: 차량번호
    #     elif 'WT.z_price' == attribute_name:  # 총 구매비용
    #         current_car.price_ = find_iter[i].attrs.get('content')
    #     # class 순서상 빈 것: 제조사보증 유무
    #     elif 'WT.z_make' == attribute_name:  # 제조사
    #         current_car.set_maker(find_iter[i].attrs.get('content'))
    #     # class 순서상 빈 것: 차종
    #     # class 순서상 빈 것: 차량코드
    #     # class 순서상 빈 것: 모델
    #     # class 순서상 빈 것: 트림
    #     elif 'WT.trns' == attribute_name:  # 변속기
    #         current_car.transmission_ = find_iter[i].attrs.get('content')
    #     elif 'WT.whatfuel' == attribute_name:  # 연료
    #         current_car.fuel_ = find_iter[i].attrs.get('content')
    #     elif'WT.z_cat' == attribute_name:  # 분류
    #         current_car.category_ = find_iter[i].attrs.get('content')
    #     elif 'WT.z_year' == attribute_name:  # 연식
    #         current_car.year_ = find_iter[i].attrs.get('content')
    #     elif 'WT.z_month' == attribute_name:  # 출시월
    #         current_car.month_ = find_iter[i].attrs.get('content')
    #     elif 'WT.mileage' == attribute_name:  # 주행거리
    #         current_car.mileage_ = find_iter[i].attrs.get('content')
    #     elif 'WT.z_vehcat' == attribute_name:  # 상태
    #         current_car.condition_ = find_iter[i].attrs.get('content')
    #     elif 'WT.color' == attribute_name:  # 색상
    #         current_car.color_ = find_iter[i].attrs.get('content')
    #     # class 순서상 빈 것: 부품교환이력

    # current_car.type_ = soup.body('span', class_='cls')[0].em.string
    # current_car.model_ = soup.body('span', class_='dtl')[0]('strong')[-1].text
    # current_car.modelCode_ = soup.body('span', class_='dtl')[0].em.string

    # 페이지 마지막 디테일 정보에서 대충 긁어오기
    for input_field in soup.body('form', {'name': 'carDetail'})[0]('input'):
        if not input_field.has_attr('id'):
            continue
        current_car.set_info(input_field.attrs.get('id'), input_field.attrs.get('value'))

    # 페이지 중단에서 정보받아오기. 특히, 제조사 보증 유무를 위해
    stat_detail = soup.body('ul', class_='stat_detail')[0]('li')
    for stat in stat_detail:
        if '차량번호' == stat.span.string:
            current_car.plateNumber_ = stat.text.split()[1]
        elif '배기량:' == stat.span.string:
            current_car.displacement_ = stat.text.split(':')[1]
        elif '연비:' == stat.span.string:
            current_car.fuelEfficiency_ = stat.text.split(':')[1]
        elif '수입형태:' == stat.span.string:    # 제조사 보증 유무
            if 'X' in stat.text:
                current_car.warranty_ = False
            else:
                current_car.warranty_ = True

    # 리스 정보가 있을 경우, 리스 정보 읽기
    for lease_table in soup.body('ul', class_='brd_price'):
        for dl in lease_table('dl'):
            if '인수비용' == dl.dt.text:
                current_car.leaseCost_ = dl.dd.text
            elif '월리스료' == dl.dt.text:
                current_car.leaseMonthlyPay_ = dl.dd.text
            elif '잔여개월' == dl.dt.text:
                months = dl.dd.text.split('/')
                current_car.leaseLeftMonths_ = months[0]
                current_car.leaseTotalMonths_ = months[1].replace('개월', '')
            else:
                print('unknown information for lease')

    # ================================================================
    # 딜러 및 상사 정보 받아오기
    # ================================================================
    # 상사 정보를 불러오기 위해서는 아래 링크를 띄워서 크롤링 해야함
    url = 'http://www.encar.com/dc/dc_carsearchpop.do?method=companyInfoPop&carTypeCd=1&carid={0}'.format(str(car_id))
    url_request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        connection = urlopen(url_request)
    except (ValueError, KeyError) as e:
        print('URL open error with dealer page')
        return None
    dealer_page = connection.read()
    connection.close()
    dealer_soup = BeautifulSoup(dealer_page, 'lxml')
    dealer_company = dealer_soup.body('table', {'class', 'viewinfo'})[0].find_all('td')[0].string

    # 딜러 본인의 이름 받아오기
    dealer_name = soup.body('div', {'class', 'dealer'})[0]('div', {'class', 'info'})[0].strong.string
    current_car.dealer_ = '{0}({1})'.format(dealer_name, dealer_company)

    # ================================================================
    # 옵션 정보 받아오기
    # ================================================================
    options = soup.body('div', {'class', 'box_opt'})
    options_basic = options[0]('dd', {'class', 'on'})
    for i in range(len(options_basic)):
        current_car.option_.set_option(options_basic[i].a.string)

    # 기타 옵션
    if 1 < len(options):
        options_etc = options[1]('dd')
        for i in range(len(options_etc)):
            current_car.option_.set_option(options_etc[i].string)

    # 추가 입력 옵션 (사용자가 추가로 요청 할 수 있는 옵션들인듯)
    if 2 < len(options):
        current_car.option_.additionalOptions_ = options[2].p.string

    # ================================================================
    # 성능 점검 기록 받아오기
    # ================================================================
    # 차량 페이지 내에 성능 점검 기록 버튼이 있는지 확인
    if 0 < len(soup.find_all('a', {'class': 'btn_detail'}, text='성능점검 자세히 보기')):

        # 성능 점검 기록이 등록되어있다면 아래의 페이지가 제대로 접속될 것임
        url = 'http://www.encar.com/md/sl/mdsl_regcar.do?method=inspectionView&carid={0}'.format(str(car_id))
        url_request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        try:
            connection = urlopen(url_request)
        except (ValueError, KeyError) as e:
            current_car.inspection_.bExist_ = False
        else:
            current_car.inspection_.bExist_ = True
            inspection_page = connection.read()
            connection.close()
            inspection_soup = BeautifulSoup(inspection_page, 'lxml')
            table_body = inspection_soup.body('table', class_='ckst')[0].tbody
            table_field_names = table_body('th')
            table_field_values = table_body('td')
            # 차종에 따라, 사고와 침수가 함께 표시되고, 원동기 형식이 표기되지 않는 테이블들이 있음
            for i in range(len(table_field_names)):
                if '연식' in table_field_names[i].text:
                    current_car.inspection_.strYear_ = table_field_values[i].text
                elif '차대번호' == table_field_names[i].text: # 동일성 확인 부분과 겹치므로, == 로 검사
                    current_car.inspection_.strVIN_ = table_field_values[i].text
                elif '최초등록일' in table_field_names[i].text:
                    current_car.inspection_.strFirstRegistrationDate_ = table_field_values[i].text
                elif '동일성확인' in table_field_names[i].text:
                    current_car.inspection_.strVINMatching_ = remove_legacy_characters(table_field_values[i].text)
                elif '주행거리' in table_field_names[i].text:
                    current_car.inspection_.strMileage_ = table_field_values[i].text.split()[0].replace(',', '')
                elif '변속기종류' in table_field_names[i].text:
                    current_car.inspection_.strVIN_ = remove_legacy_characters(table_field_values[i].text)
                elif '사고유무' == table_field_names[i].text:
                    if '무' not in table_field_values[i].text:
                        current_car.inspection_.bDamaged_ = True
                elif '침수유무' == table_field_names[i].text:
                    if '무' not in table_field_values[i].text:
                        current_car.inspection_.bSubmerged_ = True
                elif '사고/침수유무' == table_field_names[i].text:
                    if '무' not in table_field_values[i].text:
                        print('유사고 표기방법: ' + table_field_values[i].text)  # 이런 경우가 거의 없어서, 크롤링 중 발견하면 report하도록
                elif '원동기형식' in table_field_names[i].text:
                    current_car.inspection_.strMotorType_ = table_field_values[i].text
                elif '보증유형' in table_field_names[i].text:
                    current_car.inspection_.strWarrantyType_ = table_field_values[i].text
                elif '불법구조변경' in table_field_names[i].text:
                    if '없음' not in table_field_values[i].text:
                        current_car.inspection_.bIllegalRemodeling_ = True
                elif '검사유효기간' in table_field_names[i].text:
                    current_car.inspection_.strTermOfValidity_ = table_field_values[i].text

            # 부품 교환 이력이 있는 것들 찾기
            repair_inspections = inspection_soup.body('dl', class_='section_cktxt')[0]('dd')
            if 1 < len(repair_inspections):
                structure_repair_list = repair_inspections[1]('span', {'class': 'on'})
                for repair_inst in structure_repair_list:
                    current_car.inspection_.listStructureRepairs_.append(repair_inst.text)
            if 0 < len(repair_inspections):
                exterior_repair_list = repair_inspections[0]('span', {'class':'on'})
                for repair_inst in exterior_repair_list:
                    current_car.inspection_.listExteriorRepairs_.append(repair_inst.text)

            # 개정된 성능 기록표
            status_name_list = []
            status_value_list = []
            new_inspection_tables = inspection_soup.body('table', class_='ckstl ckdata')[0]('tbody')
            for table in new_inspection_tables:
                cur_section_name = table.th.text + '_'
                cur_mid_section_name = ''
                for tr in table('tr'):
                    for td in tr('td'):
                        if td.has_attr('rowspan'):        # 중간 섹션
                            cur_mid_section_name = td.text + '_'
                        else:
                            if td.has_attr('colspan'):    # 검사 항목 (중간 섹션 없는 경우)
                                if 3 == int(td.attrs.get('colspan')):  # 배기 가스 등, 여러 줄에 걸친 검사 결과
                                    status_name_list.append(cur_section_name[:-1])  # 마지막에 언더라인 지우기
                                    status_value_list.append(td.text)
                                else:
                                    status_name_list.append(cur_section_name + td.text)
                                    cur_mid_section_name = ''  # rowspan이 끝났음을 표현
                            else:
                                cur_status = td('span', {'class': 'on'})
                                if 0 < len(cur_status):   # 검사 결과
                                    status_value_list.append(cur_status[0].text)
                                else:                     # 검사 항목
                                    status_name_list.append(cur_section_name + cur_mid_section_name + td.text)

            # 항목별로 정리해서 집어 넣기
            for i in range(len(status_name_list)):
                current_car.inspection_.set_item(status_name_list[i], status_value_list[i])


    # ================================================================
    # 보험 기록 가져오기
    # ================================================================
    # 보험 기록이 등록되어있다면 아래의 페이지가 제대로 접속된 후, 테이블이 읽힐 것
    # 페이지 자체는 모두 존재함을 확인 함
    url = 'http://www.encar.com/dc/dc_cardetailview.do?method=kidiFirstPop&carid={0}'.format(str(car_id))
    url_request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    current_car.insurance_.bExist_ = False
    try:
        connection = urlopen(url_request)
    except (ValueError, KeyError) as e:
        print('no insurance page')
    else:
        insurance_page = connection.read()
        connection.close()
        insurance_soup = BeautifulSoup(insurance_page, 'lxml')
        smlist = insurance_soup.body('div', class_='smlist')
        if 0 < len(smlist):
            current_car.insurance_.bExist_ = True
            tr_list = smlist[0]('tr')
            for tr in tr_list:
                image_src = tr.img.get('src')
                if '/images/es/car_num2_2.gif' == image_src:  # 자동차 용도 이력
                    current_car.insurance_.set_change_purpose(tr('td')[1].text.split()[0])
                elif '/images/es/car_num2_3.gif' == image_src:  # 번호판 / 차주 변경 이력
                    input_numbers = tr('td')[1].text.split('/ ')
                    current_car.insurance_.set_change_plate_number(input_numbers[0].replace('회', ''))
                    current_car.insurance_.set_change_owner(input_numbers[1].replace('회', ''))
                elif '/images/es/car_num2_4.gif' == image_src:  # 파손 이력
                    current_car.insurance_.set_damages(tr('td')[1].text)
                elif '/images/es/car_num2_5.gif' == image_src:
                    current_car.insurance_.set_compensation_self(tr('td')[1].text)
                elif '/images/es/car_num2_6.gif' == image_src:
                    current_car.insurance_.set_compensation_others(tr('td')[1].text)

    # ================================================================
    # 차량 설명 받아오기
    # ================================================================
    current_car.description_ = soup.body('div', {'class', 'wrp_car_info'})[0]('div')[0].pre.string

    return current_car
Example #43
0
def render():
    # magic number for unique file name
    magicnum = '_' + str(int(time.time()))

    from urllib import unquote
    query = request.args.get('q')
    url = unquote(query)

    if 'htmlkepdf.com' in url:
        return redirect('/', 301)

    from urlparse import urlparse
    addr = urlparse(url).netloc
    filename = addr.replace('.', '_')

    from readability.readability import Document
    import urllib2
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    try:
        html = opener.open(url).read()
    #except HTTPError:
    #    html = urllib2.urlopen(url).read()
    except:
        html = requests.get(url).content
    readable_article = Document(html).summary()
    readable_title = Document(html).short_title()

    # clean up text with bs4
    soup = BeautifulSoup(readable_article)
    readable_article = soup.body(text=True)
    readable_article = ' '.join(readable_article)
    # find meta, just in case readable article is too short
    soup2 = BeautifulSoup(html)
    try:
        metadesc = soup2.find('meta', {'name': 'description'})['content']
    except:
        metadesc = ''
    # error list
    # 1. facebook.com harus pake https

    p1 = Popen('xvfb-run --auto-servernum --server-num=1 python gistfile2.py ' + '"' + url + '"' + ' ' + filename + '.pdf', shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
    stdout, stderr = p1.communicate() # wait

    retcode = p1.returncode
    if retcode == 0:
        with open(filename + '.pdf') as f:
            # filter data yang masuk, biar gak duplikat
            oid = fs.put(f, content_type='application/pdf', filename=filename+magicnum, title=readable_title, article=readable_article, update=datetime.datetime.now(), url=url, metadesc=metadesc)
        os.remove(filename + '.pdf')
        return redirect('/view/' + str(oid))

    elif retcode == 139:
        p2 = Popen('phantomjs rasterize.js ' + '"' + url + '"' + ' ' + filename + '.pdf', shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
        stdout, stderr = p2.communicate() # wait

        retcode = p2.returncode
        if retcode == 0:
            with open(filename + '.pdf') as f:
                # filter data yang masuk, biar gak duplikat
                oid = fs.put(f, content_type='application/pdf', filename=filename+magicnum, title=readable_title, article=readable_article, update=datetime.datetime.now(), url=url, metadesc=metadesc)
                os.remove(filename + '.pdf')
                return redirect('/view/' + str(oid))

    else:
        p2 = Popen('phantomjs rasterize.js ' +  '"' + url + '"' + ' ' + filename + '.pdf', shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
        stdout, stderr = p2.communicate() # wait

        retcode = p2.returncode
        if retcode == 0:
            with open(filename + '.pdf') as f:
                # filter data yang masuk, biar gak duplikat
                oid = fs.put(f, content_type='application/pdf', filename=filename+magicnum, title=readable_title, article=readable_article, update=datetime.datetime.now(), url=url, metadesc=metadesc)
                os.remove(filename + '.pdf')
                return redirect('/view/' + str(oid))

    # else error, tampilkan error di halaman error
    stderror = stderr

    return render_template("error.html", query=query, addr=addr, title=readable_title, stderror=stderror, retcode=retcode)
Example #44
0
def crawl_espn_projection_page(projections=None, next_page=None, **params):
    """Crawls the ESPN Page and returns the projection data as a dict"""
    if next_page:
        response = requests.get(next_page)
    else:
        response = requests.get(ESPN_PROJECTIONS_URL, params)
    print response.url

    soup = BeautifulSoup(response.content, 'html.parser')
    pagination_nav = soup.body.find(class_='paginationNav')
    for item in pagination_nav.find_all('a'):
        if 'NEXT' in item.contents:
            next_page = item['href']
    projections = {} if projections is None else projections
    player_rows = soup.body(class_='pncPlayerRow')
    for row in player_rows:
        projection = {}
        valid = True
        for i, cell in enumerate(row.find_all('td')):
            if i == 0:
                # Find Name, Team, and Position
                name = cell.a.string
                if 'D/ST' in cell.contents[1]:
                    team = get_team(cell.contents[0].string.split()[0].strip().lower())
                    projection['name'] = get_name(name, team, 'dst')
                    projection['team'] = get_team(team)
                    projection['position'] = 'dst'
                else:
                    splits = cell.contents[1].split()
                    team = splits[1]
                    position = splits[2]

                    # No Free Agents
                    if team == 'FA':
                        valid = False
                        break
                    projection['name'] = get_name(name, team, position)
                    projection['team'] = get_team(team)
                    try:
                        projection['position'] = get_position(position)
                    except Exception:
                        # Remove kickers and the like.
                        valid = False
                        break

            if i == 1:
                # Find opponent and whether or not team is home or away
                if cell.a is None:
                    valid = False
                    break
                text = cell.a.string
                if text[0] == '@':
                    projection['home'] = False
                    projection['opponent'] = get_team(text[1:])
                else:
                    projection['home'] = True
                    projection['opponent'] = get_team(text)
            elif i == 3:
                projection['receptions'] = float(cell.string.split('/', 1)[0])
            elif i in range(4, 14):
                _populate_stats(i, cell, projection)
        if valid:
            calculate_ppr(projection)
            projections[projection['name']] = projection
    if next_page and len(projections) < 500:
        time.sleep(0.250)
        return crawl_espn_projection_page(projections=projections, next_page=next_page)
    else:
        return projections
	temp_dict['Snippet'] = message['snippet'] # fetching message snippet


	try:
		
		# Fetching message body
		mssg_parts = payld['parts'] # fetching the message parts
		part_one  = mssg_parts[0] # fetching first element of the part 
		part_body = part_one['body'] # fetching body of the message
		part_data = part_body['data'] # fetching data from the body
		clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8
		clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8
		clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8
		soup = BeautifulSoup(clean_two , "lxml" )
		mssg_body = soup.body()
		# mssg_body is a readible form of message body
		# depending on the end user's requirements, it can be further cleaned 
		# using regex, beautiful soup, or any other method
		temp_dict['Message_body'] = mssg_body

	except :
		pass

	print (temp_dict)
	final_list.append(temp_dict) # This will create a dictonary item in the final list
	
	# This will mark the messagea as read
	GMAIL.users().messages().modify(userId=user_id, id=m_id,body={ 'removeLabelIds': ['UNREAD']}).execute() 
	
# find can navigate the parse tree as well. findParents, findNextSiblings,findPreviousSiblings all work 
# similar to findAll, but will search only within those branches of the tree. 
# findNext, findPrevious and findAllNext and findAllPrevious can be used to find matches starting from 
# a specified point. 

# Let's say you want the text of the first paragraph after the first occurrence of the text "Google" 

soup.find(text="Google").findNext('p').text


# In[ ]:

# A little shortcut to using findAll - if you call the tag itself as a function, you can use it in place of findAll
# with the same arguments 

soup.body('p')


# In[ ]:

soup.findAll('p')


# In[ ]:

#BeautifulSoup makes parsing html or xml very intuitive and elegant. Doing the same thing with regular expressions 
# is prone to leading you to pulling your hair out :) In most situations for screen-scraping projects, BeautifulSoup
# is a life-saver! 


# In[ ]:
Example #47
0
import urllib2

#Handle Arguments
if len(sys.argv) < 5:
  print "USAGE: " + sys.argv[0] + " TARGET_URL FIRSTNAME_FILE LASTNAME_FILE OUTPUT_FILE"
	sys.exit()

target = sys.argv[1]
firstNameLocation = sys.argv[2]
lastNameLocation = sys.argv[3]
outLocation = sys.argv[4]

#Grab URL Text
page = urllib2.urlopen(target)
soup = BeautifulSoup(page)
pageText = soup.body(text = True)

#Split URL Text into Words
pageTextString = ""
for each in pageText:
	pageTextString += each

pageTextString = pageTextString.split(" ")
trimmedPageText = []
for each in pageTextString:
	trimmedPageText.append(each.lower().replace("\n", " "))

#Load Name Lists
firstNameFile = open(firstNameLocation, 'r')
firstNameList = []
for each in firstNameFile:
Example #48
0
 def extractText(self):
     soup = BeautifulSoup(self.request.content)
     return "".join(soup.body(text=True))
Example #49
0
 for script in soup.findAll('script'):
   script.extract()
 for link in soup.findAll('a', href=True):
   if len(link['href']) > 9:
     pat = re.compile(r'^http').findall(link['href'])
     if pat:
       href=re.compile(r"/$").sub('',link['href'])
       temp=re.compile(r"\.").split( href.lower())
       size = len(temp)
       size = size -1
       ext=temp[size]
       if mime.has_key(ext):
         err=1
       else:
         urls.append(href)
 body = soup.body(text=True)
 body = ' '.join(body)
 body=convertAccents(body)
 # body=cleanHTML(body)
 title=convertAccents(title)
 title=cleanHTML(title)
 try:
   body=unicodedata.normalize('NFKD',body).encode('ascii', 'ignore')
 except:
   err=1
 try:
   title=unicodedata.normalize('NFKD',title).encode('ascii', 'ignore')
 except:
   err=1
 body=re.compile(r'\n').sub(' ',body)
 body=re.compile(r'[ ]+').sub(' ',body)