def SeparateWithHtml(html): tagList = [] elemList = [] attrList = [] soup = BeautifulSoup(html,"html5lib") txt = re.compile("[^-~]") for elem in soup.body(text = txt): dname="" for parent in elem.parents: dname = dname + "-" + str(parent.name) if elem !=u'\n' and str(elem.parent.name) != "script": #print "dname:",dname #print "elem:",elem #print elem.parent.attrs tagList.append(dname) elemList.append(elem) attrList.append(elem.parent.attrs) for elem in soup.body(text =""): dname=elem.name for parent in elem.parents: dname = dname + "-" + str(parent.name) if str(elem.parent.name) != "script" and str(elem.name) != "script": #print "dname:",dname #print "elem:","" #print elem.attrs tagList.append(dname) elemList.append("") attrList.append(elem.attrs) return tagList,elemList,attrList
def parse_url(parse_url): print("Starting the process now") #parse_url = sys.stdin.readline() #print("URL: " + parse_url) feed = feedparser.parse(parse_url) print("Successfully parsed the url") new_feed = etree.Element('rss', version="2.0") channel = etree.SubElement(new_feed, 'channel') title = etree.SubElement(channel, 'title') title.text = feed.feed.title link = etree.SubElement(new_feed, 'link') link.text = feed.feed.link desc = etree.SubElement(new_feed, 'description') desc.text = feed.feed.description for entry in feed.entries: response = requests.get(entry.link, headers={'User-Agent': 'Mozilla/5.0'}) response.raise_for_status() soup = BeautifulSoup(response.content, features="html5lib") c = soup.body("div", "chapter-inner chapter-content") #Royal Road c2 = soup.body("div", "fr-view") #WuxiaWorld c3 = soup.body("div", "chp_raw") #ScribbleHub c4 = soup.body("div", "entry-content") #WordPress if c is not None and len(c) > 0: children = c[0].children elif c2 is not None and len(c2) > 0: children = c2[0].children elif c3 is not None and len(c3) > 0: children = c3[0].children elif c4 is not None and len(c4) > 0: children = c4[0].children item = etree.SubElement(channel, 'item') item_title = etree.SubElement(item, 'title') item_title.text = entry.title item_link = etree.SubElement(item, 'link') item_link.text = entry.link item_desc = etree.SubElement(item, 'description') desc = "".join(str(child) for child in children if str(child).strip()) item_desc.text = desc guid = etree.SubElement(item, 'guid', isPermaLink='false') try: guid.text = entry.id except: guid.text = "" pubDate = etree.SubElement(item, 'pubDate') pubDate.text = entry.published return (etree.tostring(new_feed, encoding='utf-8', method="xml"))
def productPrice(product_id): api = "http://www.hobbyking.com/hobbyking_api.asp?id=" + str(product_id) + "+&switch=3" output = urllib2.urlopen(api).read(); soup = BeautifulSoup(output, 'html5lib') price = ''.join(soup.body(text=True)[0]) api = "http://www.hobbyking.com/hobbyking_api.asp?id=" + str(product_id) + "+&switch=1" output = urllib2.urlopen(api).read(); soup = BeautifulSoup(output, 'html5lib') stock = ''.join(soup.body(text=True)[0]) return toJson({"price" : price, "stock" : stock})
def ReadEmailDetails(service, user_id, msg_id): temp_dict = { } try: message = service.users().messages().get(userId=user_id, id=msg_id).execute() # fetch the message using API payLoad = message['payload'] # get payload of the message headr = payLoad['headers'] # get header of the payload for one in headr: # getting the Subject if one['name'] == 'Subject': msg_subject = one['value'] temp_dict['Subject'] = msg_subject else: pass for two in headr: # getting the date if two['name'] == 'Date': msg_date = two['value'] # date_parse = (parser.parse(msg_date)) # m_date = (date_parse.datetime()) temp_dict['DateTime'] = msg_date else: pass # Fetching message body part_body = None if 'parts' in payLoad: email_parts = payLoad['parts'] # fetching the message parts part_one = email_parts[0] # fetching first element of the part part_body = part_one['body'] # fetching body of the message elif 'body' in payLoad: part_body = payLoad['body'] # fetching body of the message if part_body['size'] == 0: #print(payLoad) return None part_data = part_body['data'] # fetching data from the body clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8 clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8 clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 soup = BeautifulSoup(clean_two , "lxml" ) message_body = soup.body() # message_body is a readible form of message body # depending on the end user's requirements, it can be further cleaned # using regex, beautiful soup, or any other method temp_dict['Message_body'] = message_body except Exception as e: print('Email read error: %s' % e) temp_dict = None pass finally: return temp_dict
def spider(url, searchText): flag = 0 try: wp = urllib.urlopen(url) #opens the URL urlContent = wp.read() # read the HTML content except: #print 'Please check URL, Cannot load url' return # To check if the page is HTML if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1: print 'The Page is not a html page' return soup = BeautifulSoup(''.join(urlContent), "html.parser") c = soup.findAll('script') for i in c: i.extract() try: bodyText = soup.body(text=True) except: return text = ''.join(bodyText) if text.find(searchText) > -1: flag = 1 print 'The String "' + searchText + '" is found in the URL : ' + url + '\n' else: return if flag == 0: print 'The given text was not found in the whole Website'
def getMeiziTu(page): meizi_url = base_url + str(page) + "#comments" respone = requests.get(meizi_url, headers=headers) respone.encoding = 'utf-8' soup = BeautifulSoup(respone.text, "html5lib") if soup.find(text=re.compile('屏蔽')) == None: print('=============================') print('正在下载第 ' + str(page) + ' 页') # 存储包含图片地址的标签 img = [] imgall = soup.body('li', id=re.compile("comment-")) for tmp in imgall: img += tmp.div.find( 'div', class_='row').find( 'div', class_='text').find_all( 'img', src=True) for n, girl in enumerate(img): print(' 第 ' + str(n) + ' 张', end='') if not girl.has_attr('org_src'): url = girl['src'] with open(Directory + '妹纸图' + str(page) + '-' + str(n) + url[-4:], 'wb') as f: f.write(requests.get(url).content) else: url = girl['org_src'] with open(Directory + '妹纸图' + str(page) + '-' + str(n) + url[-4:], 'wb') as f: f.write(requests.get(url).content) print('...OK!') print('第 ' + str(page) + ' 页下载完成啦!!!') return True
def strip_html(path, i, label_xid=True): """Strip the HTML: get rid of scripts and interactions""" print '[{}] Reading {} ...'.format(i, path) with open(path, 'r', 'utf8') as fin: # TODO: Handle encodings soup = BeautifulSoup(fin.read(), 'html5lib') # Add doctype if missing if not has_doctype(soup): soup.insert(0, Doctype('html')) # Remove dangerous tags for x in soup('script'): x.extract() for x in soup('noscript'): x.extract() for x in soup('link'): if x.get('as') == 'script': x.extract() for x in soup('iframe'): x['src'] = '' # Fix styles for x in soup('style'): x.string = H.unescape(u"".join(unicode(y) for y in x.contents)) # Label all tags i = 1 for x in soup.body(True): for attr in list(x.attrs): if attr.startswith('on') or attr == 'srcset': del x[attr] if label_xid: x['data-xid'] = i i += 1 # Return return soup.prettify()
def highlight(url): r = requests.get(url) html_text = r.text soup = BeautifulSoup(html_text, "lxml") headSnippetSoup = BeautifulSoup(SNIPPET_HEADER, "lxml") bodySnippetSoup = BeautifulSoup(SNIPPET_BODY, "lxml") head_snippet = removeTags(headSnippetSoup) body_snippet = removeTags(bodySnippetSoup) head = soup.head head.insert(1, soup.new_tag('style', type='text/css')) head.style.append(highlight_css) head.insert(0, head_snippet) soup.head = head # soup.body = add_text(soup, " I didn't find any helpful answers here") body = soup.body body.insert(0, body_snippet) soup.body = body newsoup = Markup(soup) html = soup.prettify("utf-8") templates = "/SnippetIQ/templates/output_template.html" pwd = os.getcwd() filename = pwd + templates with open(filename, "wb") as file: file.write(html) return newsoup
def getFollowees(urls,cur,total): if cur<total: cur+=1 for url in urls: temp=[] try: FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text except(TypeError,ConnectionResetError,requests.packages.urllib3.exceptions.ProtocolError,requests.exceptions.ConnectionError): print('获取关注列表出错,正在重连。。。') try: FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text except(TypeError,ConnectionResetError,requests.packages.urllib3.exceptions.ProtocolError,requests.exceptions.ConnectionError): print('远程主机关闭连接,结束本条URL后续操作!') source = BeautifulSoup(FolloweesPage,"html5lib") for everyone in source.body('div',class_='zm-profile-card zm-profile-section-item zg-clear no-hovercard'): href = everyone.find('a',class_='zg-link')['href'] name=everyone.find('a',class_='zg-link')['title'] mylock.acquire() #获得锁 temp.append(href) temps[0]+=1 if href not in set_urls: set_urls.add(href) list_urls.append(href) print('加入:'+href+' '+name) mylock.release() getFollowees(temp,cur,total)
def getFollowees(urls,cur,total): if cur<total: cur+=1 for url in urls: temp=[] try: FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text except(TypeError,ConnectionResetError,requests.packages.urllib3.exceptions.ProtocolError,requests.exceptions.ConnectionError): print('获取关注列表出错,正在重连。。。') FolloweesPage=requests.get(url+'/followees',cookies=cookie,headers=header).text # selector=etree.HTML(FolloweesPage) source = BeautifulSoup(FolloweesPage,"html5lib") # Guanzhu_a=selector.xpath('//*[@id="zh-profile-follows-list"]/div/div/div[2]/h2/a') # for everyone in Guanzhu_a: for everyone in source.body('div',class_='zm-profile-card zm-profile-section-item zg-clear no-hovercard'): #href=everyone.xpath('@href')[0] href = everyone.find('a',class_='zg-link')['href'] #name=everyone.xpath('text()')[0] name=everyone.find('a',class_='zg-link')['title'] mylock.acquire() #获得锁 temp.append(href) temps.append(href) #list_urls.append(href) if href not in set_urls: set_urls.add(href) list_urls.append(href) # temp.append(href) print('加入:'+href+' '+name) # else: # print('已经存在:'+everyone.xpath('@href')[0]+' '+everyone.xpath('text()')[0]) mylock.release() # print('找到的:'+everyone.xpath('@href')[0]+' '+everyone.xpath('text()')[0]) getFollowees(temp,cur,total)
def parse_email(self, message): temp_dict = {} payld = message['payload'] # get payload of the message headr = payld['headers'] # get header of the payload for one in headr: # getting the Subject if one['name'] == 'Subject': msg_subject = one['value'] temp_dict['Subject'] = msg_subject # elif one['name'] == 'Date': # msg_date = one['value'] # date_parse = (parser.parse(msg_date)) # m_date = (date_parse.date()) # temp_dict['Date'] = str(m_date) elif one['name'] == 'Date': try: msg_date = one['value'] date_parse = parse(msg_date) m_date = date_parse.date() temp_dict['Date'] = str(m_date) except: pass #temp_dict['Date'] = str(m_date) elif one['name'] == 'From': msg_from = one['value'] temp_dict['Sender'] = msg_from elif one['name'] == 'To': msg_from = one['value'] temp_dict['Receiver'] = msg_from else: pass temp_dict['Snippet'] = message['snippet'] # fetching message snippet try: # Fetching message body mssg_parts = payld['parts'] # fetching the message parts part_one = mssg_parts[0] # fetching first element of the part part_body = part_one['body'] # fetching body of the message part_data = part_body['data'] # fetching data from the body clean_one = part_data.replace("-", "+") # decoding from Base64 to UTF-8 clean_one = clean_one.replace("_", "/") # decoding from Base64 to UTF-8 clean_two = base64.b64decode(bytes( clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 soup = BeautifulSoup(clean_two, "lxml") mssg_body = soup.body() # mssg_body is a readible form of message body # depending on the end user's requirements, it can be further cleaned # using regex, beautiful soup, or any other method temp_dict['Message_body'] = mssg_body except: pass return temp_dict
def suburl(self, personalurl, recdepth, connectiontrytimes): if (recdepth > 0) and (connectiontrytimes >= 0): try: r = requests.get(personalurl + '/followees', headers=self.header, cookies=self.cookies) except (TypeError, ConnectionResetError, requests.packages.urllib3.exceptions.ProtocolError, requests.exceptions.ConnectionError): if connectiontrytimes - 1 > 0: print('ConnectionError: 断开连接!进行重试,还剩' + str( connectiontrytimes - 1) + '次重试机会') elif connectiontrytimes - 1 == 0: print('ConnectionError: 断开连接!进行最后一次重试') self.suburl(personalurl, recdepth, connectiontrytimes - 1) else: source = BeautifulSoup(r.text, "html5lib") for temp in source.body( 'div', class_='zm-profile-card zm-profile-section-item zg-clear no-hovercard'): url = temp.find('a', class_='zg-link')['href'] self.countnum += 1 if url not in self._alreadyurl: print('发现新用户! */' + url[28:]) self._allurl.add(url) else: print('已存在用户: */' + url[28:]) self.suburl(url, recdepth - 1, connectiontrytimes) elif connectiontrytimes < 0: print('最后一次重试失败!放弃尝试重新连接!')
def _add_instant_tags(self, request, response): if hasattr(response, "content") and getattr(settings, "WTM_INJECT_TAGS", True): strategy = TagStrategy(request) content = response.content.decode(response.charset) doc = BeautifulSoup(content, "html.parser") head = getattr(doc, "head", []) body = getattr(doc, "body", []) for tag in strategy.result: obj = tag.get("object") element = tag.get("element") if head and obj.tag_location == Tag.TOP_HEAD: head.insert(1, element) elif head and obj.tag_location == Tag.BOTTOM_HEAD: head.append(element) elif body and obj.tag_location == Tag.TOP_BODY: body.insert(1, element) elif body and obj.tag_location == Tag.BOTTOM_BODY: body.append(element) doc.head = head doc.body = body response.content = doc.encode(formatter=None) return response return response
def collect_single_level_part(self, part): data = part['body']['data'].replace('-','+').replace('_','/') clean_data = str(base64.b64decode(bytes(data,'utf-8'))) soup = BeautifulSoup(clean_data, "lxml") text = soup.body()[0].text[2:-1] del soup return str(text)
def _parse(self, html): soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') self.real_article = True if soup.title != None: self.title = soup.title.string if soup.date != None: self.date = soup.time.string if soup.body != None: self.body = "\n".join([p.string for p in soup.body(text=True)])
def get_car_list_from_list_page(page_number): # scrape the page with selenium url = 'http://www.encar.com/fc/fc_carsearchlist.do?carType=for&searchType=model&wtClick_index=251#!%7B%22action' \ '%22%3A%22%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22' \ 'page%22%3A{0}%2C%22limit%22%3A20%7D'.format(page_number) phantomjs_path = r'D:\Workspace\[LIBRARY]\[WEB]\phantomjs-2.1.1-windows\bin\phantomjs.exe' try: # driver = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNIT.copy()) driver = webdriver.PhantomJS(phantomjs_path) except (ValueError, KeyError) as e: print('Driver error with Chrome browser') return None else: try: driver.get(url) except (ValueError, KeyError) as e: print('URL open error with car page') return None list_page = driver.page_source try: # because of bugs in the 'service', pass for quit method is demanded driver.quit() except AttributeError: pass soup = BeautifulSoup(list_page, 'lxml') # validate the page # no_car_message = soup('p', {'class': 'message', 'title': '등록차량이 없습니다. 다른 조건으로 검색하세요.'}) # if 0 < len(no_car_message): # print('there is no car list') # return None # get gar list table car_list = soup.body('tbody', {'id': 'sr_normal'}) if 0 == len(car_list): return None # get URLs of each car page and parse each page into 'result_car' instance result_car_list = [] for tr in car_list[0]('td', {'class': 'inf'}): page_url = 'http://www.encar.com' + tr.a.get("href") result_car = get_car_info_from_car_detail_page(page_url) if result_car is not None: result_car_list.append(result_car) current_time_string = time.strftime('%Y%m%d_%H%M%S') with open( 'car_instances_at_page_{0}_{1}.txt'.format(str(page_number), current_time_string), 'w') as outfile: json.dump( [car_spec_class.__dict__ for car_spec_class in result_car_list], outfile) return result_car_list
def corr_heatmap_with_dual_zoombar(data,file_name='Pairwise_Heatmap.html'): # add vertical zoom bar based on corr_heatmap fuction report=True corr_heatmap(data=data,report=report,file_name=file_name) soup = BeautifulSoup(open(file_name)) p = re.compile('"dataZoom":') the_iter = p.finditer(str(soup.body()[1])) result = max(enumerate(the_iter))[1] idx = result.span()[1] s_new = str(soup.body()[1])[:idx+2] + \ '\n {\n "show": true,\n "type": "slider",\n "start": 50,\n "end": 100,\n "orient": "horizontal",\n "xAxisIndex": null,\n "yAxisIndex": null\n },' + \ '\n {\n "show": true,\n "type": "slider",\n "start": 50,\n "end": 100,\n "orient": "vertical",\n "xAxisIndex": null,\n "yAxisIndex": null\n },' + \ '\n {\n "show": true,\n "type": "inside",\n "start": 50,\n "end": 100,\n "orient": "vertical",\n "xAxisIndex": null,\n "yAxisIndex": null\n },' + str(soup.body()[1])[idx+227:] s_new = s_new[31:] s_new = s_new[:-9] soup.body()[1].string = s_new Html_file= open(file_name,"w") Html_file.write(soup.prettify()) Html_file.close()
def collect_multi_level_part(self, part): full_body = "" for p in part['parts']: data = p['body']['data'].replace('-','+').replace('_','/') clean_data = str(base64.urlsafe_b64decode(bytes(data,'UTF-8'))) soup = BeautifulSoup(clean_data, "html") text = soup.body()[0].text[2:-1] del soup full_body+=str(text) return full_body
def _parse(self, html): soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, fromEncoding='utf-8') self.real_article = True if soup.title != None: self.title = soup.title.string if soup.date != None: self.date = soup.time.string if soup.body != None: self.body = "\n".join([p.string for p in soup.body(text=True)])
def pars_title(link, readmanga=True, mangalib=False): f = open('title.txt','rb') content = f.read() soup = BeautifulSoup(content, "lxml") if readmanga: title_name = soup.find("meta", itemprop="name")['content'] \ if soup.find("meta", itemprop="name") else None genres = {el.a.text for el in soup.body( "span", {"class": "elem_genre"})} category = soup.body.find("span", {"class": "elem_category"}).a.text \ if soup.body.find("span", {"class": "elem_category"}) else None year = soup.body.find("span", {"class": "elem_year"}).a.text \ if soup.body.find("span", {"class": "elem_year"}) else None for div in soup.body.findAll("div", {"class": "rightBlock"}): if div.h5: if div.h5.text == 'Количество закладок': for i, strong in enumerate(div.findAll('strong')): if i == 0: status_process = strong.text if i == 1: status_readed = strong.text if i == 2: status_loved = strong.text return {'title_name': title_name, 'genres': genres, 'category': category, 'year': year, 'status_process': status_process, 'status_readed': status_readed, 'status_loved': status_loved} elif mangalib: title_name = soup.find("meta", itemprop="name")['content'] \ if soup.find("meta", itemprop="name") else None for info in soup.body("div", {"class": "info-list__row"}): info_name = info.strong.text info_value = info.span.text if info.span else None if info_name == "Тип": category = info_value elif info_name == "Дата релиза": year = info_value elif info_name == "Жанры": genres = {el.text for el in info.findAll("a")} elif info_name == "Просмотров": views_count = info_value bookmarks_count_text = soup.body.find("h3", {"class": "aside__title"}) bookmarks_count = re.match(r"\(.*?\)", bookmarks_count_text)[1:-1] return {'title_name': title_name, 'genres': genres, 'category': category, 'year': year, 'views_count': views_count}
def course_scraper(url): courses = [] html = urlopen(url) bs_obj = BeautifulSoup(html,"html.parser") for item in bs_obj.body("li"): if "data-subject" in item.attrs: if(item["data-subject"] != "[]"): course = item.get_text() courses.append(course) return courses
def get_csv_name(year): generic_ftp_name = "ftp://ftp.geonet.org.nz/strong/processed/Summary/" ftp_link = request.urlopen(generic_ftp_name + str(year)) soup = BeautifulSoup(ftp_link, 'lxml') soupstring = soup.body()[0].text string_io = io.StringIO(soupstring) table = pd.read_table(string_io, delim_whitespace = True, names = ["blah", "blah2", "blah3","blah4", "blah5", "blah6", "blah7", "blah8", "CSV_link"]) list_of_csv = table.CSV_link.tolist() quake_id_list = [i[0:11] for i in list_of_csv] return list_of_csv
def processBody(self, payload): parts = payload.get('parts')[0] data = parts['body']['data'] data = data.replace("-", "+").replace("_", "/") decoded_data = base64.b64decode(data) soup = BeautifulSoup(decoded_data, "lxml") body = str(soup.body()) body = body.replace("[<p>", "") cleanBody = body.replace("</p>]", "") # Should return body information in form of string if all went well return cleanBody
def get_wikipedia_langs(count): resp = requests.get( 'https://en.wikipedia.org/wiki/List_of_programming_languages') page = Soup(resp.text, features='html.parser') listings = list() for section in page.body('div', attrs={'class': 'div-col columns column-width'}): for listing in section('li'): listings.append(listing.a.text.strip()) return choices(listings, k=count)
def parsing_message(message): temp_dict = {} payld = message['payload'] headr = payld['headers'] for one in headr: # getting the Subject if one['name'] == 'Subject': msg_subject = one['value'] temp_dict['Subject'] = msg_subject else: pass for two in headr: # getting the date if two['name'] == 'Date': msg_date = two['value'] date_parse = (parser.parse(msg_date)) m_date = (date_parse.date()) temp_dict['Date'] = str(m_date) else: pass for three in headr: # getting the Sender if three['name'] == 'From': msg_from = three['value'] temp_dict['Sender'] = msg_from else: pass temp_dict['Snippet'] = message['snippet'] # fetching message snippet try: # Fetching message body mssg_parts = payld['parts'] # fetching the message parts part_one = mssg_parts[0] # fetching first element of the part part_body = part_one['body'] # fetching body of the message part_data = part_body['data'] # fetching data from the body clean_one = part_data.replace("-", "+") # decoding from Base64 to UTF-8 clean_one = clean_one.replace("_", "/") # decoding from Base64 to UTF-8 clean_two = base64.b64decode(bytes( clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 soup = BeautifulSoup(clean_two, "lxml") mssg_body = soup.body() # mssg_body is a readible form of message body # depending on the end user's requirements, it can be further cleaned # using regex, beautiful soup, or any other method temp_dict['Message_body'] = mssg_body except: temp_dict['Message_body'] = "N/A" return temp_dict # This will create a dictonary item in the final list
def get_messages(self, labels): """Get a list of emails for the corresponding labels where each email is in dict format""" messages = self.get_labelled_messages(labels) messages_list = [] for message in messages: message_dict = {} m_id = message['id'] # get id of individual message message = self.service.users().messages().get( userId=user_id, id=m_id).execute() # fetch the message using API payload = message['payload'] # get payload of the message for header in payload['headers']: # getting the Subject if header['name'] == 'Subject': msg_subject = header['value'] message_dict['Subject'] = msg_subject if header['name'] == 'Date': date_parse = (parser.parse(header['value'])) message_dict['Date'] = str(date_parse.date()) if header['name'] == 'From': message_dict['Sender'] = header['value'] message_dict['Snipet'] = message["snippet"] try: # Fetching message body parts = payload['parts'] # fetching the message parts part_one = parts[0] # fetching first element of the part part_body = part_one['body'] # fetching body of the message part_data = part_body['data'] # fetching data from the body clean_one = part_data.replace( "-", "+") # decoding from Base64 to UTF-8 clean_one = clean_one.replace( "_", "/") # decoding from Base64 to UTF-8 clean_two = base64.b64decode(bytes( clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 soup = BeautifulSoup(clean_two, "lxml") message_dict['Body'] = soup.body() except: pass print(message_dict) messages_list.append(message_dict) # This will mark the message as read self.service.users().messages().modify(userId=self.user_id, id=m_id, body={ 'removeLabelIds': ['UNREAD'] }).execute() return messages_list
def preprocess_html(text, preprocessor, forcePeriod): """ Options: preprocessor: justext, bs4, None continuous: True, False. Use continuous to set if you want to force end of sentences. """ if not preprocessor or not (type(text) == str or type(text) == unicode) or len(text.strip()) == 0: print("TEXT IS NOT BEING PRE PROCESSED") print("type(text) == %s ; Size Text: %d" % (type(text), len(text.strip()))) return text elif preprocessor == "bs4": soup = BeautifulSoup(text, "html.parser") tags_to_remove = ["script"] for tag in tags_to_remove: for x in soup.body(tag): x.decompose() if forcePeriod: return soup.body.get_text().replace("\n", ".\n") else: return soup.body.get_text() elif preprocessor == "justext": paragraphs = justext.justext(text, justext.get_stoplist('English')) text = "\n" for paragraph in paragraphs: if not paragraph.is_boilerplate: # and not paragraph.is_header: if forcePeriod: text = text + paragraph.text + ".\n" else: text = text + paragraph.text + "\n" return text # At the moment that this code was updated, boilerpipe was not available for download via pip. elif preprocessor == "boilerpipe" or preprocessor == "boi": text = Extractor(extractor='ArticleExtractor', html=text).getText() #print("Text before: %s" % text) if forcePeriod: #print("Text after: %s" % text.replace("\n", ".\n")) return text.replace("\n", ".\n") else: return text else: print("PRE PROCESSING OPTION %s NOT FOUND. IGNORING PRE PROCESSING." % (preprocessor)) return text
def get_message_text(self, msg_id): # possible formats: ['full', 'metadata', 'minimal', 'raw'] msg = self.service.users().messages().get(userId='me', id=msg_id, format='full', prettyPrint=True).execute() parts = msg['payload']['parts'] if self.is_contains_attachment(parts): data = parts[0]['parts'][0]['body']['data'] else: data = parts[0]['body']['data'] clean = base64.urlsafe_b64decode(data) soup = BeautifulSoup(clean, "lxml") mssg_body = soup.body() return mssg_body
def preprocess_html(self, text, preprocessor, forcePeriod): """ Options: preprocessor: justext, bs4, None forcePeriod: True, False. True will force a period whenever a linebreak is found. """ if not preprocessor or type(text) != str or len(text.strip()) == 0: return text elif preprocessor == "bs4": soup = BeautifulSoup(text, "html.parser") # This html text has no body! if soup.find("body") is None: return text tags_to_remove = ["script"] for tag in tags_to_remove: for x in soup.body(tag): x.decompose() if forcePeriod: return soup.body.get_text().replace("\n", ".\n") else: return soup.body.get_text() elif preprocessor == "justext" or preprocessor == "jst": paragraphs = justext.justext(text, justext.get_stoplist('English')) text = "\n" for paragraph in paragraphs: if not paragraph.is_boilerplate: # and not paragraph.is_header: if forcePeriod: text = text + paragraph.text + ".\n" else: text = text + paragraph.text + "\n" return text # Boilerpipe install is not always working. If you cannot install it, just comment the following code # and remove the import elif preprocessor == "boilerpipe" or preprocessor == "boi": text = Extractor(extractor='ArticleExtractor', html=text).getText() #print("Text before: %s" % text) if forcePeriod: #print("Text after: %s" % text.replace("\n", ".\n")) return text.replace("\n", ".\n") else: return text else: print("PRE PROCESSING OPTION %s NOT FOUND. IGNORING PRE PROCESSING." % preprocessor) return text
def get_car_list_from_list_page(page_number): # scrape the page with selenium url = 'http://www.encar.com/fc/fc_carsearchlist.do?carType=for&searchType=model&wtClick_index=251#!%7B%22action' \ '%22%3A%22%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22' \ 'page%22%3A{0}%2C%22limit%22%3A20%7D'.format(page_number) phantomjs_path = r'D:\Workspace\[LIBRARY]\[WEB]\phantomjs-2.1.1-windows\bin\phantomjs.exe' try: # driver = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNIT.copy()) driver = webdriver.PhantomJS(phantomjs_path) except (ValueError, KeyError) as e: print('Driver error with Chrome browser') return None else: try: driver.get(url) except (ValueError, KeyError) as e: print('URL open error with car page') return None list_page = driver.page_source try: # because of bugs in the 'service', pass for quit method is demanded driver.quit() except AttributeError: pass soup = BeautifulSoup(list_page, 'lxml') # validate the page # no_car_message = soup('p', {'class': 'message', 'title': '등록차량이 없습니다. 다른 조건으로 검색하세요.'}) # if 0 < len(no_car_message): # print('there is no car list') # return None # get gar list table car_list = soup.body('tbody', {'id': 'sr_normal'}) if 0 == len(car_list): return None # get URLs of each car page and parse each page into 'result_car' instance result_car_list = [] for tr in car_list[0]('td', {'class': 'inf'}): page_url = 'http://www.encar.com' + tr.a.get("href") result_car = get_car_info_from_car_detail_page(page_url) if result_car is not None: result_car_list.append(result_car) current_time_string = time.strftime('%Y%m%d_%H%M%S') with open('car_instances_at_page_{0}_{1}.txt'.format(str(page_number), current_time_string), 'w') as outfile: json.dump([car_spec_class.__dict__ for car_spec_class in result_car_list], outfile) return result_car_list
def scrape_player_page(video): """ Try to scrape the site for video and download. """ if not video['url'].startswith('http'): video['url'] = "http://www.svtplay.se" + video['url'] soup = BeautifulSoup(requests.get(video['url']).text) video_player = soup.body('a', {'data-json-href': True})[0] if 'oppetarkiv.se' in video['url']: flashvars = requests.get( "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json() else: if video_player.attrs['data-json-href'].startswith("/wd"): flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json() else: flashvars = requests.get( "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json() video['duration'] = video_player.attrs.get('data-length', 0) if not 'title' in video: video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_') if 'genre' not in video: if soup.find(text='Kategori:'): video['genre'] = soup.find(text='Kategori:').parent.parent.a.text else: video['genre'] = 'Ingen Genre' if 'dynamicStreams' in flashvars: video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4' filename = Path(video['title']).with_suffix(".mp4") print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0]) if 'pathflv' in flashvars: rtmp = flashvars['pathflv'][0] filename = Path(video['title']).with_suffix(".flv") print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0]) if not 'timestamp' in video and soup.find_all(datetime=True): xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime'] if xmldate_str: video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time if 'video' in flashvars: for reference in flashvars['video']['videoReferences']: if 'm3u8' in reference['url']: video['url'] = reference['url'] video['filename'] = Path(video['title']).with_suffix('.ts') if 'statistics' in flashvars: video['category'] = flashvars['statistics']['category'] if not download_from_playlist(video): return False if 'url' not in video: print("Could not find any streams") return False return video
def process_as_html(contents, charset): if type(contents) == BeautifulSoup: soup = contents else: soup = BeautifulSoup(contents, "html5lib", from_encoding = charset) # Get references to all strings if not soup.body: return soup strings = find_string_elements(soup.body) images = soup.body("img") iframes = soup.body("iframe") #flash = soup.body("param") for string_el in strings: # Run regex on each string s = unicode(string_el) censored = currency_pattern.sub(censor_currency_match, s) if s is not censored: already_link = change_link_href(string_el, "http://www.slowerinternet.com") if not already_link: #print "Must create a link!" link = soup.new_tag("a", href="http://www.slowerinternet.com") link.string = censored string_el.replace_with(link) else: string_el.replace_with(censored) for img in images: if img.get("src") and adblock_filter.match(img["src"]): img["src"] = "https://docs.python.org/favicon.ico" for iframe in iframes: if iframe.get("src") and adblock_filter.match(iframe["src"]): iframe["src"] = "http://example.com" return soup
def key_sector(key_sector): url_sec = 'https://de.finance.yahoo.com/quote/' + entry_list + '/holdings?p=' + entry_list req_sec = r.get(url_sec) dat_sec = BeautifulSoup(req_sec.content, 'html.parser') cont_sec = dat_sec.body('div', {'class': 'Mb(25px)'}) df_sec = pd.DataFrame(cont_sec[1]) sec = df_sec[0].astype(str).str.split('</span>').to_list() df_sec2 = pd.DataFrame(sec).dropna().transpose() sec2 = df_sec2[1].astype(str).str.split('">').to_list() df_sec3 = pd.DataFrame(sec2) sec_industry = df_sec3[4][1:].dropna().reset_index().drop(columns=['index']) sec_percent = df_sec3[1].str.replace(',', '.').str.replace('%', '').apply(pd.to_numeric, errors='coerce').dropna().reset_index().drop(columns=['index']) df_merge = pd.merge(sec_industry, sec_percent, left_index=True, right_index=True).rename(columns={4: 'Sektor', 1: 'Gewichtung in %'}).sort_values(by=['Gewichtung in %'], ascending=False).reset_index().drop(columns=['index']) return df_merge
def json_from_full_listing(listing_url): print(f'fetching {listing_url}...') resp = requests.get(listing_url) soup = BeautifulSoup(resp.text, 'html.parser') script = [ s for s in soup.body('script') if s.string is not None and 'dataLayer' in s.string ][0] match = re.search('dataLayer = (.*);', script.string) jsonstr = match.groups()[0] last = json.loads(jsonstr)[-1] listing = {**last.get('property', {}), **last.get('sold_property', {})} listing['link'] = listing_url return pd.json_normalize(listing).to_dict(orient='records')[0]
def __init__(self, list_url): self.playlist_video_list = [] r = urlopen(list_url) soup = BeautifulSoup(r, "html.parser") #print 'lolololololl soup' [s.extract() for s in soup.body('script')] tag_list = soup.body.find("div", "pl-video-list") if not tag_list: return tag_titles = tag_list.find_all("tr", "pl-video yt-uix-tile ") tag_imgs = tag_list.find_all("span", "yt-thumb-clip") tag_times = tag_list.find_all('div', 'timestamp') tag_uploaders = tag_list.find_all("a", " yt-uix-sessionlink spf-link ") ##print soup.prettify() #print len(tag_imgs) #print len(tag_titles) #print len(tag_times) #print len(tag_uploaders) have_to_be_modified = [] for n in range(0, len(tag_imgs), 1): if "no_thumbnail" in tag_imgs[n].img['data-thumb']: have_to_be_modified.append(n) ##print "n "+str(n) have_to_be_modified = sorted(have_to_be_modified, reverse=True) for target_list in have_to_be_modified: ##print target_list del tag_titles[target_list] del tag_imgs[target_list] #print "modified" #print len(tag_imgs) #print len(tag_titles) if len(tag_imgs) == len(tag_titles) == len(tag_times) == len( tag_uploaders): for i in range(0, len(tag_titles)): ##print tag_imgs[i].img['data-thumb'] ##print tag_titles[i]['data-title'] ##print tag_times[i].string ##print tag_uploaders[i].string a_pref = playlist_video(i, tag_titles[i]['data-title'], tag_imgs[i].img['data-thumb'], tag_times[i].string, tag_uploaders[i].string) ##print a_pref.img self.playlist_video_list.append(a_pref)
def searchUrl(url, level, searchText): # the root URL is level 0 # do not go to other websites global website netloc = urlparse.urlsplit(url).netloc.split('.') if netloc[-2] + netloc[-1] != website: return global urlList if url in urlList: # prevent using the same URL again return try: # context = ssl._create_unverified_context() urlContent = urllib2.urlopen(url,context=context, verify=False).read() # soup = BeautifulSoup(''.join(urllib2.urlopen(url,context=context).read()), verify=False) urlList.append(url) except: return soup = BeautifulSoup(''.join(urlContent)) # remove script tags c=soup.findAll('script') for i in c: i.extract() # get text content of the URL try: body_texts = soup.body(text=True) except: return text = ''.join(body_texts) # search if text.find(searchText) > -1: print url print # if there are links on the webpage then recursively repeat if level > 0: linkTags = soup.findAll('a') if len(linkTags) > 0: for linkTag in linkTags: try: linkUrl = linkTag['href'] searchUrl(linkUrl, level - 1, searchText) except: pass
def main(page): for i in range(len(header)): if header_Available[i]!=False: url = start_url+'/page-'+str(page)+'#comments' r = requests.get(url,headers=header[i]) r.encoding = 'utf-8' soup = BeautifulSoup(r.text,"lxml") if soup.find(text=re.compile('屏蔽'))==None: print('=============================') print('正在下载第 '+str(page)+' 页') # 存储包含图片地址的标签 img = [] # 筛选img标签的替代方案A: # comparetolist = ['p','div','div','div','li','ol','div','div','div','div','body'] # goodjob(soup.find_all('img',src=True),img,comparetolist) imgall = soup.body('li',id = re.compile("comment-")) for tmp in imgall: img+=tmp.div.find('div',class_ = 'row').find('div',class_ = 'text').find_all('img',src=True) for n,girl in enumerate(img): print(' 第 '+str(n)+' 张',end='') if not girl.has_attr('org_src'): url = girl['src'] with open('妹纸图'+str(page)+'-'+str(n)+url[-4:],'wb') as f: f.write(requests.get(url).content) else: url = girl['org_src'] with open('妹纸图'+str(page)+'-'+str(n)+url[-4:],'wb') as f: f.write(requests.get(url).content) print('...OK!') print('第 '+str(page)+' 页下载完成啦!!!') return True else: if header_Available[i]!=False: header_Available[i]=False print('被屏蔽,正在反屏蔽.....\n User-Agent 可用信息:') show_info() if header_Available[len(header)-1]==False: print('反屏蔽失败,线程终止!\nUser-Agent 可用信息:') show_info() return False
def fuckingweather(self, irc, msg, args, text): """ <zip code>: Displays the weather from http://www.thefuckingweather.com/ """ url = 'http://www.thefuckingweather.com/Where/%s' % utils.web.urlquote(text) try: soup = BeautifulSoup(utils.web.getUrl(url)) find=lambda x,y:soup.body(x,y,limit=1)[0].text temperature = find('span', {'class':'temperature'}) remark = find('p',{'class':'remark'}) flavor = find('p',{'class':'flavor'}) location = find('span',{'id':'locationDisplay'}) celsius = math.floor((int(temperature)-32)*5/9) res = "%s\u00B0F / %s\u00B0C in %s?! %s. %s." % (temperature, celsius, location, remark, flavor) irc.reply(res, prefixNick=True) except: irc.reply("ERROR: IT'S F*****G BROKEN.", prefixNick=True)
def fetchImage(hostname): while(container.empty() == False): url,depth = container.get() if url in visited_url or depth>3 or hostname!=urlparse.urlparse(url).hostname: continue visited_url.append(url) print 'visiting '+url +' at depth:',depth htmlData = urllib2.urlopen(url).read() soup = BeautifulSoup(htmlData) #search imgs in this body section articles = soup.body("article") count = 0 for article in articles: if(article.img['src']==""): continue imgUrl = urlparse.urljoin(url,article.img['src']) appName = article.section.h3.a.string category = article.section.h3.span.string if(imgUrl not in loaded_image_url): print 'imgUrl:' + imgUrl + ' appName:' +appName.encode('big5','ignore') + ' category:'+category.encode('big5','ignore')+' found' loaded_image_url.append(imgUrl) count = count +1; #print "downloading img:" + imgUrl + " ..." #cmd = "wget -P ./../res "+imgUrl #os.system(cmd) #urllib.urlretrieve(imgUrl, "./../res/url"+str(count)+".jpg") #search for adjancy pages links = soup('a') for link in links: if('href' in dict(link.attrs)): newurl = urlparse.urljoin(url,link['href']) container.put((newurl,depth+1))
if line.startswith("#UID:"): uid = line.strip().split("#UID:")[1] elif line.startswith("#DATE:"): date = line.strip().split("#DATE:")[1] elif line.startswith("#URL:"): url = line.strip().split("#URL:")[1] elif line.strip() == "#EOR": print uid, date, url content = unidecode(content.decode("utf8")) if html_parser == "bs4": soup = BeautifulSoup(content, "html.parser") tags_to_remove = ["script"] for tag in tags_to_remove: for x in soup.body(tag): x.decompose() text = soup.body.get_text() elif html_parser == "justext": paragraphs = justext.justext(content, justext.get_stoplist('English')) text = "\n" for paragraph in paragraphs: if not paragraph.is_boilerplate: # and not paragraph.is_header: text = text + paragraph.text + "\n" elif html_parser == "boilerpipe": extractor = Extractor(extractor='ArticleExtractor', html=content) text = extractor.getText() elif html_parser == "html":
print tag.name, tag.string print '##############根据属性名查找(1):' # 定位属性名为sister的标签(注意:由于class是python的关键词,所以需要写成class_='*') # 但是通过 class_ 参数搜索有指定CSS类名的tag 是从Beautiful Soup的4.1.1版本才开始; for tag in soup.find_all(class_="sister"): print tag.name, tag.string print '##############根据属性名查找(2):' for tag in soup.find_all(id=re.compile("link"), limit=3): print tag.name, tag.string # find_all() 几乎是Beautiful Soup中最常用的搜索方法, 所以我们定义了它的简写方法. # BeautifulSoup对象 和 tag对象可以被当作一个方法来使用 # find_all的简写方法 print '##############简写find_all:' for tag in soup(class_="sister", limit=1): print tag.name, tag.string # 当设置limit=1,等价于使用find()方法 # 区别: # 唯一的区别是 find_all() 方法的返回结果是值包含一个元素的列表(能够 for in 遍历),而find()方法直接返回结果. # find_all() 方法没有找到目标是返回空列表, find()方法找不到目标时,返回 None . print '##############find:' item_tag = soup.find(class_="sister") print item_tag, item_tag.string print "body中id='link2'的标签:", soup.body.find_all(id="link2") # 简写: print "body中id='link2'的标签:", soup.body(id="link2")
def get_car_info_from_car_detail_page(page_url): # 자동차 페이지 url_request = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'}) try: connection = urlopen(url_request) except (ValueError, KeyError) as e: print('URL open error with car page') return None car_page = connection.read() connection.close() soup = BeautifulSoup(car_page, 'lxml') # page validation find_iter = soup.find_all('div', attrs={'class': 'car_info'}) if 0 == len(find_iter): return None # ================================================================ # 기본적인 차량 정보 받아오기 # ================================================================ find_iter = soup.head.find_all('meta', {'name': 'WT.z_CarId'}) if 0 == len(find_iter): return None car_id = int(find_iter[0].attrs.get('content')) current_car = CarInfo(car_id) # current_car.dealer_ = 'unknown' # for i in range(len(find_iter)): # attribute_name = find_iter[i].attrs.get('name') # if attribute_name is None: # continue # # class 순서상 빈 것: 딜러 # if 'WT.z_state' == attribute_name: # 차량 위치 # current_car.state_ = find_iter[i].attrs.get('content') # # class 순서상 빈 것: 차량번호 # elif 'WT.z_price' == attribute_name: # 총 구매비용 # current_car.price_ = find_iter[i].attrs.get('content') # # class 순서상 빈 것: 제조사보증 유무 # elif 'WT.z_make' == attribute_name: # 제조사 # current_car.set_maker(find_iter[i].attrs.get('content')) # # class 순서상 빈 것: 차종 # # class 순서상 빈 것: 차량코드 # # class 순서상 빈 것: 모델 # # class 순서상 빈 것: 트림 # elif 'WT.trns' == attribute_name: # 변속기 # current_car.transmission_ = find_iter[i].attrs.get('content') # elif 'WT.whatfuel' == attribute_name: # 연료 # current_car.fuel_ = find_iter[i].attrs.get('content') # elif'WT.z_cat' == attribute_name: # 분류 # current_car.category_ = find_iter[i].attrs.get('content') # elif 'WT.z_year' == attribute_name: # 연식 # current_car.year_ = find_iter[i].attrs.get('content') # elif 'WT.z_month' == attribute_name: # 출시월 # current_car.month_ = find_iter[i].attrs.get('content') # elif 'WT.mileage' == attribute_name: # 주행거리 # current_car.mileage_ = find_iter[i].attrs.get('content') # elif 'WT.z_vehcat' == attribute_name: # 상태 # current_car.condition_ = find_iter[i].attrs.get('content') # elif 'WT.color' == attribute_name: # 색상 # current_car.color_ = find_iter[i].attrs.get('content') # # class 순서상 빈 것: 부품교환이력 # current_car.type_ = soup.body('span', class_='cls')[0].em.string # current_car.model_ = soup.body('span', class_='dtl')[0]('strong')[-1].text # current_car.modelCode_ = soup.body('span', class_='dtl')[0].em.string # 페이지 마지막 디테일 정보에서 대충 긁어오기 for input_field in soup.body('form', {'name': 'carDetail'})[0]('input'): if not input_field.has_attr('id'): continue current_car.set_info(input_field.attrs.get('id'), input_field.attrs.get('value')) # 페이지 중단에서 정보받아오기. 특히, 제조사 보증 유무를 위해 stat_detail = soup.body('ul', class_='stat_detail')[0]('li') for stat in stat_detail: if '차량번호' == stat.span.string: current_car.plateNumber_ = stat.text.split()[1] elif '배기량:' == stat.span.string: current_car.displacement_ = stat.text.split(':')[1] elif '연비:' == stat.span.string: current_car.fuelEfficiency_ = stat.text.split(':')[1] elif '수입형태:' == stat.span.string: # 제조사 보증 유무 if 'X' in stat.text: current_car.warranty_ = False else: current_car.warranty_ = True # 리스 정보가 있을 경우, 리스 정보 읽기 for lease_table in soup.body('ul', class_='brd_price'): for dl in lease_table('dl'): if '인수비용' == dl.dt.text: current_car.leaseCost_ = dl.dd.text elif '월리스료' == dl.dt.text: current_car.leaseMonthlyPay_ = dl.dd.text elif '잔여개월' == dl.dt.text: months = dl.dd.text.split('/') current_car.leaseLeftMonths_ = months[0] current_car.leaseTotalMonths_ = months[1].replace('개월', '') else: print('unknown information for lease') # ================================================================ # 딜러 및 상사 정보 받아오기 # ================================================================ # 상사 정보를 불러오기 위해서는 아래 링크를 띄워서 크롤링 해야함 url = 'http://www.encar.com/dc/dc_carsearchpop.do?method=companyInfoPop&carTypeCd=1&carid={0}'.format(str(car_id)) url_request = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) try: connection = urlopen(url_request) except (ValueError, KeyError) as e: print('URL open error with dealer page') return None dealer_page = connection.read() connection.close() dealer_soup = BeautifulSoup(dealer_page, 'lxml') dealer_company = dealer_soup.body('table', {'class', 'viewinfo'})[0].find_all('td')[0].string # 딜러 본인의 이름 받아오기 dealer_name = soup.body('div', {'class', 'dealer'})[0]('div', {'class', 'info'})[0].strong.string current_car.dealer_ = '{0}({1})'.format(dealer_name, dealer_company) # ================================================================ # 옵션 정보 받아오기 # ================================================================ options = soup.body('div', {'class', 'box_opt'}) options_basic = options[0]('dd', {'class', 'on'}) for i in range(len(options_basic)): current_car.option_.set_option(options_basic[i].a.string) # 기타 옵션 if 1 < len(options): options_etc = options[1]('dd') for i in range(len(options_etc)): current_car.option_.set_option(options_etc[i].string) # 추가 입력 옵션 (사용자가 추가로 요청 할 수 있는 옵션들인듯) if 2 < len(options): current_car.option_.additionalOptions_ = options[2].p.string # ================================================================ # 성능 점검 기록 받아오기 # ================================================================ # 차량 페이지 내에 성능 점검 기록 버튼이 있는지 확인 if 0 < len(soup.find_all('a', {'class': 'btn_detail'}, text='성능점검 자세히 보기')): # 성능 점검 기록이 등록되어있다면 아래의 페이지가 제대로 접속될 것임 url = 'http://www.encar.com/md/sl/mdsl_regcar.do?method=inspectionView&carid={0}'.format(str(car_id)) url_request = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) try: connection = urlopen(url_request) except (ValueError, KeyError) as e: current_car.inspection_.bExist_ = False else: current_car.inspection_.bExist_ = True inspection_page = connection.read() connection.close() inspection_soup = BeautifulSoup(inspection_page, 'lxml') table_body = inspection_soup.body('table', class_='ckst')[0].tbody table_field_names = table_body('th') table_field_values = table_body('td') # 차종에 따라, 사고와 침수가 함께 표시되고, 원동기 형식이 표기되지 않는 테이블들이 있음 for i in range(len(table_field_names)): if '연식' in table_field_names[i].text: current_car.inspection_.strYear_ = table_field_values[i].text elif '차대번호' == table_field_names[i].text: # 동일성 확인 부분과 겹치므로, == 로 검사 current_car.inspection_.strVIN_ = table_field_values[i].text elif '최초등록일' in table_field_names[i].text: current_car.inspection_.strFirstRegistrationDate_ = table_field_values[i].text elif '동일성확인' in table_field_names[i].text: current_car.inspection_.strVINMatching_ = remove_legacy_characters(table_field_values[i].text) elif '주행거리' in table_field_names[i].text: current_car.inspection_.strMileage_ = table_field_values[i].text.split()[0].replace(',', '') elif '변속기종류' in table_field_names[i].text: current_car.inspection_.strVIN_ = remove_legacy_characters(table_field_values[i].text) elif '사고유무' == table_field_names[i].text: if '무' not in table_field_values[i].text: current_car.inspection_.bDamaged_ = True elif '침수유무' == table_field_names[i].text: if '무' not in table_field_values[i].text: current_car.inspection_.bSubmerged_ = True elif '사고/침수유무' == table_field_names[i].text: if '무' not in table_field_values[i].text: print('유사고 표기방법: ' + table_field_values[i].text) # 이런 경우가 거의 없어서, 크롤링 중 발견하면 report하도록 elif '원동기형식' in table_field_names[i].text: current_car.inspection_.strMotorType_ = table_field_values[i].text elif '보증유형' in table_field_names[i].text: current_car.inspection_.strWarrantyType_ = table_field_values[i].text elif '불법구조변경' in table_field_names[i].text: if '없음' not in table_field_values[i].text: current_car.inspection_.bIllegalRemodeling_ = True elif '검사유효기간' in table_field_names[i].text: current_car.inspection_.strTermOfValidity_ = table_field_values[i].text # 부품 교환 이력이 있는 것들 찾기 repair_inspections = inspection_soup.body('dl', class_='section_cktxt')[0]('dd') if 1 < len(repair_inspections): structure_repair_list = repair_inspections[1]('span', {'class': 'on'}) for repair_inst in structure_repair_list: current_car.inspection_.listStructureRepairs_.append(repair_inst.text) if 0 < len(repair_inspections): exterior_repair_list = repair_inspections[0]('span', {'class':'on'}) for repair_inst in exterior_repair_list: current_car.inspection_.listExteriorRepairs_.append(repair_inst.text) # 개정된 성능 기록표 status_name_list = [] status_value_list = [] new_inspection_tables = inspection_soup.body('table', class_='ckstl ckdata')[0]('tbody') for table in new_inspection_tables: cur_section_name = table.th.text + '_' cur_mid_section_name = '' for tr in table('tr'): for td in tr('td'): if td.has_attr('rowspan'): # 중간 섹션 cur_mid_section_name = td.text + '_' else: if td.has_attr('colspan'): # 검사 항목 (중간 섹션 없는 경우) if 3 == int(td.attrs.get('colspan')): # 배기 가스 등, 여러 줄에 걸친 검사 결과 status_name_list.append(cur_section_name[:-1]) # 마지막에 언더라인 지우기 status_value_list.append(td.text) else: status_name_list.append(cur_section_name + td.text) cur_mid_section_name = '' # rowspan이 끝났음을 표현 else: cur_status = td('span', {'class': 'on'}) if 0 < len(cur_status): # 검사 결과 status_value_list.append(cur_status[0].text) else: # 검사 항목 status_name_list.append(cur_section_name + cur_mid_section_name + td.text) # 항목별로 정리해서 집어 넣기 for i in range(len(status_name_list)): current_car.inspection_.set_item(status_name_list[i], status_value_list[i]) # ================================================================ # 보험 기록 가져오기 # ================================================================ # 보험 기록이 등록되어있다면 아래의 페이지가 제대로 접속된 후, 테이블이 읽힐 것 # 페이지 자체는 모두 존재함을 확인 함 url = 'http://www.encar.com/dc/dc_cardetailview.do?method=kidiFirstPop&carid={0}'.format(str(car_id)) url_request = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) current_car.insurance_.bExist_ = False try: connection = urlopen(url_request) except (ValueError, KeyError) as e: print('no insurance page') else: insurance_page = connection.read() connection.close() insurance_soup = BeautifulSoup(insurance_page, 'lxml') smlist = insurance_soup.body('div', class_='smlist') if 0 < len(smlist): current_car.insurance_.bExist_ = True tr_list = smlist[0]('tr') for tr in tr_list: image_src = tr.img.get('src') if '/images/es/car_num2_2.gif' == image_src: # 자동차 용도 이력 current_car.insurance_.set_change_purpose(tr('td')[1].text.split()[0]) elif '/images/es/car_num2_3.gif' == image_src: # 번호판 / 차주 변경 이력 input_numbers = tr('td')[1].text.split('/ ') current_car.insurance_.set_change_plate_number(input_numbers[0].replace('회', '')) current_car.insurance_.set_change_owner(input_numbers[1].replace('회', '')) elif '/images/es/car_num2_4.gif' == image_src: # 파손 이력 current_car.insurance_.set_damages(tr('td')[1].text) elif '/images/es/car_num2_5.gif' == image_src: current_car.insurance_.set_compensation_self(tr('td')[1].text) elif '/images/es/car_num2_6.gif' == image_src: current_car.insurance_.set_compensation_others(tr('td')[1].text) # ================================================================ # 차량 설명 받아오기 # ================================================================ current_car.description_ = soup.body('div', {'class', 'wrp_car_info'})[0]('div')[0].pre.string return current_car
def render(): # magic number for unique file name magicnum = '_' + str(int(time.time())) from urllib import unquote query = request.args.get('q') url = unquote(query) if 'htmlkepdf.com' in url: return redirect('/', 301) from urlparse import urlparse addr = urlparse(url).netloc filename = addr.replace('.', '_') from readability.readability import Document import urllib2 opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: html = opener.open(url).read() #except HTTPError: # html = urllib2.urlopen(url).read() except: html = requests.get(url).content readable_article = Document(html).summary() readable_title = Document(html).short_title() # clean up text with bs4 soup = BeautifulSoup(readable_article) readable_article = soup.body(text=True) readable_article = ' '.join(readable_article) # find meta, just in case readable article is too short soup2 = BeautifulSoup(html) try: metadesc = soup2.find('meta', {'name': 'description'})['content'] except: metadesc = '' # error list # 1. facebook.com harus pake https p1 = Popen('xvfb-run --auto-servernum --server-num=1 python gistfile2.py ' + '"' + url + '"' + ' ' + filename + '.pdf', shell=True, stdout=PIPE, stderr=PIPE, close_fds=True) stdout, stderr = p1.communicate() # wait retcode = p1.returncode if retcode == 0: with open(filename + '.pdf') as f: # filter data yang masuk, biar gak duplikat oid = fs.put(f, content_type='application/pdf', filename=filename+magicnum, title=readable_title, article=readable_article, update=datetime.datetime.now(), url=url, metadesc=metadesc) os.remove(filename + '.pdf') return redirect('/view/' + str(oid)) elif retcode == 139: p2 = Popen('phantomjs rasterize.js ' + '"' + url + '"' + ' ' + filename + '.pdf', shell=True, stdout=PIPE, stderr=PIPE, close_fds=True) stdout, stderr = p2.communicate() # wait retcode = p2.returncode if retcode == 0: with open(filename + '.pdf') as f: # filter data yang masuk, biar gak duplikat oid = fs.put(f, content_type='application/pdf', filename=filename+magicnum, title=readable_title, article=readable_article, update=datetime.datetime.now(), url=url, metadesc=metadesc) os.remove(filename + '.pdf') return redirect('/view/' + str(oid)) else: p2 = Popen('phantomjs rasterize.js ' + '"' + url + '"' + ' ' + filename + '.pdf', shell=True, stdout=PIPE, stderr=PIPE, close_fds=True) stdout, stderr = p2.communicate() # wait retcode = p2.returncode if retcode == 0: with open(filename + '.pdf') as f: # filter data yang masuk, biar gak duplikat oid = fs.put(f, content_type='application/pdf', filename=filename+magicnum, title=readable_title, article=readable_article, update=datetime.datetime.now(), url=url, metadesc=metadesc) os.remove(filename + '.pdf') return redirect('/view/' + str(oid)) # else error, tampilkan error di halaman error stderror = stderr return render_template("error.html", query=query, addr=addr, title=readable_title, stderror=stderror, retcode=retcode)
def crawl_espn_projection_page(projections=None, next_page=None, **params): """Crawls the ESPN Page and returns the projection data as a dict""" if next_page: response = requests.get(next_page) else: response = requests.get(ESPN_PROJECTIONS_URL, params) print response.url soup = BeautifulSoup(response.content, 'html.parser') pagination_nav = soup.body.find(class_='paginationNav') for item in pagination_nav.find_all('a'): if 'NEXT' in item.contents: next_page = item['href'] projections = {} if projections is None else projections player_rows = soup.body(class_='pncPlayerRow') for row in player_rows: projection = {} valid = True for i, cell in enumerate(row.find_all('td')): if i == 0: # Find Name, Team, and Position name = cell.a.string if 'D/ST' in cell.contents[1]: team = get_team(cell.contents[0].string.split()[0].strip().lower()) projection['name'] = get_name(name, team, 'dst') projection['team'] = get_team(team) projection['position'] = 'dst' else: splits = cell.contents[1].split() team = splits[1] position = splits[2] # No Free Agents if team == 'FA': valid = False break projection['name'] = get_name(name, team, position) projection['team'] = get_team(team) try: projection['position'] = get_position(position) except Exception: # Remove kickers and the like. valid = False break if i == 1: # Find opponent and whether or not team is home or away if cell.a is None: valid = False break text = cell.a.string if text[0] == '@': projection['home'] = False projection['opponent'] = get_team(text[1:]) else: projection['home'] = True projection['opponent'] = get_team(text) elif i == 3: projection['receptions'] = float(cell.string.split('/', 1)[0]) elif i in range(4, 14): _populate_stats(i, cell, projection) if valid: calculate_ppr(projection) projections[projection['name']] = projection if next_page and len(projections) < 500: time.sleep(0.250) return crawl_espn_projection_page(projections=projections, next_page=next_page) else: return projections
temp_dict['Snippet'] = message['snippet'] # fetching message snippet try: # Fetching message body mssg_parts = payld['parts'] # fetching the message parts part_one = mssg_parts[0] # fetching first element of the part part_body = part_one['body'] # fetching body of the message part_data = part_body['data'] # fetching data from the body clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8 clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8 clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 soup = BeautifulSoup(clean_two , "lxml" ) mssg_body = soup.body() # mssg_body is a readible form of message body # depending on the end user's requirements, it can be further cleaned # using regex, beautiful soup, or any other method temp_dict['Message_body'] = mssg_body except : pass print (temp_dict) final_list.append(temp_dict) # This will create a dictonary item in the final list # This will mark the messagea as read GMAIL.users().messages().modify(userId=user_id, id=m_id,body={ 'removeLabelIds': ['UNREAD']}).execute()
# find can navigate the parse tree as well. findParents, findNextSiblings,findPreviousSiblings all work # similar to findAll, but will search only within those branches of the tree. # findNext, findPrevious and findAllNext and findAllPrevious can be used to find matches starting from # a specified point. # Let's say you want the text of the first paragraph after the first occurrence of the text "Google" soup.find(text="Google").findNext('p').text # In[ ]: # A little shortcut to using findAll - if you call the tag itself as a function, you can use it in place of findAll # with the same arguments soup.body('p') # In[ ]: soup.findAll('p') # In[ ]: #BeautifulSoup makes parsing html or xml very intuitive and elegant. Doing the same thing with regular expressions # is prone to leading you to pulling your hair out :) In most situations for screen-scraping projects, BeautifulSoup # is a life-saver! # In[ ]:
import urllib2 #Handle Arguments if len(sys.argv) < 5: print "USAGE: " + sys.argv[0] + " TARGET_URL FIRSTNAME_FILE LASTNAME_FILE OUTPUT_FILE" sys.exit() target = sys.argv[1] firstNameLocation = sys.argv[2] lastNameLocation = sys.argv[3] outLocation = sys.argv[4] #Grab URL Text page = urllib2.urlopen(target) soup = BeautifulSoup(page) pageText = soup.body(text = True) #Split URL Text into Words pageTextString = "" for each in pageText: pageTextString += each pageTextString = pageTextString.split(" ") trimmedPageText = [] for each in pageTextString: trimmedPageText.append(each.lower().replace("\n", " ")) #Load Name Lists firstNameFile = open(firstNameLocation, 'r') firstNameList = [] for each in firstNameFile:
def extractText(self): soup = BeautifulSoup(self.request.content) return "".join(soup.body(text=True))
for script in soup.findAll('script'): script.extract() for link in soup.findAll('a', href=True): if len(link['href']) > 9: pat = re.compile(r'^http').findall(link['href']) if pat: href=re.compile(r"/$").sub('',link['href']) temp=re.compile(r"\.").split( href.lower()) size = len(temp) size = size -1 ext=temp[size] if mime.has_key(ext): err=1 else: urls.append(href) body = soup.body(text=True) body = ' '.join(body) body=convertAccents(body) # body=cleanHTML(body) title=convertAccents(title) title=cleanHTML(title) try: body=unicodedata.normalize('NFKD',body).encode('ascii', 'ignore') except: err=1 try: title=unicodedata.normalize('NFKD',title).encode('ascii', 'ignore') except: err=1 body=re.compile(r'\n').sub(' ',body) body=re.compile(r'[ ]+').sub(' ',body)