def comment(): url = 'http://www.espncricinfo.com/australia-v-india-2015-16/engine/match/895815.html?innings=' url+=str(Team_id) url+=';view=commentary' source_code = requests.get(url) # source code of page plain_text = source_code.text soup = BeautifulSoup(plain_text) soup.encode('UTF-8') for link in soup.findAll('div' , {'class' : 'commentary-text'} ): # a = find all links of the titles #href = link.get('href') # pulling links of this class title = link.contents[1].encode('UTF-8') # here string is for <href = "something"...> Titlesndjsnd </href> then it is Titlesndjsnd text.append(str(title)) #print(title) #overs.append(title) # print(href , "\t") # print( get_single_item_data(href) ) # who posted in that page forums for link in soup.findAll('div' , {'class' : 'commentary-overs'} ): # a = find all links of the titles #href = link.get('href') # pulling links of this class title1 = link.string # here string is for <href = "something"...> Titlesndjsnd </href> then it is Titlesndjsnd over.append(str(title1)) clean_up_list(text)
def play(url): source_code = requests.get(url) # source code of page plain_text = source_code.text soup = BeautifulSoup(plain_text, "html.parser") soup.encode("UTF-8") h, m, s = 0, 0, 0 count = 0 for link in soup.findAll("div", {"class": "timestamp"}): count += 1 time = link.string.split(":") if len(time) == 3: h += int(time[0]) m += int(time[1]) s += int(time[2]) else: m += int(time[0]) s += int(time[1]) ts = h * 60 * 60 + m * 60 + s m, s = divmod(ts, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) print("Total Videos : ", count) print(d, "Days", h, "Hours", m, "Minutes", s, "Seconds")
def scrape(self, links=[], ads=True, translator=False): print "Scraping ad pages..." responses = [] values = {} data = [] urls = self.generate_pages(self.base_url) for url in urls: print "Scraping URL:", url r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") soup.encode('utf-8') values["title"] = self.get_ad_title(soup) # values["phone_numbers"] values["text_body"] = self.get_ad_text(soup) values["images"] = self.get_ad_images(soup) values["link"] = url values["posted_at"] = self.get_ad_date(soup) values["scraped_at"] = str(datetime.datetime.now()) values["language"] = "Spanish" # Being lazy here. # values["polarity"] # values["translated_body"] # values["translated_title"] # values["subjectivity"] time.sleep(3) data.append(values) return data
def generate_pages(self, url): """ Creates a list of URLs containing ads for further scraping. """ print "Fetching pages..." urls = [] while True: print url r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") soup.encode('utf-8') ad_links = self.get_ad_links(soup) for link in ad_links: urls.append(link) next = soup.find_all("a", { "class": "num_next"}) if next: # Fetches the current page. current_page = int(soup.find_all("div", { "class": "num_sel"})[0].text) # Generates the link for the next page. new_url = str(''.join([url.split("?")[0],"?p=",str(current_page+1)])) url = new_url time.sleep(3) else: break # Note: List needs to be uniquified. return urls
def getHTML(): url = "https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions" html = urllib.request.urlopen(url).read() soup = BeautifulSoup(html,'html.parser') soup.encode('utf8') soup.prettify() extract_info(soup)
def seating_details(request, lan_id, seating_id=None, seat_id=None): lan = get_object_or_404(LAN, pk=lan_id) seatings = Seating.objects.filter(lan=lan) if not seatings: return render(request, 'seating/seating.html') if seating_id: seating = get_object_or_404(Seating, pk=seating_id, lan=lan) else: seating = seatings[0] return redirect(seating) users = seating.get_user_registered() seats = seating.get_total_seats() dom = BeautifulSoup(seating.layout.template, "html.parser") counter = 0 for tag in dom.find_all('a'): children = tag.find_all('rect') children[0]['seat-number'] = seats[counter].pk children[0]['seat-display'] = seats[counter].placement if not seats[counter].user: children[0]['class'] = ' seating-node-free' children[0]['status'] = "free" else: if seats[counter].user == request.user: children[0]['class'] = ' seating-node-self' children[0]['status'] = "mine" else: children[0]['class'] = ' seating-node-occupied' children[0]['status'] = "occupied" children[0]['seat-user'] = unicode(seats[counter].user.get_full_name()) #Separate title element for chrome support title = dom.new_tag("title") title.string = unicode(seats[counter].user.get_full_name()) tag.append(title) counter += 1 dom.encode("utf-8") context = {} context['seatings'] = seatings context['seating'] = seating context['seat'] = seat_id context['hide_sidebar'] = True context['template'] = dom.__str__ return render(request, 'seating/seating.html', context)
def wp_reformat(content): content = BeautifulSoup(content) for img in content.find_all('img'): src = img['src'] src = re.sub('http://kecebongsoft.files.wordpress.com/(\d+)/(\d+)/', r'/img/wordpress/\1-\2-', src) img.replace_with('![image](%s)' % src) #content = re.sub('http://kecebongsoft.files.wordpress.com/(\d+)/(\d+)/', r'/img/wordpress/\1-\2-', content) #content = re.sub('<im.*src=(\'|")(.*)["\'].*>', r"![image](\2)", content) content = re.sub('\[caption.*?\](.|\n)*?\!(.*?\))(.|\n)*?caption]', r'\2', content.encode('ascii', 'ignore')) content = re.sub('\[source.*?\]((.|\n)*?)\[/sourcecode\]', r'\t:::txt\1', content.encode('ascii', 'ignore')) return content
def parse(source): """Pulls out the paste""" soup = BeautifulSoup(source, "html5lib") soup.encode("utf8") uri = soup.find("title") uri = uri.encode("utf-8") uri = uri[7:-19] #print uri title = soup.find('div', {'class': 'modal-body'}).get_text().strip() newline = title.index('\n') title = title[38:newline-1] paste = soup.find('div', {'id': 'code'}).get_text().strip() return title, paste
def serverlist(url): link = urllib2.urlopen(url).read() soup = BeautifulSoup(link.decode("utf-8")) epis = soup("p", {"class": "epi"}) for i in range(0, len(epis)): etitle = BeautifulSoup(str(epis[i]))("b")[0].contents[0] addDir(etitle.encode("utf-8"), url, 3, iconimage, False, i, gname)
def doctoText(filepath): """ returns a string of text from the input file. created the if statement for future file formats. link below provided partial code. http://davidmburke.com/2014/02/04/python-convert-documents-doc-docx-odt-pdf-to-plain-text-without-libreoffice/ """ if filepath[-4:] == ".pdf": return convertpdftoText(filepath) elif filepath[-5:] == ".docx": document = opendocx(filepath) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return "\n\n".join(newparatextlist) else: with open(filepath, "rb") as myfile: try: # cleans html, removes tags htmldata = myfile.read() edata = htmldata.decode("utf-8", "strict") raw = BeautifulSoup(edata).get_text() cleanedhtml = raw.encode("utf-8", "strict") return cleanedhtml except: data = myfile.read() return str(data)
def search_weibo(url): #url = 'http://s.weibo.com/weibo/%25E5%2591%25A8%25E7%25A5%2589%25E6%2580%2580&Refer=index' url = 'http://s.weibo.com/weibo/%E8%91%A3%E5%B4%87%E6%B4%8B&Refer=index' #url = 'http://s.weibo.com/weibo/%25E5%25BC%25A0%25E5%25A9%25B7?topnav=1&wvr=6&b=1' #url = 'http://www.weibo.com/u/3075975003?from=myfollow_all' page = urllib.urlopen(url) content = page.read() #print content transcode = content.decode('utf-8','ignore').encode('utf-8','ignore') #print type(content) #transcode=content.decode('gbk','ignore').encode('utf-8','ignore') o = BeautifulSoup(transcode) #print o #print type(o) #print dir(o) #print o.find_all('p') #print o.findAll('p') #print o.title f = open('web.txt','w+') ''' for item in o: f.write("%s" % item) f.close() ''' open('origin.txt','w+').write(transcode) f.write(o.encode('gbk')) f.close()
def get_solution(url): #url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40440099&rd=16747&pm=14278' #url = 'https://community.topcoder.com/stat?c=problem_solution&rm=329103&rd=16775&pm=14340&cr=23089515' #url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40364957&rd=16747&pm=14278' print url #tcsso = 'b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584' #cookies = dict() #cookies['tcsso'] = '40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584' #'40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584' #cookies['JSESSIONID'] = 'UYKd7Rv1-OY-6bmewBWJDw**.tomcat_tc01' print cookies page = requests.get(url, cookies=cookies) #print page if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url, cookies=cookies) html_content = page.text #print html_content[0:100000] #soup = BeautifulSoup(html_content, "html.parser") #text = soup.select("body > table > tbody > tr > td.bodyText > table.paddingTable > tbody > tr:nth-child(1) > td > table:nth-child(4) > tbody > tr:nth-child(13) > td") body = re.findall('<TD CLASS="problemText" COLSPAN="8" VALIGN="middle" class="alignMiddle" ALIGN="left">\n (.+?)<BR>\n </TD>', html_content, flags=re.S) text = body[0] text = text.replace("<BR>","\n") #print w #print repr(text) print text failed_to_download = None solution = None if len(text)==0: failed_to_download = solution_id else: body = BeautifulSoup(str(text), "html.parser").get_text() body = body.replace("\\","\\\\") solution = body.encode('utf-8').decode('string-escape') #print repr(solution) #print solution return solution
def updateMahoyo(self, progress): progress = BeautifulSoup(progress) exception = self.master.modules["commands"].exception user = yield self.config.get("user") passwd = yield self.config.get("pass") if user is None or passwd is None: raise exception(u"No blog username or password in config") blog = XMLRPC("http://commiesubs.com/xmlrpc.php") post = yield blog.callRemote("wp.getPost", 0, user, passwd, 8367) content = BeautifulSoup(post["post_content"]) old = content.find(class_="progress") new = progress.find(class_="progress") old.replace_with(new) content = content.encode(formatter="html") try: yield blog.callRemote( "wp.editPost", 0, user, passwd, 8367, {"post_content": content} ) except: raise exception(u"Couldn't update post")
def search_pmc(self, pmc_id): """fetch the documents' content (have pmc id)""" query = self.baseURL + "efetch.fcgi?db=pmc&id={id}" . format(id=pmc_id) self.logger.info("search the content of pmc document based on its 'id'={id}. searching url={url}" . format(id=pmc_id, url=query)) try: content = BS(urlopen(query, timeout=TIMEOUT).read(), 'lxml') except: content = "" if "<?properties open_access?>" in content.encode('utf-8'): self.logger.info("access the open_access document. pmc_id={pid}" . format(pid=pmc_id)) parsed_doc = self.eutils_parse_doc(content) return { "pmc_id": pmc_id, "abstract": parsed_doc[0], "body": parsed_doc[1], "entry_created_date": str(datetime.datetime.utcnow()) } else: return "" self.logger.info("cannot access the document (pmc_id={pid}). Crawl it by web crawler!" . format(pid=pmc_id)) parsed_doc = self.advanced_parse_doc(pmc_id) return { "pmc_id": pmc_id, "abstract": parsed_doc[0], "keyword": parsed_doc[1], "body": parsed_doc[2], "bib": parsed_doc[3], "entry_created_date": str(datetime.datetime.utcnow()) }
def scrapSatCen(): print("#========================= SatCen SCRAPING =========================") SatCenData = satcen.returnAgency('SATCEN') SatCen_link = SatCenData['link'][0] SatCen_id = SatCenData['id'][0] SatCen_source = urllib.request.urlopen(SatCen_link) #Retrieve the list of jobs as bs4 navigable string soup = BeautifulSoup(SatCen_source,'html.parser') #Convert to bytes bytesEncoded = soup.encode('utf-8') #Convert to string stringDecoded = bytesEncoded.decode('utf-8') #Convert to dictionary jobsdict = json.loads(stringDecoded) #Browse dictionaty and select available positions for job in jobsdict: if (job['Status']=='OPEN') and (job['InternalOnly'] == False): link = 'https://apps.satcen.europa.eu/recruitment/#/vacancy/'+job['Id'] print(job['Reference'], job['ExpireOn'][:10],job['Title'],format.typeOfPost(job['TypePost']),job['WorkUnit'],link) satcen.persist(SatCen_id, job['Title'],job['Reference'],job['WorkUnit'],'', job['ExpireOn'][:10],link,'', format.typeOfPost(job['TypePost'])) print("#========================SATCEN SCRAPING COMPLETE=================================")
def get_correctedFiles(path, save, url, img): if not os.path.exists(save): os.makedirs(save) for f in os.listdir(path): print "correcting file %s" % f infile = open(os.path.join(path, f)).read() soup = BeautifulSoup(infile, "html5lib") for tag in soup.find_all(lambda t: 'href' in t.attrs or 'src' in t.attrs): if 'href' in tag.attrs: url_parts = urlparse.urlsplit(tag.attrs["href"]) full_path = tag.attrs["href"] hrefpath = url_parts.path if full_path[0:4] != "http" or full_path[0:5] != " http": # for wiki conversion (moin moin wikis) # hrefpath = hrefpath.replace("/", "|") if hrefpath[0:6] == "|wiki|": hrefpath = hrefpath[6:] tag.attrs["href"] = urlparse.urljoin(url, hrefpath) else: url_parts = urlparse.urlsplit(tag.attrs["src"]) srcpath = url_parts.path srcparts = srcpath.split("/") srcpath = srcparts[len(srcparts) -1] tag.attrs["src"] = urlparse.urljoin(img, srcpath) outfile = open(os.path.join(save, f), "w") outfile.write(soup.encode("ascii", "xmlcharrefreplace")) outfile.close()
def get_solution(contest, solution_id): url = 'http://codeforces.com/contest/' + str(contest[0]) + '/submission/' + str(solution_id) print url page = requests.get(url) if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url) html_content = page.text #print html_content soup = BeautifulSoup(html_content, "html.parser") text = soup.select("body > div > div > div > div > pre") failed_to_download = None solution = None if len(text)==0: failed_to_download = solution_id else: body = BeautifulSoup(str(text[0]), "html.parser").get_text() body = body.replace("\\","\\\\") solution = body.encode('utf-8').decode('string-escape') return solution_id, solution, failed_to_download
def get_solution(solution_id): #solutions = [] #failed_to_download_s = [] #for i in solution_ids: url = "https://www.codechef.com/viewplaintext/" + str(solution_id) page = requests.get(url) if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url) html_content = page.text if html_content==None: failed_to_download_s.append(i) text = BeautifulSoup(html_content, "html.parser").get_text() #'''figure out if escape_lt needs to go here''' print len(text) #print text failed_to_download = None solution = None #print text if len(text)==0 or re.search('var _sf_startpt = (new Date()).getTime()', text) != None: failed_to_download = solution_id else: text = text.replace("\\","\\\\") solution = text.encode('utf-8').decode('string-escape') return solution_id, solution, failed_to_download
def formatabbr(page): srd = SoupStrainer('div', id='container') div = BeautifulSoup(page, parse_only=srd).div nav = div.find('div', id='navigation') nav.decompose() tbl = div.find('table') tbl.name = 'div' tbl.attrs.clear() tbl['class'] = 'oH1' tdr = div.find_all(name=re.compile(r't[dr]', re.I)) for t in tdr: t.unwrap() for p in div.find_all('p'): p['class'] = 'ZFY' p.name = 'div' blank = div.find('div', class_='blank') if blank: blank.decompose() ft = div.find('div', id='footer') if ft: ft.decompose() formatcontent(div) div.attrs.clear() div['class'] = 'RmY' text = cleansp(div.encode('iso-8859-1')) div.decompose() return ''.join(['<link rel="stylesheet"href="ety.css"type="text/css">', text])
def serverlist(url): link = urllib2.urlopen(url).read() soup = BeautifulSoup(link.decode('utf-8')) epis = soup('p',{'class':'epi'}) for i in range(0,len(epis)): etitle = BeautifulSoup(str(epis[i]))('b')[0].contents[0] addDir(etitle.encode('utf-8'),url,3,iconimage,False,i)
def handle_html_content(self, content): soup = BeautifulSoup(content, 'html.parser') for p_elem in soup.find_all('p'): css = None if 'style' in p_elem.attrs: css = cssutils.parseStyle(p_elem.attrs['style']) text_list = p_elem.text.split() p_new = soup.new_tag('p', style=css.cssText if css else None) for idx, word in enumerate(text_list): if len(self.dorks) <= 0: self.dorks = yield from self.get_dorks() word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href=self.dorks.pop(), style='color:{color};text-decoration:none;cursor:text;'.format( color=css.color if css and 'color' in css.keys() else '#000000' ) ) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = soup.encode('utf-8') return content
def makeappdx(page): srd = SoupStrainer('div', id='container') div = BeautifulSoup(page, parse_only=srd).div nav = div.find('div', id='navigation') nav.decompose() title = div.center.get_text(strip=True) div.center.decompose() font = div.find_all('font', size='2', color=None) for f in font: f.unwrap() for p in div.find_all('p'): p['class'] = 'ZFY' p.name = 'div' blank = div.find('div', class_='blank') if blank: blank.decompose() ft = div.find('div', id='footer') if ft: ft.decompose() div.attrs.clear() div['class'] = 'oH1' formatcontent(div) text = cleansp(div.encode('iso-8859-1')) div.decompose() return ''.join(['<div class="xsv">', title, '</div>', text])
def text_cleaner(stripped_advertiser, website): # clean up html and read words soup_obj = BeautifulSoup(website, "html.parser") for script in soup_obj(["script", "style"]): # get rid of that nasty # javascript script.extract() # Remove these two elements from the BS4 object text = soup_obj.get_text().replace(".", "") # we convert text in place # several times coming up with open('sites/{}.html'.format(stripped_advertiser.lower()), 'w') as f: f.write(soup_obj.encode("utf-8")) # write html out to file lines = (line.strip() for line in text.splitlines()) # break into lines chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) def chunk_space(chunk): # individual utf-8 encoded word chunks chunk_out = chunk + " " return chunk_out text = "".join(chunk_space(chunk) for chunk in chunks if chunk).encode( "utf-8") try: text = text.decode("unicode_escape").encode("ascii", "ignore") except Exception as error: return # just in case there are some weird characters here, don't kill text = re.sub("[^a-zA-Z.+3]", " ", text) # only regular words, no numbers text = text.lower().split() # lower case so the dict can be populated stop_words = set(stopwords.words("english")) text = [w for w in text if w not in stop_words] # gets rid of stop words text = list(set(text)) # no repeats! for word in text: if word not in fashion_dict: fashion_dict[word] = 1 # populate fashion dict else: fashion_dict[word] += 1 # increment entry return text
def downloadthread(fil): apps_links = read_from_file(fil) for link in apps_links[1:3]: # link = apps_links[1] # dev version of for loop req = urllib2.Request("%s%s" % (base_url,link), None, headers) html_doc = urllib2.urlopen(req).read() ################ Title extract startstring = '<title>' start = html_doc.find(startstring)+len(startstring) endstring='| AppBrain Android Market</title>' end = html_doc.find(endstring) Title = html_doc[start:end] ################ Description extract startstring = '<div class="app_descriptiontab">' start = html_doc.find(startstring)+len(startstring) endstring='<div style="position: absolute; right: 0px; bottom: 0px">' end = html_doc.find(endstring) description = html_doc[start:end] ################ description = description.strip() # get rid of whitespace description = BeautifulSoup(description) description = description.get_text() # get rid of html subdir = "\\fulldescriptions\\" + fil[1:fil.find(".")]+"\\" path = basedir+subdir filename = path+Title+".txt" if not os.path.exists(path): # if folder does not exist create it. os.makedirs(path) with open(filename, "w") as txtfile: txtfile.write(description.encode('utf8')) print("link:%s done" % link)
def serializeLabContent(labContent): #print labContent f = open('template.html', "r") labHtml = f.read() f.close() labTemplate = BeautifulSoup(labHtml) articleSection = labTemplate.find_all('div', id="experiment-article-sections")[0] sectionNumber = 1 for sectionName,sectionContent in labContent: sectionTag = labTemplate.new_tag('section', id="experiment-article-section-"+str(sectionNumber)) articleSection.append(sectionTag) iconTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-icon") iconTag['class']='icon' sectionTag.append(iconTag) headingTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-heading") headingTag['class']='heading' headingTag.append(sectionName) sectionTag.append(headingTag) contentTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-content") contentTag['class']='content' contentTag.append(sectionContent) sectionTag.append(contentTag) sectionNumber +=1 f = open('content.html', "w+") labTemplate = labTemplate.prettify() f.write(labTemplate.encode('utf-8')) f.close()
def extractCourseCatalog(School): #accesses the url & takes html url = urllib2.urlopen(School.url) html = url.read() #parsing html web content txt_content = BeautifulSoup(html, 'html.parser').get_text() #creating a file, converting text to unicode, saving file txt_name = School.name + '.txt' txt_file = open(txt_name, 'w') encoded_content = txt_content.encode('utf-8') txt_file.write(encoded_content) txt_file.close() #use regex to find courses, remove duplicates and sort course_names = re.findall('COMP [0-5]\d*', html) uniq_names = list(set(course_names)) uniq_names.sort() if (School.name == 'Carleton'): getCourseDescriptions(School, uniq_names, txt_content) else: getCourseDescriptions(School, uniq_names, encoded_content) names_file = open(School.name + '_courses.txt', 'w') # put each course in school_courses file exactly once for course in (uniq_names): names_file.write(course + '\n') names_file.close()
def getNewestData(): file = open("Data/vextir.txt", "w"); soup = BeautifulSoup(urllib2.urlopen("http://www.landsbankinn.is/Vextir").read()) file.write(soup.encode("utf-8")) file.close return soup
def downloadCourse(session, c, sem): global files global sections files = itertools.count() sections = itertools.count() name = c['key'].replace('/', '-') + u'/' path = root + sem.replace('/', '-') + u'/' + name path = urllib.url2pathname(path.encode('utf-8')).replace(':', '-').replace('"', '') if not os.path.exists(path): os.makedirs(path) print ' +--' + colors.BOLD + name + colors.ENDC r = session.get(c['url']) if(r.status_code == 200): soup = BeautifulSoup(r.text, 'html.parser') if not os.path.exists(path + '.dump'): os.makedirs(path + '.dump') dst = path + '.dump/' + c['key'].replace('/', '-').encode('utf-8') + '-' + c['type'] + '-' + str(datetime.date.today()) + '-full.html' dst = dst.replace(':', '-').replace('"', '') with open(dst, 'wb') as f: f.write(soup.encode('utf-8')) for s in soup.find_all(class_='section main clearfix'): downloadSection(session, s, path) #print 'Saved ' + str(files.next()) + ' Files in ' + str(sections.next()) + ' Sections' else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def __get_item_description(self, soup, path): pattern = re.compile(r'//dsc\.taobaocdn\.com.*?,') scripts = soup.find_all("script") for script in scripts: if len(script.contents) == 0: continue match = pattern.findall(script.contents[0]) if len(match) != 1: continue description_url = match[0].split(":")[0][:-2] # description_url = description_url.replace('"', '') # description_url = description_url.replace(' ', '') protocol = 'http:' description = requests.get(protocol + description_url) description = description.text pattern = re.compile(r'<.*>') match = pattern.findall(description) s = BeautifulSoup(match[0], 'html.parser') img_list = s.find_all('img') # 创建描述图片存放目录 if not os.path.exists(path): os.makedirs(path) if len(img_list) != 0: self.__download_and_replace(img_list, path) return str(s.encode('utf-8').replace('\n', '')) raise Exception(ERROR_DESCRIPTION_URL)
def crawlUrl(url): print url global i global fDict global fList print len(fDict) print len(fList) print len(visited) try: response=urllib2.urlopen(url) visited[url]=True header=response.info() type=header.getheader('content-type') if 'text/html' in type: content = response.read() #print content soup = BeautifulSoup(content) data = soup.encode('utf8').lower() if (('world war' in data)|('stalingrad' in data)): hList=getLinks(url,soup) updateFile(soup,url,hList,header,content) i+=1 print 'i is',i else: pass except: print 'Unable to open URL',url pass
def strip_sphinx_documentation(source_dir, generated_dir, lang_destination_dir, lang, version): # Go through each file, and if it is a .html, extract the .document object # contents for subdir, dirs, all_files in os.walk(generated_dir): for file in all_files: subpath = os.path.join(subdir, file)[len( generated_dir):] if not subpath.startswith('/.') and not subpath.startswith( '/_static') and not subpath.startswith('/_doctrees'): new_path = lang_destination_dir + subpath if '.html' in file or '_images' in subpath or '.txt' in file or '.json' in file: if not os.path.exists(os.path.dirname(new_path)): os.makedirs(os.path.dirname(new_path)) if '.html' in file: # Soup the body of the HTML file. # Check if this HTML was generated from Markdown original_md_path = get_original_markdown_path( source_dir, subpath[1:]) if original_md_path: # If this html file was generated from Sphinx MD, we need to regenerate it using python's # MD library. Sphinx MD library is limited and doesn't support tables markdown_file(original_md_path, version, '', new_path) # Since we are ignoring SPHINX's generated HTML for MD files (and generating HTML using # python's MD library), we must fix any image links that starts with 'src/'. image_subpath = None parent_paths = subpath.split('/') if '' in parent_paths: parent_paths.remove('') image_subpath = '' # -1 because we nest it 1 further levels? No idea. for i in range(len(parent_paths) - 1): image_subpath = image_subpath + '../' # hardcode the sphinx '_images' dir image_subpath += '_images' with open(new_path) as original_html_file: soup = BeautifulSoup(original_html_file, 'lxml') prepare_internal_urls(soup, lang, version) image_links = soup.find_all( 'img', src=re.compile(r'^(?!http).*')) if len(image_links) > 0: for image_link in image_links: image_file_name = os.path.basename( image_link['src']) if image_subpath: image_link['src'] = '%s/%s' % ( image_subpath, image_file_name) else: image_link['src'] = '_images/%s' % ( image_file_name) with open(new_path, 'w') as new_html_partial: new_html_partial.write(soup.encode("utf-8")) else: with open(os.path.join(subdir, file)) as original_html_file: soup = BeautifulSoup(original_html_file, 'lxml') document = None # Find the .document element. if version == '0.9.0': document = soup.select('div.body')[0] else: document = soup.select('div.document')[0] with open(new_path, 'w') as new_html_partial: new_html_partial.write(document.encode("utf-8")) elif '_images' in subpath or '.txt' in file or '.json' in file: # Copy to images directory. copyfile(os.path.join(subdir, file), new_path)
def zdf2pdf(entries, opts): from bs4 import BeautifulSoup import urllib, urlparse import xhtml2pdf.pisa as pisa try: import cStringIO as SIO except ImportError: import StringIO as SIO # Save the current directory so we can go back once done startdir = os.getcwd() # Start the xhtml to be converted data = '<head>\n' # Normalize all of the given paths to absolute paths opts['output_file'] = os.path.abspath(opts['output_file']) opts['work_dir'] = os.path.abspath(opts['work_dir']) attach_dir = os.path.join(opts['work_dir'], 'attach') # Check for and create working directory if not os.path.isdir(opts['work_dir']): os.makedirs(opts['work_dir']) # Check for and create a directory for attachments and images if not os.path.isdir(attach_dir): os.makedirs(attach_dir) # Save the running configuration for rerunning parser = configparser.SafeConfigParser() config_opts = dict( (k, v) for k, v in opts.iteritems() if (k != 'json_file' and k != 'categories' and k != 'forums' and k != 'topics' and k != 'run_section' and k != 'list_zdf' and k != 'work_dir' and k != 'delete' and k != 'url' and k != 'mail' and k != 'password' and k != 'is_token' and v != None)) if config_opts.has_key('style_file'): config_opts['style_file'] = os.path.basename(config_opts['style_file']) if config_opts.has_key('output_file'): config_opts['output_file'] = os.path.basename( config_opts['output_file']) config_opts['json_file'] = 'entries.json' parser.add_section('zdf2pdf') for k, v in config_opts.iteritems(): parser.set('zdf2pdf', k, unicode(v)) with codecs.open(os.path.join(opts['work_dir'], 'zdf2pdf.cfg'), 'w', 'utf-8') as config_file: parser.write(config_file) if opts['style_file']: # Save the style file in the working directory shutil.copy(opts['style_file'], opts['work_dir']) data += """<link rel="stylesheet" type="text/css" href="{}" />\n""".format( os.path.basename(opts['style_file'])) data += '</head>\n<body>\n' # Add PDF header if given if opts['header']: data += opts['header'] + '\n' if opts['footer']: data += opts['footer'] + '\n' # Build anything provided that should go on the title page if opts['title'] or opts['author'] or opts['date'] or opts['copyright']: if opts['title_class']: title_class = ' class="{}"'.format(opts['title_class']) else: title_class = '' data += '<div{}>\n'.format(title_class) if opts['title']: data += '<h1>{}</h1>\n'.format(opts['title']) if opts['author']: data += '<div>{}</div>\n'.format(opts['author']) if opts['date']: data += '<div>{}</div>\n'.format(opts['date']) if opts['copyright']: data += '<div>{}</div>\n'.format(opts['copyright']) data += '</div>\n' # Go through the JSON and build a toc and body to add to the html data entry_ids, body, toc = process_entries(entries) # Put all of the body after the table of contents if opts['toc']: if opts['toc_class']: toc_class = ' class="{}"'.format(opts['toc_class']) else: toc_class = '' data += '<div{}>\n<h2>{}</h2>\n<ol>\n'.format(toc_class, opts['toc_title']) data += toc data += '</ol>\n</div>\n' data += body # Change to working directory to begin file output os.chdir(opts['work_dir']) # Save entries with open('entries.json', "w") as outfile: outfile.write(json.dumps(entries)) # Make the data a traversable beautifulsoup soup = BeautifulSoup(data) if opts['pre_width']: # Monkey patch TextWrapper for splitting on any whitespace and add # splitting on commas. Save the old one for when we're done. old_wordsep_simple_re = textwrap.TextWrapper.wordsep_simple_re new_wordsep_simple_re = re.compile(r'(\s+|\,)') textwrap.TextWrapper.wordsep_simple_re = new_wordsep_simple_re w = textwrap.TextWrapper(width=opts['pre_width'], replace_whitespace=False, drop_whitespace=False, break_on_hyphens=False, break_long_words=True) for pre in soup.find_all('pre'): pre_str = '' try: for line in pre.string.splitlines(): pre_str += '\n'.join(w.wrap(line)) + '\n' pre.string = pre_str except AttributeError: # pre tag has no content pass # Put the original wordsep_simple_re back textwrap.TextWrapper.wordsep_simple_re = old_wordsep_simple_re # Get images and display them inline for img in soup.find_all('img'): # Handle relative and absolute img src src = urlparse.urljoin(opts['url'], img['src']) # Normalize the local filename srcfile = os.path.join(attach_dir, src.replace('/', '_')) # Get this image if not already present if not os.path.isfile(srcfile): urllib.urlretrieve(src, srcfile) # Update the tag for the relative filepath img['src'] = srcfile # Make relative links to entries and absolute links to entries point to PDF # anchors. e.g. # http://example.zendes.com/entries/21473796-title # /entries/21473796-title # TODO /entries/21473796-title#anchor r = re.compile('(?:' + opts['url'] + ')?/entries/([0-9]*)-.*') for a in soup.find_all('a'): try: m = r.match(a['href']) # modify the link if we have a match and the entry is in the PDF if m and int(m.group(1)) in entry_ids: a['href'] = '#{}'.format(m.group(1)) except KeyError: # this a tag doesn't have an href. named anchor only? pass if opts['strip_empty']: soup = strip_empty_tags(soup) html = soup.encode('utf-8') # Save generated html with open('entries.html', "w") as outfile: outfile.write(html) pdf = pisa.CreatePDF(SIO.StringIO(html), file(opts['output_file'], "wb"), encoding='utf-8') if pdf.err and pdf.log: for mode, line, msg, code in pdf.log: print "%s in line %d: %s" % (mode, line, msg) if pdf.warn: print "*** %d WARNINGS OCCURED" % pdf.warn os.chdir(startdir)
soup = BeautifulSoup(page.content, 'html.parser') tr_elements = soup.find_all("tr", class_='ranking-list') j = 0 for tr in tr_elements: link = tr.find("td", class_='title al va-t word-break').find('a', href=True) title = tr.find("div", class_='di-ib clearfix').find('a').text print(title) link = link['href'] newURL = link + "/characters" newPage = requests.get(newURL) newSoup = BeautifulSoup(newPage.content, 'html.parser') soupStr = str(newSoup.encode('utf-8')) index1 = (soupStr).rindex("<h2") index2 = soupStr.rindex("</td") soupStr = soupStr[:index1] + soupStr[index2:] newSoup = BeautifulSoup(soupStr, 'html.parser') tables = newSoup.find('table').find('td', valign="top", style='padding-left: 5px;') rows = tables.find_all('tr') for row in rows: std = row.find('td', align="right") for names in std.find_all('tr'): subStd = names.find('td') if (subStd.find('small').text == 'Japanese'): name = subStd.find('a').text
def parse(self, response): base_url = 'http://sh.lianjia.com' items = [] res = response.body soup = BeautifulSoup(res, 'html.parser') soup.encode('utf-8') for fang in soup.select('.info-panel'): item = FangSpiderItem() item['fang_key'] = fang.select('h2')[0].a['key'].strip() item['fang_desc'] = fang.select('h2')[0].text.strip() item['fang_url'] = base_url + fang.select( 'h2')[0].a['href'].strip() item['price'] = fang.select('.price')[0].text.strip() item['price_pre'] = fang.select('.price-pre')[0].text.strip() item['xiaoqu'] = fang.select('.where')[0].a.text.strip() item['huxing'] = fang.select('.where')[0].contents[3].text.strip() item['mianji'] = fang.select('.where')[0].contents[5].text.strip() item['bankuai'] = '' item['chaoxiang'] = '' item['age'] = '' item['subway'] = '' item['taxfree'] = '' item['haskey'] = '' item['col_look'] = '' item['quyu'] = fang.select('.con')[0].contents[1].text.strip() #item['bankuai']=fang.select('.con')[0].contents[3].text.strip() if len(fang.select('.con')[0].contents) >= 4: item['louceng'] = fang.select( '.con')[0].contents[4].string.strip() if len(fang.select('.con')[0].contents) >= 6: item['chaoxiang'] = fang.select( '.con')[0].contents[6].string.strip() if len(fang.select('.con')[0].contents) >= 8: item['age'] = fang.select('.con')[0].contents[8].string.strip() if len(fang.select('.con')[0].contents) > 9: item['age'] = fang.select( '.con')[0].contents[-1].string.strip() if len(fang.select('.fang-subway-ex')) > 0: item['subway'] = fang.select('.fang-subway-ex')[0].text.strip() if len(fang.select('.taxfree-ex')) > 0: item['taxfree'] = fang.select('.taxfree-ex')[0].text.strip() if len(fang.select('.haskey-ex')) > 0: item['haskey'] = fang.select('.haskey-ex')[0].text.strip() if len(fang.select('.square')) > 0: item['col_look'] = fang.select('.square')[0].span.text.strip() #print u'在售:', u'房源编号:',item['fang_key'],u'房源描述:',item['fang_desc'],\ # u'区域:',item['quyu'],u'版块:',item['bankuai'], u'楼层:',item['louceng'],u'朝向:',item['chaoxiang'],u'房龄:',item['age'],\ # u'小区:',item['xiaoqu'],u'户型 :', item['huxing'],u'面积:',item['mianji'],\ # u'总价:',item['price'],u'单价:',item['price_pre'],u'看房人数:',item['col_look']#,u'房源链接:',item['fang_url']#\ # #u'交通 :',item['subway'],u'税费:',item['taxfree'],u'钥匙:',item['haskey'],u'房源链接:',item['fang_url'] #print u'在售:', item['fang_key'],item['fang_desc'],\ # item['quyu'] ,item['louceng'], item['chaoxiang'], item['age'],\ # item['xiaoqu'] , item['huxing'],item['mianji'],\ # item['price'],item['price_pre'],item['col_look']#,u'房源链接:',item['fang_url']#\ #u'交通 :',item['subway'],u'税费:',item['taxfree'],u'钥匙:',item['haskey'],u'房源链接:',item['fang_url'] items.append(item) #for item in items: # print u'在售:', u'房源编号:',item['fang_key'],u'房源描述:',item['fang_desc'],u'房源链接:',item['fang_url'],\ # u'区域:',item['quyu'],u'版块:',item['bankuai'], u'楼层:',item['louceng'],u'朝向:',item['chaoxiang'],u'房龄:',item['age'],\ # u'小区:',item['xiaoqu'],u'户型 :', item['huxing'],u'面积:',item['mianji'],\ # u'总价 :',item['price'],u'单价:',item['price_pre'],u'看房人数:',item['col_look'],\ # u'交通 :',item['subway'],u'税费:',item['taxfree'],u'钥匙:',item['haskey'] return items
continue soup = '' HTMLFILE = str(line[1]) + '.htm' TEXTFILE = str(line[1]) + '.txt' HADOOP_HTMLFILE = 'user/root/crawls/' + str(ANET) + '/' + str( BNET) + '/' + HTMLFILE HADOOP_TEXTFILE = 'user/root/texts/' + str(ANET) + '/' + str( BNET) + '/' + TEXTFILE print "-======= site: " + str(url) + " =======-" try: soup = BeautifulSoup(html) except: print " soup exception" continue HFP = open(HTMLFILE, 'w') HFP.write(soup.encode('utf-8')) HFP.close() with open(HTMLFILE) as hfp: try: client.create_file(HADOOP_HTMLFILE, hfp) except: client.delete_file_dir(HADOOP_HTMLFILE) client.create_file(HADOOP_HTMLFILE, hfp) TFP = open(TEXTFILE, 'w') WRITEOUT = unicode(soup.get_text()) WORDLIST = re.sub(r'[^a-zA-Z0-9 ]', r' ', WRITEOUT) WORDLIST = WORDLIST.strip().split() TFP.write(WRITEOUT.encode('utf-8')) TFP.close() PAGETITLE = ''
def link_modifier(self, search_string, page_location): """ Find a word and link it to the page location. Add to list of updated items :param search_string: string to be linked :param page_location: string of the name of linked page :return: """ for response in self.response['results']: if response['type'] != 'page': continue # copy the response response_copy = { 'id': response['id'], 'type': response['type'], 'title': response['title'], 'version': {}, 'body': {} } response_copy['body']['storage'] = {} response_copy['body']['storage']['representation'] = response[ 'body']['storage']['representation'] response_copy['body']['storage']['value'] = response['body'][ 'storage']['value'] response_copy['version'][ 'number'] = response['version']['number'] + 1 response_body = response_copy['body']['storage']['value'] bs = BeautifulSoup(response_body, "html.parser") matches = bs.findAll(text=re.compile(r'\b' + search_string + r'\b')) if not matches: return change_count = 0 for match in matches: grand_parent = match.parent.parent.name # check if word is part of a markdown if "ac:" in grand_parent: if grand_parent == "ac:link": try: existing_link = match.parent.previous_sibling[ 'ri:content-title'] except: print "Error: detected self referencing link at: {}"\ .format(response['title']) continue if existing_link != page_location: match.parent.previous_sibling[ 'ri:content-title'] = page_location change_count += 1 else: continue else: continue else: # don't add links in tables # for parent in match.parents: # if "table" in parent: # continue substituted = re.sub( r'\b' + search_string + r'\b', self.LINK1 + page_location + self.LINK2 + search_string + self.LINK3, match) match.replaceWith(BeautifulSoup(substituted, "html.parser")) change_count += 1 if change_count: # do replacement response_copy['body']['storage']['value'] = bs.encode('utf-8') self.to_be_updated.append(response_copy) self.responses.append(response) else: continue
#request get html and find specific content in every 5seconds import time url ='https://weather.yahoo.co.jp/weather/jp/13/4410.html' #url = input("enter url to request :") flag = True counter = 0 while flag : result = get_html(url) if result : counter+=1 soup = BeautifulSoup(result,'html.parser') print(soup) soupText = str(soup.encode('utf-8')) f = open('result.txt','w') f.write(soupText) #access to div class attr way1 divs = soup.findAll('div',{'class':'forecastCity'}) print(id(divs)) #access to div class attr way2 divs2 = soup.select('div[class*=forecast]') print(id(divs2)) time.sleep(10)
def crawl_episode_info(self): mdir =os.path.dirname(os.path.abspath(__file__)) fname = mdir+'/feed.xml' url = 'http://dataskeptic.com/feed.rss' if not(os.path.isfile(fname)): print('EP:fetching') r = requests.get(url) f = open(fname, 'wb') f.write(r.text.encode('utf-8')) f.close() with open(fname) as fd: xml = xmltodict.parse(fd.read()) episodes = xml['rss']['channel']['item'] descriptions = [] descToTitle = {} descToLink = {} descToNum = {} l = len(episodes) for episode in episodes: enclosure = episode['enclosure'] desc = episode['description'] desc = desc.replace(u'\xa0', u' ') desc = desc.replace(u'\n', u' ') desc = desc.replace(u'\xc2', u' ') desc = BeautifulSoup(desc, "lxml").text if len(desc) >= 5: descriptions.append(desc) descToTitle[desc] = episode['title'] descToLink[desc] = episode['link'] descToNum[desc] = l l = l - 1 result = {} for desc in descriptions: info = {} info["link"] = descToLink[desc] info["title"] = descToTitle[desc] info["num"] = descToNum[desc] result[desc] = info mdir = os.path.dirname(os.path.abspath(__file__)) if not os.path.exists(mdir+'/text/'): os.makedirs(mdir+'/text/') with open(mdir+'/text/episodes_json.txt', 'w') as outfile: json.dump(result, outfile) with open(mdir+'/text/episode_titles.txt','w') as thefile: for i in range(len(descriptions)): desc = descriptions[i] title = descToTitle[desc] title = title.replace('[MINI]', "") title = title.encode('utf-8').strip() title = str(title).replace('\n', "") thefile.write("%s\n" % str(title)) with open(mdir+'/text/episode_descs_titles.txt', 'w') as thefile: for i in range(len(descriptions)): desc = descriptions[i] title = descToTitle[desc] desc = desc.encode('utf-8').strip() desc = str(desc).replace('\n', "") title = title.replace('[MINI]', "") title = title.encode('utf-8').strip() title = str(title).replace('\n', "") thefile.write("%s\n" % str(title+", "+desc)) self.descriptions = descriptions
def get_descriptions(problem): descriptions = [] left_out = [] failed_to_download_d = [] #print problem_list #for i in problem_list: url = 'https://www.hackerearth.com/problem/algorithm/' + problem #url = 'https://www.hackerearth.com/problem/algorithm/' + i #url = 'https://www.hackerearth.com/problem/algorithm/' + 'ways-of-seeing-circuits' #print url #url = "https://www.codechef.com/api/contests/PRACTICE/problems/" + str(i) print url page = requests.get(url) if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url) html_content_all = page.text if re.search('"message":"requests limit exhausted"', html_content_all) != None: while re.search('message":"requests limit exhausted', html_content_all) != None: time.sleep(1) page = requests.get(url) html_content_all = page.text if html_content_all == None: failed_to_download_d.append(i) #print html_content_all soup = BeautifulSoup(html_content_all) #html_content = soup.findAll("div", { "class" : "starwars-lab" }) #''' html_content_1 = soup.findAll("div", {"class": "starwars-lab"}) html_content_2 = soup.findAll( "div", {"class": "less-margin-2 input-output-container"}) html_content_3 = soup.findAll("div", {"class": "standard-margin"}) #''' #raw = BeautifulSoup(str(html_content[0]).replace("</p>", "\n</p>").replace("<sup>", "<sup>^").replace("\le", u"≤").replace("\ge", u"≥").replace("\lt", "<").replace("\gt", ">"), "html.parser").get_text() raw = BeautifulSoup( str(html_content_1[0]).replace("</p>", "\n</p>").replace( "<sup>", "<sup>^"), "html.parser").get_text() + BeautifulSoup( str(html_content_2[0]).replace("</p>", "\n</p>").replace( "<sup>", "<sup>^"), "html.parser").get_text() + BeautifulSoup( str(html_content_3[0]).replace("</p>", "\n</p>").replace( "<sup>", "<sup>^"), "html.parser").get_text() #if re.search("https://d320jcjashajb2.cloudfront.net/media/uploads", str(html_content_all)) == None and re.search('"message":"Problem is not visible now. Please try again later."', str(html_content_all)) == None and re.search('Statement is not available', str(html_content_all)) == None: if re.search( "https://d320jcjashajb2.cloudfront.net/media/uploads", html_content_all ) == None and re.search( '"message":"Problem is not visible now. Please try again later."', html_content_all) == None and re.search( 'Statement is not available', html_content_all) == None: raw = raw.replace("\n\n\n\n\n\n", "") #raw = raw.replace("\n\n\n\n\n", "\n") raw = raw.replace("\n\n\n", "\n") raw = raw.replace("\n\n\n", "\n\n") raw = raw.replace("\n\n\n", "\n\n") raw = raw.replace("<sup>", "<sup>^") raw = raw.replace("\in", u"∈").replace('$$', '') raw = raw.replace(" <=", u" ≤").replace(" >=", u" ≥").replace( "<=", u" ≤ ").replace(">=", u" ≥ ").replace(u"≤ ", u"≤ ").replace( u"≥ ", u"≥ ").replace("\le", u"≤").replace("\ge", u"≥").replace( "\lt", "<").replace("\gt", ">") raw = re.sub('Subtasks(.+?)SAMPLE INPUT', 'SAMPLE INPUT', raw, flags=re.S) raw = re.sub('Time Limit:(.+)', '', raw, flags=re.S) raw = re.sub('See Russian translation\n\n', '', raw, flags=re.S) raw = re.sub('See Russian translation', '', raw, flags=re.S) raw = raw.replace("\\", "\\\\") descriptions.append(raw.encode('utf-8').decode('string-escape')) else: #left_out.append(i) #descriptions.append(raw.encode('utf-8').decode('string-escape')) left_out.append(problem) #hjgf #print 'descriptions' #print descriptions[0] #asasdf return descriptions, left_out, failed_to_download_d
) meta2["name"] = "viewport" headTag.append(meta2) content.body.insert_before(headTag) return content def changeImgSrcAttr(soup): # Use s attribute instead of src for images for imgElement in soup.select('img[src]'): imgElement["s"] = imgElement["src"] del imgElement["src"] return soup if __name__ == '__main__': if len(sys.argv) < 2: print "Usage: " + sys.argv[0] + " <inFile> [outFile]" exit(1) file = sys.argv[1] soup = BeautifulSoup(open(file)) soup = cleanUp(soup) soup = changeImgSrcAttr(soup) file = sys.stdout if len(sys.argv) > 2: file = open(sys.argv[2], 'w') file.write(soup.encode('utf-8'))
continue hud = hud.replace("Dinero", "Mono") hud = hud.replace("Día ", "Tago ") if hud: hud += "\n" passage = passages[name] # print(Fore.RED + str(passage)) if not esperanto: continue passage.string = hud + esperanto print(name) eo_all_links = re.findall('\[\[([^\[\]]*)\]\]', esperanto) for r in eo_all_links: assert 1 <= len(r.split('|')) <= 2 es_links = [ r.split('|')[-1] for r in re.findall('\[\[([^\[\]]*)\]\]', spanish) ] eo_links = [ r.split('|')[-1] for r in re.findall('\[\[([^\[\]]*)\]\]', esperanto) ] if es_links != eo_links: print(es_links, eo_links) assert es_links == eo_links with open(output_file, "wb") as file: file.write(soup.encode(formatter="html"))
from bs4 import BeautifulSoup import requests URL = 'https://en.wikipedia.org/wiki/Google' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser').prettify() text = str(soup.encode("UTF-8")) file = open("input.txt", "w") file.write(text) file.close() import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('maxent_ne_chunker') nltk.download('words') for k in text.split("\n"): text1 = str(nltk.re.sub(r"[^a-zA-Z0-9]+", ' ', k)) file = open("input1.txt", "w") file.write(text1) stokens = nltk.sent_tokenize(text1) wtokens = nltk.word_tokenize(text1) for s in stokens: print(s) tagged = nltk.pos_tag(wtokens)
filepath = os.path.join(args.source, relpath, filename) dstpath = os.path.join(doc_dir, relpath, filename) logging.info('Processing file {}'.format(filepath)) indexs = list() try: assert filepath.endswith('.html') with open(filepath, 'rb') as src: soup = BeautifulSoup(src.read().decode('utf8'), 'lxml') for index in extract_cppmodule(soup): indexs.append(index) for index in extract_sectionlink(soup): indexs.append(index) remove_navbar(soup) with open(dstpath, 'wb') as dst: dst.write(soup.encode('utf8')) except AssertionError: # except: shutil.copy(filepath, dstpath) for name, typ, pos in indexs: name = re.sub('\s+', ' ', name) assert '\n' not in name if '#' in pos[1:] or '/' in pos: continue # print(name, typ, pos) # assert('#' not in pos[1:]) if not pos.startswith('#'): pos = '#' + pos pos = os.path.join(relpath, filename) + pos cur.execute(
from bs4 import BeautifulSoup html_encoding = 'gbk' wanted_html_part = r''' <td style="width: 700px; height: 20px;font-size:13.5px; " valign="middle"> <a href="https://example.com/dist/standalone.html?eid=xxx" target="_blank">xxx</a> </td> '''.strip().decode("utf-8") # type: unicode with open("Default.aspx") as fp: html_content = fp.read() wanted_html = html_content.decode(html_encoding) # type: unicode # https://blog.csdn.net/adinlead/article/details/53897409 soup = BeautifulSoup( wanted_html, 'html.parser' ) # do NOT use 'lxml' or will lost '.aspx' tags, but not good enough wanted_res = soup.find('td', style="width: 700px; height: 20px;font-size:13.5px; ") # Beautiful Soup replaces < with < # https://stackoverflow.com/questions/52040260/beautiful-soup-replaces-with-lt wanted_res.replace_with(wanted_html_part) # keep html tag in replace procedure # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-formatters original_html_content = soup.encode(html_encoding, formatter=None) with open("Default1.aspx", 'w') as fp: fp.write(original_html_content)
kursor.execute("delete from corona") # r = requests.get("https://pomber.github.io/covid19/timeseries.json").json() # number = 1 # for y in r['Indonesia']: # kursor.execute("insert into corona(id,negara,date,confirmed,deaths,recovered,datetime) values(%s,%s,%s,%s,%s,%s,%s)", (number,'Indonesia',y['date'],y['confirmed'],y['deaths'],y['recovered'],datetime.datetime.now(pytz.timezone('Asia/Jakarta')))) # number += 1 page = requests.get( 'https://www.worldometers.info/coronavirus/country/indonesia/').text soup = BeautifulSoup(page, 'html.parser') soup.encode('utf-8') # cases = soup.find("div", {"class": "maincounter-number"}).find("span", {"style", {"color" : "#aaa"}})[0].get_text().strip() cases = soup.find_all("div", {"class": "maincounter-number"}) hasil = [] for x in cases: children = x.findChildren("span", recursive=True) for y in children: iwant = y.text.split(' ')[0].strip()
# 使用request和bs4编写基本爬虫 # https://www.cnblogs.com/baojinjin/p/6819389.html import requests from bs4 import BeautifulSoup res = requests.get('http://book.zongheng.com/chapter/734213/40615154.html') res.encoding = 'utf-8' # print(res.text) soup = BeautifulSoup(res.text, 'html.parser') # print(soup.encode('gb18030')) print(soup.encode('utf-8', 'ignore'))
# 2) Replace the main picture with a picture of yourself. # 3) Replace any local images with the image I provided in media. (You # must keep the image in a separate folder than your html code. # Deliverables # Make sure the new page is uploaded to your GitHub account. from bs4 import BeautifulSoup import urllib import requests import re base_url = 'http://collemc.people.si.umich.edu/data/bshw3StarterFile.html' r = requests.get(base_url) soup = BeautifulSoup(r.text, 'html.parser') str_soup = soup.encode("ascii", "ignore").decode( "utf-8") #encode and decode will convert to string main_img_replace = str_soup.replace( "https://testbed.files.wordpress.com/2012/09/bsi_exposition_041316_192.jpg", "media/IMG_6293.jpg") #replacing main picture with personal picture local_img_replace = main_img_replace.replace( "logo2.png", "media/logo.png") #replacing logo pictures with photo from media file x = local_img_replace.replace( "student", "AMAZING student") #replaces student with AMAZING student fout = open("new_html.html", "w") #creating a write-able html file fout.write(x) #writing into html file fout.close()
def get_release(): req = requests.get('https://github.com/redis/redis/releases') html = req.text.encode('utf-8') soup = BeautifulSoup(html, 'html.parser') print(soup.encode('utf-8'))
class Flipkart: def __init__(self, url): # self.url = "https://www.amazon.in/Bose-SoundLink-Wireless-Around-Ear-Headphones/dp/B0117RGG8E/ref=sr_1_11?qid=1562395272&refinements=p_89%3ABose&s=electronics&sr=1-11" self.url = url self.old_price = 99999999 self.count = 0 self.product_details = "" def request_with_ua(self): self.error = "" lines = open("user_agents.txt").read().splitlines() user_agent = random.choice(lines) self.headers = { 'User-Agent' : user_agent } print("New Flipkart request with : ", user_agent) response = requests.get(self.url, headers=self.headers) self.soup = BeautifulSoup(response.content, 'html.parser') self.soup.encode('utf-8') try: #check whether browser version is supported or not self.error = self.soup.find("div", {"class": "popup-header"}).text.strip() print("Browser is no longer supported") except Exception as be: print("Browser is supported") # print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(be).__name__, be) pass # print(be) # f = open("soup.html", "w",encoding= "utf-8") # f.write(str(self.soup)) # f.close() def check_price(self): try: self.request_with_ua() while(self.error=="Your browser is no longer supported"): print("User agent switching\n") self.request_with_ua() try: # print("current price block") self.title = self.soup.find("span", {"class": "_35KyD6"}).text self.current_price = self.soup.find("div", {"class": "_1vC4OE _3qQ9m1"}).get_text().replace(',', '').replace('₹', '').replace(' ', '').strip() # print(self.current_price) except: # self.current_price = soup.find(id = "priceblock_dealprice").get_text().replace(',', '').replace('₹', '').replace(' ', '').strip() print("self.current_price exception") try: self.current_price = int(self.current_price.split(".")[0]) self.review_count = self.soup.find("span", {"class": "_38sUEc"}).get_text() self.stars = self.soup.find("div", {"class": "hGSR34"}).text try: self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars':self.stars, 'Number of reviews and ratings': self.review_count} except NameError as e: self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars': "Unable to fetch", 'Number of reviews and ratings': "Unable to fetch"} for key,value in self.product_dict.items(): self.product_details = self.product_details + str(key) + " : "+ str(value) + "\n" # # print(json.dumps(jsonObject, indent=2)) if(self.current_price < self.old_price): self.old_price = self.current_price if self.count == 0: return self.product_details self.count = 1 return False else: return self.product_details except Exception as qq: print("second end block") print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(qq).__name__, qq) except Exception as ww: print("end block") print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(ww).__name__, ww)
from bs4 import BeautifulSoup waybillNo_list =['810131162977', '810131167088', '810131151219', '810131166299'] headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'en,en-US;q=0.8,zh;q=0.6,zh-CN;q=0.4', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Content-Length':'46', 'Content-Type':'application/x-www-form-urlencoded', 'Host':'trace.yto.net.cn:8022', 'Origin':'http://www.yto.net.cn', 'Referer':'http://www.yto.net.cn/gw/index/index.html', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } for waybillNo in waybillNo_list: payload = { 'waybillNo': waybillNo } res = requests.post("http://trace.yto.net.cn:8022/TraceSimple.aspx",data = payload, headers = headers) result = BeautifulSoup(res.text,"html5lib").select('.data')[-1].text.strip() print waybillNo + "; "+ str(result.encode('utf8')).replace("感谢使用圆通速递,期待再次为您服务", "")
def test_reporting(rf, admin_user): product_price = 100 product_count = 2 tax_rate = Decimal("0.10") line_count = 1 expected_taxful_total, expected_taxless_total, shop, order = initialize_report_test(product_price, product_count, tax_rate, line_count) with override_provides("reports", [__name__ + ":SalesTestReport"]): data = { "report": SalesTestReport.get_name(), "shop": shop.pk, "date_range": DateRangeChoices.THIS_YEAR.value, "writer": "json", "force_download": 1, } view = ReportView.as_view() request = apply_request_middleware(rf.post("/", data=data), user=admin_user) response = view(request) if hasattr(response, "render"): response.render() assert response.status_code == 200 json_data = json.loads(response.content.decode("utf-8")) assert force_text(SalesTestReport.title) in json_data.get("heading") totals = json_data.get("tables")[0].get("totals") return_data = json_data.get("tables")[0].get("data")[0] assert int(totals.get("product_count", 0)) == product_count assert int(return_data.get("product_count", 0)) == product_count assert int(totals.get("order_count", 0)) == 1 assert int(return_data.get("order_count", 0)) == 1 assert str(expected_taxless_total) in totals.get("taxless_total", "0") assert str(expected_taxful_total) in totals.get("taxful_total", "0") today = date.today() last_year = date(today.year - 1, 1, 1) next_year = date(today.year + 1, 1, 1) # test report without downloading it data = { "report": SalesTestReport.get_name(), "shop": shop.pk, "date_range": DateRangeChoices.CUSTOM.value, "start_date": last_year.strftime("%Y-%m-%d"), "end_date": next_year.strftime("%Y-%m-%d"), "writer": "json", } request = apply_request_middleware(rf.post("/", data=data), user=admin_user) response = view(request) assert response.status_code == 200 soup = BeautifulSoup(response.render().content) response_text = str(six.u(soup.encode('ascii'))) assert force_text(SalesTestReport.title) in response_text assert str(expected_taxless_total) in response_text assert str(expected_taxful_total) in response_text
writer = csv.writer(f1, lineterminator='\n', ) writer.writerow(headerrow) for page in range(1, 10000000): page_url = "https://www.thredup.com/products/women/shorts?department_tags=women&page=" + str( page) + "&search_tags=women-shorts&sort=Newest+First" # uClient = uReq(page_url) print(page_url) scrappage = requests.get(page_url) html_doc = scrappage.text page_soup = BeautifulSoup(html_doc, 'lxml') # parses html into a soup data structure to traverse html # as if it were a json data type. # page_soup = BeautifulSoup(uClient.read(), "html.parser") # print(page_soup.prettify()) errtxt1 = page_soup.encode("utf-8") errtxt = page_soup.prettify() errfound = False errfound1 = False if errtxt.find('Try removing some filters to see more items.') >= 0: errfound = True if errtxt.find('We couldn\'t find anything matching your search.') >= 0: errfound1 = True if errfound and errfound1: errorpage = errorpage + '\n' + 'Error or no data found Record No : ' + str(j) + ' Page No : ' + str(page) nexterrorpage = nexterrorpage + 1 if nexterrorpage > consterror: print(errorpage) break # print("The ERROR IS RAISED")
def __init__(self, username, downloadPhotos): self.useragents = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' ] self.username = username # Make the directory that we are putting the files into self.make_directory() print(colors.OKGREEN + f"[*] Starting Scan on {self.username}" + colors.ENDC) # Get the html data with the requests module r = requests.get( f'http://instagram.com/{self.username}', headers={'User-Agent': random.choice(self.useragents)}) soup = BeautifulSoup(r.text, 'html.parser') # To prevent a unicode error, we need the following line... soup.encode('utf-8') # Find the tags that hold the data we want to parse general_data = soup.find_all('meta', attrs={'property': 'og:description'}) more_data = soup.find_all('script', attrs={'type': 'text/javascript'}) description = soup.find('script', attrs={'type': 'application/ld+json'}) # Try to parse the content -- if it fails then the program exits try: self.text = general_data[0].get('content').split() # This is the profile description data self.description = json.loads(description.get_text()) # This is the javascript json that is passed into json.loads() self.profile_meta = json.loads( more_data[3].get_text()[21:].strip(';')) except: print(colors.FAIL + f"Username {self.username} not found" + colors.ENDC) sys.exit() self.profile_data = { "Username": self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['username'], "Profile name": self.description['name'], "URL": self.description['mainEntityofPage']['@id'], "Followers": self.text[0], "Following": self.text[2], "Posts": self.text[4], "Bio": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['biography']), "profile_pic_url": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['profile_pic_url_hd']), "is_business_account": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['is_business_account']), "connected_to_fb": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['connected_fb_page']), "externalurl": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['external_url']), "joined_recently": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['is_joined_recently']), "business_category_name": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['business_category_name']), "is_private": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['is_private']), "is_verified": str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql'] ['user']['is_verified']) } # Tries to scrape posts if it is a public profile self.save_data() if downloadPhotos == True: self.scrape_posts() self.print_data()
class Amazon: def __init__(self, url): # self.url = "https://www.amazon.in/Bose-SoundLink-Wireless-Around-Ear-Headphones/dp/B0117RGG8E/ref=sr_1_11?qid=1562395272&refinements=p_89%3ABose&s=electronics&sr=1-11" self.url = url self.old_price = 99999999 self.count = 0 self.product_details = "" def request_with_ua(self): self.error = "" lines = open("user_agents.txt").read().splitlines() user_agent = random.choice(lines) self.headers = { 'User-Agent' : user_agent } print("New Amazon request with : ", user_agent) response = requests.get(self.url) # response = requests.get(self.url, headers=self.headers) self.soup = BeautifulSoup(response.content, 'html.parser') self.soup.encode('utf-8') try: #check whether browser version is supported or not self.error = self.soup.find("div", {"class": "popup-header"}).text.strip() print("Browser is no longer supported") except Exception as bb: try: # when browser is supported then check captcha page self.error = self.soup.find("div", {"class": "a-box a-alert a-alert-info a-spacing-base"}).text.strip() print("Browser Captcha error") print("Self.error:",self.error) # print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(bb).__name__, bb) except Exception as bc: print("Browser is supported") # print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(bc).__name__, bc) pass # print(be) f = open("soup.html", "w",encoding= "utf-8") f.write(str(self.soup)) f.close() def check_price(self): try: self.request_with_ua() while(self.error=="Your browser is no longer supported" or self.error.startswith("Enter the characters you see below")): print("User agent switching\n") self.request_with_ua() try: # print("current price block") self.title = self.soup.find(id= "productTitle").get_text().strip() self.current_price = self.soup.find(id = "priceblock_ourprice").get_text().replace(',', '').replace('₹', '').replace(' ', '').strip() except: print("Its a special deal price") self.current_price = self.soup.find(id = "priceblock_dealprice").get_text().replace(',', '').replace('₹', '').replace(' ', '').strip() try: self.current_price = int(self.current_price.split(".")[0]) self.review_count = self.soup.find(id="acrCustomerReviewText").get_text().split()[0] self.stars = self.soup.find(id = "acrPopover").get_text().strip() try: self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars':self.stars, 'Number of reviews': self.review_count} except NameError as e: self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars':"Unable to fetch", 'Number of reviews': "Unable to fetch"} for key,value in self.product_dict.items(): self.product_details = self.product_details + str(key) + " : "+ str(value) + "\n" # print(json.dumps(jsonObject, indent=2)) if(self.current_price < self.old_price): self.old_price = self.current_price if self.count == 0: return self.product_details self.count = 1 return False else: return self.product_details except Exception as pqpq: print("second end block") print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(pqpq).__name__, pqpq) except Exception as qq: print("end block") print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(qq).__name__, qq) # function that sends an email if the prices fell down def send_mail(): server = smtplib.SMTP('smtp.gmail.com', 587) server.ehlo() server.starttls() server.ehlo() server.login('*****@*****.**', 'password') subject = 'Price Fell Down' body = "Check the amazon link https://www.amazon.in/Bose-SoundLink-Wireless-Around-Ear-Headphones/dp/B0117RGG8E/ref=sr_1_11?qid=1562395272&refinements=p_89%3ABose&s=electronics&sr=1-11 " msg = f"Subject: {subject}\n\n{body}" server.sendmail( '*****@*****.**', '*****@*****.**', msg ) #print a message to check if the email has been sent print('Hey Email has been sent') # quit the server server.quit()
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ===================================================================== for ul in soup.find_all('ul', class_='facet'): for li in ul.find_all('li'): # return the facet text (section title) facet = li.find( 'strong' ) # assumes the first row of the facet is the "title" row - breaks if it isnt if facet: facet_type = facet.text.upper() else: facet_type = facet_type.upper( ) # if None is found, apply current facet_type value to next facet_type value facet_desc = li.find('a') if facet_desc: # checks if there is a result on the search for the "a" anchor (removes the title of the sections by default - returned above) facet_desc = facet_desc.text.upper() facet_desc = re.sub( r"[!@#$']", '', str(facet_desc)) # removes special characters from string facet_count = li.find('span') facet_count = int(facet_count.text.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: # if no "a" anchor is found, ignore None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
url = "https://www.soccerstats.com/homeaway.asp?league=denmark" data = requests.get(url, time.sleep(2)) soup = BeautifulSoup(data.content) div = soup.find("div", id="h2h-team2") table = div.find("table", id="btable") f = open("denmark1/awayRows.txt", "w") f.write(str(table)) f.close() print("Away Rows Complete") counter += 1 print(str(counter) + " out of " + str(total)) url = "https://www.soccerstats.com/results.asp?league=denmark&pmtype=bygameweek" data = requests.get(url, time.sleep(2)) soup = BeautifulSoup(data.content) soup.encode(formatter=None) div = soup.find("div", class_="tabbertabdefault") table = div.find("table", id="btable") f = open("denmark1/fixtures.txt", "w") f.write(str(table)) f.close() print("Fixtures Complete") counter += 1 print(str(counter) + " out of " + str(total)) url = "https://www.soccerstats.com/results.asp?league=denmark&pmtype=bydate" data = requests.get(url, time.sleep(2)) soup = BeautifulSoup(data.content) div = soup.find_all("table", id="btable") f = open("denmark1/seasonFixtures.txt", "w") f.write(str(div[0]))
#result = requests.get("https://www.amazon.com/HP-EliteDesk-800-G1-Refurbished/dp/B0784F3NHF", headers=headers) result = requests.get("https://www.amazon.com/HP-EliteDesk-800-G1-Refurbished/dp/B0784F82Q5", headers=headers) #you can get the status code for the page #print(result.status_code) #print(result.headers) src = result.content #lxml needs to be installed separately soup = BeautifulSoup(src, 'lxml') #Change from soup object to byte object textable = soup.encode('utf-8') #Change from byte object to string object encoding = 'utf-8' transformToString = textable.decode(encoding) #Print object type print(type(transformToString)) #-------this works for In Stock #foundStringIndex = transformToString.find("In Stock") #print(foundStringIndex) #shownString = transformToString[foundStringIndex : foundStringIndex+8] #print(shownString)
def consolidate_report(): pass_count = 0 fail_count = 0 with open(test_results_file, 'r') as html_file: for line in html_file: if 'PASS' in line: if '<b>' in line: pass_count += 1 if 'FAIL' in line: if '<b>' in line: fail_count += 1 total_count = pass_count + fail_count consolidate_table = """ <table border="2"> <col width="150"> <col width="150"> <col width="150"> <tr bgcolor="#b3ffff"> <th colspan="4" style="font-size:19px">Consolidated Report</th> </tr> <tr> <th style="font-size:17px">Total</th> <th style="font-size:17px">Passed</th> <th style="font-size:17px">Failed</th> </tr> <tr align="center"> <td style="font-size:17px">{0}</td> <td><font color=green style="font-size:17px"><b>{1}</b></td> <td><font color=red style="font-size:17px"><b>{2}</b></td> </tr> </table> <br> """.format(total_count, pass_count, fail_count) with open(test_results_file, 'r') as f2: ogcontent = f2.read() with open(test_results_file, 'w') as f3: f3.write(consolidate_table) styl = ''' <style> pre { overflow-x: auto; white-space: pre-wrap; white-space: -moz-pre-wrap; white-space: -pre-wrap; white-space: -o-pre-wrap; word-wrap: break-word; } </style> ''' with open(test_results_file, 'a') as f4: f4.write(styl) f4.write(ogcontent) from bs4 import BeautifulSoup with open(test_results_file, 'r') as f: soup = BeautifulSoup(f, 'html.parser') l1 = soup.findAll('table', {'border': '1'}) for each in l1: i = 1 children = each.findChildren('b') for child in children: if child.string != 'FAIL' and child.string != 'PASS': child.string = "{}. ".format(i) + child.string i += 1 with open(test_results_file, "wb") as f_output: f_output.write(soup.encode('utf8'))
reload(sys) sys.setdefaultencoding('utf8') from selenium import webdriver from bs4 import BeautifulSoup import time browser = webdriver.Chrome() url = 'http://loudong.360.cn/Loo/index/search/%E4%BA%91/p/{page}.html' dirpath = os.getcwd() filepath = os.path.join(dirpath, 'butian_{page}.html') for i in range(1, 73): browser.get(url.format(page=i)) soup = BeautifulSoup(browser.page_source, 'lxml') print soup.encode("utf8") with open(filepath.format(page=i), 'w') as f: f.write(browser.page_source.encode("utf8")) f.close() #browser.delete_all_cookies() time.sleep(2) browser.close() respath = os.path.join(dirpath, 'res_butian') for i in range(1, 73): soup = BeautifulSoup(open(filepath.format(page=i)), 'lxml') f = open(respath, 'a')
start_time = time.time() startpoint = 1 for full in fulls: site = full[2] org = full[0] smo_id = full[3] term = full[1] year = full[4] driver = webdriver.Chrome("CHROMEDRIVER DIRECTORY") driver.get(site) #get original site info source = driver.page_source #text = requests.get(site).text #text version of getting requests #soup = BeautifulSoup(text) #use text version to get site soup soup = BeautifulSoup(source, "html.parser") soup2 = soup.encode("utf-8") try: #resultno = re.findall('English</a><span class="resultscount"> (.*?)\xe2\x80\x8e</span>',soup2) #resultno = re.findall('<h1 id="pqResultsCount">\n(.*?) results\n</h1>',soup2) resultno = re.findall('<h1 id="pqResultsCount">\n(.*?) result', soup2) resultno = ''.join(resultno) resultno = resultno.translate(None, "(){}<>,") resultno = int(resultno) except ValueError, e: resultno = int(0) no_pages = int(math.ceil(resultno / 20)) #encrypt = re.findall('href="https://search.proquest.com/docview/(.*?)/',soup2) an = re.findall('{"(.*?)markedlistcheckbox:markallitems', soup2) an = ''.join(an) an = re.findall('markAll":false,"formats":{(.*?)},"markURL"', an) an = ''.join(an)