def scraper(): ''' This script scrapes http://usesthis.com/interviews/ for each interview, saves the interviewees name, product name, product description, link to the product to a csv file named everyone.csv ''' outputFile = open('everyone.csv', 'a') scrapecount = 0 response = requests.request('get', 'http://usesthis.com/interviews/') html = soup(response.text) interviewLinks = html.findSelect('#interviews li h2 a') linkLength = len(interviewLinks) while scrapecount < (linkLength): response = requests.request('get', interviewLinks[scrapecount]['href']) html = soup(response.text) person = html.findSelect('.person')[0].text product = html.findSelect('#contents article.contents p a') productLength = len(product) csvWriter = csv.writer(outputFile) for x in range(0, productLength, 1): try: print person, product[x].text, product[x]['title'], product[x]['href'] csvWriter.writerow([person, product[x].text, product[x]['title'], product[x]['href']]) except Exception as e: print '%s, %s, %s, %s' % ('Exception', 'Exception', 'Exception', e) scrapecount += 1
def __actinit(mech,paswd): mech.open(PLAT) mech.select_form(nr=0) mech["user"] = "******" mech["passwd"] = b6d(paswd) results = mech.submit().read() soup(results) print (PLAT+'file/memit/')
def getLinks(self,page): #in later versions i plan to make a more comprehensive interpretation of the page ;) exploit_table = soup(page).findAll("tr") for exploit in exploit_table: ex=exploit.contents[1] ex=soup(str(ex)) anchor=str(ex.a) if anchor != 'None': descr = anchor[anchor.index('>')+1:-4] link = anchor.split(" ")[1].replace("href=",'').replace("\"",'') sys.stderr.write("[%s]%shttp://www.1337day.com%s\n" % (descr,' '*(90-(len(descr)+len(link))),link))
def __actinit(mech, paswd): cj = cookielib.LWPCookieJar() mech.set_cookiejar(cj) mech.open(PLAT) mech.select_form(nr=0) mech["user"] = "******" mech["passwd"] = b6d(paswd) results = mech.submit().read() cookies = mech._ua_handlers['_cookies'].cookiejar soup(results) xsrf = [ck for ck in cookies if ck.name == '_xsrf'][0] print (PLAT+STORAGE, xsrf) return xsrf.value
def readhorse(horsename): ''' this function reads the primary web page for eachhorse ''' horsename1 = horsename.replace(" ","+") webstring = "http://www.pedigreequery.com/"+horsename1 horsename=horsename.strip() print repr(horsename) hsoup = soup(urllib.urlopen(webstring)).findAll(text=re.compile("{2-d} DP =",re.IGNORECASE)) hsoup2 = soup(urllib.urlopen(webstring)).findAll(text=re.compile("Earnings",re.IGNORECASE)) #hsoup = soup(urllib.urlopen(webstring)) hsoup = str(hsoup) hsoup2 = str(hsoup2) dhorsename = horsename.rstrip()+".txt" with open(dhorsename,'w') as webout: webout.write(hsoup) webout.write(hsoup2)
def search(self, item): item = quote(item) my_url = self.url + "search/?query=" + item response = requests.get( my_url) # Opens connection, grabs the webpage and downloads it page_html = response.content #Parsing html page_soup = soup(page_html) #grabs each product containers = page_soup.findAll( "div", {"class": "search-result-gridview-item-wrapper"}) # print(len(containers)) storeObj = Store() for container in containers: out_of_stock = len(containers[0].findAll( "div", {"class": "product-sub-title-block product-out-of-stock" })) != 0 if not out_of_stock: store = Store() store.store_name = 'Walmart' store.title = container.img["alt"] store.image_url = container.img["data-image-src"] store.product_url = self.url + container.a["href"] store.price = container.findAll( "span", {"class": "visuallyhidden"})[-1].text storeObj.add_item(store) return storeObj.generate_json()
def parse_packet(self): ''' This function will parse the needed data from the packet PSML XML definition and send the data to the API. ''' # If the counter timer is set to 0, then this is the first packet # we have parsed. Set the counter to the current time so that we # dont send a single packet stat to the API. if self.counter == 0: self.counter = int(time.time()) # Next we instantiate a BeautifulSoup object to parse the packet and # pull out the protocol name. packet = soup(self.packet) proto = packet.findAll('section')[4].text # If we dont see the protocol yet in the protos dictionary, we need # to initialize it. After that, we can then increment regardless. if proto not in self.protos: self.protos[proto] = 0 self.protos[proto] += 1 # Once we reach 60 seconds, we need to purge out the protocol counts # that we have counted. Make an API call for each proto we have, # then reset the counter timer and the protos dictionary. if (int(time.time()) - self.counter) >= 60: for proto in self.protos: log.debug('TSHARK: sending %s=%s' % (proto, self.protos[proto])) self.api.stat(proto, self.protos[proto]) self.counter = int(time.time()) self.protos = {}
def get_soup_from_url(self, url_in): """ Return data loaded from an URL, as BeautifulSoup(3) object. Wrapper helper function aronud self.get_data_from_url() """ return soup(self.get_data_from_url(url_in))
def get_input_register(self): """ Getting input name list in form register :return: """ response = requests.get(self.form_url) resp = soup(response.text) password = resp.find('input', {'type': 'password'}) if self.form_name: form = resp.find('form', {'name': self.form_name}) else: form = password.findParent('form') self.attrs = {} for input_text in form.findAll('input'): input_name = input_text.get('name', None) value = input_text.get('value', None) if input_text and input_name: if input_text['type'] == 'checkbox': self.attrs.update({input_name: ['on']}) elif input_text['type'] == 'radio': self.attrs.update({input_name: [value]}) else: self.attrs.update({input_name: value}) for input_select in form.findAll('select'): values = input_select.findAll('option')[1:] self.attrs.update({input_select.get('name', None): random.choice([[value['value']] for value in values])})
def proxy(request, path): """ Answer requests for webalizer images and monthly reports. If an image is requested let django's static.serve view do the work, if an html file is requested just insert the content of the <body> into the django template. """ context = {'title': 'Webalizer'} if path is None or path is u'': path = 'index.html' if webalizer_dir is not None: if path.endswith('.png'): # webalizer generates png images return serve(request, path, document_root=webalizer_dir) else: try: webalizer_index = open(os.path.join(webalizer_dir, path)).read() webalizer_soup = soup(webalizer_index) context.update({'data': ' '.join([unicode(x) for x in webalizer_soup.body.contents])}) except: context.update({'data': None}) return direct_to_template(request, 'webalizer/index.html', context)
def top_ten(): httpreq = requests.get('https://news.ycombinator.com') dom = soup(httpreq.text) outer = dom.find("table") inner = outer.findAll("table")[1] rowlist = inner.findAll("tr") list = [] for row in range(0,len(rowlist)-3,3): rowmain = rowlist[row].findAll("td") rowsub = rowlist[row+1].findAll("td") listitem = {"link": rowmain[2].find("a")["href"], "title": rowmain[2].find("a").string, "domain": rowmain[2].findAll("span")[1].string} try: listitem["poster"] = rowsub[1].findAll('a')[0].string listitem["posted"] = rowsub[1].findAll('a')[1].string listitem["comment"] = re.findall(r'\d+', rowsub[1].findAll('a')[2].string)[0] except: continue list.append(listitem) response = "HackerNews Top 10\n" for i in range(0,10): response += '['+str(i+1)+'] ' + list[i]['title'] + list[i]['domain'] + ' ('+list[i]["posted"]+' | ' +list[i]["comment"] +' comment)' + '\n' return response
def get_th(_num): num = 1 if _num - 1 in range(0, 29): num = _num - 1 else: num = randint(0,29) httpreq = requests.get('https://news.ycombinator.com') dom = soup(httpreq.text) outer = dom.find("table") inner = outer.findAll("table")[1] row = inner.findAll("tr") rowmain = row[(num*3)-3].findAll("td") rowsub = row[(num*3)-2].findAll("td") returnitem = {"link": rowmain[2].find("a")["href"], "title": rowmain[2].find("a").string, "domain": rowmain[2].findAll("span")[1].string} try: returnitem["poster"] = rowsub[1].findAll('a')[0].string returnitem["posted"] = rowsub[1].findAll('a')[1].string returnitem["comment"] = re.findall(r'\d+', rowsub[1].findAll('a')[2].string)[0] returnitem["posttype"] = 's' except: returnitem["posttype"] = 'j' response = '[->] ' + returnitem['title'] + returnitem['domain'] if returnitem["posttype"] == 'j': response += ' (Job) \n' else: response += ' ('+returnitem["posted"]+' | ' +returnitem["comment"] +' comment)' + '\n' response += returnitem["link"] return response
def anagram(self): if not self.values: self.chat("Enter a word or phrase") return word = '+'.join(self.values) url = "http://wordsmith.org/anagram/anagram.cgi?anagram=" + word + "&t=50&a=n" urlbase = pageopen(url) if not urlbase: self.chat("Fail") return cont = soup(urlbase) if len(cont.findAll("p")) == 6: self.chat("No anagrams found.") return try: paragraph = cont.findAll("p")[3] content = ','.join(paragraph.findAll(text=True)) content = content[2:-4] content = content.replace(": ,", ": ") self.chat(content) # Usually not concerned with exceptions # in mongo, but this is bound to come up # again. except Exception as e: print e self.chat("Got nothin")
def handle_ly(filename, orderno): site = "http://www.ly.gov.tw" htmlfile = open(filename,encoding='UTF-8') start_keystring = u'屆 立法委員名單' end_keystring = u'查詢結果共' state = "none" # none, inline print u'第' + str(orderno) + u'屆 清單' state = "none" # none, inline cnt = 0 for line in htmlfile: if line.count(start_keystring) >= 1: state = "inline" if line.count(end_keystring) and state == "inline" >= 1: state = "none" if state == "inline": cnt+= 1 html_line = soup(line, fromEncoding="UTF-8") for tag in html_line.findAll('a', {'href': True}) : if(tag.string != None and tag['href'] != None ): url = site + tag['href'] legislator_name = tag.string.encode('UTF-8') # print type(legislator_name.encode("UTF-8")) inject_legislator_tag(legislator_name, url, orderno)
def extract(self): _html = soup(self.RESPONSE) for cite in _html.findAll("p", attrs={'class': 'PartialSearchResults-item-url'}): sub = re.search(self.regexp, cite.text, re.IGNORECASE) if sub: _sub = sub.group() if _sub not in self.SUBDOMAINS: self.SUBDOMAINS.append(_sub)
def extract(self): _html = soup(self.RESPONSE) for cite in _html.findAll("cite"): sub = re.search(self.regexp, cite.text, re.IGNORECASE) if sub: _sub = sub.group() if _sub not in self.SUBDOMAINS: self.SUBDOMAINS.append(_sub)
def _get_all_slunk_installer_links(self, page_text): logger.debug('Parsing page in search for links') html = soup(page_text) for tag in html.findAll('a', {'data-link': True}): link = tag.attrMap['data-link'] logger.debug('Found link: %s' % link[:-4]) if link.startswith(self.link_prefix): yield link
def findFontGUID(fontTable): xmlStr = open(fontTable, "r").read() xml = soup(xmlStr) for i, k in enumerate(xml.findAll("w:font")): fName = k["w:name"] x = soup(str(k)) try: fontKey = x.findAll("w:embedregular")[0]["w:fontkey"] fontKey = "".join(fontKey[1:-1].split("-")) except: continue print "\tFont: {0}\n\tKey : {1}".format(fName, fontKey) print return fontKey
def _getData(url): try: data = download_page(url) data = data.decode('utf-8') data = soup(data) except Exception as e: print '[Letterboxd][_getData] %s' % (e) return None, None else: return data, _getNextPage(data)
def homepage_is_in_linkspanel(self, response): """ Checks whether the homepage link is displayed """ html = soup(response.content) links = html.findAll("a") for l in links: if self.homepage in l['href']: return True return False
def linker(self, urls): for url in urls: # Special behaviour for Twitter URLs match_twitter_urls = re.compile("http[s]?://(www.)?twitter.com/.+/status/([0-9]+)") twitter_urls = match_twitter_urls.findall(url) if len(twitter_urls): self.tweet(twitter_urls) return fubs = 0 title = "Couldn't get title" roasted = "Couldn't roast" urlbase = pageopen(url) if not urlbase: fubs += 1 try: opener = urllib2.build_opener() roasted = opener.open(SHORTENER + url).read() except: fubs += 1 ext = url.split(".")[-1] images = ["gif", "png", "jpg", "jpeg"] if ext in images: title = "Image" elif ext == "pdf": title = "PDF Document" else: try: cont = soup(urlbase, convertEntities=soup.HTML_ENTITIES) title = cont.title.string except: self.chat("Page parsing error") return deli = "https://api.del.icio.us/v1/posts/add?" data = urllib.urlencode({"url": url, "description": title, "tags": "okdrink," + self.lastsender}) if DELICIOUS_USER: base64string = base64.encodestring("%s:%s" % (DELICIOUS_USER, DELICIOUS_PASS))[:-1] try: req = urllib2.Request(deli, data) req.add_header("Authorization", "Basic %s" % base64string) send = urllib2.urlopen(req) except: self.chat("(delicious is down)") if fubs == 2: self.chat("Total fail") else: self.chat(unescape(title) + " @ " + roasted)
def panel_is_in_response(self, response): """ Checks whether the versioned links panel is found in the rendered HTML response. """ html = soup(response.content) panels = html.findAll("div", {'class': 'panel-heading'}) for panel in panels: if 'versioned links' in str(panel): return True return False
def scrape(self, url): if not any(url in s for s in self.visited): print("scanning %s", url) r = requests.get(url) self.visited.append(url) if r.status_code != 200: self.dead.append(url) else: s = soup(r.content) for link in s.findAll('a'): self.unvisited.append(link)
def get_general_info_link_panel(self, response): """ Checks whether the links panel is found in the rendered HTML response. """ html = soup(response.content) panels = html.findAll("div", {'class': 'panel-heading'}) for panel in panels: if 'links' in str(panel) and not 'versioned links' in str(panel): return panel return False
def build_metadata(filename): list_html_file = open(filename, encoding="UTF-8") keyword = "public/Attachment" for line in list_html_file: if line.count(keyword) > 0: html_line = soup(line) print html_line.a["href"] print html_line.a.img["title"] wget_pdf(html_line) list_html_file.close()
def GetFileHosts(self, url, list, lock, message_queue): import base64 import re from entertainment.net import Net net = Net(do_not_cache_if_any=do_no_cache_keywords_list) def unpack_js(p, k): k = k.split('|') for x in range(len(k) - 1, -1, -1): if k[x]: p = re.sub('\\b%s\\b' % base36encode(x), k[x], p) return p def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'): if not isinstance(number, (int, long)): raise TypeError('number must be an integer') if number == 0: return alphabet[0] base36 = '' sign = '' if number < 0: sign = '-' number = - number while number != 0: number, i = divmod(number, len(alphabet)) base36 = alphabet[i] + base36 return sign + base36 sources = [] ; final = [] html = soup(net.http_GET(url).content) links = html.findAll('script') for link in links: try: if 'eval' in link.contents[0]: r = re.search('return p}\(\'(.+?);\',\d+,\d+,\'(.+?)\'\.split',link.contents[0]) if r: p, k = r.groups() try: sources.append((base64.b64decode(unpack_js(p, k).split('"')[1]).split('>')[1].split('<')[0])) except:pass except IndexError : pass for link in sources: if 'www' not in link.split('//')[1].split('.')[0]: final.append((link.split('//')[1].split('.')[0],link)) else: final.append((link.split('//')[1].split('.')[1],link)) for title,blob in final: if not 'watchfreeinhd' in title.lower(): self.AddFileHost(list, 'SD', blob)
def main(): files = os.popen('dir /B *.html').read() files = files.split('\n') for file in files: if len(file) > 0: infile = open(file,'r') data = infile.read() infile.close() soupped = soup(data) outfile = open(file,'w') outfile.write(soupped.prettify()) outfile.close()
def term_to_link(term): urlopener = urllib2.build_opener() urlopener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/535.19 (KHTML, like Gecko) ' 'Chrome/18.0.1025.168 Safari/535.19')] if term.find(' ') > 0: term = ('"%s"' % term).replace(' ', '+') google_results = urlopener.open('https://www.google.com/search?q=%s' % term) page = soup(google_results.read()) try: first_result = page.find('div', id='ires').find('h3', 'r').find('a')['href'] except AttributeError: return None return first_result
def run( self, target_directory, start_chapter=1, start_page=1, failure_threshold=2): """ :param target_directory: """ if not exists(target_directory): makedirs(target_directory) current_chapter = start_chapter current_page = start_page current_directory = join(target_directory, str(current_chapter)) if not exists(current_directory): makedirs(current_directory) failure = 0 visited = [] _logger.info('Downloading chapter %s' % current_chapter) next_page_url = self._url_builder(current_chapter, current_page) while next_page_url is not None: response = get(next_page_url) if response.status_code == 200: html = soup(response.text) image_url = self._image_extractor(html) if image_url is None or image_url in visited: failure += 1 else: path = join(current_directory, str(current_page)) + '.jpg' success = _download(image_url, path) if success: failure = 0 visited.append(image_url) else: _logger.warn('Failed downloading page %s' % current_page) failure += 1 else: failure += 1 _logger.warn('Failed downloading page %s' % current_page) if failure == 0: current_page += 1 if failure > failure_threshold: current_chapter += 1 current_page = start_page visited = [] current_directory = join(target_directory, str(current_chapter)) if not exists(current_directory): makedirs(current_directory) _logger.info('Downloading chapter %s' % current_chapter) next_page_url = self._url_builder(current_chapter, current_page)
def __parse_meal_ratty(html): parsed = soup(html) table = parsed.find('table', {'id':'tblMain'}) rows = table.findAll('tr')[1:] cols = [urllib2.unquote(col.text) for col in rows[0].findAll('td')[1:]] print 'Cols: ' + str(cols) data = {col:[] for col in cols} for row in rows[1:-1]: row_cols = row.findAll('td')[1:] for ix, c in enumerate(row_cols): if c.text: data[cols[ix]].append(c.text) data['Other'] = [col.text for col in rows[-1].findAll('td') if col.text and col.text != '.'] return data
def getDom(self, url, host, data): opener = urllib2.build_opener(urllib2.HTTPHandler) opener.addheaders = [('User-Agent', ua), ('Host', host)] urllib2.install_opener(opener) try: response = urllib2.urlopen(url, data, timeout=120) result = response.read() except: try: response = urllib2.urlopen(url, data, timeout=120) result = response.read() except: return False return soup(result)
def parse_packet(self): '''This function will parse the needed data from the packet XML definition and send the data to the API. ''' packet = soup(self.packet) # The BeautifulSoup parser object of the XML proto = packet.findAll('section')[-2].text if proto not in self.protos: self.protos[proto] = 0 self.protos[proto] += 1 if (int(time.time()) - self.wait_timer) >= self.interval: for proto in self.protos: log.debug('%s Stats: %s: %s' % (self.stanza, proto, self.protos[proto])) dofler.api.stat(proto, self.protos[proto]) self.wait_timer = int(time.time()) self.protos = {}
def episodes(self, show_title): show_url = SHOW_URL + show_title.lower().replace(' ', '-') self.browser.open(show_url) show_html = soup(self.browser.response(), convertEntities=soup.HTML_ENTITIES) episodes = [] for season in show_html('tr', {'class': 'episodes-list'}): for episode in season('td'): if not episode.has_key('data-id'): continue episodes.append( { 'wid': episode['data-id'], 'id': episode('span', {'class': 'e-count'})[0].text, 'name': episode('span', {'class': 'e-title'})[0].text } ) return episodes
def parse_links(self, source): """ a bit hacky parses all href attributes from html then "find all urls" in them second step is because some href attributes in template can be placeholders etc., which we don't need """ all_links = set() for tag in soup(source).findAll("a", {"href": True}): val = tag.attrMap["href"] urls = re.findall("""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""", val) if len(urls) == 1: all_links.add(urls[0]) return sorted(list(all_links))
def main(argv): global verbose argc = len(argv) html = '' if argc and not os.path.exists(argv[0]): print __doc__ return 20 if argc>0: html = open(argv[0]).read() else: while 1: try: line = sys.stdin.readline() except KeyboardInterrupt: sys.stderr.write("Interrupted by user!\n") return 5 if not line: break html += line lcount0 = html.count('\n') + (1 if html[-1:]!='\n' else 0) ccount0 = len(html) if argc>1 and os.path.exists(argv[1]): root, ext = os.path.splitext(argv[1]) backup = "%s.bak" % root if verbose: print "Overwriting %s, original backedup to %s" % (argv[1], backup) oldhtml = open(argv[1]).read() open(backup,'w').write(oldhtml) formatted = soup(html).prettify() lcount1 = formatted.count('\n') + (1 if formatted[-1:]!='\n' else 0) ccount1 = len(formatted) if argc>1: open(argv[1],'w').write(formatted) else: sys.stdout.write(formatted) if verbose: print "Original: %d lines, %d characters" % (lcount0,ccount0) print "Prettified: %d lines, %d characters" % (lcount1,ccount1) return 0
def maybe_get_icon(self): u""" Get icon for the site as a QImage if we haven’t already. Get the site icon, either the 'rel="icon"' or the favicon, for the web page at url or passed in as page_html and store it as a QImage. This function can be called repeatedly and loads the icon only once. """ if self.site_icon: return if not with_pyqt: self.site_icon = None return page_request = urllib2.Request(self.icon_url) if self.user_agent: page_request.add_header('User-agent', self.user_agent) page_response = urllib2.urlopen(page_request) if 200 != page_response.code: self.get_favicon() return page_soup = soup(page_response) try: icon_url = page_soup.find(name='link', attrs={'rel': 'icon'})['href'] except (TypeError, KeyError): self.get_favicon() return # The url may be absolute or relative. if not urlparse.urlsplit(icon_url).netloc: icon_url = urlparse.urljoin(self.url, urllib.quote(icon_url.encode('utf-8'))) icon_request = urllib2.Request(icon_url) if self.user_agent: icon_request.add_header('User-agent', self.user_agent) icon_response = urllib2.urlopen(icon_request) if 200 != icon_response.code: self.site_icon = None return self.site_icon = QImage.fromData(icon_response.read()) max_size = QSize(self.max_icon_size, self.max_icon_size) icon_size = self.site_icon.size() if icon_size.width() > max_size.width() \ or icon_size.height() > max_size.height(): self.site_icon = self.site_icon.scaled(max_size, Qt.KeepAspectRatio, Qt.SmoothTransformation)
def parse_links(self, source): """ a bit hacky parses all href attributes from html then "find all urls" in them second step is because some href attributes in template can be placeholders etc., which we don't need """ all_links = set() for tag in soup(source).findAll("a", {"href": True}): val = tag.attrMap["href"] urls = re.findall( """http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""", val) if len(urls) == 1: all_links.add(urls[0]) return sorted(list(all_links))
def getContent(url, patterns={}, ignore_pattern=[]): """ Arugments: url(String): target url extract_pattern(dict): (name,extract_function) """ ret = {} url_text = requests.get(url).text bs = soup(url_text) # 404 case if not bs.find(ignore_pattern[0], attrs=ignore_pattern[1]) == None: return None if not type(patterns) == dict: raise TypeError( "Error type of patterns, should be (name,extract_function)!") for p in patterns.items(): ret[p[0]] = bs.find(p[1][0], attrs=p[1][1]).text return ret
def grab_listings(query_url): raw_html = urlopen(query_url) parsed_html = soup(raw_html) postings = parsed_html('p') all_details = [] for post in postings: new_id = str(post('a')[1]['data-id']) if len(all_ids) == 1: all_ids.append(new_id) return [] if new_id in all_ids: break else: all_ids.append(new_id) details = {} details['id'] = new_id details['title'] = post('a')[1].contents[0] #details['time'] = post('span')[0]('time')[0] try: details['nhood'] = scrap_html(post('span')[0]('small')[0]) except: details['nhood'] = 'Not listed' details['price'] = scrap_html(post.find('span', {'class': 'price'})) all_details.append(details) if len(all_details) > 19: break return all_details
def buildDict(self, urlx): jr = jutils.getHttpJson(urlx) rdict = {} vmap = {} prefix = '/taxonomy/term/' countDataset = 0 for item in jr: atag = soup(item[u'提供機關名稱']).a cd = int(item[u'資料集總數']) countDataset += cd vmap[atag['href'].replace(prefix,'')]={'name':atag.string,'dataset':cd} rdict['data'] = vmap rdict['meta'] = {'countTerm':len(jr), 'countDataset':countDataset, 'source':'http://data.gov.tw/data_usage/orgfullname/json', 'host':'http://data.gov.tw', 'build':'http://console.tw', 'script':'https://github.com/y12studio/console-tw/tree/master/projects/datachart/build-org-term.py', 'prefix':prefix, 'time':datetime.datetime.utcnow().isoformat()} return rdict
def filter_html(in_html): doc = soup(in_html) #recs = doc.findAll("div", { "class": "class_name"}) # remove unwanted tags for div in doc.findAll('head'): div.extract() for div in doc.findAll(['i', 'h1', 'script']): div.extract() for div in doc.findAll('div', 'top'): div.extract() for div in doc.findAll('div', 'bot'): div.extract() for div in doc.findAll('div', 'line'): div.extract() for div in doc.findAll('div', 'mainmenu'): div.extract() for div in doc.findAll('div', 'banner'): div.extract() for div in doc.findAll('div', 'maintainers'): div.extract() #for div in doc.findAll('div', {'style':'clear:both;margin-left:200px;'}): # div.extract() # remove html comments comments = doc.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] out_html = doc.body.prettify() #out_html = re.sub('(\\n *)','\\n',out_html) # a little more cleaning out_html = re.sub('(<dd>)\\n', '', out_html) out_html = re.sub('(</dd>)\\n', '', out_html) out_html = re.sub('<br />', '<br/>', out_html) return out_html
def GPLastImg(rename=""): url = "http://10.5.5.9:8080/DCIM/100GOPRO/" page = urllib2.urlopen(url).read() html = soup(page) jpegs = [] for href in html.findAll("a", {"class": "link"}): x = re.search('.*href="(.+\.JPG)".*', str(href)) if x: jpegs.append(x.group(1)) lastImg = jpegs[-1] imgUrl = url + "/" + lastImg import PIL from PIL import Image import urllib2 as urllib import io image_file = io.BytesIO(urllib2.urlopen(imgUrl).read()) img = Image.open(image_file) img = img.resize((400, 300), PIL.Image.ANTIALIAS) img.save(lastImg) return lastImg
def child_html_parse(data, web_url): """ About: This is a helper function which parses the html using the keys and values for the python dict named html_classes Args: data (str), web_url (str) Returns: None """ client_info = { 'name': None, 'address': None, 'phone_number': None, 'link': None } #t= BeautifulSoup(html).findChildren("p", { "class" : "sc-Exhibitor_PhoneFax" }) page_soup = soup(data) for key in client_info.keys(): try: client_info[key] = page_soup.findAll( html_classes[key][1], {"class": html_classes[key][0]})[0].text.encode('utf-8') except Exception as e: print "{0} key failed for link {1}".format(key, web_url) client_data.append(client_info)
def parse_table(self): docsoup = soup(self.region_page) table = docsoup.body.findAll('table') data = table[4] self.tr_data = data.findAll('tr') self.num_records = len(self.tr_data)
def _doSearch(self, search_strings, search_mode='eponly', epcount=0, age=0, epObj=None): results = [] items = {'Season': [], 'Episode': [], 'RSS': []} if not self._doLogin(): return results for mode in search_strings.keys(): logger.log(u"Search Mode: %s" % mode, logger.DEBUG) for search_string in search_strings[mode]: if mode != 'RSS': searchURL = self.urls['search'] % (urllib.quote_plus( search_string.replace('.', ' ')), self.categories) else: searchURL = self.urls['rss'] % self.categories logger.log(u"Search URL: %s" % searchURL, logger.DEBUG) if mode != 'RSS': logger.log(u"Search string: %s" % search_string, logger.DEBUG) data = self.getURL(searchURL) if not data: logger.log("No data returned from provider", logger.DEBUG) continue html = soup(data) if not html: logger.log("No html data parsed from provider", logger.DEBUG) continue empty = html.find('No torrents here') if empty: logger.log( u"Data returned from provider does not contain any torrents", logger.DEBUG) continue tables = html.find('table', attrs={'class': 'mainblockcontenttt'}) if not tables: logger.log( u"Could not find table of torrents mainblockcontenttt", logger.ERROR) continue torrents = tables.findChildren('tr') if not torrents: continue # Skip column headers for result in torrents[1:]: try: cells = result.findChildren( 'td', attrs={ 'class': re.compile( r'(green|yellow|red|mainblockcontent)') }) if not cells: continue title = download_url = seeders = leechers = None size = 0 for cell in cells: try: if None is title and cell.get( 'title') and cell.get( 'title') in 'Download': title = re.search( 'f=(.*).torrent', cell.a['href']).group(1).replace( '+', '.') download_url = self.urls['home'] % cell.a[ 'href'] if None is seeders and cell.get( 'class')[0] and cell.get('class')[ 0] in 'green' 'yellow' 'red': seeders = int(cell.text) if not seeders: seeders = 1 elif None is leechers and cell.get( 'class')[0] and cell.get('class')[ 0] in 'green' 'yellow' 'red': leechers = int(cell.text) if not leechers: seeders = 0 # Need size for failed downloads handling if re.match( r'[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cells[7].text): size = self._convertSize(cells[7].text) if not size: size = -1 except: logger.log( u"Failed parsing provider. Traceback: %s" % traceback.format_exc(), logger.ERROR) if not all([title, download_url]): continue #Filter unseeded torrent if seeders < self.minseed or leechers < self.minleech: if mode != 'RSS': logger.log( u"Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})" .format(title, seeders, leechers), logger.DEBUG) continue item = title, download_url, size, seeders, leechers if mode != 'RSS': logger.log(u"Found result: %s " % title, logger.DEBUG) items[mode].append(item) except (AttributeError, TypeError, KeyError, ValueError): continue #For each search mode sort all the items by seeders if available items[mode].sort(key=lambda tup: tup[3], reverse=True) results += items[mode] return results
def _doSearch(self, search_params, search_mode='eponly', epcount=0, age=0, epObj=None): results = [] if not self._doLogin(): return results for search_string in search_params if search_params else '': if isinstance(search_string, unicode): search_string = unidecode(search_string) searchURL = self.urls['search'] % (urllib.quote_plus(search_string.replace('.', ' ')), self.categories) logger.log(u"Search string: " + searchURL, logger.DEBUG) data = self.getURL(searchURL) if not data: logger.log(u'No grabs for you', logger.DEBUG) continue html = soup(data) if not html: continue empty = html.find('No torrents here') if empty: logger.log(u"Could not find any torrents", logger.ERROR) continue tables = html.find('table', attrs={'class': 'mainblockcontenttt'}) if not tables: logger.log(u"Could not find table of torrents mainblockcontenttt", logger.ERROR) continue torrents = tables.findChildren('tr') if not torrents: continue # Skip column headers for result in torrents[1:]: try: cells = result.findChildren('td', attrs={'class': re.compile(r'(green|yellow|red|mainblockcontent)')}) if not cells: continue title = url = seeders = leechers = None size = 0 for cell in cells: try: if None is title and cell.get('title') and cell.get('title') in 'Download': title = re.search('f=(.*).torrent', cell.a['href']).group(1).replace('+', '.') url = self.urls['home'] % cell.a['href'] if None is seeders and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red': seeders = int(cell.text) elif None is leechers and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red': leechers = int(cell.text) # Skip torrents released before the episode aired (fakes) if re.match('..:..:.. ..:..:....', cells[6].text): if (datetime.strptime(cells[6].text, '%H:%M:%S %m/%d/%Y') - datetime.combine(epObj.airdate, datetime.min.time())).days < 0: continue # Need size for failed downloads handling if re.match('[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cells[7].text): size = self._convertSize(cells[7].text) if not title or not url or not seeders or leechers is None or not size or \ seeders < self.minseed or leechers < self.minleech: continue item = title, url, seeders, leechers, size logger.log(u"Found result: " + title + " (" + searchURL + ")", logger.DEBUG) results.append(item) except: raise except (AttributeError, TypeError, KeyError, ValueError): continue results.sort(key=lambda tup: tup[3], reverse=True) return results
# nope, doesn't work: # coding: utf-8 # but this does: import sys reload(sys) sys.setdefaultencoding('utf-8') from BeautifulSoup import BeautifulSoup as soup, Comment, Tag import urllib if 1: url = 'http://eracks.com/customers' print url s = urllib.urlopen(url).read() s = soup(s) cont = s.body # extract text and a few select tags: snip = '' for e in cont.recursiveChildGenerator(): if isinstance (e, unicode) or \ isinstance (e, Tag) and (str(e).startswith (u'<a ') or str(e).startswith (u'<b>') or str(e).startswith (u'<img')): s = unicode(e) if s.strip() or len(e): snip += s # unicode(e) # .extract() nope, confuses the generator e.contents = []
def main(): urls = load_urls() state = load_state() i = 0 while i < len(urls): url = urls[i] if isinstance(url, basestring): url = { 'url': url, 'type': 'html', 'referer': url, } urls[i] = url i += 1 try: body,hash = get(url) except Exception as e: continue if url['url'] not in state: state[url['url']] = { 'previous_hash': None, 'permutations': 0, } if hash != state[url['url']]['previous_hash']: state[url['url']]['previous_hash'] = hash state[url['url']]['permutations'] = state[url['url']]['permutations'] + 1 print "Permutation %s: %s was updated on %s" % (state[url['url']]['permutations'],url['url'], time.ctime()) store_body(url,body) if url['type'] == 'img': send_mail(config.email,'Updated image url %s' % url['url'],'Not sending the image, lolz') continue else: send_mail(config.email,'Updated site url %s' % url['url'],body) try: s = soup(body) for anchor in s('a'): try: new_url = { 'url': get_url(url['url'],anchor['href']), 'type':'html', 'referer':url['referer'] } if new_url not in urls: print "Permutation %s: Found URL in %s. New URL: %s" % (state[url['url']]['permutations'],url['url'], new_url['url']) urls.append(new_url) send_mail(config.email,'Found new website %s' % new_url['url'],'Hoi') # Send e-mail except Exception as e: print e for img in s('img'): try: new_url = { 'url': get_url(url['url'],img['src']), 'type':'img', 'referer':url['referer'] } if new_url not in urls: print "Permutation %s: Found URL in %s. New URL: %s" % (state[url['url']]['permutations'],url['url'], new_url['url']) urls.append(new_url) send_mail(config.email,'Found new image %s' % new_url['url'],'Hoi') except Exception as e: print e except Exception as e: continue store_urls(urls) store_state(state)
import urllib2 from BeautifulSoup import BeautifulSoup as soup teams = {'csk':"Chennai Super Kings","mi":"Mumbai Indians","kxip":"Kings XI Punjab","kkr":"Kolkata Knight Riders",\ "rcb":"Royal Challengers Bangalore","srh":"Sunrisers Hyderabad","rr":"Rajasthan Royals","dd":"Delhi Daredevils"} teamsAbbr = {"Chennai Super Kings":"csk","Mumbai Indians":"mi","Kings XI Punjab":"kxip","Kolkata Knight Riders":"kkr",\ "Royal Challengers Bangalore":"rcb","Sunrisers Hyderabad":"srh","Rajasthan Royals":"rr","Delhi Daredevils":"dd"} page = urllib2.urlopen( "http://www.cricbuzz.com/cricket-series/2676/indian-premier-league-2018/matches" ).read() soupedPage = soup(page) teamsPage = soupedPage.findAll("div", {"class": "cb-col-75 cb-col"}) #datePage = soupedPage.findAll("div", {"class": '"cb-col-25 cb-col pad10"'}) MatchesDataInOrder = [] for i in range(len(teamsPage)): AvsB = teamsPage[i].find("span") WinningTeam = teamsPage[i].find("a", {"class": "cb-text-link"}) team1, team2 = map( teamsAbbr.get, map(str.strip, map(str, AvsB.contents[0].split(",")[0].split(" vs ")))) if WinningTeam != None: winner = teamsAbbr[str( WinningTeam.contents[0].split("won")[0]).strip()] else: winner = "NP" MatchesDataInOrder.append([team1, team2, winner]) #for i in MatchesDataInOrder: # print i pointsPerTeam = {} for key in teams.keys():
def prettyhtml(value, autoescape=None): value = str(soup(value)) if autoescape and not isinstance(value, SafeData): from django.utils.html import escape value = escape(value) return mark_safe(value)
def get_source(link): r = requests.get(link) if r.status_code == 200: return soup(r.text) else: sys.exit("[~] Invalid Response Received.")
import scraperwiki from BeautifulSoup import BeautifulSoup as soup urls = ( ('event', 'http://www.leedsdigitalfestival.com/'), ('about', 'http://www.leedsdigitalfestival.com/about/'), ('news', 'http://www.leedsdigitalfestival.com/news/'), ('video', 'http://www.leedsdigitalfestival.com/video/'), ('location', 'http://www.leedsdigitalfestival.com/locations/'), ('resource', 'http://www.leedsdigitalfestival.com/resources/'), ) data = [] for content_type, url in urls: s = soup(scraperwiki.scrape(url)) page = s.find('div', 'page') for block in page.findAll('div', 'block'): if 'small_footer' in block['class'].split(' '): continue title = block.find('div', {'class': 'post-title'}) title = title and title.h2.text ps = block.findAll('p') dt = '' venue = '' by = '' if hasattr(ps[0], 'span') and ps[0].span: dt = ps[0].span.text _venue = ps[0].text.replace(dt, '') if _venue.startswith('by '): by = _venue else:
def setUp(self): self.base_url = u'http://www.mddep.gouv.qc.ca/sol/terrains/terrains-contamines/recherche.asp' self.test_text = open('tests/fixtures/testfile.html', 'r').readlines() self.parsed_file = soup(StringIO(self.test_text))
def scrape(host, number, paths, emails, wait, first): #setup signals and verbose flag signal(SIGINT, sig_handle) global verbose #check if the number of emails we requested has been met if len(emails) >= number: return True #get host site path hosturl = urlparse.urlparse(host) if hosturl.path in paths: return True paths.append(hosturl.path) #get page content sleep(wait) try: content = soup(urlopen(host).read()) if verbose: print("Link read: {}".format(host)) except Exception as e: if isinstance(e, KeyboardInterrupt): #write the file and quit if SigInt sig_handle(1, 2) else: if verbose: print("Link read failed: {}".format(host)) return False #grab all email addresses from the page and then add it to the list pageemails = mailreg.findall(content.prettify()) for pemail in pageemails: if not pemail in emails: print(pemail) emails.append(pemail) #scrape for URLs on the site links = [] for x in content('a'): #find all anchors try: links.append(x['href']) except: pass for link in links: plink = urlparse.urlparse(link) if plink.path is u'': #if there is no path then just skip it continue if (hosturl.netloc in plink.netloc or plink.netloc is "") and plink.path not in paths: if plink.path[0] == '/': #absolute path url = slashreg.sub("/", "{}/{}".format(hosturl.netloc, plink.path)) elif first is True: #first page check url = slashreg.sub( "/", "{}/{}/{}".format(hosturl.netloc, hosturl.path, plink.path)) url = "{}://{}".format(hosturl.scheme, url) if (scrape(url, number, paths, emails, wait, False)): continue else: #we are at a full path, subtract the file name and start from the path url = slashreg.sub( "/", "{}/{}/{}".format( hosturl.netloc, "/".join(hosturl.path.split("/")[:-1]), plink.path)) else: #all other pages, subtract filename and go from the path url = slashreg.sub( "/", "{}/{}/{}".format(hosturl.netloc, "/".join(hosturl.path.split("/")[:-1]), plink.path)) url = "{}://{}".format(hosturl.scheme, url) scrape(url, number, paths, emails, wait, False) #recursion! return True
def get_post_params(post): post_updated = dateutils.from_iso_format(post['updated']) post_published = dateutils.from_iso_format(post['published']) post_id = post['id'] permalink = post['url'] item = post['object'] if post['verb'] == 'post': content = [item['content']] elif post['verb'] == 'share': content = [post.get('annotation', 'Shared:')] if 'actor' in item: content.append('<br/><br/>') if 'url' in item['actor'] and 'displayName' in item['actor']: content.append( '<a href="%s">%s</a>' % (item['actor']['url'], item['actor']['displayName'])) content.append(' originally shared this post: ') elif 'displayName' in item['actor']: content.append(item['actor']['displayName']) content.append(' originally shared this post: ') content.append('<br/><blockquote>') content.append(item['content']) content.append('</blockquote>') elif post['verb'] == 'checkin': content = [item['content']] place = post.get('placeName', '') if place: if item['content']: # Add some spacing if there's actually a comment content.append('<br/><br/>') content.append('Checked in at %s' % place) else: content = [] if 'attachments' in item: # attached content for attach in item['attachments']: content.append('<br/><br/>') if attach['objectType'] == 'article': # Attached link content.append('<a href="%s">%s</a>' % (attach['url'], attach.get('displayName', 'attached link'))) elif attach['objectType'] == 'photo': # Attached image content.append('<img src="%s" alt="%s" />' % (attach['image']['url'], attach['image'].get( 'displayName', 'attached image')) ) # G+ doesn't always supply alt text... elif attach['objectType'] == 'photo-album' or attach[ 'objectType'] == 'album': # Attached photo album link content.append('Album: <a href="%s">%s</a>' % (attach['url'], attach.get('displayName', 'attached album'))) elif attach['objectType'] == 'video': # Attached video content.append('Video: <a href="%s">%s</a>' % (attach['url'], attach.get('displayName', 'attached video'))) else: # Unrecognized attachment type content.append('[unsupported post attachment of type "%s"]' % attach['objectType']) # If no actual parseable content was found, just link to the post post_content = u''.join(content) or permalink # Generate the post title out of just text [max: 100 characters] post_title = u' '.join(x.string for x in soup(post_content).findAll(text=True)) post_title = space_compress_regex.sub(' ', post_title) if len(post_title) > 100: if post_title == permalink: post_title = u"A public G+ post" else: candidate_title = post_title[:97] if '&' in candidate_title[ -5:]: # Don't risk cutting off HTML entities candidate_title = candidate_title.rsplit('&', 1)[0] if ' ' in candidate_title[ -5:]: # Reasonably avoid cutting off words candidate_title = candidate_title.rsplit(' ', 1)[0] post_title = u"%s..." % candidate_title return { 'title': post_title, 'permalink': xhtml_escape(permalink), 'postatomdate': dateutils.to_atom_format(post_updated), 'postatompubdate': dateutils.to_atom_format(post_published), 'postdate': post_published.strftime('%Y-%m-%d'), 'id': xhtml_escape(post_id), 'summary': xhtml_escape(post_content), }