def GetBlogHistory(url, direction): global totalnum global currentnum if currentnum > 15: print "sleep 30 seconds before continuing retrieving new instances" time.sleep( 30 ) #sleep 30seconds after retrieved 20 instances, way to avoid block from site for too frequent access... currentnum = 0 currentnum += 1 try: driver.get(url) except: if printtimeoutexcept == 1: print "retry for timeout(20sec) exception, url: " + url GetBlogHistory(url, direction) content = driver.page_source soup = BeautifulSoup.BeautifulSOAP(content) if direction == 0 or direction == -1: llist = soup.find("div", attrs={"class": "pleft thide"}) if (llist != None): lurl = dict(llist.contents[0].attrs)['href'] if printblognum == 1: totalnum += 1 print "%d. %s %s" % (totalnum, lurl, html_parser.unescape( dict(llist.attrs)['a'])) else: print lurl + ' ' + html_parser.unescape( dict(llist.attrs)['a']) GetBlogHistory(lurl, -1) if direction == 0 or direction == 1: rlist = soup.find("div", attrs={"class": "pright thide"}) if (rlist != None): rurl = dict(rlist.contents[0].attrs)['href'] if printblognum == 1: totalnum += 1 print "%d. %s %s" % (totalnum, rurl, html_parser.unescape( dict(rlist.attrs)['a'])) else: print rurl + ' ' + html_parser.unescape( dict(rlist.attrs)['a']) GetBlogHistory(rurl, 1)
def rpm_list(urllink): download_list = [] html_doc = urllib2.urlopen(urllink) html_bs = bs.BeautifulSOAP(html_doc) for i in html_bs.findAll('a'): download_list.append(i.attrs[0][1]) return download_list[1:]
def loadConfig(self, configPath): "Initialize the state of the fairy from XML config files" self.configPath = os.path.join( os.path.dirname(os.path.abspath(sys.argv[0])), configPath) printAndLog("Merge Fairy loading configuration file " + self.configPath) soup = BeautifulSoup.BeautifulSOAP(open(self.configPath)) dom = soup.first("fairy") for attr in ["urlbase", "pathbase", "buildcmd"]: self.__dict__[attr] = dom[attr] self.pollinterval = int(float(dom["pollinterval"])) self.branches = [] self.branchMap = {} # map between URLs and branch objects for branchNode in dom.fetch("branch"): branch = SvnBranch(self.buildcmd, urlbase=self.urlbase, pathbase=self.pathbase).initFromXml(branchNode) self.branches.append(branch) self.branchMap[branch.url] = branch self.branches.sort(key=lambda b: b.url) self.dependencyMap = DependencyMap() for dependencyNode in dom.fetch("dependency"): dependency = Dependency().initFromXml(dependencyNode, self) self.dependencyMap.addDependency(dependency)
def _reddit(self, args): """Usage: `{cmd_prefix}reddit [*subreddits]`""" output = [] args = args if args else [''] for arg in args: if arg: site = 'http://www.reddit.com/r/{}'.format(arg) logger.log((site, ), (None, )) else: site = 'http://www.reddit.com/' bs = BeautifulSoup.BeautifulSOAP(urlgrabber.urlread(site, size=2097152*10)) output.extend(bs.findAll('a', 'title')) return '\n'.join('{}: {} {}'.format(i + 1, o.string, o.get('href')) for i, o in enumerate(output[:5]))
def parse_resp(content): params = [] soup = BeautifulSoup.BeautifulSOAP(content) form = soup.find('form') form_attr_dict = dict(form.attrs) for e in form.findAll('input'): e_attr_dict = dict(e.attrs) params.append((e_attr_dict['name'], e_attr_dict['value'])) return form_attr_dict['action'], form_attr_dict['method'].upper( ), dict(params)
def parse_full(xml): '''given the xml returned from a comp search, try an iterrupt the returned data from a search query. ''' soup = BeautifulSoup.BeautifulSOAP(xml) data = soup.find('data') if not data: raise ParseException('unable to find data element') totals = data.find('totals') results = data.find('results') if not totals: raise ParseException('unable to find totals element') if not results: raise ParseException('unable to find results element') # totals # by_category = {} for tab in totals.findAll('tab'): table_name = tab.find('table').text count = int(tab.find('total').text) by_category[table_name] = count # results. since we're not exactly sure what's coming back (other then # inspecting the category the query was made with) lets just be better at # handling stuff # result_rows = [] for child in results.childGenerator(): if not isinstance(child, BeautifulSoup.Tag): continue result_rows.append(_soupdict(child)) return { 'totals': by_category, 'rows': result_rows, }
def GetBlogHistory(url, direction): global totalnum try: driver.get(url) except: if printtimeoutexcept == 1: print "Retry for timeout(20sec) exception, url: " + url GetBlogHistory(url, direction) content = driver.page_source soup = BeautifulSoup.BeautifulSOAP(content) if direction == 0 or direction == -1: llist = soup.find("div", attrs={"class": "pleft thide"}) if (llist != None): lurl = dict(llist.contents[0].attrs)['href'] if printblognum == 1: totalnum += 1 print "%d. %s %s" % (totalnum, lurl, html_parser.unescape( dict(llist.attrs)['a'])) else: print lurl + ' ' + html_parser.unescape( dict(llist.attrs)['a']) GetBlogHistory(lurl, -1) if direction == 0 or direction == 1: rlist = soup.find("div", attrs={"class": "pright thide"}) if (rlist != None): rurl = dict(rlist.contents[0].attrs)['href'] if printblognum == 1: totalnum += 1 print "%d. %s %s" % (totalnum, rurl, html_parser.unescape( dict(rlist.attrs)['a'])) else: print rurl + ' ' + html_parser.unescape( dict(rlist.attrs)['a']) GetBlogHistory(rurl, 1)