def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find('a', {'style' : "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"}) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class' : 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find( 'a', { 'style': "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;" }) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class': 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def parse_locations_from_preferences_body(self, response_body): location_profiles = [] response_doc = BeautifulSoup(response_body) tbody_tag = response_doc.find('tbody', {'id':'dpref_driver_pk__preferences_pk__driver_locations_pk__profiles'}) if tbody_tag is None: raise ScreenscrapeParseError('No tbody found: %r' % response_body) tr_tags = tbody_tag.findAll('tr') for tr_tag in tr_tags: profile_name_td_tag = tr_tag.findAll('td', {'class':'profile_name'})[0] profile_desc_td_tag = tr_tag.findAll('td', {'class':'profile_descr'})[0] profile_id_radio_tag = tr_tag.findAll('input', {'class':'profile_default'})[0] profile_name = profile_name_td_tag.text profile_desc = profile_desc_td_tag.text profile_id = profile_id_radio_tag['value'] profile_def = (profile_id_radio_tag.get('checked',None) == 'checked') location_profile = LocationProfile(profile_name, profile_id, profile_desc) location_profile.is_default = profile_def location_profiles.append(location_profile) return location_profiles
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' if "立即查看" in info: return '' try: bt = BeautifulSoup(info) ret = bt.find('pre', {'class' : "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return '' except: return ''
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' if "立即查看" in info: return '' try: bt = BeautifulSoup(info) ret = bt.find('pre', {'class': "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return '' except: return ''
def getstatus(code, count=None): url = baseurl % code f = urllib2.urlopen(url) d = f.read() f.close() bs = BeautifulSoup(d) res = [] statuslist = ( bs.find("div", {"class": "result_up"}).find("table", {"width": "500"}).findAll("p", {"class": "resulttext"}) ) for status in statuslist: date, statustext, location = status.contents statustext = statustext.string date = time.strptime(date, "%d.%m.%Y, klo %H:%M ") location = location[6:].strip() dt = datetime.datetime(*date[0:6]) now = datetime.datetime.now() age = now - dt agestr = [] if age.days > 0: agestr.append("%dd" % age.days) secs = age.seconds hours, minutes, seconds = secs // 3600, secs // 60 % 60, secs % 60 if hours > 0: agestr.append("%dh" % hours) if minutes > 0: agestr.append("%dm" % minutes) res.append("%s - %s - %s" % (" ".join(agestr) + " ago", statustext, location)) if count: return res[:count] else: return res
def buildStoryFromString(data, stories): story=findStory(data, stories) if not story: url="http://www.google.com/search?q="+data.replace(' ', '+') logging.info(url) try: raw_data = http.getHttp(url) soup = BeautifulSoup(raw_data) story=None a=soup.find(lambda tag: tag.name=='a' and tag.attrs[0][0]=='href' and not tag.attrs[0][1].startswith('/') and not 'google' in tag.attrs[0][1]) if a and a.text: story=Story() story.deleteFlag=False story.mykey=data story.title='' for c in a.contents: if type(c) == Tag: story.title+=c.text else: story.title+=c story.link=a.attrs[0][1] story.text='' for c in a.parent.contents[4].contents: if type(c) == Tag: story.text+=c.text else: story.text+=c story.put() except DownloadError: #@UndefinedVariable logging.error(url + ' failed to load') ''' scraper=SearchScraper() scraper.feed(raw_data) return scraper.story '''
dburl = TallstreetUrls.get_url(url[0:-1]) if dburl: url = url[0:-1] if dburl: payload["url"] = dburl.url payload["title"] = dburl.title payload["description"] = dburl.description payload["new"] = False for keyword in dburl.related_keywords: payload["tags"][keyword.tag.tag] = min(keyword.money / 1000 + 10, 30) else: page = fetch(url) soup = BeautifulSoup(page.content) payload["title"] = soup.html.head.title.string desc = soup.find("meta", {"name": "description"}) if desc: payload["description"] = desc["content"] payload["url"] = url payload["new"] = True if keywords == []: invested = TallstreetPortfolio.get_keywords(request.user, dburl) for keyword in invested: if payload["tags"].has_key(keyword.keyword.tag): del payload["tags"][keyword.keyword.tag] if keyword.keyword.tag == new_keyword: new_keyword = "" keywords.append({"keyword": keyword.keyword.tag, "amount": keyword.money, 'edit': False}) payload["base_balance"] -= keyword.money