def crawl(ul, t_name): res = rq.get(ul, headers=headers) soup = _s(res.text, 'lxml') all = soup.findAll('h4', class_='name') names = '\n'.join([a.text.strip() for a in all]) print(ul, t_name) global all_name all_name = all_name + names
def crawl_init(self, page): print('getting initial page') newpages = set() c = requests.get(page) soup = _s(c.text, features="html.parser") links = soup.findAll('a') for link in links: if 'href' in dict(link.attrs): url = urljoin(page, link['href']) newpages.add(url) return newpages
def crawl(self, page): newpages = set() try: c = requests.get(page) print('soup for crawling %s' % page) except: print('cannot open %s' % page) return soup = _s(c.text, features="html.parser") self.addtoindex(page, soup) links = soup.findAll('a') print('got %s links from crawled url from %a' % (len(links), page)) q = (page, "", "Y") try: self.con.execute('insert into urlcheck values (?, ?, ?)', q) print("put %s into urlcheck" % page) except sqlite.IntegrityError: isindexed = self.con.execute('select indexed from urlcheck where url = "%s"' % page).fetchone() if isindexed[0] == 'Y': print('%s is already crawled' % page) return else: self.con.execute('update urlcheck set indexed = "Y" where url = "%s"' % page) print("change %s n to y" % page) for link in links: if 'href' in dict(link.attrs): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # if url[0:4] == 'http' and 'bkrs.info' in url and not self.isindexed(url): isindexed = self.con.execute('select indexed from urlcheck where url = "%s"' % url).fetchone() if url[0:4] == 'http' and 'bkrs.info' in url and 'taolun' in url and not isindexed: if 'slovo' not in url: newpages.add(url) q = (url, link.string, "N") self.con.execute('insert into urlcheck values (?, ?, ?)', q) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) # topicId = re.search(url, '.*taolun\/(forum|thread)-(\d+)(-.*)*\.html') self.dbcommit() print('commit links from one url') self.dbcommit() print('commit set of urls') print('added %s links for newpages from %a' % (len(newpages), page))
import requests data = requests.get("https://www.crummy.com/").content from bs4 import _s data = [x for x in _s(data).block_text()]
def run_app_populate(movie, name, review, email): session = requests.Session() # create account print('creating account for: {}'.format(email)) r = session.get(HOST + '/add_user') csrf_token = _s(r.text, 'html.parser').find('input', {'name': 'csrf-token'})['value'] session.post(HOST + '/add_user', data={ 'email': email, 'name': name, 'password': '******', 'csrf-token': csrf_token }) print('account created') # log in print('logging in') r = session.get(HOST + '/login') csrf_token = _s(r.text, 'html.parser').find('input', {'name': 'csrf-token'})['value'] session.post(HOST + '/login', data={ 'email': email, 'password': '******', 'csrf-token': csrf_token }) print('logged in') # create review print('creating review') r = session.get(HOST + '/create_review') csrf_token = _s(r.text, 'html.parser').find('input', {'name': 'csrf-token'})['value'] session.post(HOST + '/create_review', data={ 'title': movie, 'rating': random.randint(1, 10), 'review': review, 'csrf-token': csrf_token }) print('review created') # search for a key word and up or down vote keyword = random.choice(review.split()) print('searching keyword: {}'.format(keyword)) r = session.get(HOST + '/search?keyword={}'.format(keyword)) a_tags = _s(r.text, 'html.parser').find_all('a', {'href': re.compile('/review/.*')}) if len(a_tags): a = random.choice(a_tags)['href'] print('voting on review: {}'.format(a)) session.get(HOST + '/vote?id={}&ud={}'.format( re.search('[0-9]+', a).group(), random.randint(0, 1))) else: print('nothing found')