Esempio n. 1
0
def crawl(ul, t_name):
    res = rq.get(ul, headers=headers)
    soup = _s(res.text, 'lxml')
    all = soup.findAll('h4', class_='name')
    names = '\n'.join([a.text.strip() for a in all])
    print(ul, t_name)
    global all_name
    all_name = all_name + names
Esempio n. 2
0
    def crawl_init(self, page):
        print('getting initial page')
        newpages = set()
        c = requests.get(page)
        soup = _s(c.text, features="html.parser")

        links = soup.findAll('a')
        for link in links:
            if 'href' in dict(link.attrs):
                url = urljoin(page, link['href'])
                newpages.add(url)

        return newpages
Esempio n. 3
0
	def crawl(self, page):

		newpages = set()
		try:
			c = requests.get(page)
			print('soup for crawling %s' % page)
		except:
			print('cannot open %s' % page)
			return

		soup = _s(c.text, features="html.parser")

		self.addtoindex(page, soup)

		links = soup.findAll('a')
		print('got %s links from crawled url from %a' % (len(links), page))
		q = (page, "", "Y")
		try:
			self.con.execute('insert into urlcheck values (?, ?, ?)', q)
			print("put %s into urlcheck" % page)
		except sqlite.IntegrityError:
			isindexed = self.con.execute('select indexed from urlcheck where url = "%s"' % page).fetchone()
			if isindexed[0] == 'Y':
				print('%s is already crawled' % page)
				return
			else:
				self.con.execute('update urlcheck set indexed = "Y" where url = "%s"' % page)
				print("change %s n to y" % page)

		for link in links:
			if 'href' in dict(link.attrs):
				url = urljoin(page, link['href'])
				if url.find("'") != -1: continue
				url = url.split('#')[0]
				# if url[0:4] == 'http' and 'bkrs.info' in url and not self.isindexed(url):
				isindexed = self.con.execute('select indexed from urlcheck where url = "%s"' % url).fetchone()
				if url[0:4] == 'http' and 'bkrs.info' in url and 'taolun' in url and not isindexed:
					if 'slovo' not in url:
						newpages.add(url)
						q = (url, link.string, "N")
						self.con.execute('insert into urlcheck values (?, ?, ?)', q)
				linkText = self.gettextonly(link)
				self.addlinkref(page, url, linkText)

				# topicId = re.search(url, '.*taolun\/(forum|thread)-(\d+)(-.*)*\.html')
			self.dbcommit()
		print('commit links from one url')
		self.dbcommit()
		print('commit set of urls')
		print('added %s links for newpages from %a' % (len(newpages), page))
Esempio n. 4
0
import requests
data = requests.get("https://www.crummy.com/").content
from bs4 import _s
data = [x for x in _s(data).block_text()]
def run_app_populate(movie, name, review, email):
    session = requests.Session()

    # create account
    print('creating account for: {}'.format(email))
    r = session.get(HOST + '/add_user')
    csrf_token = _s(r.text,
                    'html.parser').find('input',
                                        {'name': 'csrf-token'})['value']
    session.post(HOST + '/add_user',
                 data={
                     'email': email,
                     'name': name,
                     'password': '******',
                     'csrf-token': csrf_token
                 })
    print('account created')

    # log in
    print('logging in')
    r = session.get(HOST + '/login')
    csrf_token = _s(r.text,
                    'html.parser').find('input',
                                        {'name': 'csrf-token'})['value']
    session.post(HOST + '/login',
                 data={
                     'email': email,
                     'password': '******',
                     'csrf-token': csrf_token
                 })
    print('logged in')

    # create review
    print('creating review')
    r = session.get(HOST + '/create_review')
    csrf_token = _s(r.text,
                    'html.parser').find('input',
                                        {'name': 'csrf-token'})['value']
    session.post(HOST + '/create_review',
                 data={
                     'title': movie,
                     'rating': random.randint(1, 10),
                     'review': review,
                     'csrf-token': csrf_token
                 })
    print('review created')

    # search for a key word and up or down vote
    keyword = random.choice(review.split())
    print('searching keyword: {}'.format(keyword))
    r = session.get(HOST + '/search?keyword={}'.format(keyword))
    a_tags = _s(r.text,
                'html.parser').find_all('a',
                                        {'href': re.compile('/review/.*')})
    if len(a_tags):
        a = random.choice(a_tags)['href']
        print('voting on review: {}'.format(a))
        session.get(HOST + '/vote?id={}&ud={}'.format(
            re.search('[0-9]+', a).group(), random.randint(0, 1)))
    else:
        print('nothing found')