from crawler import Crawler def get_title(text): a = text[text.index(':')+1:] return a[:a.index('.')] localhost = False search_term = 'Giuseppe Vizzari' if localhost: dblp = Crawler(base_url='http://localhost', port=50001) dblp.visit('/dblp.html') else: dblp = Crawler(base_url='http://www.informatik.uni-trier.de', port=50001) dblp.visit('/~ley/db/indices/a-tree/index.html') dblp.search('//*[@name="author"]', search_term) trs = dblp.sess.xpath('//p[1]/table/tbody/tr') print trs for tr in trs: tds = tr.xpath('./td') if(tds): print "%s -> %s"%(tds[0].text(), get_title(tds[2].text())) #tds[2].text(),
from crawler import Crawler localhost = False npage = 1 new_page = True search_term = 'Giuseppe Vizzari' if localhost: scholar = Crawler(port=50002) scholar.visit('http://localhost/scholar.htm') else: scholar = Crawler(base_url = 'http://scholar.google.com', port=50002) scholar.visit('/') scholar.search('//*[@name="q"]', search_term) while(new_page): new_page = False divs = scholar.sess.xpath('//*[@class="gs_r"]') for div in divs: papers = div.xpath('./*[@class="gs_rt"]/a') print "papers: %s"%papers if(papers): print papers[0].text() citedbies = div.xpath('./*[@class="gs_fl"]/a') print "citedbies: %s"%citedbies if(citedbies): print citedbies[0]["href"]
from crawler import Crawler from bs4 import BeautifulSoup crawler = Crawler() # returns a list of the available forms forms = crawler.get_forms('https://github.com/login') # submits the login form crawler.submit(forms[1], {'login': '******', 'password': '******'}) # just a simple GET request, returns a response object response = crawler.visit('https://github.com') # lets use BeautifulSoup to parse the response text html = BeautifulSoup(response.text, 'lxml') # get a list of our repositories by scraping the html for repo in html.find(id='repo_listing').find_all('span', {'class': 'repo'}): print repo.text print '\n', crawler.get_cookies() print '\n', crawler.get_headers()