コード例 #1
0
ファイル: dblp.py プロジェクト: layinah/bibliographyScraping
from crawler import Crawler

def get_title(text):
    a = text[text.index(':')+1:]
    return a[:a.index('.')]

localhost = False
search_term = 'Giuseppe Vizzari'

if localhost:
    dblp = Crawler(base_url='http://localhost', port=50001)
    dblp.visit('/dblp.html')
else:
    dblp = Crawler(base_url='http://www.informatik.uni-trier.de', port=50001)
    dblp.visit('/~ley/db/indices/a-tree/index.html')
    dblp.search('//*[@name="author"]', search_term)

trs = dblp.sess.xpath('//p[1]/table/tbody/tr')
print trs

for tr in trs:
    tds = tr.xpath('./td')
    if(tds):
        print "%s -> %s"%(tds[0].text(), get_title(tds[2].text())) #tds[2].text(), 
コード例 #2
0
from crawler import Crawler

localhost = False
npage = 1
new_page = True
search_term = 'Giuseppe Vizzari'

if localhost:
    scholar = Crawler(port=50002)
    scholar.visit('http://localhost/scholar.htm')
else:
    scholar = Crawler(base_url = 'http://scholar.google.com', port=50002)
    scholar.visit('/')
    scholar.search('//*[@name="q"]', search_term)

while(new_page):
    new_page = False
    divs = scholar.sess.xpath('//*[@class="gs_r"]')

    for div in divs:
        papers = div.xpath('./*[@class="gs_rt"]/a')
        print "papers: %s"%papers

        if(papers):
            print papers[0].text()

        citedbies = div.xpath('./*[@class="gs_fl"]/a')
        print "citedbies: %s"%citedbies

        if(citedbies):
            print citedbies[0]["href"]
コード例 #3
0
ファイル: main.py プロジェクト: sathoro/python-crawler
from crawler import Crawler
from bs4 import BeautifulSoup

crawler = Crawler()

# returns a list of the available forms
forms = crawler.get_forms('https://github.com/login')

# submits the login form
crawler.submit(forms[1], {'login': '******', 'password': '******'})

# just a simple GET request, returns a response object
response = crawler.visit('https://github.com')

# lets use BeautifulSoup to parse the response text
html = BeautifulSoup(response.text, 'lxml')

# get a list of our repositories by scraping the html
for repo in html.find(id='repo_listing').find_all('span', {'class': 'repo'}):
	print repo.text

print '\n', crawler.get_cookies()
print '\n', crawler.get_headers()