Beispiel #1
0
def start(url, depth, regex):
    b = Browser()
    b.open(url)
    print("parse html ...: %d" % len(b.content))
    dom = html.fromstring(b.content)
    b.close()
    print("END")
Beispiel #2
0
def scrap(url):
    try:
        reload(sys)
        sys.setdefaultencoding('utf-8')

        display = Display(visible=0, size=(800, 600))
        display.start()

        b = Browser()
        b.open(url)
        content = b.main_frame['content'].read()
        #print "=====Crawled====="
        #print content
        #dom = html.fromstring(content)
        b.close()
        display.stop()
        del b
        return str(content)
    except Exception as e:
        print "===Scrapping Exception====="
        print str(e)
        b.close()
        display.stop()
Beispiel #3
0
def scrap(url):
    try:
        reload(sys)
        sys.setdefaultencoding('utf-8')
        
        display = Display(visible=0, size=(800, 600))
        display.start()

        b = Browser()
        b.open(url)
        content = b.main_frame['content'].read()
        #print "=====Crawled====="
        #print content
        #dom = html.fromstring(content)
        b.close()
        display.stop()
        del b
        return str(content)
    except Exception as e:
        print "===Scrapping Exception====="
        print str(e)
        b.close()
        display.stop()
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

from webkit_browser import Browser
from lxml import html

import csv
reader = csv.reader(file('stock_list.csv', 'rb'))
stockids = []
for line in reader:
    stockids.append(line[1])
for integer in range(13):
    stockids.pop(0)

b = Browser()
for stockid in stockids:
	print 'http://stockhtm.finance.qq.com/sstock/ggcx/' + stockid +'.shtml'
	b.open('http://stockhtm.finance.qq.com/sstock/ggcx/' + stockid +'.shtml')
	content = b.main_frame['content'].read()
	dom = html.fromstring(content)
	results = dom.xpath('//*[@class="col-2 fr"]/ul/li')
	for result in results:
	    print result.text_content()
	results = dom.xpath('//h1[@class="col-1-1"]')
	name = ""
	for result in results:
	    name = result
	    print result.text_content()
	results = dom.xpath('//table[@class="l20"]//tr/td[contains(@id,"main-")]')
	for result in results:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

from webkit_browser import Browser
from lxml import html
import re

b = Browser()
b.open('http://quote.eastmoney.com/stocklist.html')
content = b.main_frame['content'].read()
dom = html.fromstring(content)
results = dom.xpath('//*[@id="quotesearch"]/ul/li')

import csv
cf = open("stock_list.csv", "w")
writer = csv.writer(cf)
writer.writerow(['name', 'id'])

for result in results:
    pattern = re.compile('[()]')
    stockItem = result.text_content()
    pricePair = pattern.split(stockItem)
    pricePair.pop()
    writer.writerow(pricePair)

cf.close()

print 'The result has been saved into stock_list.csv'