/
tweet_hr.py
54 lines (39 loc) · 1.34 KB
/
tweet_hr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
import lxml.html as lh
from selenium import webdriver
from selenium import selenium
import time
import html2text
class tweethr(Spider):
name = "tweet"
allowed_domains = ["moneycontrol.com"]
start_urls = [
"http://www.moneycontrol.com/india/stockpricequote/bankspublicsector/statebankindia/SBI"
]
def __init__(self):
Spider.__init__(self)
def parse(self, response):
driver = webdriver.Firefox()
driver.get("http://www.moneycontrol.com/india/stockpricequote/bankspublicsector/statebankindia/SBI")
time.sleep(10)
content = driver.page_source
i=0
converter = html2text.HTML2Text()
converter.ignore_links = True
doc = HtmlXPathSelector(response)
j=0
while(j<6):
driver.refresh()
for desc in doc.xpath("//div/span[@id='Bse_Prc_tick']").extract():
i=i+1
print ("\n*******************************************************\n")
print i
print converter.handle(desc)
for desc1 in doc.xpath("//div/span[@id='Nse_Prc_tick']").extract():
i=i+1
print ("\n*******************************************************\n")
print i
print converter.handle(desc1)
j=j+1
return desc