-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler_nasdaq.py
executable file
·61 lines (48 loc) · 1.74 KB
/
crawler_nasdaq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from boilerpipe.extract import Extractor
from BeautifulSoup import BeautifulSoup
import datetime
import hashlib
import json
import urllib2
import os
import DBOperation
import FYPsetting
def NASDAQ_crawler():
companies = dict()
with open("%s/target_companies.json" % os.path.dirname(os.path.realpath(__file__)),"r") as infile:
companies = json.load(infile)
for company in companies["company_code"]:
NASDAQ_get_data(company)
def NASDAQ_get_data(company_code):
url = 'http://www.nasdaq.com/symbol/%s/news-headlines' % company_code
conn = urllib2.urlopen(url)
html = conn.read()
soup = BeautifulSoup(html)
content_div = soup.find("div", {'class': "news-headlines"})
# No news found?
if content_div==None:
return
links = content_div.findAll('a')
content_list = list()
for tag in links:
if tag.parent.name != "span":
continue
link = tag.get('href', None)
title = tag.contents[0]
try:
news_page = urllib2.urlopen(link).read()
extractor = Extractor(extractor='ArticleExtractor', html=news_page)
except:
continue
content = extractor.getText()
now = datetime.datetime.now()
content_list.append({"title": title,
"article": content,
"link": link,
"source": "NASDAQ",
"target": company_code,
"date": "%04d%02d%02d" % (now.year, now.month, now.day),
"hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
DBOperation.save_db(content_list)
if __name__ == '__main__':
NASDAQ_crawler()