def GOOGLE_get_data(company): google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company rss_feed = feedparser.parse(google_news_rss_url) content_list = list() for entry in rss_feed['entries']: title = entry['title'] link = entry['link'] try: news_page = urllib2.urlopen(link).read() extractor = Extractor(extractor='ArticleExtractor', html=news_page) except: continue content = extractor.getText() now = datetime.datetime.now() content_list.append({"title": title, "article": content, "link": link, "source": "GOOGLE", "target": company, "date": "%04d%02d%02d" % (now.year, now.month, now.day), "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()}) DBOperation.save_db(content_list)
def Twitter_get_data(company): config = FYPsetting.TWITTER_CONFIG twitter = Twitter(auth=OAuth(config["access_key"], config["access_secret"], config["consumer_key"], config["consumer_secret"])) query = twitter.search.tweets(q=company, lang="en", result_type="recent", count="%d" % FYPsetting.QUERY_PAGE) urllist = list() content_list = list() for result in query["statuses"]: #print "@%s %s" % (result["user"]["screen_name"].encode("UTF-8"), result["text"].encode("UTF-8")) cur_text = result["text"].split(" ") #pre-process a readable title title_list = [ value for value in result["text"].split(" ") if not value.startswith("http") and not value.startswith("#") and not value.startswith("@") ] final_title = ' '.join(title_list) #parse and extract article for word in cur_text: if word.startswith("http"): utf_word = word.encode('latin-1', 'ignore') if utf_word in urllist: break urllist.append(utf_word) try: extractor = Extractor(extractor='ArticleExtractor', url=utf_word) except: break content = extractor.getText() if content is not "": now = datetime.datetime.now() content_list.append({"title": final_title, "article": content, "link": utf_word, "source": "TWITTER", "target": company, "date": "%04d%02d%02d" % (now.year, now.month, now.day), "hash": hashlib.sha224(result["text"].encode("UTF-8")).hexdigest()}) break DBOperation.save_db(content_list)
def NYT_get_data(company): raw_response_list = list() API_base_url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?" config = FYPsetting.NYT_CONFIG now = datetime.datetime.now() past = now - datetime.timedelta(hours=72) now_str = "%04d%02d%02d" % (now.year, now.month, now.day) past_str = "%04d%02d%02d" % (past.year, past.month, past.day) for page in range(FYPsetting.QUERY_PAGE//3): url = "%sbegin_data=%s&sort=newest&page=%d&q=%s&api-key=%s" % (API_base_url, past_str, page, company, config["API_key"]) response = requests.get(url).json() raw_response_list += response["response"]["docs"] content_list = list() for doc in raw_response_list: url = doc["web_url"] title = doc["headline"]["main"] #print title try: cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) html = opener.open(url).read() extractor = Extractor(extractor='ArticleExtractor', html=html) except: continue content = extractor.getText() now = datetime.datetime.now() content_list.append({"title": title, "article": content, "link": url, "source": "NYT", "target": company, "date": now_str, "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()}) DBOperation.save_db(content_list)
def NASDAQ_get_data(company_code): url = 'http://www.nasdaq.com/symbol/%s/news-headlines' % company_code conn = urllib2.urlopen(url) html = conn.read() soup = BeautifulSoup(html) content_div = soup.find("div", {'class': "news-headlines"}) # No news found? if content_div==None: return links = content_div.findAll('a') content_list = list() for tag in links: if tag.parent.name != "span": continue link = tag.get('href', None) title = tag.contents[0] try: news_page = urllib2.urlopen(link).read() extractor = Extractor(extractor='ArticleExtractor', html=news_page) except: continue content = extractor.getText() now = datetime.datetime.now() content_list.append({"title": title, "article": content, "link": link, "source": "NASDAQ", "target": company_code, "date": "%04d%02d%02d" % (now.year, now.month, now.day), "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()}) DBOperation.save_db(content_list)