Exemple #1
0
def GOOGLE_get_data(company):

    google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company
    rss_feed = feedparser.parse(google_news_rss_url)

    content_list = list()

    for entry in rss_feed['entries']:
        title = entry['title']
        link = entry['link']
        try:
            news_page = urllib2.urlopen(link).read()
            extractor = Extractor(extractor='ArticleExtractor', html=news_page)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": link,
                            "source": "GOOGLE",
                            "target": company,
                            "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
                            

    DBOperation.save_db(content_list)
def Twitter_get_data(company):

    config = FYPsetting.TWITTER_CONFIG
    
    twitter = Twitter(auth=OAuth(config["access_key"], config["access_secret"],
                                    config["consumer_key"], config["consumer_secret"]))
    query = twitter.search.tweets(q=company, lang="en", result_type="recent", count="%d" % FYPsetting.QUERY_PAGE)
    
    urllist = list()
    content_list = list()
    
    for result in query["statuses"]:
        #print "@%s %s" % (result["user"]["screen_name"].encode("UTF-8"), result["text"].encode("UTF-8"))
        cur_text = result["text"].split(" ")
        
        #pre-process a readable title
        title_list = [ value for value in result["text"].split(" ") if not value.startswith("http") and not value.startswith("#") and not value.startswith("@") ]
        final_title = ' '.join(title_list)

        #parse and extract article
        for word in cur_text:
            if word.startswith("http"):
                utf_word = word.encode('latin-1', 'ignore')
                if utf_word in urllist:
                    break
                urllist.append(utf_word)
    
                try:
                    extractor = Extractor(extractor='ArticleExtractor', url=utf_word)
                except:
                    break
                content = extractor.getText()
                if content is not "":
                    now = datetime.datetime.now()
                    content_list.append({"title": final_title,
                                        "article": content,
                                        "link": utf_word,
                                        "source": "TWITTER",
                                        "target": company,
                                        "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                                        "hash": hashlib.sha224(result["text"].encode("UTF-8")).hexdigest()})
                break
    
    DBOperation.save_db(content_list)
Exemple #3
0
def NYT_get_data(company):
    raw_response_list = list()
    API_base_url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?"
    config = FYPsetting.NYT_CONFIG
    
    now = datetime.datetime.now()
    past = now - datetime.timedelta(hours=72)
    now_str = "%04d%02d%02d" % (now.year, now.month, now.day)
    past_str = "%04d%02d%02d" % (past.year, past.month, past.day)
    
    for page in range(FYPsetting.QUERY_PAGE//3):
        url = "%sbegin_data=%s&sort=newest&page=%d&q=%s&api-key=%s" % (API_base_url, past_str, page, company, config["API_key"])
        response = requests.get(url).json()
        raw_response_list += response["response"]["docs"]
    
    content_list = list()
    
    
    for doc in raw_response_list:
        url = doc["web_url"]
        title = doc["headline"]["main"]
        #print title
        try:
            cj = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
            html = opener.open(url).read()
            extractor = Extractor(extractor='ArticleExtractor', html=html)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": url,
                            "source": "NYT",
                            "target": company,
                            "date": now_str,
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
    
    DBOperation.save_db(content_list)
Exemple #4
0
def NASDAQ_get_data(company_code):

    url = 'http://www.nasdaq.com/symbol/%s/news-headlines' % company_code
    
    conn = urllib2.urlopen(url)
    html = conn.read()
    
    soup = BeautifulSoup(html)
    content_div = soup.find("div", {'class': "news-headlines"})
    
    # No news found?
    if content_div==None:
        return
        
    links = content_div.findAll('a')
    
    content_list = list()
    
    for tag in links:
        if tag.parent.name != "span":
            continue
        link = tag.get('href', None)
        title = tag.contents[0]
        try:
            news_page = urllib2.urlopen(link).read()
            extractor = Extractor(extractor='ArticleExtractor', html=news_page)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": link,
                            "source": "NASDAQ",
                            "target": company_code,
                            "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
    
    DBOperation.save_db(content_list)