def runDatumBoxSentimentAnalysis(self, articles): numWrong = 0 #torThread = TorThread(self) #torThread.start() #time.sleep(5) datumBox = DatumBox("08fe94b761715219d636bd338b1cd984") for article in articles: sentimentModule = SentimentThread(article=article,articleNumber=0, datum_box=datumBox, proxy="127.0.0.1:8118") guessedSentiment = sentimentModule.getSentimentOfArticle() #print(article.getText()) print("Guess: " + str(guessedSentiment) + ", actual: " + str(article.sentiment)) if int(article.sentiment) != int(guessedSentiment): numWrong += 1 print("\nCorrectly analyzed " + str(len(articles) - numWrong) + " out of " + str(len(articles)) + " articles.")
def crawl(self, userInfo, userNumber, dateToSearch, daysToSearch, fileName): global proxies user = userInfo[userNumber] datum_box = DatumBox(user['APIKey']) sentimentsFile = open(fileName, "a") daysToGoBack = 1; #googleHttp = urllib3.PoolManager() while(daysToGoBack <= daysToSearch): #toSleep = 10 #print "sleeping for " + str(toSleep) + " seconds in a pathetic attempt to bypass Google...." #time.sleep(toSleep) print "Searching on date " + dateToSearch.strftime("%m/%d/%Y") + " (day " + str(daysToGoBack) + " of " + str(daysToSearch) + ")" #try: julianDate = trunc(sum(jdcal.gcal2jd(dateToSearch.year, dateToSearch.month, dateToSearch.day)) + .5) keyword = "investing" sites = ["http://online.wsj.com/", "http://www.bloomberg.com/news/", "http://www.rttnews.com/", "http://www.reuters.com/finance", "http://www.usatoday.com/", "money.usnews.com", "www.ft.com/home/us", "http://www.cnbc.com/" ] query = "site:money.cnn.com" for site in sites: query = query + " OR site:" + site query = query + " " + keyword + " daterange:" + str(julianDate) + "-" + str(julianDate); print "Query: " + query; query = quote_plus(query) #gs = GoogleSearch(query); #gs.results_per_page = 50 #results = gs.get_results() headers={'User-agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6'} startPageQuery = "https://startpage.com/do/search?cat=web&cmd=process_search&language=english&engine0=v1all&abp=1&x=-843&y=-302&prfh=lang_homepageEEEs%2Fair%2Feng%2FN1Nenable_post_methodEEE0N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE1N1Nnum_of_resultsEEE50N1N&suggestOn=0&query=" + query #request = urllib2.Request('GET', startPageQuery, None, headers) request = urllib2.Request(url=startPageQuery, data=None, headers=headers) f = urllib2.urlopen(request) resultsPage = f.read() f.close() soup = BeautifulSoup(resultsPage) resultDivs = soup.findAll("div", { "class" : "result" }) results = [] for resultDiv in resultDivs: if ('none' in resultDiv.get('class')): continue h3 = resultDiv.find("h3") url = h3.find("a", href=True)['href'] results.append(url) #except SearchError, e: # print "Search failed: %s" % e # continue #Crawl to each web site and extract web articles try: numberOfArticlePages = 0 articles = [] articleURLs = [] total = len(results) def fetch(url): #request = urllib2.Request(url) #f = urllib2.urlopen(request) articles.append(webarticle2text.extractFromURL(url, timeout=60)) articleURLs.append(url) print(articleURLs[len(articleURLs) - 1]) print(articles[len(articles) - 1]) #f.close() print "\r" + str(len(articles)) + " / " + str(len(results)) + " articles crawled to.", sys.stdout.flush() #response = requests.request('GET', url, hooks = {'response' : do_something}, timeout=60.0) #articlePages.append(response) pool = Pool(50) for res in results: pool.spawn(fetch, res) pool.join() del pool # I was using grequests, but it appeared that they were leaking file descriptors... :( #articleRequests = (grequests.get(res, hooks = {'response' : do_something}, timeout=60) for res in results) #articlePages = grequests.map(articleRequests) print except gevent.Timeout: if (numberOfArticlePages >= len(results) * self.MIN_ARTICLES_PERCENT): print ("Timeout, but continuing because we have " + str(numberOfArticlePages) + " articles.") else: print ("Timeout, going to next day because we have only " + str(numberOfArticlePages) + " articles.") continue '''articles = [] articleURLs = [] for articlePage in articlePages: if (articlePage is not None): articles.append(webarticle2text.extractFromHTML(articlePage.text)) articleURLs.append(articlePage.url) ''' print "All articles (" + str(len(articles)) + " of 'em) for date " + dateToSearch.strftime("%m/%d/%Y") + " returned, now being analyzed..." i = 1 self.threads = [] mutex_writefile = threading.Lock() #for res in results: self.dbError = True while (self.dbError == True): # This might get some articles twice, but its okay because once they are in the db it won't matter self.dbError = False for articleText, articleURL in zip(articles, articleURLs): #articleURL = res.url.encode("utf8") thread = SentimentThread(articleURL, articleText, i, fileName, dateToSearch, mutex_writefile, datum_box, "127.0.0.1:8118", self, user['subscriptionID']) #thread.daemon = True thread.start() self.threads.append(thread) i = i + 1 if (len(self.threads) >= self.MAX_THREADS): for thread in self.threads: thread.join() for thread in self.threads: thread.join() print self.articlesCompleted = 0 if (self.dbError == True): userNumber = userNumber - 1 if (userNumber <= 0): print("Out of Users") return user = userInfo[userNumber] datum_box = DatumBox(user['APIKey']) dateToSearch = dateToSearch - timedelta(1) daysToGoBack += 1 sentimentsFile.close() print("Congratulations! All articles analyzed. Shutting down...")
def crawl(self, userInfo, userNumber, dateToSearch, daysToSearch, fileName, useTor): self.useTor = useTor global proxies user = userInfo[userNumber] datum_box = DatumBox(user['APIKey']) sentimentsFile = open(fileName, "a") daysToGoBack = 1; if useTor: print("Starting Tor...") self.torThread = TorThread(self) self.torThread.start() time.sleep(5) proxyIP = '127.0.0.1:8118' else: proxyIP = '' #googleHttp = urllib3.PoolManager() while(daysToGoBack <= daysToSearch): print("Searching on date " + dateToSearch.strftime("%m/%d/%Y") + " (day " + str(daysToGoBack) + " of " + str(daysToSearch) + ")") julianDate = trunc(sum(jdcal.gcal2jd(dateToSearch.year, dateToSearch.month, dateToSearch.day)) + .5) #keyword = '"3M Co" OR "American Express" OR "AT&T" OR "Boeing" OR "Caterpillar" OR "Chevron" OR "Cisco" OR "Dupont E I De Nemours" OR "Exxon" OR "General Electric" OR "Goldman Sachs" OR "Home Depot" OR "Intel" OR "IBM"'# OR "Johnson & Johnson" OR "JPMorgan Chase" OR "McDonald\'s" OR "Merck and Co" OR "Microsoft" OR "Nike" OR "Pfizer" OR "Procter & Gamble" OR "Coca-Cola" OR "Travelers Companies" OR "United Technologies" OR "UnitedHealth" OR "Verizon" OR "Visa" OR "Wal-Mart" OR "Walt Disney"' #keywords = ["\"3M Co\"", "\"American Express\"", "\"AT&T Inc\"", "\"Boeing Co\"", "\"Caterpillar Inc\"", "\"Chevron\"", "\"Cisco Systems Inc\"", "\"DuPont\"","\"Exxon Mobil\"","\"General Electric\"","\"Goldman Sachs Group\"","\"Home Depot\"","\"Intel Corp\"","\"International Business Machines\"","\"johnson and johnson\" OR \"Johnson & Johnson\"","\"JP Morgan\"","\"McDonald's Corp\"","\"Merck & Co\"","\"Microsoft Corp\" OR \"Microsoft Corporation\"","\"Nike\"","\"Pfizer\"","\"Procter & Gamble\" OR \"Procter And Gamble\"","\"Coca-Cola\"","\"United Technologies Corp\"","\"UnitedHealth Group\"","\"Verizon Communications\"","\"Visa Inc\"","\"Wal-Mart\"","\"Walt Disney Company\""]; keywords = ["\"3M Co\"", "\"American Express\"", "\"AT&T\"", "\"Boeing\"", "\"Caterpillar Inc\"", "\"Chevron\"", "\"Cisco Systems\"", "\"DuPont\"","\"Exxon Mobil\"","\"General Electric\"","\"Goldman Sachs Group\"","\"Home Depot\"","\"Intel Corp\"","\"International Business Machines\"","\"Johnson and Johnson\" OR \"Johnson & Johnson\"","\"JP Morgan\"","\"McDonald's Corp\"","\"Merck & Co\"","\"Microsoft Corp\" OR \"Microsoft Corporation\"","\"Nike\"","\"Pfizer\"","\"Procter & Gamble\" OR \"Procter And Gamble\"","\"Coca-Cola\"","\"United Technologies Corp\"","\"UnitedHealth Group\"","\"Verizon Communications\"","\"Visa Inc\"","\"Wal-Mart\"","\"Walt Disney Company\""]; sites = ["http://money.cnn.com/" + dateToSearch.strftime("%Y"), "http://www.bloomberg.com/news/", "http://www.rttnews.com/", "http://www.reuters.com/finance", "money.usnews.com", "www.ft.com/home/us", "http://www.cnbc.com/", "http://www.fool.com", "http://www.thestreet.com", "http://www.zacks.com", "http://www.seekingalpha.com/article" ] baseQuery = "site:" + sites[0] first = True for site in sites: if (first): first = False continue baseQuery = baseQuery + " OR site:" + site baseQuery = baseQuery + " daterange:" + str(julianDate) + "-" + str(julianDate) + " "; numberOfResults = 50 for keyword in keywords: query = quote_plus(baseQuery + keyword); startPageQuery = "https://startpage.com/do/search?cat=web&abp=1&prfh=lang_homepageEEEs%2Fair%2Feng%2FN1Nenable_post_methodEEE0N1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE1N1Nnum_of_resultsEEE" + str(numberOfResults) + "N1N&suggestOn=0&query=" + query #startPageQuery = "http://ipinfo.io/" binaryData = urlencode({'language' : 'english', 'cmd' : 'process_search', 'engine0' :'v1all'}).encode('utf-8') print (startPageQuery) ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) #monkey patch... proxy_support = urllib.request.ProxyHandler({'http': proxyIP}) opener = urllib.request.build_opener(proxy_support, urllib.request.HTTPSHandler(context=ssl_context)) #monkey patch... urllib.request.install_opener(opener) #monkey patch... #headers={'User-agent' : 'Mozilla/5.0', 'Connection':'close'} #No longer used because StartPage blocked it for a while #headers={'User-agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6', 'Connection':'close'} browser = random.choice(BROWSERS) headers={'User-agent' : browser} request = urllib.request.Request(url=startPageQuery, data = None, headers=headers) response = opener.open(request) resultsPage = response.read().decode('utf-8')#.decode('iso-8859-1') #print(resultsPage) soup = BeautifulSoup(resultsPage) resultDivs = soup.findAll("div", { "class" : "result" }) results = [] for resultDiv in resultDivs: if ('none' in resultDiv.get('class')): continue parent = resultDiv.findParent('div') if parent.get('id') == 'sponsored_container': continue h3 = resultDiv.find("h3") if (h3 is None): continue url = h3.find("a", href=True)['href'] results.append(url) try: numberOfArticlePages = 0 articles = [] articleURLs = [] articleRanks = [] total = len(results) # Was using gevent, but then switched to python3.3, which does not yet support gevent crawlThreads = [] for index,res in enumerate(results): t = threading.Thread(target=FetchArticle.fetch, args=(keyword, res, index, articles, len(results))) t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() crawlThreads.append(t) for thread in crawlThreads: thread.join(60) if (thread.isAlive()): print("A URL has timed out") # I was using grequests, but it appeared that they were leaking file descriptors... :( #articleRequests = (grequests.get(res, hooks = {'response' : do_something}, timeout=60) for res in results) #articlePages = grequests.map(articleRequests) print() except gevent.Timeout: if (numberOfArticlePages >= len(results) * self.MIN_ARTICLES_PERCENT): print(("Timeout, but continuing because we have " + str(numberOfArticlePages) + " articles.")) else: print(("Timeout, going to next day because we have only " + str(numberOfArticlePages) + " articles.")) continue print("All articles (" + str(len(articles)) + " of 'em) for date " + dateToSearch.strftime("%m/%d/%Y") + " and keyword " + keyword + " returned, now being analyzed...") i = 1 mutex_writefile = threading.Lock() #for res in results: self.dbError = True while (self.dbError == True): # This might get some articles twice, but its okay because once they are in the db it won't matter print("Gathering articles") self.dbError = False self.threads = [] for article in articles: #articleURL = res.url.encode("utf8") thread = SentimentThread(article, i, datum_box, proxyIP, self, keyword, fileName, dateToSearch, mutex_writefile, user['subscriptionID']) #thread.daemon = True thread.start() self.threads.append(thread) i = i + 1 if (len(self.threads) >= self.MAX_THREADS): for thread in self.threads: thread.join() for thread in self.threads: thread.join() self.articlesCompleted = 0 if (self.dbError == True): userNumber = userNumber - 1 if (userNumber < 0): print("Out of Users") return user = userInfo[userNumber] datum_box = DatumBox(user['APIKey']) print("Switching users and trying again.") if self.useTor: self.torThread.stop() self.torThread.join() time.sleep(5) print("Tor stopped. Restarting...") self.torThread = TorThread(self) self.torThread.start() time.sleep(5) print("Tor restarted. Continuing...") print() #Next day! print() dateToSearch = dateToSearch - timedelta(1) daysToGoBack += 1 sentimentsFile.close() print("Congratulations! All articles analyzed. Shutting down...")