def runDatumBoxSentimentAnalysis(self, articles):
     numWrong = 0
     #torThread = TorThread(self)
     #torThread.start()
     #time.sleep(5)
     datumBox = DatumBox("08fe94b761715219d636bd338b1cd984") 
     for article in articles:
         sentimentModule = SentimentThread(article=article,articleNumber=0, datum_box=datumBox, proxy="127.0.0.1:8118")
         guessedSentiment = sentimentModule.getSentimentOfArticle()
         #print(article.getText())
         print("Guess: " + str(guessedSentiment) + ", actual: " + str(article.sentiment))
         if int(article.sentiment) != int(guessedSentiment):
             numWrong += 1
     print("\nCorrectly analyzed " + str(len(articles) - numWrong) + " out of " + str(len(articles)) + " articles.")
    def crawl(self, userInfo, userNumber, dateToSearch, daysToSearch, fileName):
        global proxies
        user = userInfo[userNumber]
        datum_box = DatumBox(user['APIKey'])  
        sentimentsFile = open(fileName, "a")
        daysToGoBack = 1;
        #googleHttp = urllib3.PoolManager()
        while(daysToGoBack <= daysToSearch):
            #toSleep = 10
            #print "sleeping for " + str(toSleep) + " seconds in a pathetic attempt to bypass Google...."
            #time.sleep(toSleep)
            print "Searching on date " + dateToSearch.strftime("%m/%d/%Y") + " (day " + str(daysToGoBack) + " of " + str(daysToSearch) + ")"
            
            #try:
            julianDate = trunc(sum(jdcal.gcal2jd(dateToSearch.year, dateToSearch.month, dateToSearch.day)) + .5)
            keyword = "investing"
            sites = ["http://online.wsj.com/", "http://www.bloomberg.com/news/", "http://www.rttnews.com/", "http://www.reuters.com/finance",  "http://www.usatoday.com/", "money.usnews.com", "www.ft.com/home/us", "http://www.cnbc.com/" ]
            query = "site:money.cnn.com"
            for site in sites:
                query = query + " OR site:" + site
            query = query + " " + keyword + " daterange:" + str(julianDate) + "-" + str(julianDate);
            print "Query: " + query;
            query = quote_plus(query)
            #gs = GoogleSearch(query);
            #gs.results_per_page = 50
            #results = gs.get_results()
            headers={'User-agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6'}
            startPageQuery = "https://startpage.com/do/search?cat=web&cmd=process_search&language=english&engine0=v1all&abp=1&x=-843&y=-302&prfh=lang_homepageEEEs%2Fair%2Feng%2FN1Nenable_post_methodEEE0N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE1N1Nnum_of_resultsEEE50N1N&suggestOn=0&query=" + query
            #request = urllib2.Request('GET', startPageQuery, None, headers)
            request = urllib2.Request(url=startPageQuery, data=None, headers=headers)
            f = urllib2.urlopen(request)
            resultsPage = f.read()
            f.close()
            soup = BeautifulSoup(resultsPage)
            resultDivs = soup.findAll("div", { "class" : "result" })
            results = []
            for resultDiv in resultDivs:
                if ('none' in resultDiv.get('class')):
                    continue
                h3 = resultDiv.find("h3")
                url = h3.find("a", href=True)['href']
                results.append(url)
            #except SearchError, e:
            #    print "Search failed: %s" % e
            #    continue
            #Crawl to each web site and extract web articles
            try:
                numberOfArticlePages = 0
                articles = []
                articleURLs = []
                total = len(results)
                def fetch(url):
                    #request = urllib2.Request(url)
                    #f = urllib2.urlopen(request)
                    articles.append(webarticle2text.extractFromURL(url, timeout=60))
                    articleURLs.append(url)
                    print(articleURLs[len(articleURLs) - 1])
                    print(articles[len(articles) - 1])
                    #f.close()
                    print "\r" + str(len(articles)) + " / " + str(len(results)) + " articles crawled to.",
                    sys.stdout.flush()
                    #response = requests.request('GET', url, hooks = {'response' : do_something}, timeout=60.0)
                    #articlePages.append(response)
            
                pool = Pool(50)
                for res in results:
                    pool.spawn(fetch, res)
                pool.join()
                del pool
                # I was using grequests, but it appeared that they were leaking file descriptors... :(
                #articleRequests = (grequests.get(res, hooks = {'response' : do_something}, timeout=60) for res in results)
                #articlePages = grequests.map(articleRequests)
                print
            except gevent.Timeout:
                if (numberOfArticlePages >= len(results) * self.MIN_ARTICLES_PERCENT):
                    print ("Timeout, but continuing because we have " + str(numberOfArticlePages) + " articles.")
                else:
                    print ("Timeout, going to next day because we have only " + str(numberOfArticlePages) + " articles.")
                    continue
            '''articles = []
            articleURLs = []
            for articlePage in articlePages:
                if (articlePage is not None):
                    articles.append(webarticle2text.extractFromHTML(articlePage.text))
                    articleURLs.append(articlePage.url)
            '''
            print "All articles (" + str(len(articles)) + " of 'em) for date " + dateToSearch.strftime("%m/%d/%Y") + " returned, now being analyzed..."

            i = 1
            self.threads = []
            mutex_writefile = threading.Lock()
            #for res in results:
            self.dbError = True
            while (self.dbError == True): # This might get some articles twice, but its okay because once they are in the db it won't matter
                self.dbError = False
                for articleText, articleURL in zip(articles, articleURLs):
                    #articleURL = res.url.encode("utf8")
                    thread = SentimentThread(articleURL, articleText, i, fileName, dateToSearch, mutex_writefile, datum_box, "127.0.0.1:8118", self, user['subscriptionID'])
                    #thread.daemon = True
                    thread.start()
                    self.threads.append(thread)
                    i = i + 1
                    if (len(self.threads) >= self.MAX_THREADS):
                        for thread in self.threads:
                            thread.join()
                for thread in self.threads:
                    thread.join()
                print
                self.articlesCompleted = 0
                if (self.dbError == True):
                    userNumber = userNumber - 1
                    if (userNumber <= 0):
                        print("Out of Users")
                        return
                    user = userInfo[userNumber]
                    datum_box = DatumBox(user['APIKey']) 
                    
            dateToSearch = dateToSearch - timedelta(1)
            daysToGoBack += 1
        sentimentsFile.close()
        print("Congratulations! All articles analyzed. Shutting down...")
 def crawl(self, userInfo, userNumber, dateToSearch, daysToSearch, fileName, useTor):
     self.useTor = useTor
     global proxies
     user = userInfo[userNumber]
     datum_box = DatumBox(user['APIKey'])  
     sentimentsFile = open(fileName, "a")
     daysToGoBack = 1;
     if useTor:
         print("Starting Tor...")
         self.torThread = TorThread(self)
         self.torThread.start()
         time.sleep(5)
         proxyIP = '127.0.0.1:8118'
     else:
         proxyIP = ''
     #googleHttp = urllib3.PoolManager()
     while(daysToGoBack <= daysToSearch):
         print("Searching on date " + dateToSearch.strftime("%m/%d/%Y") + " (day " + str(daysToGoBack) + " of " + str(daysToSearch) + ")")
         
         julianDate = trunc(sum(jdcal.gcal2jd(dateToSearch.year, dateToSearch.month, dateToSearch.day)) + .5)
         #keyword = '"3M Co" OR "American Express" OR "AT&T" OR "Boeing" OR "Caterpillar" OR "Chevron" OR "Cisco" OR "Dupont E I De Nemours" OR "Exxon" OR "General Electric" OR "Goldman Sachs" OR "Home Depot" OR "Intel" OR "IBM"'# OR "Johnson & Johnson" OR "JPMorgan Chase" OR "McDonald\'s" OR "Merck and Co" OR "Microsoft" OR "Nike" OR "Pfizer" OR "Procter & Gamble" OR "Coca-Cola" OR "Travelers Companies" OR "United Technologies" OR "UnitedHealth" OR "Verizon" OR "Visa" OR "Wal-Mart" OR "Walt Disney"'
         #keywords = ["\"3M Co\"", "\"American Express\"", "\"AT&T Inc\"", "\"Boeing Co\"", "\"Caterpillar Inc\"", "\"Chevron\"", "\"Cisco Systems Inc\"", "\"DuPont\"","\"Exxon Mobil\"","\"General Electric\"","\"Goldman Sachs Group\"","\"Home Depot\"","\"Intel Corp\"","\"International Business Machines\"","\"johnson and johnson\" OR \"Johnson & Johnson\"","\"JP Morgan\"","\"McDonald's Corp\"","\"Merck & Co\"","\"Microsoft Corp\" OR \"Microsoft Corporation\"","\"Nike\"","\"Pfizer\"","\"Procter & Gamble\" OR \"Procter And Gamble\"","\"Coca-Cola\"","\"United Technologies Corp\"","\"UnitedHealth Group\"","\"Verizon Communications\"","\"Visa Inc\"","\"Wal-Mart\"","\"Walt Disney Company\""];
         keywords = ["\"3M Co\"", "\"American Express\"", "\"AT&T\"", "\"Boeing\"", "\"Caterpillar Inc\"", "\"Chevron\"", "\"Cisco Systems\"", "\"DuPont\"","\"Exxon Mobil\"","\"General Electric\"","\"Goldman Sachs Group\"","\"Home Depot\"","\"Intel Corp\"","\"International Business Machines\"","\"Johnson and Johnson\" OR \"Johnson & Johnson\"","\"JP Morgan\"","\"McDonald's Corp\"","\"Merck & Co\"","\"Microsoft Corp\" OR \"Microsoft Corporation\"","\"Nike\"","\"Pfizer\"","\"Procter & Gamble\" OR \"Procter And Gamble\"","\"Coca-Cola\"","\"United Technologies Corp\"","\"UnitedHealth Group\"","\"Verizon Communications\"","\"Visa Inc\"","\"Wal-Mart\"","\"Walt Disney Company\""];
         sites = ["http://money.cnn.com/" + dateToSearch.strftime("%Y"), "http://www.bloomberg.com/news/", "http://www.rttnews.com/", "http://www.reuters.com/finance", "money.usnews.com", "www.ft.com/home/us", "http://www.cnbc.com/", "http://www.fool.com", "http://www.thestreet.com", "http://www.zacks.com", "http://www.seekingalpha.com/article" ]
         baseQuery = "site:" + sites[0]
         first = True
         for site in sites:
             if (first):
                 first = False
                 continue
             baseQuery = baseQuery + " OR site:" + site
         baseQuery = baseQuery + " daterange:" + str(julianDate) + "-" + str(julianDate) + " ";
         
         numberOfResults = 50
         for keyword in keywords:
             query = quote_plus(baseQuery + keyword);
             startPageQuery = "https://startpage.com/do/search?cat=web&abp=1&prfh=lang_homepageEEEs%2Fair%2Feng%2FN1Nenable_post_methodEEE0N1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE1N1Nnum_of_resultsEEE" + str(numberOfResults) + "N1N&suggestOn=0&query=" + query 
             #startPageQuery = "http://ipinfo.io/"
             binaryData = urlencode({'language' : 'english', 'cmd' : 'process_search', 'engine0' :'v1all'}).encode('utf-8')
             print (startPageQuery)
             ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) #monkey patch...
             proxy_support = urllib.request.ProxyHandler({'http': proxyIP})
             opener = urllib.request.build_opener(proxy_support, urllib.request.HTTPSHandler(context=ssl_context)) #monkey patch...
             urllib.request.install_opener(opener) #monkey patch...
             #headers={'User-agent' : 'Mozilla/5.0', 'Connection':'close'} #No longer used because StartPage blocked it for a while
             #headers={'User-agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6', 'Connection':'close'}
             browser = random.choice(BROWSERS)
             headers={'User-agent' : browser}
             request = urllib.request.Request(url=startPageQuery, data = None, headers=headers)
             response = opener.open(request)
             resultsPage = response.read().decode('utf-8')#.decode('iso-8859-1')
             #print(resultsPage)
             soup = BeautifulSoup(resultsPage)
             resultDivs = soup.findAll("div", { "class" : "result" })
             results = []
             for resultDiv in resultDivs:
                 if ('none' in resultDiv.get('class')):
                     continue
                 parent = resultDiv.findParent('div')
                 if parent.get('id') == 'sponsored_container':
                     continue
                 h3 = resultDiv.find("h3")
                 if (h3 is None):
                     continue
                 url = h3.find("a", href=True)['href']
                 results.append(url)
             try:
                 numberOfArticlePages = 0
                 articles = []
                 articleURLs = []
                 articleRanks = []
                 total = len(results)
         
                 # Was using gevent, but then switched to python3.3, which does not yet support gevent
                 crawlThreads = []
                 for index,res in enumerate(results):
                     t = threading.Thread(target=FetchArticle.fetch, args=(keyword, res, index, articles, len(results)))
                     t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
                     t.start()
                     crawlThreads.append(t)
                 for thread in crawlThreads:
                     thread.join(60)
                     if (thread.isAlive()):
                         print("A URL has timed out")
                 # I was using grequests, but it appeared that they were leaking file descriptors... :(
                 #articleRequests = (grequests.get(res, hooks = {'response' : do_something}, timeout=60) for res in results)
                 #articlePages = grequests.map(articleRequests)
                 print()
             except gevent.Timeout:
                 if (numberOfArticlePages >= len(results) * self.MIN_ARTICLES_PERCENT):
                     print(("Timeout, but continuing because we have " + str(numberOfArticlePages) + " articles."))
                 else:
                     print(("Timeout, going to next day because we have only " + str(numberOfArticlePages) + " articles."))
                     continue
             print("All articles (" + str(len(articles)) + " of 'em) for date " + dateToSearch.strftime("%m/%d/%Y") + " and keyword " + keyword + " returned, now being analyzed...")
 
             i = 1
             mutex_writefile = threading.Lock()
             #for res in results:
             self.dbError = True
             while (self.dbError == True): # This might get some articles twice, but its okay because once they are in the db it won't matter
                 print("Gathering articles")
                 self.dbError = False
                 self.threads = []
                 for article in articles:
                     #articleURL = res.url.encode("utf8")
                     thread = SentimentThread(article, i, datum_box, proxyIP, self, keyword, fileName, dateToSearch, mutex_writefile, user['subscriptionID'])
                     #thread.daemon = True
                     thread.start()
                     self.threads.append(thread)
                     i = i + 1
                     if (len(self.threads) >= self.MAX_THREADS):
                         for thread in self.threads:
                             thread.join()
                 for thread in self.threads:
                     thread.join()
                 self.articlesCompleted = 0
                 if (self.dbError == True):
                     userNumber = userNumber - 1
                     if (userNumber < 0):
                         print("Out of Users")
                         return
                     user = userInfo[userNumber]
                     datum_box = DatumBox(user['APIKey']) 
                     print("Switching users and trying again.")
                     if self.useTor:
                         self.torThread.stop()
                         self.torThread.join()
                         time.sleep(5)
                         print("Tor stopped. Restarting...")
                         self.torThread = TorThread(self)
                         self.torThread.start()
                         time.sleep(5)
                         print("Tor restarted. Continuing...")
             print()        
         #Next day!
         print()        
         dateToSearch = dateToSearch - timedelta(1)
         daysToGoBack += 1
     sentimentsFile.close()
     print("Congratulations! All articles analyzed. Shutting down...")