def test(): rootPath = downloader_common.rootPath downloader = Downloader(rootPath) logging.basicConfig( filename='downloader_dt_news.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s\t%(module)s\t%(message)s', datefmt='%d.%m.%Y %H:%M:%S') article = downloader.loadArticle( 'https://dt.ua/macrolevel/nbu-bezturbotniy-vibir-meti-zayava-nacionalnogo-banku-pro-zminu-monetarnoyi-politiki-ta-pochatok-zhittya-za-novimi-pravilami-inflyaciynogo-targetuvannya-zalishaye-bez-vidpovidi-bagato-gostrih-pitan-yak-po-suti-samogo-povidomlennya-tak-i-nashog' ) print(article.info()) text = " ".join(article.body) text = text.strip() textStats = stats.TextStats(text) (ret, retMsg) = textStats.isStoreText() if not ret: print("WARNING: " + retMsg)
def getNewsForDate(self, date): print('get news for ' + date.strftime('%d.%m.%Y')) url = self.baseUrl + '/archives/date_'+date.strftime('%d%m%Y')+'/' print('url: ' +url) # replace {0} with url articleList = list() downloadedUrls = set() cmd = self.getLinksCmd.format(url) #print('cmd: ' +cmd) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) for ln in p.stdout: line = ln.decode('utf-8').strip() if len(line) > 0 and not line.startswith('http') and line.startswith('/') and line not in downloadedUrls: print ('load article: '+self.baseUrl + line) try: article = self.loadArticle(self.baseUrl + line) if article is not None: bAddToList = True text = " ".join(article.body) text = text.strip() if len(text) > 0: textStats = stats.TextStats(text) if textStats.isUkr() and textStats.isRus(): bAddToList = False logging.warning("IGNORE: Article is Ukr and Rus. URL: "+ line) logging.info(" stats: "+str(textStats.common_text_20)) elif textStats.isRus(): bAddToList = False logging.warning("IGNORE: Article is Rus. URL: "+ line) elif textStats.isEng(): bAddToList = False logging.warning("IGNORE: Article is Eng. URL: "+ line) elif textStats.isUkr(): bAddToList = True elif not (textStats.isUkr() or textStats.isRus() or textStats.isEng()): if textStats.hasUkrLetter(): bAddToList = True else: bAddToList = False logging.warning("IGNORE: Article language not detected. Has no only-ukr chars. URL: "+ line) else: logging.warn("WARNING: Article language not detected (check manually). URL: "+ line) logging.info(" text length: "+ str(len(text))) bAddToList = True else: bAddToList = False logging.error("IGNORE: Article is empty. URL: "+self.baseUrl + line) if bAddToList: articleList.append(article) downloadedUrls.add(line) else: #exit logging.error("Article can not be loaded from URL: "+self.baseUrl + line) #sys.exit("Article can not be loaded from URL: "+self.baseUrl + line) except SystemExit: raise except: exc_type, exc_value, exc_traceback = sys.exc_info() print ("Unexpected error: ", exc_type) traceback.print_exception(exc_type, exc_value, exc_traceback) else: print ('ignore url: '+ line) # order articles by time return sorted(articleList, key=lambda x: x.timeStr)
def getNewsForDate(self, date): print('get news for ' + date.strftime('%d.%m.%Y')) url = self.baseUrl + '/archives/date_' + date.strftime('%d%m%Y') + '/' print('url: ' + url) articleList = list() downloadedUrls = set() # replace {0} with url cmd = self.getLinksCmd.format(url) #print('cmd: ' +cmd) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) for ln in p.stdout: line = ln.decode('utf-8').strip() if len(line) > 0 and not line.startswith( 'http') and line.startswith( '/') and line not in downloadedUrls: print('load article: ' + self.baseUrl + line) try: article = self.loadArticle(self.baseUrl + line) if article is not None: bAddToList = True text = " ".join(article.body) text = text.strip() if len(text) > 0: textStats = stats.TextStats(text) if textStats.isUkr() and textStats.isRus(): bAddToList = False logging.warning( "IGNORE: Article is Ukr and Rus. URL: " + self.baseUrl + line) logging.info(" stats: " + str(textStats.common_text_20)) elif textStats.isRus(): bAddToList = False logging.warning( "IGNORE: Article is Rus. URL: " + self.baseUrl + line) elif textStats.isEng(): bAddToList = False logging.warning( "IGNORE: Article is Eng. URL: " + self.baseUrl + line) elif not (textStats.isUkr() or textStats.isRus() or textStats.isEng()): if textStats.hasRusLetter(): bAddToList = False logging.warning( "IGNORE: Article (language not detected) has Rus letters. URL: " + self.baseUrl + line) elif textStats.hasUkrLetter(): bAddToList = True elif len(text) < 450: #ignore article bAddToList = False logging.warning( "IGNORE: Article language not detected. URL: " + self.baseUrl + line) logging.info(" text length: " + str(len(text))) logging.info(" stats: " + str(textStats.common_text_20)) elif line in [ '/articles/2012/10/28/6975576/', '/articles/2014/01/23/7011063/', '/articles/2014/01/28/7011761/' ]: bAddToList = False logging.error( "IGNORE: Article has not language. URL: " + self.baseUrl + line) else: logging.error( "Article language not detected. URL: " + self.baseUrl + line) logging.info(" text length: " + str(len(text))) logging.info(" stats: " + str(textStats.common_text_20)) print(article.info()) sys.exit( "Article language not detected. URL: " + self.baseUrl + line) else: if line in [ '/articles/2005/11/17/3019729/', '/articles/2006/02/8/3061761/', '/articles/2007/01/31/3203836/', '/articles/2007/03/15/3216901/', '/articles/2007/03/28/3221114/', '/articles/2007/03/30/3222055/', '/articles/2007/03/31/3222674/', '/articles/2007/04/3/3224158/', '/articles/2007/04/3/3224119/', '/articles/2007/04/11/3227795/', '/articles/2007/04/11/3227746/', '/articles/2007/09/30/3292450/', '/articles/2008/05/26/3448561/', '/articles/2008/05/26/3448546/', '/articles/2009/01/12/3668969/', '/articles/2009/06/10/4013079/', '/news/2010/01/15/4621064/', '/articles/2010/01/18/4630133/', '/news/2010/05/31/5093418/', '/news/2010/06/18/5152762/', '/news/2010/06/22/5161355/', '/news/2010/07/12/5216065/', '/news/2010/10/12/5471544/', '/news/2011/01/18/5801413/', '/news/2011/01/28/5847095/', '/news/2011/02/8/5893563/', '/articles/2011/02/28/5968537/', '/news/2011/03/23/6044026/', '/news/2011/03/25/6051379/', '/news/2011/06/16/6302922/', '/articles/2011/11/16/6758771/', '/articles/2012/04/5/6962138/', '/articles/2012/04/20/6963082/', '/articles/2012/04/20/6963077/', '/articles/2012/07/30/6969816/', '/articles/2012/07/30/6967948/', '/articles/2012/08/1/6969957/', '/news/2012/08/1/6969973/', '/news/2013/12/3/7004679/', ]: bAddToList = False logging.error( "IGNORE: Article is empty. URL: " + self.baseUrl + line) elif len(article.timeStr) > 0 and len( article.title) > 0: bAddToList = False logging.error( "IGNORE: Empty article with title and time. URL: " + self.baseUrl + line) else: bAddToList = False logging.error("Article is empty. URL: " + self.baseUrl + line) print(article.info()) #sys.exit("Article is empty. URL: "+self.baseUrl + line) if len(article.body) == 1: logging.warning( "Article has one paragraph. URL: " + self.baseUrl + line) if bAddToList: articleList.append(article) downloadedUrls.add(line) else: #exit logging.error("Article can not be loaded from URL: " + self.baseUrl + line) #sys.exit("Article can not be loaded from URL: "+self.baseUrl + line) except SystemExit: raise except: exc_type, exc_value, exc_traceback = sys.exc_info() print("Unexpected error: ", exc_type) traceback.print_exception(exc_type, exc_value, exc_traceback) raise else: print('ignore url: ' + line) # order articles by time return sorted(articleList, key=lambda x: x.timeStr)
def getNewsForDate(self, date): url = self.baseUrl + '/sitemap/text/%d/%d/%d/index.html' % ( date.year, date.month, date.day) print('get news for %d.%d.%d, url: %s' % (date.day, date.month, date.year, url)) articleList = list() cmd = self.getLinksCmd.format(url) #print('cmd: ' +cmd) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) for ln in p.stdout: line = ln.decode('utf-8').strip() if len(line) > 0 and not line.startswith('/sitemap/text'): try: retryCount = 0 while True: # print ('[%d.%d.%d] ' % (date.day, date.month, date.year) + 'load article: '+self.baseUrl + line) article = self.loadArticle(self.baseUrl + line) if article is not None: if isinstance(article, str): if article == 'reload': # reload article if retryCount > 4: #exit logging.error( '[%s] Timeout, try to reload article. RetryCount = %d' % (str(date), retryCount)) break # sys.exit("Timeout: Article can not be loaded. from URL: %s" % (self.baseUrl + line)) retryCount += 1 logging.warning( '[%s] Timeout, try to reload article. RetryCount = %d' % (str(date), retryCount)) continue else: break elif isinstance(article, Article): text = " ".join(article.body) text = text.strip() textStats = stats.TextStats(text) (ret, retMsg) = textStats.isStoreText() if ret: articleList.append(article) else: logging.warning( retMsg + " date %s, URL: %s" % (str(date), self.baseUrl + line)) break else: #exit sys.exit("Unknown article type : " + str(article)) else: #exit sys.exit("Article can not be loaded from URL: " + self.baseUrl + line) except (SystemExit, KeyboardInterrupt): raise except: exc_type, exc_value, exc_traceback = sys.exc_info() print("Unexpected error: ", exc_type) traceback.print_exception(exc_type, exc_value, exc_traceback) # order articles by time return sorted(articleList, key=lambda x: x.timeStr)
def getNewsForDate(self, date): print('get news for ' + date.strftime('%d.%m.%Y')) url = self.baseUrl + '/archivedate.php?AYear=' + str( date.year) + '&AMonth=' + str(date.month) + '&ADay=' + str( date.day) print('url: ' + url) articleList = list() downloadedUrls = set() # replace {0} with url cmd = self.getLinksCmd.format(url) #print('cmd: ' +cmd) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) for ln in p.stdout: line = ln.decode('utf-8').strip() if len(line) > 0 and line.startswith( self.baseUrl) and line not in downloadedUrls: print('load article: ' + line) try: article = self.loadArticle(line) if article is not None: bAddToList = True text = " ".join(article.body) text = text.strip() if len(text) > 0: textStats = stats.TextStats(text) if textStats.isUkr() and textStats.isRus(): bAddToList = False logging.warning( "IGNORE: Article is Ukr and Rus. URL: " + line) logging.info(" stats: " + str(textStats.common_text_20)) elif textStats.isRus(): bAddToList = False logging.warning( "IGNORE: Article is Rus. URL: " + line) elif textStats.isEng(): bAddToList = False logging.warning( "IGNORE: Article is Eng. URL: " + line) elif not (textStats.isUkr() or textStats.isRus() or textStats.isEng()): if textStats.hasRusLetter(): bAddToList = False logging.warning( "IGNORE: Article (language not detected) has Rus letters. URL: " + line) elif len(text) < 450: #ignore article bAddToList = False logging.warning( "IGNORE: Article language not detected. URL: " + line) logging.info(" text length: " + str(len(text))) logging.info(" stats: " + str(textStats.common_text_20)) elif textStats.hasUkrLetter(): bAddToList = True else: logging.error( "Article language not detected. URL: " + line) logging.info(" text length: " + str(len(text))) logging.info(" stats: " + str(textStats.common_text_20)) bAddToList = False #sys.exit("Article language not detected. URL: "+ line) else: if line in [ 'http://www.telekritika.ua/knigi-tk/2009-06-17/46263', 'http://www.telekritika.ua/medialiteracy/2010-10-01/56304', 'http://www.telekritika.ua/medialiteracy/2010-10-07/56435', 'http://www.telekritika.ua/notices/2010-10-08/56475', 'http://www.telekritika.ua/medialiteracy/2010-10-12/56540', 'http://www.telekritika.ua/tel/2010-10-22/56827', 'http://www.telekritika.ua/news/2010-11-05/57249', 'http://www.telekritika.ua/news/2010-11-08/57319', 'http://www.telekritika.ua/tel/2010-11-22/57742', 'http://www.telekritika.ua/profesiya/2010-11-29/57931' ]: bAddToList = False logging.error( "IGNORE: Article is empty. URL: " + line) elif len(article.timeStr) > 0 and len( article.title) > 0: bAddToList = False logging.error( "IGNORE: Empty article with title and time. URL: " + line) else: bAddToList = False logging.error("Article is empty. URL: " + line) article.info() #sys.exit("Article is empty. URL: "+ line) if bAddToList: if len(article.body) == 1: logging.warning("Article (length = " + str(len(text)) + ") has one paragraph. URL: " + line) articleList.append(article) downloadedUrls.add(line) else: #exit logging.error("Article can not be loaded from URL: " + line) sys.exit("Article can not be loaded from URL: " + line) except SystemExit: raise except: exc_type, exc_value, exc_traceback = sys.exc_info() print("Unexpected error: ", exc_type) traceback.print_exception(exc_type, exc_value, exc_traceback) else: print('ignore url: ' + line) # order articles by time return sorted(articleList, key=lambda x: x.timeStr)