Ejemplo n.º 1
0
def test():
    rootPath = downloader_common.rootPath
    downloader = Downloader(rootPath)

    logging.basicConfig(
        filename='downloader_dt_news.log',
        level=logging.DEBUG,
        format='%(asctime)s %(levelname)s\t%(module)s\t%(message)s',
        datefmt='%d.%m.%Y %H:%M:%S')

    article = downloader.loadArticle(
        'https://dt.ua/macrolevel/nbu-bezturbotniy-vibir-meti-zayava-nacionalnogo-banku-pro-zminu-monetarnoyi-politiki-ta-pochatok-zhittya-za-novimi-pravilami-inflyaciynogo-targetuvannya-zalishaye-bez-vidpovidi-bagato-gostrih-pitan-yak-po-suti-samogo-povidomlennya-tak-i-nashog'
    )
    print(article.info())

    text = " ".join(article.body)
    text = text.strip()
    textStats = stats.TextStats(text)
    (ret, retMsg) = textStats.isStoreText()
    if not ret:
        print("WARNING: " + retMsg)
Ejemplo n.º 2
0
 def getNewsForDate(self, date):
   print('get news for ' + date.strftime('%d.%m.%Y'))
   url = self.baseUrl + '/archives/date_'+date.strftime('%d%m%Y')+'/'
   print('url: ' +url)
   # replace {0} with url
   articleList = list()
   downloadedUrls = set()
   cmd = self.getLinksCmd.format(url)
   #print('cmd: ' +cmd)
   p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
   for ln in p.stdout:
     line = ln.decode('utf-8').strip()
     if len(line) > 0 and not line.startswith('http') and line.startswith('/') and line not in downloadedUrls:
       print ('load article: '+self.baseUrl + line)
       try:
         article = self.loadArticle(self.baseUrl + line)
         if article is not None:
           bAddToList = True
           text = " ".join(article.body)
           text = text.strip()
           if len(text) > 0:
             textStats = stats.TextStats(text)
             if textStats.isUkr() and textStats.isRus():
               bAddToList = False
               logging.warning("IGNORE: Article is Ukr and Rus. URL: "+ line)
               logging.info("   stats: "+str(textStats.common_text_20))
             elif textStats.isRus():
               bAddToList = False
               logging.warning("IGNORE: Article is Rus. URL: "+ line)
             elif textStats.isEng():
               bAddToList = False
               logging.warning("IGNORE: Article is Eng. URL: "+ line)
             elif textStats.isUkr():
               bAddToList = True
             elif not (textStats.isUkr() or textStats.isRus() or textStats.isEng()):
                 if textStats.hasUkrLetter():
                     bAddToList = True
                 else:
                     bAddToList = False
                     logging.warning("IGNORE: Article language not detected. Has no only-ukr chars. URL: "+ line)
             else:
                 logging.warn("WARNING: Article language not detected (check manually). URL: "+ line)
                 logging.info("   text length: "+ str(len(text)))
                 bAddToList = True
           else:
               bAddToList = False
               logging.error("IGNORE: Article is empty. URL: "+self.baseUrl + line)
           if bAddToList:
             articleList.append(article)
             downloadedUrls.add(line)
         else:
           #exit
           logging.error("Article can not be loaded from URL: "+self.baseUrl + line)
           #sys.exit("Article can not be loaded from URL: "+self.baseUrl + line)
       except SystemExit:
         raise
       except:
         exc_type, exc_value, exc_traceback = sys.exc_info()
         print ("Unexpected error: ", exc_type)
         traceback.print_exception(exc_type, exc_value, exc_traceback)
     else:
       print ('ignore url: '+ line)
   # order articles by time
   return sorted(articleList, key=lambda x: x.timeStr)
Ejemplo n.º 3
0
    def getNewsForDate(self, date):
        print('get news for ' + date.strftime('%d.%m.%Y'))
        url = self.baseUrl + '/archives/date_' + date.strftime('%d%m%Y') + '/'
        print('url: ' + url)

        articleList = list()
        downloadedUrls = set()
        # replace {0} with url
        cmd = self.getLinksCmd.format(url)
        #print('cmd: ' +cmd)
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        for ln in p.stdout:
            line = ln.decode('utf-8').strip()
            if len(line) > 0 and not line.startswith(
                    'http') and line.startswith(
                        '/') and line not in downloadedUrls:
                print('load article: ' + self.baseUrl + line)
                try:
                    article = self.loadArticle(self.baseUrl + line)
                    if article is not None:
                        bAddToList = True
                        text = " ".join(article.body)
                        text = text.strip()
                        if len(text) > 0:
                            textStats = stats.TextStats(text)
                            if textStats.isUkr() and textStats.isRus():
                                bAddToList = False
                                logging.warning(
                                    "IGNORE: Article is Ukr and Rus. URL: " +
                                    self.baseUrl + line)
                                logging.info("   stats: " +
                                             str(textStats.common_text_20))
                            elif textStats.isRus():
                                bAddToList = False
                                logging.warning(
                                    "IGNORE: Article is Rus. URL: " +
                                    self.baseUrl + line)
                            elif textStats.isEng():
                                bAddToList = False
                                logging.warning(
                                    "IGNORE: Article is Eng. URL: " +
                                    self.baseUrl + line)
                            elif not (textStats.isUkr() or textStats.isRus()
                                      or textStats.isEng()):
                                if textStats.hasRusLetter():
                                    bAddToList = False
                                    logging.warning(
                                        "IGNORE: Article (language not detected) has Rus letters. URL: "
                                        + self.baseUrl + line)
                                elif textStats.hasUkrLetter():
                                    bAddToList = True
                                elif len(text) < 450:  #ignore article
                                    bAddToList = False
                                    logging.warning(
                                        "IGNORE: Article language not detected. URL: "
                                        + self.baseUrl + line)
                                    logging.info("   text length: " +
                                                 str(len(text)))
                                    logging.info("   stats: " +
                                                 str(textStats.common_text_20))
                                elif line in [
                                        '/articles/2012/10/28/6975576/',
                                        '/articles/2014/01/23/7011063/',
                                        '/articles/2014/01/28/7011761/'
                                ]:
                                    bAddToList = False
                                    logging.error(
                                        "IGNORE: Article has not language. URL: "
                                        + self.baseUrl + line)
                                else:
                                    logging.error(
                                        "Article language not detected. URL: "
                                        + self.baseUrl + line)
                                    logging.info("   text length: " +
                                                 str(len(text)))
                                    logging.info("   stats: " +
                                                 str(textStats.common_text_20))
                                    print(article.info())
                                    sys.exit(
                                        "Article language not detected. URL: "
                                        + self.baseUrl + line)
                        else:
                            if line in [
                                    '/articles/2005/11/17/3019729/',
                                    '/articles/2006/02/8/3061761/',
                                    '/articles/2007/01/31/3203836/',
                                    '/articles/2007/03/15/3216901/',
                                    '/articles/2007/03/28/3221114/',
                                    '/articles/2007/03/30/3222055/',
                                    '/articles/2007/03/31/3222674/',
                                    '/articles/2007/04/3/3224158/',
                                    '/articles/2007/04/3/3224119/',
                                    '/articles/2007/04/11/3227795/',
                                    '/articles/2007/04/11/3227746/',
                                    '/articles/2007/09/30/3292450/',
                                    '/articles/2008/05/26/3448561/',
                                    '/articles/2008/05/26/3448546/',
                                    '/articles/2009/01/12/3668969/',
                                    '/articles/2009/06/10/4013079/',
                                    '/news/2010/01/15/4621064/',
                                    '/articles/2010/01/18/4630133/',
                                    '/news/2010/05/31/5093418/',
                                    '/news/2010/06/18/5152762/',
                                    '/news/2010/06/22/5161355/',
                                    '/news/2010/07/12/5216065/',
                                    '/news/2010/10/12/5471544/',
                                    '/news/2011/01/18/5801413/',
                                    '/news/2011/01/28/5847095/',
                                    '/news/2011/02/8/5893563/',
                                    '/articles/2011/02/28/5968537/',
                                    '/news/2011/03/23/6044026/',
                                    '/news/2011/03/25/6051379/',
                                    '/news/2011/06/16/6302922/',
                                    '/articles/2011/11/16/6758771/',
                                    '/articles/2012/04/5/6962138/',
                                    '/articles/2012/04/20/6963082/',
                                    '/articles/2012/04/20/6963077/',
                                    '/articles/2012/07/30/6969816/',
                                    '/articles/2012/07/30/6967948/',
                                    '/articles/2012/08/1/6969957/',
                                    '/news/2012/08/1/6969973/',
                                    '/news/2013/12/3/7004679/',
                            ]:
                                bAddToList = False
                                logging.error(
                                    "IGNORE: Article is empty. URL: " +
                                    self.baseUrl + line)
                            elif len(article.timeStr) > 0 and len(
                                    article.title) > 0:
                                bAddToList = False
                                logging.error(
                                    "IGNORE: Empty article with title and time. URL: "
                                    + self.baseUrl + line)
                            else:
                                bAddToList = False
                                logging.error("Article is empty. URL: " +
                                              self.baseUrl + line)
                                print(article.info())
                                #sys.exit("Article is empty. URL: "+self.baseUrl + line)
                        if len(article.body) == 1:
                            logging.warning(
                                "Article has one paragraph. URL: " +
                                self.baseUrl + line)
                        if bAddToList:
                            articleList.append(article)
                            downloadedUrls.add(line)
                    else:
                        #exit
                        logging.error("Article can not be loaded from URL: " +
                                      self.baseUrl + line)
                        #sys.exit("Article can not be loaded from URL: "+self.baseUrl + line)
                except SystemExit:
                    raise
                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    print("Unexpected error: ", exc_type)
                    traceback.print_exception(exc_type, exc_value,
                                              exc_traceback)
                    raise
            else:
                print('ignore url: ' + line)
        # order articles by time
        return sorted(articleList, key=lambda x: x.timeStr)
Ejemplo n.º 4
0
    def getNewsForDate(self, date):
        url = self.baseUrl + '/sitemap/text/%d/%d/%d/index.html' % (
            date.year, date.month, date.day)
        print('get news for %d.%d.%d, url: %s' %
              (date.day, date.month, date.year, url))
        articleList = list()
        cmd = self.getLinksCmd.format(url)
        #print('cmd: ' +cmd)
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        for ln in p.stdout:
            line = ln.decode('utf-8').strip()
            if len(line) > 0 and not line.startswith('/sitemap/text'):
                try:
                    retryCount = 0
                    while True:
                        # print ('[%d.%d.%d] ' % (date.day, date.month, date.year) + 'load article: '+self.baseUrl + line)
                        article = self.loadArticle(self.baseUrl + line)
                        if article is not None:
                            if isinstance(article, str):
                                if article == 'reload':
                                    # reload article
                                    if retryCount > 4:
                                        #exit
                                        logging.error(
                                            '[%s] Timeout, try to reload article. RetryCount = %d'
                                            % (str(date), retryCount))
                                        break
                                        # sys.exit("Timeout: Article can not be loaded. from URL: %s" % (self.baseUrl + line))
                                    retryCount += 1
                                    logging.warning(
                                        '[%s] Timeout, try to reload article. RetryCount = %d'
                                        % (str(date), retryCount))
                                    continue
                                else:
                                    break
                            elif isinstance(article, Article):
                                text = " ".join(article.body)
                                text = text.strip()
                                textStats = stats.TextStats(text)
                                (ret, retMsg) = textStats.isStoreText()
                                if ret:
                                    articleList.append(article)
                                else:
                                    logging.warning(
                                        retMsg + " date %s, URL: %s" %
                                        (str(date), self.baseUrl + line))

                                break
                            else:
                                #exit
                                sys.exit("Unknown article type : " +
                                         str(article))
                        else:
                            #exit
                            sys.exit("Article can not be loaded from URL: " +
                                     self.baseUrl + line)
                except (SystemExit, KeyboardInterrupt):
                    raise
                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    print("Unexpected error: ", exc_type)
                    traceback.print_exception(exc_type, exc_value,
                                              exc_traceback)
        # order articles by time
        return sorted(articleList, key=lambda x: x.timeStr)
 def getNewsForDate(self, date):
     print('get news for ' + date.strftime('%d.%m.%Y'))
     url = self.baseUrl + '/archivedate.php?AYear=' + str(
         date.year) + '&AMonth=' + str(date.month) + '&ADay=' + str(
             date.day)
     print('url: ' + url)
     articleList = list()
     downloadedUrls = set()
     # replace {0} with url
     cmd = self.getLinksCmd.format(url)
     #print('cmd: ' +cmd)
     p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
     for ln in p.stdout:
         line = ln.decode('utf-8').strip()
         if len(line) > 0 and line.startswith(
                 self.baseUrl) and line not in downloadedUrls:
             print('load article: ' + line)
             try:
                 article = self.loadArticle(line)
                 if article is not None:
                     bAddToList = True
                     text = " ".join(article.body)
                     text = text.strip()
                     if len(text) > 0:
                         textStats = stats.TextStats(text)
                         if textStats.isUkr() and textStats.isRus():
                             bAddToList = False
                             logging.warning(
                                 "IGNORE: Article is Ukr and Rus. URL: " +
                                 line)
                             logging.info("   stats: " +
                                          str(textStats.common_text_20))
                         elif textStats.isRus():
                             bAddToList = False
                             logging.warning(
                                 "IGNORE: Article is Rus. URL: " + line)
                         elif textStats.isEng():
                             bAddToList = False
                             logging.warning(
                                 "IGNORE: Article is Eng. URL: " + line)
                         elif not (textStats.isUkr() or textStats.isRus()
                                   or textStats.isEng()):
                             if textStats.hasRusLetter():
                                 bAddToList = False
                                 logging.warning(
                                     "IGNORE: Article (language not detected) has Rus letters. URL: "
                                     + line)
                             elif len(text) < 450:  #ignore article
                                 bAddToList = False
                                 logging.warning(
                                     "IGNORE: Article language not detected. URL: "
                                     + line)
                                 logging.info("   text length: " +
                                              str(len(text)))
                                 logging.info("   stats: " +
                                              str(textStats.common_text_20))
                             elif textStats.hasUkrLetter():
                                 bAddToList = True
                             else:
                                 logging.error(
                                     "Article language not detected. URL: "
                                     + line)
                                 logging.info("   text length: " +
                                              str(len(text)))
                                 logging.info("   stats: " +
                                              str(textStats.common_text_20))
                                 bAddToList = False
                                 #sys.exit("Article language not detected. URL: "+ line)
                     else:
                         if line in [
                                 'http://www.telekritika.ua/knigi-tk/2009-06-17/46263',
                                 'http://www.telekritika.ua/medialiteracy/2010-10-01/56304',
                                 'http://www.telekritika.ua/medialiteracy/2010-10-07/56435',
                                 'http://www.telekritika.ua/notices/2010-10-08/56475',
                                 'http://www.telekritika.ua/medialiteracy/2010-10-12/56540',
                                 'http://www.telekritika.ua/tel/2010-10-22/56827',
                                 'http://www.telekritika.ua/news/2010-11-05/57249',
                                 'http://www.telekritika.ua/news/2010-11-08/57319',
                                 'http://www.telekritika.ua/tel/2010-11-22/57742',
                                 'http://www.telekritika.ua/profesiya/2010-11-29/57931'
                         ]:
                             bAddToList = False
                             logging.error(
                                 "IGNORE: Article is empty. URL: " + line)
                         elif len(article.timeStr) > 0 and len(
                                 article.title) > 0:
                             bAddToList = False
                             logging.error(
                                 "IGNORE: Empty article with title and time. URL: "
                                 + line)
                         else:
                             bAddToList = False
                             logging.error("Article is empty. URL: " + line)
                             article.info()
                             #sys.exit("Article is empty. URL: "+ line)
                     if bAddToList:
                         if len(article.body) == 1:
                             logging.warning("Article (length = " +
                                             str(len(text)) +
                                             ") has one paragraph. URL: " +
                                             line)
                         articleList.append(article)
                         downloadedUrls.add(line)
                 else:
                     #exit
                     logging.error("Article can not be loaded from URL: " +
                                   line)
                     sys.exit("Article can not be loaded from URL: " + line)
             except SystemExit:
                 raise
             except:
                 exc_type, exc_value, exc_traceback = sys.exc_info()
                 print("Unexpected error: ", exc_type)
                 traceback.print_exception(exc_type, exc_value,
                                           exc_traceback)
         else:
             print('ignore url: ' + line)
     # order articles by time
     return sorted(articleList, key=lambda x: x.timeStr)