Esempio n. 1
0
def parse_unparsed(session, crawlers):
    session.flush()
    unparsed_rawurl_count = session.query(Rawurl).filter_by(state = Rawurl.STATE_UNPARSED).count()
    
        
    if unparsed_rawurl_count == 0:
        print('nothing to parse')
        return
    else:
        print('Trying to parse ' + str(unparsed_rawurl_count) + ' unparsed urls...')
    
    parse_success = 0
    parse_failed = 0
    
    before = time.time()
    
    rawurl = session.query(Rawurl).filter_by(state = Rawurl.STATE_UNPARSED).limit(1).first()
    rawurl_index = 0
    while rawurl is not None:
        foundparser = False
        for _, parser in crawlers.items():
            try:
                parser.parse_rawurl(rawurl)
                parse_success += 1
                foundparser = True
                session.commit()
                break
            except InvalidArticleException:
                continue
            
        if not foundparser:
            rawurl.state = Rawurl.STATE_INVALID
            parse_failed += 1
            
        rawurl_index += 1
        
        if (rawurl_index % 10 == 0):
            percent, etl = statistics_get_percent_estimated_time(before, rawurl_index, unparsed_rawurl_count)
            print('\r %0.2f%% | %d urls parsed, %d urls failed | ETL: %s' % (percent, parse_success, parse_failed, etl), end='')
        rawurl = session.query(Rawurl).filter_by(state = Rawurl.STATE_UNPARSED).limit(1).first()
    session.commit()
    
    print('')
    print('from %d urls, %d failed to parse (%d%%)' % 
          (unparsed_rawurl_count, parse_failed, round(100 * parse_failed / unparsed_rawurl_count)))
Esempio n. 2
0
def process_articles(session):
    """Processes (analyzes) articles for each day
    """
    print("Fetching words...", end="")
    words = get_all_words_in_db(session)
    print("done.")
    print("Fetching dates...", end="")
    dates = get_articles_dates(session)
    print("done.")
    day_index = 0
    max_index = len(dates)
    time_before = time.time()

    statistics = {
        "day": 0,
        "maxday": len(dates),
        "timebefore": time.time(),
        "articles_processed": 0,
        "new_words": 0,
        "new_occurences": 0,
    }

    print("Processing articles...")
    for date in dates:
        process_daily_articles(session, date, words, statistics)
        statistics["day"] += 1

        percent, etl = statistics_get_percent_estimated_time(
            statistics["timebefore"], statistics["day"], statistics["maxday"]
        )
        print("\r%0.2f%% | ETL: %s | %s" % (percent, etl, str(date)), end="")
        session.commit()
    print("\nPerforming final commit...", end="")
    session.commit()
    print("done.\n")

    print("Articles processed: %d" % statistics["articles_processed"])
    print("New words added: %d" % statistics["new_words"])
    print("Occurences added: %d" % statistics["new_occurences"])
    print("Dates spanned: %d" % max_index)

    seconds = round(time.time() - time_before)
    print("Time taken: %s" % seconds_to_string(seconds))
    print("Articles per second: %0.2f" % (statistics["articles_processed"] / seconds))
Esempio n. 3
0
def process_daily_articles(session, article_date, all_words, statistics):
    daily_articles = session.query(Article).filter_by(state=0).filter_by(date=article_date)

    verbose = False
    # this is often empty
    # but also often the most expensive because it performs a large flush
    occurences = session.query(Occurence).filter_by(date=article_date)
    if verbose:
        print("\npocet nacitanych zavislosti: %d " % occurences.count())

    # will map word_text to occurence (for each source)
    all_occurences = defaultdict(dict)
    for occ in occurences:
        all_occurences[occ.source][occ.word.text] = occ

    # statistics
    cnt_of_new_words = 0
    cnt_of_new_occurences = 0
    cnt_of_existing_occurences = 0
    cnt_of_existing_words = 0
    articles_processed = 0

    words_dict = defaultdict(Counter)
    if verbose:
        print("Pocet clankov pre datum %s je %d" % (str(article_date), daily_articles.count()))
    start = time.time()

    for article in daily_articles:
        words_dict[article.source].update(re.split("\W+", article.content))
        article.state = 1  # we can't commit till this property is not properly reflected in occurences
        articles_processed += 1

    count_sources = len(words_dict)
    index_source = 0
    for source in words_dict:
        iterator = words_dict[source].items()

        first = True
        index_word = 0
        for word_text, frequency in iterator:

            # adds word
            if word_text in all_words:
                cnt_of_existing_words += 1
            else:
                new_word = Word(word_text, strip_accents(word_text).lower())
                cnt_of_new_words += 1
                session.add(new_word)
                all_words[word_text] = new_word

            # adds occurence
            all_occurence_current = all_occurences[source]
            if word_text in all_occurence_current:
                all_occurence_current[word_text].count += frequency
                all_occurence_current[word_text].article_count += 1
                cnt_of_existing_occurences += 1
            else:
                new_occurence = Occurence(all_words[word_text], frequency, article_date, source, 1)
                # the following line can be ommited
                all_occurence_current[word_text] = new_occurence
                session.add(new_occurence)
                cnt_of_new_occurences += 1
            index_word += 1

            if index_word % 100 == 0:
                session.flush()
                precise_progress = (
                    statistics["day"] + (index_source + index_word / len(words_dict[source])) / count_sources
                )
                percent, etl = statistics_get_percent_estimated_time(
                    statistics["timebefore"], precise_progress, statistics["maxday"]
                )
                print("\r%0.2f%% | ETL: %s | %s" % (percent, etl, str(article_date)), end="")
        index_source += 1
    # session.commit()
    statistics["articles_processed"] += articles_processed
    statistics["new_words"] += cnt_of_new_words
    statistics["new_occurences"] += cnt_of_new_occurences
    stop = time.time()
    if verbose:
        print("pocet spracovanych clankov    %d" % articles_processed)
        print("pocet pridanych slov          %d" % cnt_of_new_words)
        print("pocet existujucich slov       %d" % cnt_of_existing_words)
        print("pocet novych zavislosti       %d" % cnt_of_new_occurences)
        print("pocet existujucich zavislosti %d" % cnt_of_existing_occurences)
        print("spracovanie jedneho clanku:   %0.2f " % round((stop - start) / articles_processed, 2))
        print("")
Esempio n. 4
0
 def crawl(self, start = 1, maxindex=None, settings = {}):
     """ Crawls provided links for articles,
     then donwloads and parses them.
     Prints statistics.
     """
     count_links_already_in_db = 0
     count_links_new_found = 0
     count_links_new_added = 0
     count_links_download_error = 0
     count_links_parse_error = 0
     count_lists_download_error = 0
     count_lists_checked = 0
     
     #repair settings
     if settings is None:
         settings = {}
     if 'stop_on_oldpage' not in settings:
         settings['stop_on_oldpage'] = False
     if 'stop_on_lastpage' not in settings:
         settings['stop_on_lastpage'] = False
     
     #pageurls = self.get_urls_to_search_for_links()
     pageurls = self.get_url_iterator(start=start, maxindex=maxindex)
     
     page_index = 0
     time_before = time.time()
     last_page_warning = False
     
     for pageurl in pageurls:
         count_lists_checked += 1
         try:
             #print('\nopening %s \n' % pageurl)
             pagehtml = my_urlopen(pageurl)
             last_page_warning = False
         except (PageNotFoundException, FailedAttemptsException, ConnectionResetError):
             print('\ncould not open article list %s' % pageurl, file=sys.stderr)
             count_lists_download_error += 1
             if last_page_warning:
                 pageurls.stop()
                 print('\nStopping because of two 404s in row\n')
                 last_page_warning = False
                 continue
             
             last_page_warning = True
             continue
         soup = BeautifulSoup(pagehtml)
         
         # parse for links
         links = self.get_links_from_soup(soup, pageurl)
         
         if settings['stop_on_lastpage'] and self.stopping_criterion(soup):
             pageurls.stop()
             print('\nStopping because individual criterion fired (no nextpage link)\n')
         
         link_index = 1
         link_count = len(links)
         
         localnewlinks = 0
         for link in links: # approx. 10 links
             if not self.link_exists_in_db(link):
                 count_links_new_found += 1
                 localnewlinks += 1
                 
                 #download article
                 try:
                     htmlcontent = self.download_link(link)
                 except (PageNotFoundException, FailedAttemptsException, ConnectionResetError, Exception):
                     print('\nnot able to download article %s\n' % link, file=sys.stderr)
                     count_links_download_error += 1
                     continue
                 
                 #add link
                 rawurl = self.store_link(link, htmlcontent)
                 
                 #parse page
                 try:
                     self.parse_rawurl(rawurl)
                 except InvalidArticleException:
                     rawurl.state = Rawurl.STATE_INVALID
                     print('\ninvalid article %s' % rawurl.url, file=sys.stderr)
                     count_links_parse_error += 1
                 
                 count_links_new_added +=1
             else:
                 count_links_already_in_db += 1
             link_index += 1
             
             percent, etl = statistics_get_percent_estimated_time(time_before, page_index + link_index/link_count, len(pageurls))
             #print('\r%0.2f%% | ETL: %s' % (percent, etl), end='')
             print('\r%0.2f%% | ETL: %s | new: %d | old: %d | pix: %d' % (percent, etl, count_links_new_added, count_links_already_in_db, page_index), end='')
         
         if settings['stop_on_oldpage'] and localnewlinks == 0:
             pageurls.stop()
             print('\nStopping because no new links were found on the last page\n')
             #print('stopping topic, no new articles')
             
         self.session.commit()
         page_index += 1
         if page_index % 100 == 0:
             print('\nwhoa! %s | %s!' % (strftime("%H:%M:%S"), pageurl))
         
         percent, etl = statistics_get_percent_estimated_time(time_before, page_index, len(pageurls))
         print('\r%0.2f%% | ETL: %s | new: %d | old: %d | pix: %d' % (percent, etl, count_links_new_added, count_links_already_in_db, page_index), end='')
         
     print('')
     print('Checked %d article lists (originally: %d, %d failed to download)' % (count_lists_checked, len(pageurls), count_lists_download_error))
     print('Found %d links' % (count_links_already_in_db + count_links_new_found))
     print('Article lists failed to download: %d' % count_lists_download_error)
     print('-Already in db: %d' % count_links_already_in_db)
     print('-New links found: %d' % count_links_new_found)
     print('--Failed to download: %d' % count_links_download_error)
     print('--Downloaded and stored, but unable to parse as article: %d' % count_links_parse_error)
     print('--Downloaded, stored & parsed successfully: %d' % count_links_new_added)