Example #1
0
 def test_create_exist_article_warc(self):
     """
     crawling a exist article warc will replace new
     """
     wc.create_article_warc(ARTICLE_HTML)
     os.chdir("..")
     self.assertTrue(os.path.isfile(WARC_ARTICLE_DIRECTORY +
                                    "/http:__www.cbc.ca_news_world_nor-"
                                    + "easter-storm-blasts-eastern-u-s-with"
                                    + "-heavy-rain-and-snow-1.2851711"
                                    + ".warc.gz"))
Example #2
0
 def test_create_wrong_url_article_warc(self):
     """
     crawling a wrong url article should be going to success because there
     is 404 page
     """
     try:
         os.chdir("..")
         os.remove(WARC_ARTICLE_DIRECTORY +
                   "/http:__www.cbc.ca_news_world_nor-easter-storm-blasts-"
                   + "eastern-u-s-with.warc.gz")
     except OSError:
         pass
     self.setUp()
     wc.create_article_warc(WRONG_ARTICLE_HTML)
     os.chdir("..")
     time.sleep(1)
     self.assertTrue(os.path.isfile(WARC_ARTICLE_DIRECTORY +
                                    "/http:__www.cbc.ca_news_world_nor-" +
                                    "easter-storm-blasts-eastern-u-s-with" +
                                    ".warc.gz"))
Example #3
0
 def test_create_article_warc(self):
     """
     create a real article url warc should work
     """
     try:
         os.chdir("..")
         os.remove(WARC_ARTICLE_DIRECTORY +
                   "/http:__www.cbc.ca_news_world_nor-easter-storm-blasts" +
                   "-eastern-u-s-with-heavy-rain-and-snow-1.2851711.warc.gz"
                   )
     except OSError:
         pass
     self.setUp()
     wc.create_article_warc(ARTICLE_HTML)
     os.chdir("..")
     time.sleep(1)
     self.assertTrue(os.path.isfile(WARC_ARTICLE_DIRECTORY +
                                    "/http:__www.cbc.ca_news_world_nor-" +
                                    "easter-storm-blasts-eastern-u-s-with" +
                                    "-heavy-rain-and-snow-1.2851711.warc.gz"
                                    ))
Example #4
0
        for line in article_file:
            if (len(line.split(' ')) == 2
                    and not line.split(' ') in article_queue):
                article_queue.append(line.split(' '))
        article_file.seek(0)
        article_file.truncate()
        article_file.close()

        if (len(article_queue) > 0):
            # get first element in the queue
            line = article_queue.pop(0)
            url = line[0]
            warc_file_name = line[1].strip()
            print 'processing: ' + url + ' : ' + warc_file_name
            article_processes.append(
                warc_creator.create_article_warc(url, warc_file_name))

            # set time out for pdf generator
            p = warc_creator.create_article_pdf(url, warc_file_name)
            # wait for 200 seconds, if timeout, kill the process
            num_polls = 0
            while p.poll() is None:
                # Waiting for the process to finish.
                time.sleep(0.1)  # Avoid being a CPU busy loop.
                num_polls += 1
                if num_polls > 2000:  # after 150 secs, it will be considered as failure,
                    # the process will be terminated and put into failure list
                    p.terminate()
                    fail_name = "article_warc.stream.failure"
                    fail = open(fail_name, "a")
                    fail.write(url + "\n")
Example #5
0
def parse_articles(referring_sites, db_keywords, source_sites, twitter_accounts_explorer):
    """ (list of [str, newspaper.source.Source, str],
         list of str, list of str, str) -> None
    Downloads each db_article in the site, extracts, compares
    with Foreign Sites and Keywords provided.
    Then the db_article which had a match will be stored into the Django database

    Keyword arguments:
    referring_sites     -- List of [name, 'built_article'] of each site
    db_keywords         -- List of keywords
    source_sites       -- List of foreign sites
    """
    added, updated, failed, no_match = 0, 0, 0, 0

    # for each db_article in each sites, download and parse important data
    for site in referring_sites:
        # print "\n%s" % site[0]

        article_count = 0
        newspaper_articles = []
        crawlersource_articles = []
        logging.info("Site: %s Type:%i"%(site['name'], site['type']))
        #0 = newspaper, 1 = crawler, 2 = both
        if(site["type"] == 0 or site["type"] == 2):
            logging.disable(logging.ERROR)
            newspaper_source = newspaper.build(site["url"],
                                             memoize_articles=False,
                                             keep_article_html=True,
                                             fetch_images=False,
                                             language='en',
                                             number_threads=1)
            logging.disable(logging.NOTSET)
            newspaper_articles = newspaper_source.articles
            article_count += newspaper_source.size()
            logging.info("populated {0} articles using newspaper".format(article_count))
        if(site["type"] == 1 or site["type"] == 2):
            crawlersource_articles = Crawler.CrawlerSource(site["url"])
            article_count += crawlersource_articles.probabilistic_n
            logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n))
        article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles)
        processed = 0
        for article in article_iterator:
            logging.info("Looking: %s"%article.url)
            # Check for any new command on communication stream
            check_command()

            url = article.url
            if 'http://www.' in url:
                url = url[:7] + url[11:]
            elif 'https://www.' in url:
                url = url[:8] + url[12:]

            # Try to download and extract the useful data
            try:
                if(not article.is_downloaded):
                    article.download()
                if(not article.is_parsed):
                    article.parse()
                title = article.title
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                logging.warning("Could not parse article")
                title = ""
            # If downloading/parsing the page fails,
            # stop here and move on to next db_article
            if not ((title == "") or (title == "Page not found")):
                logging.debug(u"found title: {0}".format(title))
                
                # Regex the keyword from the article's text
                keywords = get_keywords(article, db_keywords)
                logging.debug("keywords: {0}".format(str(keywords)))
                # Regex the links within article's html
                sources = get_sources_sites(article.article_html, source_sites)
                logging.debug("sources: {0}".format(str(sources)))
                twitter_accounts = get_sources_twitter(article.article_html, twitter_accounts_explorer)
                logging.debug("twitter_accounts: {0}".format(str(twitter_accounts[0])))
                # Store parsed author
                authors = article.authors
                # Try to parse the published date
                pub_date = get_pub_date(article)

                # If neither of keyword nor sources matched,
                # then stop here and move on to next article

                if not (keywords == [] and sources[0] == [] and twitter_accounts[0] ==[]):
                    logging.info("Found Match")
                    try:
                        logging.info("Requesting canonical url")
                        url = requests.get(url).url
                    except requests.RequestException:
                        logging.warning("Raised requests.RequestException")
                    except:
                        logging.warning("Could not resolve canonical url")
                    logging.debug("canonical url found")
                    # Check if the entry already exists
                    db_article_list = Article.objects.filter(url=url)
                    if not db_article_list:
                        logging.info("Adding new Article to the DB")
                        # If the db_article is new to the database,
                        # add it to the database
                        db_article = Article(title=title, url=url,
                                          domain=site["url"],
                                          date_added=timezone.localtime(
                                              timezone.now()),
                                          date_published=pub_date)
                        db_article.save()

                        db_article = Article.objects.get(url=url)

                        for key in keywords:
                            db_article.keyword_set.create(name=key)

                        for author in authors:
                            db_article.author_set.create(name=author)
                        for account in twitter_accounts[0]:

                            db_article.sourcetwitter_set.create(name = account, matched = True)

                        for account in twitter_accounts[1]:
                            db_article.sourcetwitter_set.create(name = account, matched = False)

                        for source in sources[0]:
                            db_article.sourcesite_set.create(url=source[0],
                                                      domain=source[1], matched=True, local=(source[1] in site["url"]))

                        for source in sources[1]:
                            db_article.sourcesite_set.create(url=source[0],
                                                      domain=source[1], matched=False, local=(source[1] in site["url"]))
                        added += 1

                    else:
                        logging.info("Modifying existing Article in the DB")
                        # If the db_article already exists,
                        # update all fields except date_added
                        db_article = db_article_list[0]
                        db_article.title = title
                        db_article.url = url
                        db_article.domain = site["url"]
                        # Do not update the added date
                        # db_article.date_added = today
                        db_article.date_published = pub_date
                        db_article.save()

                        for key in keywords:
                            if not db_article.keyword_set.filter(name=key):
                                db_article.keyword_set.create(name=key)

                        for author in authors:
                            if not db_article.author_set.filter(name=author):
                                db_article.author_set.create(name=author)

                        for account in twitter_accounts[0]:
                            if not db_article.sourcetwitter_set.filter(name=account):
                                db_article.sourcetwitter_set.create(name = account, matched = True)

                        for account in twitter_accounts[1]:
                            if not db_article.sourcetwitter_set.filter(name=account):
                                db_article.sourcetwitter_set.create(name = account, matched = False)

                        for source in sources[0]:
                            if not db_article.sourcesite_set.filter(url=source[0]):
                                db_article.sourcesite_set.create(url=source[0],
                                                      domain=source[1], matched=True, local=(source[1] in site["url"]))

                        for source in sources[1]:
                            if not db_article.sourcesite_set.filter(url=source[0]):
                                db_article.sourcesite_set.create(url=source[0],
                                                      domain=source[1], matched=False, local=(source[1] in site["url"]))

                    warc_creator.create_article_warc(url)
                else:
                    logging.info("No matches")
            else:
                logging.info("No title found")

            processed += 1
            print(
                "%s (Article|%s) %i/%i          \r" %
                (str(timezone.localtime(timezone.now()))[:-13],
                 site["name"], processed, article_count))

            # Null the db_article data to free the memory
            #newspaper_source.articles[db_article] = None

            # Null the article object's content to free the memory
            article.article_html = None
            article.text = None
            article.title = None
            article.source_url = None
            article.url = None
            article.top_img = None
            article.meta_img = None
            article.imgs = None
            article.movies = None
            article.keywords = None
            article.meta_keywords = None
            article.tags = None
            article.authors = None
            article.publish_date = None
            article.summary = None
            article.html = None
            article.is_parsed = None
            article.is_downloaded = None
            article.meta_description = None
            article.meta_lang = None
            article.meta_favicon = None
            article.meta_data = None
            article.canonical_link = None
            article.top_node = None
            article.clean_top_node = None
            article.doc = None
            article.clean_doc = None
            article.additional_data = None

            logging.info("(%s | %i/%i) Finished looking: %s"%(article.url, processed, article_count, article.url))
        logging.info("Finished Site: %s"%site['name'])
        print(
            "%s (Article|%s) %i/%i          " %
            (str(timezone.localtime(timezone.now()))[:-13], site["name"],
             processed, article_count))
Example #6
0
        # read the file and get all the urls
        article_file = open(article_file_name, "r+")
        for line in article_file:
            if (len(line.split(' ')) == 2 and not line.split(' ') in article_queue):
                article_queue.append(line.split(' '))
        article_file.seek(0)
        article_file.truncate()
        article_file.close()

        if (len(article_queue) > 0):
            # get first element in the queue
            line = article_queue.pop(0)
            url = line[0]
            warc_file_name = line[1].strip()
            print 'processing: ' + url + ' : ' + warc_file_name
            article_processes.append(warc_creator.create_article_warc(url, warc_file_name))

            # set time out for pdf generator
            p = warc_creator.create_article_pdf(url, warc_file_name)
            # wait for 200 seconds, if timeout, kill the process
            num_polls = 0
            while p.poll() is None:
                # Waiting for the process to finish.
                time.sleep(0.1)  # Avoid being a CPU busy loop.
                num_polls += 1
                if num_polls > 2000:  # after 150 secs, it will be considered as failure,
					# the process will be terminated and put into failure list
                    p.terminate()
                    fail_name = "article_warc.stream.failure"
                    fail = open(fail_name, "a")
                    fail.write(url + "\n")
Example #7
0
        # if not tasks remaining, retry thoes url that failed before
        if os.stat(article_file_name).st_size == 0:
            article_file_name = "article_warc.stream.failure"
        # read the file and get all the urls
        article_file = open(article_file_name, "r+")
        for url in article_file:
            if (url.strip() != "" and (not url.strip() in article_queue)):
                article_queue.append(url.strip())
        article_file.seek(0)
        article_file.truncate()
        article_file.close()

        if (len(article_queue) > 0):
            # get first element in the queue
            url = article_queue.pop(0)
            article_processes.append(warc_creator.create_article_warc(url))

            # set time out for pdf generator
            p = warc_creator.create_article_pdf(url)
            # wait for 30 seconds, if timeout, kill the process
            num_polls = 0
            while p.poll() is None:
                # Waiting for the process to finish.
                time.sleep(0.1)  # Avoid being a CPU busy loop.
                num_polls += 1
                if num_polls > 600:  # after 60 secs, it will be considered as failure,
					# the process will be terminated and put into failure list
                    p.terminate()
                    fail_name = "article_warc.stream.failure"
                    fail = open(fail_name, "a")
                    fail.write(url + "\n")
Example #8
0
def parse_articles(referring_sites, db_keywords, source_sites, twitter_accounts_explorer):
    """ (list of [str, newspaper.source.Source, str],
         list of str, list of str, str) -> None
    Downloads each db_article in the site, extracts, compares
    with Foreign Sites and Keywords provided.
    Then the db_article which had a match will be stored into the Django database

    Keyword arguments:
    referring_sites     -- List of [name, 'built_article'] of each site
    db_keywords         -- List of keywords
    source_sites       -- List of foreign sites
    """
    added, updated, failed, no_match = 0, 0, 0, 0

    # for each db_article in each sites, download and parse important data
    for site in referring_sites:
        # print "\n%s" % site[0]

        article_count = 0
        newspaper_articles = []
        crawlersource_articles = []
        logging.info("Site: %s Type:%i"%(site['name'], site['type']))
        #0 = newspaper, 1 = crawler, 2 = both

        if(site["type"] == 0 or site["type"] == 2):
            logging.disable(logging.ERROR)
            newspaper_source = newspaper.build(site["url"],
                                             memoize_articles=False,
                                             keep_article_html=True,
                                             fetch_images=False,
                                             language='en',
                                             number_threads=1)
            logging.disable(logging.NOTSET)
            newspaper_articles = newspaper_source.articles
            article_count += newspaper_source.size()
            logging.info("populated {0} articles using newspaper".format(article_count))
        if(site["type"] == 1 or site["type"] == 2):
            crawlersource_articles = Crawler.Crawler(site["url"], site["filter"])
            article_count += crawlersource_articles.probabilistic_n
            logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n))
        article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles)
        processed = 0
        for article in article_iterator:
            #have to put all the iteration stuff at the top because I used continue extensively in this loop
            processed += 1
            # Check for any new command on communication stream
            check_command()        

            if url_in_filter(article.url, site["filter"]):
                logging.info("Matches with filter, skipping the {0}".format(article.url))
                continue

            print(
                "%s (Article|%s) %i/%i          \r" %
                (str(timezone.localtime(timezone.now()))[:-13],
                 site["name"], processed, article_count))
            logging.info("Processing %s"%article.url)

            url = article.url
            if 'http://www.' in url:
                url = url[:7] + url[11:]
            elif 'https://www.' in url:
                url = url[:8] + url[12:]

            article = ExplorerArticle(article.url)
            # Try to download and extract the useful data
            if(not article.is_downloaded):
                if(not article.download()):
                    logging.warning("article skipped because download failed")
                    continue

            article.preliminary_parse()

            if not article.title:
                logging.info("article missing title, skipping")
                continue

            if not article.text:
                logging.info("article missing text, skipping")
                continue
                
            # Regex the keyword from the article's text
            keywords = get_keywords(article, db_keywords)
            logging.debug(u"matched keywords: {0}".format(repr(keywords)))
            # Regex the links within article's html
            sources = get_sources_sites(article, source_sites)
            logging.debug(u"matched sources: {0}".format(repr(sources)))
            twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer)
            logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0])))

            if((not keywords) or (not sources[0]) or (not twitter_accounts[0])):#[] gets coverted to false
                logging.debug("skipping article because it's not a match")
                continue
            logging.info("match found")

            article.newspaper_parse()

            authors = article.authors
            pub_date = get_pub_date(article)
            # Check if the entry already exists
            db_article_list = Article.objects.filter(url=url)
            if not db_article_list:
                logging.info("Adding new Article to the DB")
                # If the db_article is new to the database,
                # add it to the database
                db_article = Article(title=article.title, url=url,
                                  domain=site["url"],
                                  date_added=timezone.localtime(
                                      timezone.now()),
                                  date_published=pub_date)
                db_article.save()

                db_article = Article.objects.get(url=url)

                for key in keywords:
                    db_article.keyword_set.create(name=key)

                for author in authors:
                    db_article.author_set.create(name=author)
                for account in twitter_accounts[0]:

                    db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=True, local=(source[1] in site["url"]))

                for source in sources[1]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=False, local=(source[1] in site["url"]))
                added += 1

            else:
                logging.info("Modifying existing Article in the DB")
                # If the db_article already exists,
                # update all fields except date_added
                db_article = db_article_list[0]
                db_article.title = article.title
                db_article.url = url
                db_article.domain = site["url"]
                # Do not update the added date
                # db_article.date_added = today
                db_article.date_published = pub_date
                db_article.save()

                for key in keywords:
                    if not db_article.keyword_set.filter(name=key):
                        db_article.keyword_set.create(name=key)

                for author in authors:
                    if not db_article.author_set.filter(name=author):
                        db_article.author_set.create(name=author)

                for account in twitter_accounts[0]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=True, local=(source[1] in site["url"]))

                for source in sources[1]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=False, local=(source[1] in site["url"]))

            warc_creator.create_article_warc(url)
        logging.info("Finished Site: %s"%site['name'])
        print(
            "%s (Article|%s) %i/%i          " %
            (str(timezone.localtime(timezone.now()))[:-13], site["name"],
             processed, article_count))
def parse_articles(populated_sites, db_keywords, foreign_sites):
    """ (list of [str, newspaper.source.Source, str],
         list of str, list of str, str) -> None
    Downloads each article in the site, extracts, compares
    with Foreign Sites and Keywords provided.
    Then the article which had a match will be stored into the Django database

    Keyword arguments:
    populated_sites     -- List of [name, 'built_article'] of each site
    db_keywords         -- List of keywords
    foreign_sites       -- List of foreign sites
    """
    added, updated, failed, no_match = 0, 0, 0, 0

    # for each article in each sites, download and parse important data
    for site in populated_sites:
        # print "\n%s" % site[0]
        article_count = site[1].size()
        processed = 0
        for k in range(len(site[1].articles)):
            art = site[1].articles[k]
            # Stop any print statements, even newspaper's warning messages
            sys.stdout = open(os.devnull, "w")
            sys.stderr = open(os.devnull, "w")

            # Check for any new command on communication stream
            check_command()

            url = art.url
            if "http://www." in url:
                url = url[:7] + url[11:]
            elif "https://www." in url:
                url = url[:8] + url[12:]

            # Try to download and extract the useful data
            try:
                art.download()
                art.parse()
                title = art.title
            except:
                title = ""
            # If downloading/parsing the page fails,
            # stop here and move on to next article
            if not ((title == "") or (title == "Page not found")):
                # Regex the keyword from the article's text
                keywords = get_keywords(art, db_keywords)
                # Regex the links within article's html
                sources = get_sources(art.article_html, foreign_sites)
                # Store parsed author
                authors = art.authors
                # Try to parse the published date
                pub_date = get_pub_date(art)

                # If neither of keyword nor sources matched,
                # then stop here and move on to next article
                if not (keywords == [] and sources == []):

                    # Check if the entry already exists
                    article_list = Article.objects.filter(url=url)
                    if not article_list:
                        # If the article is new to the database,
                        # add it to the database
                        article = Article(
                            title=title,
                            url=url,
                            url_origin=site[2],
                            date_added=timezone.localtime(timezone.now()),
                            date_published=pub_date,
                        )
                        article.save()

                        article = Article.objects.get(url=url)

                        for key in keywords:
                            article.keyword_set.create(keyword=key)

                        for author in authors:
                            article.author_set.create(author=author)

                        for source in sources:
                            article.source_set.create(url=source[0], url_origin=source[1])

                        added += 1

                    else:
                        # If the article already exists,
                        # update all fields except date_added
                        article = article_list[0]
                        article.title = title
                        article.url = url
                        article.url_origin = site[2]
                        # Do not update the added date
                        # article.date_added = today
                        article.date_published = pub_date
                        article.save()

                        for key in keywords:
                            if not A_keyword.objects.filter(keyword=key):
                                article.keyword_set.create(keyword=key)

                        for author in authors:
                            if not Author.objects.filter(author=author):
                                article.author_set.create(author=author)

                        for source in sources:
                            if not Source.objects.filter(url=source[0]):
                                src = article.source_set.create(url=source[0])
                                src.url_origin = source[1]

                    warc_creator.create_article_warc(url)

            processed += 1

            # Let the output print back to normal for minimal ui
            sys.stdout = sys.__stdout__
            # Print out minimal information
            sys.stdout.write(
                "%s (Article|%s) %i/%i          \r"
                % (str(timezone.localtime(timezone.now()))[:-13], site[0], processed, article_count)
            )
            sys.stdout.flush()
            # Null the article data to free the memory
            site[1].articles[k] = None
        print(
            "%s (Article|%s) %i/%i          "
            % (str(timezone.localtime(timezone.now()))[:-13], site[0], processed, article_count)
        )