Ejemplo n.º 1
0
from datetime import date
from spacy.lang.en.stop_words import STOP_WORDS
from kargo import logger, corpus, scraping, terms, evaluation
from pke.utils import compute_document_frequency, load_document_frequency_file
from pke.unsupervised import TfIdf, KPMiner, YAKE
from pke.unsupervised import SingleRank, TopicRank, PositionRank, MultipartiteRank
SCRAPED_DIR = "data/scraped"
INTERIM_DIR = "data/interim"
PROCESSED_DIR = "data/processed"
MANUAL_DIR = "data/annotations"
RESULTS_DIR = "results"
RELEVANT_DIR = os.path.join(PROCESSED_DIR, "news", "relevant")
CORE_NLP_DIR = os.path.join(RELEVANT_DIR, "dev")
EXTRACTED_DIR = os.path.join(RESULTS_DIR, "extracted_terms", "dev")
PLOT_DIR = os.path.join(RESULTS_DIR, "plots")
log = logger.get_logger(__name__, logger.INFO)


def scraping_news_sites():
    log.info("Begin scraping processes")
    air_cargo_news_spider = scraping.AirCargoNewsSpider(
        seed_url="https://www.aircargonews.net/news-by-date/page/",
        output_folder=os.path.join(SCRAPED_DIR, "aircargonews.net"))
    log.info("Begin scraping aircargonews.net")
    air_cargo_news_spider.start(1, 2)
    air_cargo_week_spider = scraping.AirCargoWeekSpider(
        seed_url="https://www.aircargoweek.com/category/news-menu/page/",
        output_folder=os.path.join(SCRAPED_DIR, "aircargoweek.com"))
    log.info("Begin scraping aircargoweek.com")
    air_cargo_week_spider.start(1, 2)
    air_cargo_world_spider = scraping.AirCargoWorldSpider(
Ejemplo n.º 2
0
import unittest
import scraping
from kargo import logger
scraping.log = logger.get_logger(__name__, logger.WARNING)


class TestScraping(unittest.TestCase):
    def test_aircargonews(self):
        air_cargo_news_spider = scraping.AirCargoNewsSpider(
            seed_url="https://www.aircargonews.net/news-by-date/page/",
            output_folder="../../data/interim/")
        result = air_cargo_news_spider.scrape(1)
        self.assertGreater(len(result), 0)

    def test_aircargoweek(self):
        air_cargo_week_spider = scraping.AirCargoWeekSpider(
            seed_url="https://www.aircargoweek.com/category/news-menu/page/",
            output_folder="../../data/interim/")
        result = air_cargo_week_spider.scrape(1)
        self.assertGreater(len(result), 0)

    def test_aircargoworld(self):
        air_cargo_world_spider = scraping.AirCargoWorldSpider(
            seed_url="https://aircargoworld.com/category/news/page/",
            output_folder="../../data/interim/")
        result = air_cargo_world_spider.scrape(
            10)  # articles in page 1 are behind paywall
        self.assertGreater(len(result), 0)

    def test_stattimes(self):
        stat_times_spider = scraping.StatTimesSpider(