コード例 #1
0
import unittest, os, time
from multiprocessing import Queue
import scrape_logger

import scrapeKL_gui

level = "WARNING"
#level = "INFO"
#level = "DEBUG"
logger = scrape_logger.setup_logger(level)

storage_directory = "scrapes"


class Test(unittest.TestCase):
    def test_scrapeThread(self):
        company_names = {
            2048: "talenom",
            1102: "cramo",
            1091: "sanoma",
            1196: "afarak group"
        }
        #company_names = None # For scraping every company
        showProgress = False
        queue = Queue()

        time0 = time.time()
        qThread = scrapeKL_gui.scrapeThread(storage_directory, company_names,
                                            showProgress, queue)
        qThread.start()
        qThread.wait()
コード例 #2
0
import unittest
import scrape_logger

import speed

level = "WARNING"
#level = "INFO"
#level = "DEBUG"
logger_root = scrape_logger.setup_logger(level)
logger_speed = scrape_logger.setup_logger(level, name="speed")


class Test(unittest.TestCase):
    def test_speed(self):
        # smoketest
        speed.run_speedtest(1)


if __name__ == '__main__':
    unittest.main()
コード例 #3
0
                    scrapeKL.print_calculations(self.filename, c_id, c_name)

                # TODO: Filter and organize functions does not work atm
                elif sender.text() == self.filter_str:
                    scrapeKL.filter_companies(self.filename)
                elif sender.text() == self.organize_str:
                    scrapeKL.organize_companies(self.filename)

                else:
                    raise ScrapeGuiException(
                        'Unexpected "sender.text()": [{}]'.format(
                            sender.text()))
            else:
                logger.info("No file selected")


if __name__ == '__main__':
    logger = scrape_logger.setup_logger("DEBUG")

    # Storage
    storage_directory = "scrapes"
    if not os.path.isdir(storage_directory):
        os.makedirs(storage_directory)
        logger.debug("storage-folder created: [{}]".format(storage_directory))

    # GUI
    app = QApplication(sys.argv)
    Window = Window(storage_directory)
    Window.show()
    sys.exit(app.exec_())
コード例 #4
0
def main(arguments):
    # Logging
    if arguments["--debug"]:
        level = "DEBUG"
    else:
        level = "INFO"
    logger = scrape_logger.setup_logger(level)

    logger.debug(arguments)

    # Storage
    storage_directory = "scrapes"
    if not os.path.isdir(storage_directory):
        os.makedirs(storage_directory)
        logger.debug("storage-folder created: [{}]".format(storage_directory))

    # Shared arguments
    if arguments["<file>"]:
        filename = storage_directory + "\\" + arguments["<file>"]
    else:
        filename = storage.get_latest_metrics_filename(storage_directory)
    c_name = arguments["--name"]
    c_id_list_in = arguments["--id"]
    c_id_list = []
    if c_id_list_in:
        for c_id in c_id_list_in:
            try:
                c_id_list.append(int(c_id))
            except ValueError:
                raise ScrapeKLException(
                    "Id {} is not an integer.".format(c_id))
    logger.debug("c_id_list: {}; c_name: {}".format(c_id_list, c_name))

    # Function calling
    if arguments["scrape"]:
        company_names = None
        if arguments["--id"]:
            company_names = {}
            for c_id in arguments["--id"]:
                try:
                    company_names[int(c_id)] = None
                except ValueError:
                    raise ScrapeKLException(
                        "Id {} is not an integer.".format(c_id))
        logger.debug("company_names to scrape: {}".format(company_names))
        time0 = time.time()
        scrape_companies(storage_directory, company_names)
        print("Scraping took: {:.2f} s".format(time.time() - time0))

    elif arguments["names"]:
        if arguments["<file>"]:
            names_filename = arguments["<file>"]
            print_all_names(names_filename=names_filename)
        else:
            print_all_names(storage_directory=storage_directory)

    elif arguments["metrics"]:
        print_metrics(filename, c_id_list, c_name)

    elif arguments["collection"]:
        print_collection(filename, c_id_list, c_name)

    elif arguments["filtered"]:
        print_filtered(filename, c_id_list, c_name)

    elif arguments["passed"]:
        print_passed_names(filename)

    elif arguments["list_files"]:
        all_filenames = os.listdir(storage_directory)
        for f in sorted(all_filenames):
            if f.endswith(".json"):
                print(f)

    elif arguments["speed"]:
        times = arguments["<times>"]
        if times:
            try:
                times = int(times)
            except ValueError:
                raise ScrapeKLException(
                    "Times {} is not an integer.".format(times))
            assert times > 0
        else:
            logger.info("Using default: times = 5")
            times = 5
        """
        The speed testing needs its own logger, so the print is clean.
        The speed logger level is gotten from the user input,
        and the scrape logger, "root", level is set to WARNING.
        """
        _speed_logger = scrape_logger.setup_logger(logger.level, "speed")
        scrape_logger.set_logger_level(logger, "WARNING")

        speed.run_speedtest(times)
コード例 #5
0
import requests, re, logging, traceback, time
from bs4 import BeautifulSoup
from datetime import date
from multiprocessing import Process, Queue
import scrape_logger

#logger = logging.getLogger('root')

level = "INFO"
#level = "DEBUG"
logger = scrape_logger.setup_logger(level, "scraping")
scrape_logger.set_logger_level(logger, level)

url_basic = "http://www.kauppalehti.fi/5/i/porssi/"
osingot_url = url_basic + "osingot/osinkohistoria.jsp"
osingot_yritys_url = url_basic + "osingot/osinkohistoria.jsp?klid={}"
kurssi_url = url_basic + "porssikurssit/osake/index.jsp?klid={}"
kurssi_tulostiedot_url  = url_basic + \
    "porssikurssit/osake/tulostiedot.jsp?klid={}"

date_format = "%Y-%m-%d"  # YYYY-MM-DD
date_short_format = "%y-%m-%d"  # YY-MM-DD
datetime_format = "%y-%m-%d_%H-%M-%S"  # YY-MM-DD_HH-MM-SS: for filename
date_pattern_0 = re.compile("^\d{4}\-\d{2}\-\d{2}$")  # YYYY-MM-DD
date_pattern_1 = re.compile("^\d{2}\.\d{2}\.\d{4}$")  # DD.MM.YYYY
date_pattern_2 = re.compile("^\d{2}/\d{2}$")  # MM/YY


class ScrapeException(Exception):
    pass
コード例 #6
0
ファイル: speed.py プロジェクト: miitcher/OsakkeetScraper
    whole_run_time = time.time() - time0

    avg_time = whole_run_time / times
    compared = avg_time - old_avg_time
    if old_avg_time:
        compared_percent = 100 * compared / old_avg_time
    else:
        compared_percent = 0
    logger.info(
        "- Scraping took:\t\t{:6.2f} s\n".format(whole_run_time) + \
        "Average time per scraping:\t{:6.2f} s\n".format(avg_time) + \
        "Compared to old average:\t{:+6.2f} s --> {:+.1f} %".format(
            compared, compared_percent
        )
    )
    return whole_run_time


if __name__ == '__main__':
    logger_root = scrape_logger.setup_logger("WARNING")
    logger = scrape_logger.setup_logger(name="speed")

    times = 5 # default
    if len(sys.argv) == 2:
        try:
            times = int(sys.argv[1])
        except ValueError:
            pass

    run_speedtest(times)