Example #1
0
def main():
    if not isinstance(config.period, int):
        main_logger.info('Period ENV variable needs to be an Integer.')
        exit(1)
    sc = Scraper()

    while True:
        try:
            sc.scrape()
            main_logger.info('Finsihed scraping.')
            main_logger.info('Halting for {} seconds.'.format(config.period))
            sleep(config.period)
        except KeyboardInterrupt:
            main_logger.info('Finished Scraper.')
            exit(1)
Example #2
0
def perform_search(search_type, data, num_threads=10):
    results = {}
    scrapers = [
        s for s in Scraper.get_scrapers(
            os.path.join(ADDON_PATH, "resources", "providers.json"),
            timeout=get_int_setting("scraper_timeout"))
        if get_boolean_setting(s.id)
    ]
    with ScraperRunner(scrapers, num_threads=num_threads) as runner:
        runner_data = runner.parse_query(
            data) if search_type == "query" else runner.parse(
                search_type, data)

        for scraper, scraper_results in runner_data:
            logging.debug("processing %s scraper results", scraper.name)
            for scraper_result in scraper_results:
                magnet = Magnet(scraper_result["magnet"])
                try:
                    info_hash = magnet.parse_info_hash()
                except InvalidMagnet:
                    continue
                if info_hash == "0" * 40:
                    continue

                magnet_result = results.get(info_hash)  # type: Result
                if magnet_result is None:
                    results[info_hash] = Result(scraper, scraper_result)
                else:
                    magnet_result.add_result(scraper, scraper_result)

    # noinspection PyTypeChecker
    return [
        r.to_provider_result()
        for r in sorted(results.values(), key=Result.get_factor, reverse=True)
    ]
Example #3
0
def readFromFile(sc: Scraper, pathIn, pathOut = "info.txt"):
    with open(pathIn, 'r') as urlList:
        for url in urlList:
            sc.setUrl(url)
            sc.sendRequest()
            res = sc.extraction()
            print(res)
            with open(pathOut, 'a') as outputFile:
                for element in res:
                    if type(res[element]).__name__ == 'list':
                        outputFile.write(element+": ")
                        for cat in res[element]:
                            outputFile.write("posizione "+str(cat[0])+" nella categoria "+cat[1]+"\n")
                    else:
                        outputFile.write(element+": "+str(res[element])+"\n")
                outputFile.write("\n\n")
Example #4
0
    def run_scraper(self, target_url, target_element_name):
        """
        Run the scraper, check the cache, and log the differences.

        """

        # fire up scraper and cache objects
        scraper = Scraper()
        cache = Cache()

        # define the target and cached content
        target_content = scraper.fetch_site_content(
            target_url,
            target_element_name
        )
        cached_content = cache.fetch_cache(target_url)

        # check the cache and report our findings
        if target_content is not None:
            diff = cache.diff_cache(target_content, cached_content)
            if diff is not u'':
                logging.info('The target differs from the cache.')
                logging.info(diff)

                logging.info('Updating cache...')
                cache.update_cache(target_url, target_content)
                logging.info('Cache updated.')

                logging.info('Sending mail...')
                email_result = self.send_email(target_url, diff)
                logging.info(email_result)

                message = 'Success! Cache updated.'
            else:
                logging.info('The target and cache match. Not altering cache.')
                message = 'Success! Cache not altered.'
        else:
            logging.warn('Unable to fetch requested page! D:')
            logging.error('Scraping falure.')
            message = 'Failure!'

        logging.info('Scraper finished.')

        return message, diff
Example #5
0
    def run_scraper(self, target_url, target_element_name):
        """
        Run the scraper, check the cache, and log the differences.

        """

        # fire up scraper and cache objects
        scraper = Scraper()
        cache = Cache()

        # define the target and cached content
        target_content = scraper.fetch_site_content(target_url,
                                                    target_element_name)
        cached_content = cache.fetch_cache(target_url)

        # check the cache and report our findings
        if target_content is not None:
            diff = cache.diff_cache(target_content, cached_content)
            if diff is not u'':
                logging.info('The target differs from the cache.')
                logging.info(diff)

                logging.info('Updating cache...')
                cache.update_cache(target_url, target_content)
                logging.info('Cache updated.')

                logging.info('Sending mail...')
                email_result = self.send_email(target_url, diff)
                logging.info(email_result)

                message = 'Success! Cache updated.'
            else:
                logging.info('The target and cache match. Not altering cache.')
                message = 'Success! Cache not altered.'
        else:
            logging.warn('Unable to fetch requested page! D:')
            logging.error('Scraping falure.')
            message = 'Failure!'

        logging.info('Scraper finished.')

        return message, diff
def verify(providers_path, schema_path, settings_path):
    if not os.path.exists(providers_path):
        raise ValidationError(
            "providers.json file ({}) does not exist!".format(providers_path))

    with open(providers_path) as f:
        data = json.load(f)
    with open(schema_path) as f:
        schema = json.load(f)

    jsonschema.validate(data, schema)

    settings = ElementTree.parse(settings_path)
    providers_root = os.path.dirname(providers_path)

    for provider in data:
        scraper = Scraper.from_data(provider)

        icon = scraper.get_attribute("icon", default=None)
        if icon:
            if not os.path.exists(os.path.join(providers_root, icon)):
                logging.warning(
                    "attributes.icon for provider '%s' is defined (%s) but is not a valid file",
                    scraper.name, icon)
        else:
            logging.debug("No icon attributes.icon defined for provider '%s'",
                          scraper.name)

        color = scraper.get_attribute("color", default=None)
        if color:
            if not COLOR_REGEX.match(color):
                logging.warning(
                    "attributes.color for provider '%s' is defined (%s) but is not a valid color (%s)",
                    scraper.name, color, COLOR_REGEX.pattern)
        else:
            logging.debug("No icon attributes.color defined for provider '%s'",
                          scraper.name)

        setting = settings.find(".//setting[@id='{}']".format(scraper.id))
        if setting is not None:
            if setting.attrib.get("type") != "bool":
                logging.warning(
                    "settings.xml setting with id '%s' must have type 'bool'",
                    scraper.id)
            if setting.attrib.get("label") is None:
                logging.warning(
                    "settings.xml setting with id '%s' must have label attribute defined",
                    scraper.id)
        else:
            logging.warning("settings.xml setting with id '%s' is not defined",
                            scraper.id)
Example #7
0
from time import sleep
from lib.scraper import Scraper

def readFromFile(sc: Scraper, pathIn, pathOut = "info.txt"):
    with open(pathIn, 'r') as urlList:
        for url in urlList:
            sc.setUrl(url)
            sc.sendRequest()
            res = sc.extraction()
            print(res)
            with open(pathOut, 'a') as outputFile:
                for element in res:
                    if type(res[element]).__name__ == 'list':
                        outputFile.write(element+": ")
                        for cat in res[element]:
                            outputFile.write("posizione "+str(cat[0])+" nella categoria "+cat[1]+"\n")
                    else:
                        outputFile.write(element+": "+str(res[element])+"\n")
                outputFile.write("\n\n")


if __name__ == "__main__":
    sel = "selettore.yml"
    sc = Scraper(sel)
    readFromFile(sc, "list.txt")
Example #8
0
 def __init__(self):
     action, key, artist, url, nfo, settings = self._parse_argv()
     Scraper(action, key, artist, url, nfo, settings)
Example #9
0
def format_results(shows):
    # Group results by day
    days = {item: [] for item in conf['days']}
    for show in shows:
        day = show.pop('day')
        days[day].append(show)
    # Order results by time
    for day, item in days.items():
        days[day] = sorted(item, key=lambda x: x['time'])
    # Join days together and return
    return days


LOGGER.info('Scraping movie schedule...')
results = Scraper.run()

LOGGER.info('Generating HTML content...')
environment = Environment(loader=FileSystemLoader('templates'),
                          undefined=StrictUndefined)
html = environment.get_template('email.html').render(
    results=format_results(results))
payload = yaml.safe_load(open('conf/mailer.yaml'))['payload']
payload['Messages'][0]['HTMLPart'] = html

LOGGER.info('Sending to mailing list...')
mailer = Mailer(**credentials)
mailer.send(payload)

LOGGER.info('Done')
 def __init__(self):
     action, artist, album, url = self._parse_argv()
     Scraper(action, artist, album, url)
Example #11
0
import sys
import requests
import json
from lxml import html

reload(sys)
sys.setdefaultencoding("utf-8")

print("Building data structure.")

from lib.scraper import Scraper
data = Scraper.build()

print("Building data structure complete.")

content = json.load(open('data/results3.json', 'r'))

for president in data.keys():
    for category in data[president].keys():
        for i in range(0, len(data[president][category])):

            if data[president][category][i]['content'] is not None:
                continue

            pid = data[president][category][i]['pid']

            if str(pid) in content:
                data[president][category][i]['content'] = content[str(pid)]
                print("Success: " + str(pid))
            else:
                print("Not Found: " + str(pid))
def get_scrapers(args):
    return [
        s for s in Scraper.get_scrapers(args.providers_path)
        if not args.provider_id or args.provider_id == s.id
    ]