def main(): if not isinstance(config.period, int): main_logger.info('Period ENV variable needs to be an Integer.') exit(1) sc = Scraper() while True: try: sc.scrape() main_logger.info('Finsihed scraping.') main_logger.info('Halting for {} seconds.'.format(config.period)) sleep(config.period) except KeyboardInterrupt: main_logger.info('Finished Scraper.') exit(1)
def perform_search(search_type, data, num_threads=10): results = {} scrapers = [ s for s in Scraper.get_scrapers( os.path.join(ADDON_PATH, "resources", "providers.json"), timeout=get_int_setting("scraper_timeout")) if get_boolean_setting(s.id) ] with ScraperRunner(scrapers, num_threads=num_threads) as runner: runner_data = runner.parse_query( data) if search_type == "query" else runner.parse( search_type, data) for scraper, scraper_results in runner_data: logging.debug("processing %s scraper results", scraper.name) for scraper_result in scraper_results: magnet = Magnet(scraper_result["magnet"]) try: info_hash = magnet.parse_info_hash() except InvalidMagnet: continue if info_hash == "0" * 40: continue magnet_result = results.get(info_hash) # type: Result if magnet_result is None: results[info_hash] = Result(scraper, scraper_result) else: magnet_result.add_result(scraper, scraper_result) # noinspection PyTypeChecker return [ r.to_provider_result() for r in sorted(results.values(), key=Result.get_factor, reverse=True) ]
def readFromFile(sc: Scraper, pathIn, pathOut = "info.txt"): with open(pathIn, 'r') as urlList: for url in urlList: sc.setUrl(url) sc.sendRequest() res = sc.extraction() print(res) with open(pathOut, 'a') as outputFile: for element in res: if type(res[element]).__name__ == 'list': outputFile.write(element+": ") for cat in res[element]: outputFile.write("posizione "+str(cat[0])+" nella categoria "+cat[1]+"\n") else: outputFile.write(element+": "+str(res[element])+"\n") outputFile.write("\n\n")
def run_scraper(self, target_url, target_element_name): """ Run the scraper, check the cache, and log the differences. """ # fire up scraper and cache objects scraper = Scraper() cache = Cache() # define the target and cached content target_content = scraper.fetch_site_content( target_url, target_element_name ) cached_content = cache.fetch_cache(target_url) # check the cache and report our findings if target_content is not None: diff = cache.diff_cache(target_content, cached_content) if diff is not u'': logging.info('The target differs from the cache.') logging.info(diff) logging.info('Updating cache...') cache.update_cache(target_url, target_content) logging.info('Cache updated.') logging.info('Sending mail...') email_result = self.send_email(target_url, diff) logging.info(email_result) message = 'Success! Cache updated.' else: logging.info('The target and cache match. Not altering cache.') message = 'Success! Cache not altered.' else: logging.warn('Unable to fetch requested page! D:') logging.error('Scraping falure.') message = 'Failure!' logging.info('Scraper finished.') return message, diff
def run_scraper(self, target_url, target_element_name): """ Run the scraper, check the cache, and log the differences. """ # fire up scraper and cache objects scraper = Scraper() cache = Cache() # define the target and cached content target_content = scraper.fetch_site_content(target_url, target_element_name) cached_content = cache.fetch_cache(target_url) # check the cache and report our findings if target_content is not None: diff = cache.diff_cache(target_content, cached_content) if diff is not u'': logging.info('The target differs from the cache.') logging.info(diff) logging.info('Updating cache...') cache.update_cache(target_url, target_content) logging.info('Cache updated.') logging.info('Sending mail...') email_result = self.send_email(target_url, diff) logging.info(email_result) message = 'Success! Cache updated.' else: logging.info('The target and cache match. Not altering cache.') message = 'Success! Cache not altered.' else: logging.warn('Unable to fetch requested page! D:') logging.error('Scraping falure.') message = 'Failure!' logging.info('Scraper finished.') return message, diff
def verify(providers_path, schema_path, settings_path): if not os.path.exists(providers_path): raise ValidationError( "providers.json file ({}) does not exist!".format(providers_path)) with open(providers_path) as f: data = json.load(f) with open(schema_path) as f: schema = json.load(f) jsonschema.validate(data, schema) settings = ElementTree.parse(settings_path) providers_root = os.path.dirname(providers_path) for provider in data: scraper = Scraper.from_data(provider) icon = scraper.get_attribute("icon", default=None) if icon: if not os.path.exists(os.path.join(providers_root, icon)): logging.warning( "attributes.icon for provider '%s' is defined (%s) but is not a valid file", scraper.name, icon) else: logging.debug("No icon attributes.icon defined for provider '%s'", scraper.name) color = scraper.get_attribute("color", default=None) if color: if not COLOR_REGEX.match(color): logging.warning( "attributes.color for provider '%s' is defined (%s) but is not a valid color (%s)", scraper.name, color, COLOR_REGEX.pattern) else: logging.debug("No icon attributes.color defined for provider '%s'", scraper.name) setting = settings.find(".//setting[@id='{}']".format(scraper.id)) if setting is not None: if setting.attrib.get("type") != "bool": logging.warning( "settings.xml setting with id '%s' must have type 'bool'", scraper.id) if setting.attrib.get("label") is None: logging.warning( "settings.xml setting with id '%s' must have label attribute defined", scraper.id) else: logging.warning("settings.xml setting with id '%s' is not defined", scraper.id)
from time import sleep from lib.scraper import Scraper def readFromFile(sc: Scraper, pathIn, pathOut = "info.txt"): with open(pathIn, 'r') as urlList: for url in urlList: sc.setUrl(url) sc.sendRequest() res = sc.extraction() print(res) with open(pathOut, 'a') as outputFile: for element in res: if type(res[element]).__name__ == 'list': outputFile.write(element+": ") for cat in res[element]: outputFile.write("posizione "+str(cat[0])+" nella categoria "+cat[1]+"\n") else: outputFile.write(element+": "+str(res[element])+"\n") outputFile.write("\n\n") if __name__ == "__main__": sel = "selettore.yml" sc = Scraper(sel) readFromFile(sc, "list.txt")
def __init__(self): action, key, artist, url, nfo, settings = self._parse_argv() Scraper(action, key, artist, url, nfo, settings)
def format_results(shows): # Group results by day days = {item: [] for item in conf['days']} for show in shows: day = show.pop('day') days[day].append(show) # Order results by time for day, item in days.items(): days[day] = sorted(item, key=lambda x: x['time']) # Join days together and return return days LOGGER.info('Scraping movie schedule...') results = Scraper.run() LOGGER.info('Generating HTML content...') environment = Environment(loader=FileSystemLoader('templates'), undefined=StrictUndefined) html = environment.get_template('email.html').render( results=format_results(results)) payload = yaml.safe_load(open('conf/mailer.yaml'))['payload'] payload['Messages'][0]['HTMLPart'] = html LOGGER.info('Sending to mailing list...') mailer = Mailer(**credentials) mailer.send(payload) LOGGER.info('Done')
def __init__(self): action, artist, album, url = self._parse_argv() Scraper(action, artist, album, url)
import sys import requests import json from lxml import html reload(sys) sys.setdefaultencoding("utf-8") print("Building data structure.") from lib.scraper import Scraper data = Scraper.build() print("Building data structure complete.") content = json.load(open('data/results3.json', 'r')) for president in data.keys(): for category in data[president].keys(): for i in range(0, len(data[president][category])): if data[president][category][i]['content'] is not None: continue pid = data[president][category][i]['pid'] if str(pid) in content: data[president][category][i]['content'] = content[str(pid)] print("Success: " + str(pid)) else: print("Not Found: " + str(pid))
def get_scrapers(args): return [ s for s in Scraper.get_scrapers(args.providers_path) if not args.provider_id or args.provider_id == s.id ]