def __init__(self, list=None): self.gathered_links = list if Queue.instance is not None else [ ] # FIFO self.dump_seperater = ",\n" self.dump_file = CONFIG.get("QUEUE", "dumpFile") self.overwrite = bool(CONFIG.get("QUEUE", "overwrite")) self.direct_dump = bool(CONFIG.get("QUEUE", "directDump"))
def setup_logging(): logging.getLogger() loghandlers = [logging.handlers.RotatingFileHandler(CONFIG.get("log_location"), mode="w")] if CONFIG.get("log_use_stdout"): loghandlers.append(logging.StreamHandler(sys.stdout)) logging.basicConfig( format="%(asctime)s [%(processName)-14.14s] [%(levelname)-7.7s] %(message)s", level=getattr(logging, CONFIG.get("log_level").upper(), None), handlers=loghandlers ) logging.debug("Logging Initialised!")
def save_rgb(self, out_path): # img = self._get_01_image().convert("RGB") img = Image.fromarray( np.uint8( CONFIG.colormap( np.array( self.df.applymap(lambda x: ((x - self.min) / ( self.max - self.min))).as_matrix())) * 255)) img.save(out_path)
def main(): setup_logging() start = time.time() with database.DB() as db: found_links = init_page_queue(CONFIG.get("parser_processes")) found_links = list(filter(needs_refresh, found_links)) found_links = found_links[:CONFIG.get("max_search")] shuffle(found_links) logging.info("found {} mods to use".format(len(found_links))) scraped_data = scrape_results(found_links, CONFIG.get("parser_processes")) if len(scraped_data) > 0: logging.debug("everything scraped") for mod_record in scraped_data: # TODO - would be nice to have a db writer thread and some kind of mpsc for other threads to send to it db.update_or_create(mod_record) logging.info("completed in: {}".format(time.time() - start))
def save_rgb(self, out_path): # img = self._get_01_image().convert("RGB") img = Image.fromarray( np.uint8( CONFIG.colormap( np.array(self.df.applymap(lambda x: ((x - self.min) / (self.max - self.min))).as_matrix()) ) * 255 ) ) img.save(out_path)
def __init__(self, reproduction_rate, start_url=None): super().__init__() self.last_pos = None self.current_pos = None self.current_urls = list() self.reproduced = False self.die_reason = "unknown" self.id = Crawler.id Crawler.id += 1 self.reproduction_rate = reproduction_rate self.per_page = int(CONFIG.get("CRAWLER", "reproductionPerPage")) self.hop_count = int(CONFIG.get("CRAWLER", "hopCount")) self.retries = int(CONFIG.get("CRAWLER", "retries")) if start_url is None: self.start_url = CONFIG.get("CRAWLER", "startUrl") else: self.start_url = start_url CrawlerAPI.get_instance().register(self)
def needs_refresh(link: str) -> bool: with DB() as db: found_link = db.get_cache_info(link) if not found_link: logging.debug("added mod %s" % link) return True else: if int(time.time()) - found_link[1] > CONFIG.get("cache_timeout"): logging.debug("cache expired on %s" % link) return True else: logging.debug("ignoring ID already in db %d" % found_link[0]) return False
def __init__(self, worker_type): self.thread_count = int(CONFIG.get("WORKER", "threads")) self.worker_type = worker_type self.workers = list() self.subqueues = list()
def __enter__(self): self.conn = sqlite3.connect(CONFIG.get("db_location")) self.cur = self.conn.cursor() self.create() return self