Ejemplo n.º 1
0
 def __init__(self, list=None):
     self.gathered_links = list if Queue.instance is not None else [
     ]  # FIFO
     self.dump_seperater = ",\n"
     self.dump_file = CONFIG.get("QUEUE", "dumpFile")
     self.overwrite = bool(CONFIG.get("QUEUE", "overwrite"))
     self.direct_dump = bool(CONFIG.get("QUEUE", "directDump"))
Ejemplo n.º 2
0
def setup_logging():
	logging.getLogger()
	loghandlers = [logging.handlers.RotatingFileHandler(CONFIG.get("log_location"), mode="w")]
	if CONFIG.get("log_use_stdout"):
		loghandlers.append(logging.StreamHandler(sys.stdout))
	logging.basicConfig(
		format="%(asctime)s [%(processName)-14.14s] [%(levelname)-7.7s]  %(message)s",
		level=getattr(logging, CONFIG.get("log_level").upper(), None),
		handlers=loghandlers
	)
	logging.debug("Logging Initialised!")
Ejemplo n.º 3
0
 def save_rgb(self, out_path):
     # img = self._get_01_image().convert("RGB")
     img = Image.fromarray(
         np.uint8(
             CONFIG.colormap(
                 np.array(
                     self.df.applymap(lambda x: ((x - self.min) / (
                         self.max - self.min))).as_matrix())) * 255))
     img.save(out_path)
Ejemplo n.º 4
0
def main():
	setup_logging()
	start = time.time()

	with database.DB() as db:
		found_links = init_page_queue(CONFIG.get("parser_processes"))
		found_links = list(filter(needs_refresh, found_links))
		found_links = found_links[:CONFIG.get("max_search")]
		shuffle(found_links)
		logging.info("found {} mods to use".format(len(found_links)))

		scraped_data = scrape_results(found_links, CONFIG.get("parser_processes"))
		if len(scraped_data) > 0:
			logging.debug("everything scraped")
		for mod_record in scraped_data:
			# TODO - would be nice to have a db writer thread and some kind of mpsc for other threads to send to it
			db.update_or_create(mod_record)

	logging.info("completed in: {}".format(time.time() - start))
Ejemplo n.º 5
0
 def save_rgb(self, out_path):
     # img = self._get_01_image().convert("RGB")
     img = Image.fromarray(
         np.uint8(
             CONFIG.colormap(
                 np.array(self.df.applymap(lambda x: ((x - self.min) / (self.max - self.min))).as_matrix())
             )
             * 255
         )
     )
     img.save(out_path)
Ejemplo n.º 6
0
    def __init__(self, reproduction_rate, start_url=None):
        super().__init__()

        self.last_pos = None
        self.current_pos = None
        self.current_urls = list()
        self.reproduced = False
        self.die_reason = "unknown"

        self.id = Crawler.id
        Crawler.id += 1

        self.reproduction_rate = reproduction_rate
        self.per_page = int(CONFIG.get("CRAWLER", "reproductionPerPage"))
        self.hop_count = int(CONFIG.get("CRAWLER", "hopCount"))
        self.retries = int(CONFIG.get("CRAWLER", "retries"))

        if start_url is None:
            self.start_url = CONFIG.get("CRAWLER", "startUrl")
        else:
            self.start_url = start_url

        CrawlerAPI.get_instance().register(self)
Ejemplo n.º 7
0
def needs_refresh(link: str) -> bool:
    with DB() as db:
        found_link = db.get_cache_info(link)

        if not found_link:
            logging.debug("added mod %s" % link)
            return True
        else:
            if int(time.time()) - found_link[1] > CONFIG.get("cache_timeout"):
                logging.debug("cache expired on %s" % link)
                return True
            else:
                logging.debug("ignoring ID already in db %d" % found_link[0])
                return False
Ejemplo n.º 8
0
    def __init__(self, worker_type):
        self.thread_count = int(CONFIG.get("WORKER", "threads"))
        self.worker_type = worker_type

        self.workers = list()
        self.subqueues = list()
Ejemplo n.º 9
0
 def __enter__(self):
     self.conn = sqlite3.connect(CONFIG.get("db_location"))
     self.cur = self.conn.cursor()
     self.create()
     return self