def initialize(): ''' Check what we have to check and set paramaters ''' global __INITIALIZED__ if __INITIALIZED__: return False with INIT_LOCK: db_check() if load_config(): # Lets create some directories dirs = [COMIC_DIR, LOG_DIR, COMIC_INT, COMIC_STATIC] OPS_LIST = [ WEB_PORT, LOG_DIR, COMIC_DIR, COMIC_DB, COMIC_INT, COMIC_STATIC ] for opts in OPS_LIST: logger.log(u'Loaded: ' + opts) for folder in dirs: dir_check(os.path.join(DATA_DIR, folder)) __INITIALIZED__ = True
def action(self, query, args=None): with db_lock: if query is None: return sqlResult = None attempt = 0 while attempt < 5: try: if args is None: logger.log(self.filename+": "+query) sqlResult = self.connection.execute(query) else: logger.log(self.filename+": "+query+" with args "+str(args)) sqlResult = self.connection.execute(query, args) self.connection.commit() # get out of the connection attempt loop since we were successful break except sqlite3.OperationalError, e: if "unable to open database file" in e.message or "database is locked" in e.message: #logger.log(u"DB error: "+ex(e), logger.WARNING) attempt += 1 time.sleep(1) else: #logger.log(u"DB error: "+ex(e), logger.ERROR) raise except sqlite3.DatabaseError, e: #logger.log(u"Fatal error executing query: " + ex(e), logger.ERROR) raise
def StopAllTasks(self): #for task in self.tasks: for task in comicstrip.TASK_LIST: logger.log(u'Stopping task ' + str(task)) task.stop() task.join() logger.log(u'Stopped')
def upsert(self, tableName, valueDict, keyDict): logger.log(u'TB: ' + str(tableName) + 'val: ' + str(valueDict) + 'key: ' + str(keyDict)) changesBefore = self.connection.total_changes genParams = lambda myDict : [x + " = ?" for x in myDict.keys()] query = "UPDATE "+tableName+" SET " + ", ".join(genParams(valueDict)) + " WHERE " + " AND ".join(genParams(keyDict)) self.action(query, valueDict.values() + keyDict.values()) if self.connection.total_changes == changesBefore: query = "INSERT INTO "+tableName+" (" + ", ".join(valueDict.keys() + keyDict.keys()) + ")" + \ " VALUES (" + ", ".join(["?"] * len(valueDict.keys() + keyDict.keys())) + ")" self.action(query, valueDict.values() + keyDict.values())
def __init__(self, **kwargs): ''' Initiate thread startup ''' self.action = kwargs.get('action') self.cycleTime = kwargs.get('cycleTime') self.args = kwargs.get('args') self.runImmediatly = kwargs.get('runImmediatly') self.running = 1 if self.runImmediatly: self.lastRun = datetime.datetime.fromordinal(1) else: self.lastRun = datetime.datetime.now() threading.Thread.__init__(self) logger.log(u'Thread Init: ' + str(threading.current_thread().name))
def page_find(comic_url): # This will run through all the next pages try: parse_page = bs(requests.get(comic_url).text) except: next_page = None return None parsed_url = urlparse.urlparse(comic_url) # First find all the links on the page for links in parse_page.findAll(['a']): if re.compile(".*next.*", re.IGNORECASE).match(str(links)): if links['href'].lower().startswith('http'): next_page = links['href'] logger.log(u'CUR: ' + comic_url + ' NEXT: ' + next_page) return next_page else: if parsed_url.path and not parsed_url.query: parsed_url = list(parsed_url) # Catch those stupid # refrences to the same page if not re.compile('.*#$.*').match(links['href']): parsed_url[2] = links['href'] next_page = urlparse.urlunparse(parsed_url) logger.log(u'CUR: ' + comic_url + ' NEXT: ' + next_page) else: next_page = None else: logger.log(u'DEBUG: ' + str(parsed_url) + ' ' + str(links['href'])) next_page = list(parsed_url) # Catch those stupid # refrences to the same page if not re.compile('.*#$.*').match(links['href']): if parsed_url.path.strip('/') not in links['href']: next_page[4] = links['href'].strip('?') else: next_page[2] = links['href'] next_page[4] = None next_page = urlparse.urlunparse(next_page) logger.log(u'CUR: ' + comic_url + ' NEXT: ' + next_page) else: next_page = None return next_page else: logger.log(u'FAILED: ' + comic_url) return None
def load_config(): global WEB_PORT, WEB_HOST, LOG_DIR, COMIC_DIR, COMIC_DB, COMIC_INT, COMIC_STATIC conf = ConfigParser() conf.read(CFG_FILE) try: WEB_PORT = conf.get('General', 'web_port') WEB_HOST = conf.get('General', 'web_host') LOG_DIR = conf.get('General', 'log_dir') COMIC_DIR = conf.get('General', 'comic_dir') COMIC_INT = conf.get('General', 'comic_int') COMIC_STATIC = conf.get('General', 'comic_static') COMIC_DB = conf.get('General', 'comic_db') return True except: logger.log(u'Could not load config file creating default file') save_config()
def grab_strip(comic_id, outpath, strip_no, current_url, replace=False): parsed = list(urlparse.urlparse(current_url)) soup = bs(requests.get(current_url).text) # Loop through all the images soup finds for image in soup.findAll('img'): filename = image['src'].split('/')[-1] if filename.endswith(('.jpg','.png','.gif')): if image['src'].lower().startswith('http'): strip_img = requests.get(image["src"]) else: parsed[2] = image['src'] parsed[4] = None strip_img = requests.get(urlparse.urlunparse(parsed)) if strip_img.status_code == requests.codes.ok: s = StringIO(strip_img.content) strip = Image.open(s) w,h = strip.size if w > 249 and h > 320 or w > 320 and h > 249: filename = "%s%s" % (strip_no,os.path.splitext(filename)[-1]) save_path = os.path.join(comicstrip.COMIC_DIR, outpath) path_exists(save_path) save_path = os.path.join(save_path, filename) db_path = os.path.join(outpath, filename) logger.log(u'PAGE: ' + current_url + 'IMAGE: ' + filename) # Check if image exists already and skip saving if not os.path.exists(save_path): strip.save(save_path) else: logger.log(u'FOUND IMAGE: ' + save_path) return { 'strip_no': strip_no, 'page_url': current_url, 'location': db_path } else: return { 'strip_no': strip_no, 'page_url': current_url, 'location': 'SKIPPED' }
def initialize(): ''' Check what we have to check and set paramaters ''' global __INITIALIZED__ if __INITIALIZED__: return False with INIT_LOCK: db_check() if load_config(): # Lets create some directories dirs = [COMIC_DIR, LOG_DIR, COMIC_INT, COMIC_STATIC] OPS_LIST = [WEB_PORT, LOG_DIR, COMIC_DIR, COMIC_DB, COMIC_INT, COMIC_STATIC] for opts in OPS_LIST: logger.log(u'Loaded: ' + opts) for folder in dirs: dir_check(os.path.join(DATA_DIR, folder)) __INITIALIZED__ = True
def update_engine(comic_id=None,que=None): # Connect to the db myDB = db.DBConnection(row_type="dict") if comic_id is not None: sqlQuery = 'SELECT id,path,first_page,end_page FROM comic_list WHERE id = (%s)' % (comic_id,) else: sqlQuery = 'SELECT id,path,first_page,end_page FROM comic_list' for info in myDB.select(sqlQuery): # Populate some empty data stores db_upd_list = list() url_list = dict() # Grab the ending page if there is one. (Page where we cut off looking for next pages) end_page = info['end_page'] last_url = myDB.select('SELECT strip_no,page_url FROM comic_strips WHERE comic_id = (?) ORDER BY strip_no DESC LIMIT 1',(info['id'],)) logger.log(u'LAST URL: ' + str(last_url)) if last_url: page_url = page_find(last_url[0]['page_url']) strip_no = last_url[0]['strip_no'] else: page_url = info['first_page'] strip_no = 0 while page_url is not None and page_url not in url_list.values() and page_url != end_page: strip_no += 1 url_list[strip_no] = page_url page_url = page_find(page_url) logger.log(u'URL LIST: ' + str(len(url_list))) #if len(url_list) > 5: # break for strip_no in url_list.keys(): myDB.upsert('comic_strips', grab_strip(info['id'], info['path'], strip_no, url_list[strip_no]), { 'comic_id': info['id'], 'strip_no': strip_no }) myDB.connection.close()
def update_engine(comic_id=None,que=None): # Connect to the db myDB = db.DBConnection(row_type="dict") if comic_id is not None: sqlQuery = 'SELECT id,path,first_page,end_page FROM comic_list WHERE id = (%s)' % (comic_id,) else: sqlQuery = 'SELECT id,path,first_page,end_page FROM comic_list' for info in myDB.select(sqlQuery): # Populate some empty data stores db_upd_list = list() url_list = dict() # Grab the ending page if there is one. (Page where we cut off looking for next pages) end_page = info['end_page'] last_url = myDB.select('SELECT strip_no,page_url FROM comic_strips WHERE comic_id = (?) ORDER BY strip_no DESC LIMIT 1',(info['id'],)) logger.log(u'LAST URL: ' + str(last_url)) if last_url: page_url = page_find(last_url[0]['page_url']) strip_no = last_url[0]['strip_no'] else: page_url = info['first_page'] strip_no = 0 while page_url is not None and page_url not in url_list.values() and page_url != end_page: strip_no += 1 url_list[strip_no] = page_url page_url = page_find(page_url) logger.log(u'URL LIST: ' + str(len(url_list))) #if len(url_list) > 5: # break for strip_no in url_list.keys(): myDB.upsert('comic_strips', grab_strip(info['id'], info['path'], strip_no, url_list[strip_no]), { 'comic_id': info['id'], 'strip_no': strip_no })
def run(self): ''' Run the function we want ''' logger.log(u'Thread Started: ' + str(threading.current_thread().name)) while True: currentTime = datetime.datetime.now() if currentTime - self.lastRun > self.cycleTime: logger.log(u'Running task: ' + str(threading.current_thread().name)) self.lastRun = currentTime try: if self.args is not None: self.action(self.args) else: self.action() except Exception, e: raise logger.log(u'Exception generated in thread ' + e) if not self.running: return time.sleep(1)
def stop(self): logger.log(u'Sending stop signal') self.running = 0