class TestDatabase(unittest.TestCase): @classmethod def setUpClass(cls): json_raw = open(os.path.dirname(os.path.realpath(__file__)) + "/config.json") cls._config = json.loads(json_raw.read()) cls._config['database']['reset'] = True cls.db = Database(cls._config['database']) @classmethod def tearDownClass(cls): os.remove("test.db") def testInit(self): self.assertIsInstance(self.db, Database) self.assertIsInstance(self.db._dbh, sqlite3.Connection) self.assertTrue(self.db._tables_exist()) cfg2 = self._config['database'] cfg2['server'] = "test2.db" cfg2['reset'] = False with Database(cfg2) as db2: self.assertIsInstance(db2, Database) self.assertTrue(db2._tables_exist()) cfg2['type'] = 'badtype' self.assertRaises(Exception, Database, cfg2) os.remove('test2.db') def testGetComicConfig(self): if not self.db.comic_exists("test"): self.db.insert_comic("test", "test", "test", "test", "test", 0, 0, "test", "test") result = self.db.get_comic_config("test") self.assertEquals(result['name'], "test") result2 = self.db.get_comic_config("bad") self.assertIsNone(result2) def testComicInsert(self): self.db = Database(self._config['database']) self.assertFalse(self.db.comic_exists("bad")) self.db.insert_comic("test", "test", "test", "test", "test", 0, 0, "test", "test") self.assertTrue(self.db.comic_exists("test")) def testFileInsert(self): if not self.db.comic_exists("test"): self.db.insert_comic("test", "test", "test", "test", "test", 0, 0, "test", "test") self.db.insert_file("test", "test", "test", "test") result = self.db._dbh.execute("select * from files where comic='test'").fetchall() self.assertEquals(len(result), 1) self.assertEquals(result[0][0], 'test') def testSetLast(self): if not self.db.comic_exists("test"): self.db.insert_comic("test", "test", "test", "test", "test", 0, 0, "test", "test") self.db.set_last("test", "lasturl") result = self.db._dbh.execute("select * from comics").fetchall() self.assertEquals(result[0][9], "lasturl")
class Download: def __init__(self, config, database): self.config = config self._dbh = Database(database) if not os.path.exists(self.config['folder']): logging.info("Creating folder '%s'", self.config['folder']) os.makedirs(self.config['folder']) elif not os.path.isdir(self.config['folder']): raise Exception("Comic folder {0} is not a directory".format(self.config['folder'])) def create_static(self, perPage, template): logging.info("Creating static site for %s", self.config['name']) self.staticFiles = set() current = self.config['startUrl'] last = "" pageComics = [] pageNum = 1 while True: logging.info("Searching '%s' for files", current) resp = urllib2.urlopen(current) page = resp.read() files = self._download(current, page, True) logging.debug("Downloaded %d images", len(files)) pageComics.extend(files) nxt = self._get_next(current, page) logging.debug("Found next page '%s'", nxt) last = current current = nxt if len(pageComics) > perPage: self._write_page(pageNum, pageComics) pageComics = [] pageNum += 1 if last == self.config['baseUrl'] or last == current or current == '' or current == None or re.search("#$", current): logging.debug("Ending crawl_comic: last: %s, current: %s, nxt: %s", last, current, nxt) if len(pageComics) > 0: self._write_page(pageNum, pageComics, True) break; def crawl_comic(self): comicConfig = self._dbh.get_comic_config(self.config['name']) # Create a new entry for new comics if comicConfig == None: self._dbh.insert_comic( self.config['name'], self.config['description'], self.config['folder'], self.config['nextRegex'], self.config['comicRegex'], self.config['notesRegex'], self.config['altText'], self.config['baseUrl'], self.config['startUrl'] ) # If the comic already exists, use the data in the databse else: self.config = comicConfig # If there is a last comic to start from, start from there rather than from the beginning current = self.config['lastUrl'] if "lastUrl" in self.config.keys() else self.config['startUrl'] last = "" while True: logging.info("Searching '%s' for files", current) resp = urllib2.urlopen(current) page = resp.read() files = self._download(current, page) logging.debug("Downloaded %d images", len(files)) nxt = self._get_next(current, page) logging.debug("Found next page '%s'", nxt) last = current current = nxt self._dbh.set_last(self.config['name'], last) if last == self.config['baseUrl'] or last == current or current == '' or current == None or re.search("#$", current): logging.debug("Ending crawl_comic: last: %s, current: %s, nxt: %s", last, current, nxt) break; def _get_next(self, url, page): linkSearch = re.search( #TODO: Fix multiline search to be properly non-greedy (may be a bug in python's re implementation) r"<a[^>]+?href\s*=\s*[\'\"](.+?)[\'\"].*?" + self.config['nextRegex'] + r".*?</a>", page, re.IGNORECASE ) logging.debug("_get_next searching for %s", linkSearch.re.pattern) #logging.debug("_get_next searching through:\n %s", linkSearch.string) logging.debug("_get_next matched whole string '%s'", linkSearch.group(0)) if linkSearch != None: logging.debug("_get_next found: %s", pprint.pformat(linkSearch.groups())) link = linkSearch.group(1) absSearch = re.match(r"http:", link) rootSearch = re.match(r"/", link) if absSearch: return link elif rootSearch: stripped = re.match(r"(http://[^/]*?)/", url) return stripped.group(1) + link else: stripped = re.match(r"(http://.*/)[^/]*", url) return stripped.group(1) + link else: return None def _download(self, url, page, static=False): logging.debug("Downloaing from " + page) comics = OrderedSet(re.findall(self.config['comicRegex'], page, re.MULTILINE)) files = [] for c in comics: if re.search(r"\.ico$", c): continue dlUrl = "" absSearch = re.match("http:", c) relSearch = re.match(r"^\/?([^/]*?)", c) if absSearch: dlUrl = c elif relSearch: stripped = re.match(r"(http:\/\/[^\/]+?)", url) dlUrl = stripped.group(1) + relSearch.group(1) else: stripped = re.match(r"(http:\/\/.*\/)[^\/]*", url) dlUrl = stripped.group(1) + relSearch.group(1) fileMatch = re.search(r".*\/([^\/?]+)", c) filename = os.path.join(self.config['folder'], fileMatch.group(1)) # For static download, skip existing files if static and filename in self.staticFiles: continue logging.info("Downloading '%s' as '%s'", dlUrl, filename) urllib.urlretrieve(dlUrl, filename) if static: self.staticFiles.add(filename) altText = None # Grab alt text from the image if the config specifies it if self.config['altText'] == True: altSearch = re.search(r"<img.*?alt(?:\s*=\s*[\'\"](.*?)[\'\"])[^>]*" + self.config['comicRegex'] + r"[^>]*/\s*>", page) if altSearch != None: altText = altSearch.group(1) # Last two params are for alt text and notes on the file if not static and not self._dbh.insert_file(self.config['name'], filename, altText, None): raise Exception("Unable to continue due to a database error") files.append({"filename": filename, "alt": altText}) return files def _write_page(self, page, comics, last=False): logging.info("Writing static page %s for %s", page, self.config['name']) filename = "page-{0}.html".format(page) f = open(self.config['static']['template'], "r") template = f.read() f.close() template = re.sub(r"@@@title@@@", self.config['name'], template) template = re.sub(r"@@@page@@@", "Page {0}".format(page), template) html = "<ul>" for c in comics: nameSearch = re.search(".*/(.*)$", c['filename']) html += "<li>" html += '<img src="../{0}" alt="{1}"/>'.format(nameSearch.group(1), c['alt']) html += "</li>" html += "</ul>" template = re.sub(r"@@@comics@@@", html, template) if page == 1: filename = "index.html" template = re.sub(r"@@@first@@@", "disabled", template) template = re.sub(r"@@@prev_page@@@", "#", template) else: template = re.sub(r"@@@first@@@", "", template) if page == 2: template = re.sub(r"@@@prev_page@@@", "index.html", template) else: template = re.sub(r"@@@prev_page@@@", "page-{0}.html".format(page - 1), template) if last: template = re.sub(r"@@@next_page@@@", "#", template) template = re.sub(r"@@@last@@@", "disabled", template) else: template = re.sub(r"@@@next_page@@@", "page-{0}.html".format(page + 1), template) template = re.sub(r"@@@last@@@", "", template) filename = os.path.join(self.config['static']['htmlDir'], filename) f = open(filename, "w") f.write(template) f.close()