def add_stats_record(cls, statsd): sqlite3 = cls.try_import() if sqlite3 is None: return logconsole('Writing project statistics to crawl database...') dbfile = os.path.join(objects.config.userdbdir, "crawls.db") conn = sqlite3.connect(dbfile) c = conn.cursor() t = (cls.projid, statsd['links'], statsd['processed'], statsd['filtered'], statsd['fatal'], statsd['broken'], statsd['filesinrepos'], statsd['extservers'] + 1, statsd['extdirs'] + 1, statsd['files'], statsd['bytes'], '%.2f' % statsd['fetchtime']) c.execute("insert into project_stats values(?,?,?,?,?,?,?,?,?,?,?,?)", t) conn.commit() c.close() pass
def find_broken_links(self, event, *args, **kwargs): urldb = objects.datamgr.get_urldb() for node in urldb.preorder(): urlobj = node.get() if urlobj.status == 404: self.broken.append(urlobj.get_full_url()) # Write to a file baseurl = objects.queuemgr.get_base_url() fname = '404#' + str(hash(baseurl)) + '.txt' logconsole('Writing broken links to',fname) f = open(fname, 'w') f.write("Broken links for crawl starting with URL %s\n\n" % baseurl) for link in self.broken: f.write(link + '\n') f.close() return False
def create_user_database(cls): sqlite3 = cls.try_import() if sqlite3 is None: return logconsole("Creating user's crawl database file in %s..." % objects.config.userdbdir) dbfile = os.path.join(objects.config.userdbdir, "crawls.db") conn = sqlite3.connect(dbfile) c = conn.cursor() # Create table for projects # This line is causing a problem in darwin # c.execute("drop table if exists projects") c.execute( """create table projects (id integer primary key autoincrement default 0, time real, name text, url str, config str)""" ) # Create table for project statistics # We are storing the information for # 1. number of urls scanned # 2. number of urls processed (fetched/crawled) # 3. number of URLs which were crawl-filtered # 4. number of urls failed to fetch # 5. number of urls with 404 errors # 6. number of URLs which hit the cache # 7. number of servers scanned # 8. number of unique directories scanned # 9. number of files saved # 10. Amount of data fetched in bytes # 11. the total time for the crawl. # This line is causing a problem in darwin # c.execute("drop table project_stats") c.execute( """create table project_stats (project_id integer primary key, urls integer, procurls integer, filteredurls integer, failedurls integer, brokenurls integer, cacheurls integer, servers integer, directories integer, files integer, data real, duration text)""" ) c.close()
def create_user_database(cls): sqlite3 = cls.try_import() if sqlite3 is None: return logconsole("Creating user's crawl database file in %s..." % objects.config.userdbdir) dbfile = os.path.join(objects.config.userdbdir, "crawls.db") conn = sqlite3.connect(dbfile) c = conn.cursor() # Create table for projects # This line is causing a problem in darwin # c.execute("drop table if exists projects") c.execute("""create table projects (id integer primary key autoincrement default 0, time real, name text, url str, config str)""") # Create table for project statistics # We are storing the information for # 1. number of urls scanned # 2. number of urls processed (fetched/crawled) # 3. number of URLs which were crawl-filtered # 4. number of urls failed to fetch # 5. number of urls with 404 errors # 6. number of URLs which hit the cache # 7. number of servers scanned # 8. number of unique directories scanned # 9. number of files saved # 10. Amount of data fetched in bytes # 11. the total time for the crawl. # This line is causing a problem in darwin # c.execute("drop table project_stats") c.execute("""create table project_stats (project_id integer primary key, urls integer, procurls integer, filteredurls integer, failedurls integer, brokenurls integer, cacheurls integer, servers integer, directories integer, files integer, data real, duration text)""") c.close()