def _check_general_url(self, url): """Checks an URL for sanity. Returns True if the URL is sane, otherwise False. >>> from ResourceChecker import ResourceChecker >>> resource_checker = ResourceChecker() >>> resource_checker._check_general_url("http://example.com") True""" url_validator = UrlValidator() sanity = url_validator.validate(url) return sanity
def __init__(self, _filepath, _filetype, _output_filepath): self.filepath = _filepath self.filetype = _filetype self.output_filepath = _output_filepath self.output_imagepath = _output_filepath self.output_filename = str(uuid.uuid4()) + "." + self.filetype self.output_filepath += "tmp_files/" + self.output_filename self.img_comp = ImageCompressor() self.url_valid = UrlValidator()
def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685" self.PolitenessDelay = 600 #Timeout(Seconds) for trying to get the next url from the frontier. self.FrontierTimeOut = 60 #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?) self.WorkerTimeOut = 60 #Timeout(Seconds) for getting data from the output queue self.OutBufferTimeOut = 60 self.MaxQueueSize = 100 self.urlValidator = UrlValidator(verbose=False) self.dbConf = open('db.conf').read() self.conn = self.connectDatabase() print "Using Postgres shelve implementation..." self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())
class FileProcessor: filetype = None filepath = None output_filename = None output_filepath = None output_imagepath = None img_comp = None url_valid = None url_list = [] def __init__(self, _filepath, _filetype, _output_filepath): self.filepath = _filepath self.filetype = _filetype self.output_filepath = _output_filepath self.output_imagepath = _output_filepath self.output_filename = str(uuid.uuid4()) + "." + self.filetype self.output_filepath += "tmp_files/" + self.output_filename self.img_comp = ImageCompressor() self.url_valid = UrlValidator() def process(self): if self.filetype == 'txt': self.processTextFile() elif self.filetype == 'csv': self.processCsvFile() elif self.filetype == 'xls' or self.fileType == 'xlsx': self.processXlsFile() return self.output_filepath def processTextFile(self): with open(self.filepath, 'r') as f: self.url_list = f.read().split("\n") f.closed self.url_list = [ x.strip() for x in self.url_list if self.url_valid.isUrl(x) ] # open output file for writing links to compressed images f = open(self.output_filepath, 'w') for url in self.url_list: output_url = self.img_comp.compress(url, str(uuid.uuid4()), self.output_imagepath) f.write(output_url + "\n") f.close() def processCsvFile(self): fw = open(self.output_filepath, 'w') with open(self.filepath, 'rb') as f: reader = csv.reader(f) try: for row in reader: for element in row: if self.url_valid.isUrl(element): output_url = self.img_comp.compress( element, str(uuid.uuid4()), self.output_imagepath) fw.write(output_url) else: fw.write(element) if element == row[-1]: fw.write("\n") else: fw.write(",") except csv.Error as e: sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) f.closed fw.close() def processXlsFile(self): workbook = xlrd.open_workbook(self.filepath) worksheet = workbook.sheet_by_name('Sheet1') new_workbook = xlwt.Workbook() new_sheet = new_workbook.add_sheet('Sheet1') for row in range(0, 100): try: element = worksheet.cell(row, 0).value if element == xlrd.empty_cell.value: continue if self.url_valid.isUrl(element): output_url = self.img_comp.compress( element, str(uuid.uuid4()), self.output_imagepath) new_sheet.write(row, 0, output_url) except IndexError as e: pass new_workbook.save(self.output_filename)
class CrawlerConfig(Config): def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685" self.PolitenessDelay = 600 #Timeout(Seconds) for trying to get the next url from the frontier. self.FrontierTimeOut = 60 #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?) self.WorkerTimeOut = 60 #Timeout(Seconds) for getting data from the output queue self.OutBufferTimeOut = 60 self.MaxQueueSize = 100 self.urlValidator = UrlValidator(verbose=False) self.dbConf = open('db.conf').read() self.conn = self.connectDatabase() print "Using Postgres shelve implementation..." self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase()) def connectDatabase(self): try: return psycopg2.connect(self.dbConf) print "Connected to database..." except Exception: traceback.print_exc() print "Could not connect to database, exiting." print "Please close manually if it doesn't exit..." sys.exit(1) def GetSeeds(self): '''Returns the first set of urls to start crawling from''' return ["http://www.ics.uci.edu/"] def HandleData(self, parsedData): '''Function to handle url data. Guaranteed to be Thread safe. parsedData = {"url" : "url", "text" : "text data from html", "html" : "raw html data"} Advisable to make this function light. Data can be massaged later. Storing data probably is more important''' cur = None try: self.conn.rollback() url = str(parsedData["url"]) text = str(parsedData["text"].encode('utf-8')) cur = self.conn.cursor() query = cur.mogrify("UPDATE PAGES SET TEXT = %s WHERE URL = %s", (text, url)) cur.execute(query) self.conn.commit() print "Saved data: "+parsedData["url"] except psycopg2.IntegrityError: traceback.print_exc() self.conn.rollback() except psycopg2.InterfaceError: print "Connection reset" self.conn = self.connectDatabase() except Exception: print "Error saving URL: "+url traceback.print_exc() try: self.conn.rollback() print "Rolled back transaction" except Exception: print "Failed to rollback transaction" finally: if cur != None: cur.close() def ValidUrl(self, url): '''Function to determine if the url is a valid url that should be fetched or not.''' return self.urlValidator.allows(url)
class FileProcessor: filetype = None filepath = None output_filename = None output_filepath = None output_imagepath = None img_comp = None url_valid = None url_list = [] def __init__(self, _filepath, _filetype, _output_filepath): self.filepath = _filepath self.filetype = _filetype self.output_filepath = _output_filepath self.output_imagepath = _output_filepath self.output_filename = str(uuid.uuid4()) + "." + self.filetype self.output_filepath += "tmp_files/" + self.output_filename self.img_comp = ImageCompressor() self.url_valid = UrlValidator() def process(self): if self.filetype == 'txt': self.processTextFile() elif self.filetype == 'csv': self.processCsvFile() elif self.filetype == 'xls' or self.fileType == 'xlsx': self.processXlsFile() return self.output_filepath def processTextFile(self): with open(self.filepath, 'r') as f: self.url_list = f.read().split("\n") f.closed self.url_list = [x.strip() for x in self.url_list if self.url_valid.isUrl(x)] # open output file for writing links to compressed images f = open(self.output_filepath, 'w') for url in self.url_list: output_url = self.img_comp.compress(url, str(uuid.uuid4()), self.output_imagepath) f.write(output_url + "\n") f.close() def processCsvFile(self): fw = open(self.output_filepath, 'w') with open(self.filepath, 'rb') as f: reader = csv.reader(f) try: for row in reader: for element in row: if self.url_valid.isUrl(element): output_url = self.img_comp.compress(element, str(uuid.uuid4()), self.output_imagepath) fw.write(output_url) else: fw.write(element) if element == row[-1]: fw.write("\n") else: fw.write(",") except csv.Error as e: sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) f.closed fw.close() def processXlsFile(self): workbook = xlrd.open_workbook(self.filepath) worksheet = workbook.sheet_by_name('Sheet1') new_workbook = xlwt.Workbook() new_sheet = new_workbook.add_sheet('Sheet1') for row in range(0, 100): try: element = worksheet.cell(row, 0).value if element == xlrd.empty_cell.value: continue if self.url_valid.isUrl(element): output_url = self.img_comp.compress(element, str(uuid.uuid4()), self.output_imagepath) new_sheet.write(row, 0, output_url) except IndexError as e: pass new_workbook.save(self.output_filename)