Esempio n. 1
0
    def _check_general_url(self, url):
        """Checks an URL for sanity. Returns True if the URL is sane, otherwise
        False.
        >>> from ResourceChecker import ResourceChecker
        >>> resource_checker = ResourceChecker()
        >>> resource_checker._check_general_url("http://example.com")
        True"""

        url_validator = UrlValidator()
        sanity = url_validator.validate(url)

        return sanity
Esempio n. 2
0
    def __init__(self, _filepath, _filetype, _output_filepath):

        self.filepath = _filepath
        self.filetype = _filetype
        self.output_filepath = _output_filepath
        self.output_imagepath = _output_filepath

        self.output_filename = str(uuid.uuid4()) + "." + self.filetype
        self.output_filepath += "tmp_files/" + self.output_filename

        self.img_comp = ImageCompressor()
        self.url_valid = UrlValidator()
Esempio n. 3
0
    def _check_general_url(self, url):
        """Checks an URL for sanity. Returns True if the URL is sane, otherwise
        False.
        >>> from ResourceChecker import ResourceChecker
        >>> resource_checker = ResourceChecker()
        >>> resource_checker._check_general_url("http://example.com")
        True"""

        url_validator = UrlValidator()
        sanity = url_validator.validate(url)
        
        return sanity
Esempio n. 4
0
	def __init__(self, _filepath, _filetype, _output_filepath):
		
		self.filepath = _filepath
		self.filetype = _filetype
		self.output_filepath  = _output_filepath
		self.output_imagepath = _output_filepath

		self.output_filename = str(uuid.uuid4()) + "." + self.filetype
		self.output_filepath += "tmp_files/" + self.output_filename
		
		self.img_comp = ImageCompressor()
		self.url_valid = UrlValidator()
Esempio n. 5
0
    def __init__(self):
        Config.__init__(self)
        self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685"
        self.PolitenessDelay = 600

        #Timeout(Seconds) for trying to get the next url from the frontier. 
        self.FrontierTimeOut = 60

        #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?)
        self.WorkerTimeOut = 60

        #Timeout(Seconds) for getting data from the output queue
        self.OutBufferTimeOut = 60

        self.MaxQueueSize = 100

        self.urlValidator = UrlValidator(verbose=False)
        self.dbConf = open('db.conf').read()
        self.conn = self.connectDatabase()
        print "Using Postgres shelve implementation..."
        self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())
Esempio n. 6
0
class FileProcessor:

    filetype = None
    filepath = None
    output_filename = None
    output_filepath = None
    output_imagepath = None
    img_comp = None
    url_valid = None

    url_list = []

    def __init__(self, _filepath, _filetype, _output_filepath):

        self.filepath = _filepath
        self.filetype = _filetype
        self.output_filepath = _output_filepath
        self.output_imagepath = _output_filepath

        self.output_filename = str(uuid.uuid4()) + "." + self.filetype
        self.output_filepath += "tmp_files/" + self.output_filename

        self.img_comp = ImageCompressor()
        self.url_valid = UrlValidator()

    def process(self):
        if self.filetype == 'txt':
            self.processTextFile()
        elif self.filetype == 'csv':
            self.processCsvFile()
        elif self.filetype == 'xls' or self.fileType == 'xlsx':
            self.processXlsFile()
        return self.output_filepath

    def processTextFile(self):
        with open(self.filepath, 'r') as f:
            self.url_list = f.read().split("\n")
        f.closed
        self.url_list = [
            x.strip() for x in self.url_list if self.url_valid.isUrl(x)
        ]

        # open output file for writing links to compressed images
        f = open(self.output_filepath, 'w')
        for url in self.url_list:
            output_url = self.img_comp.compress(url, str(uuid.uuid4()),
                                                self.output_imagepath)
            f.write(output_url + "\n")
        f.close()

    def processCsvFile(self):
        fw = open(self.output_filepath, 'w')
        with open(self.filepath, 'rb') as f:
            reader = csv.reader(f)
            try:
                for row in reader:
                    for element in row:
                        if self.url_valid.isUrl(element):
                            output_url = self.img_comp.compress(
                                element, str(uuid.uuid4()),
                                self.output_imagepath)
                            fw.write(output_url)
                        else:
                            fw.write(element)
                        if element == row[-1]:
                            fw.write("\n")
                        else:
                            fw.write(",")
            except csv.Error as e:
                sys.exit('file %s, line %d: %s' %
                         (filename, reader.line_num, e))
        f.closed
        fw.close()

    def processXlsFile(self):
        workbook = xlrd.open_workbook(self.filepath)
        worksheet = workbook.sheet_by_name('Sheet1')

        new_workbook = xlwt.Workbook()
        new_sheet = new_workbook.add_sheet('Sheet1')

        for row in range(0, 100):
            try:
                element = worksheet.cell(row, 0).value
                if element == xlrd.empty_cell.value:
                    continue
                if self.url_valid.isUrl(element):
                    output_url = self.img_comp.compress(
                        element, str(uuid.uuid4()), self.output_imagepath)
                    new_sheet.write(row, 0, output_url)
            except IndexError as e:
                pass

        new_workbook.save(self.output_filename)
Esempio n. 7
0
class CrawlerConfig(Config):
    def __init__(self):
        Config.__init__(self)
        self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685"
        self.PolitenessDelay = 600

        #Timeout(Seconds) for trying to get the next url from the frontier. 
        self.FrontierTimeOut = 60

        #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?)
        self.WorkerTimeOut = 60

        #Timeout(Seconds) for getting data from the output queue
        self.OutBufferTimeOut = 60

        self.MaxQueueSize = 100

        self.urlValidator = UrlValidator(verbose=False)
        self.dbConf = open('db.conf').read()
        self.conn = self.connectDatabase()
        print "Using Postgres shelve implementation..."
        self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())

    def connectDatabase(self):
        try:
            return psycopg2.connect(self.dbConf)
            print "Connected to database..."
        except Exception:
            traceback.print_exc()
            print "Could not connect to database, exiting."
            print "Please close manually if it doesn't exit..."
            sys.exit(1)

    def GetSeeds(self):
        '''Returns the first set of urls to start crawling from'''
        return ["http://www.ics.uci.edu/"]

    def HandleData(self, parsedData):
        '''Function to handle url data. Guaranteed to be Thread safe.
        parsedData = {"url" : "url", "text" : "text data from html", "html" : "raw html data"}
        Advisable to make this function light. Data can be massaged later. Storing data probably is more important'''
        cur = None
        try:
            self.conn.rollback()
            url = str(parsedData["url"])
            text = str(parsedData["text"].encode('utf-8'))
            cur = self.conn.cursor()
            query = cur.mogrify("UPDATE PAGES SET TEXT = %s WHERE URL = %s", (text, url))
            cur.execute(query)
            self.conn.commit()
            print "Saved data: "+parsedData["url"]
        except psycopg2.IntegrityError:
            traceback.print_exc()
            self.conn.rollback()
        except psycopg2.InterfaceError:
            print "Connection reset"
            self.conn = self.connectDatabase()
        except Exception:
            print "Error saving URL: "+url
            traceback.print_exc()
            try:
                self.conn.rollback()
                print "Rolled back transaction"
            except Exception:
                print "Failed to rollback transaction"
        finally:
            if cur != None: cur.close()

    def ValidUrl(self, url):
        '''Function to determine if the url is a valid url that should be fetched or not.'''
        return self.urlValidator.allows(url)
Esempio n. 8
0
class FileProcessor:

	filetype = None
	filepath = None
	output_filename = None
	output_filepath = None
	output_imagepath = None
	img_comp = None
	url_valid = None
	
	url_list = []

	def __init__(self, _filepath, _filetype, _output_filepath):
		
		self.filepath = _filepath
		self.filetype = _filetype
		self.output_filepath  = _output_filepath
		self.output_imagepath = _output_filepath

		self.output_filename = str(uuid.uuid4()) + "." + self.filetype
		self.output_filepath += "tmp_files/" + self.output_filename
		
		self.img_comp = ImageCompressor()
		self.url_valid = UrlValidator()
		
		
	def process(self):
		if self.filetype == 'txt':
			self.processTextFile()
		elif self.filetype == 'csv':
			self.processCsvFile()
		elif self.filetype == 'xls' or self.fileType == 'xlsx':
			self.processXlsFile()
		return self.output_filepath
		
		
	def processTextFile(self):
		with open(self.filepath, 'r') as f:
			self.url_list = f.read().split("\n")
		f.closed
		self.url_list = [x.strip() for x in self.url_list if self.url_valid.isUrl(x)]
		
		# open output file for writing links to compressed images
		f = open(self.output_filepath, 'w')
		for url in self.url_list:
			output_url = self.img_comp.compress(url, str(uuid.uuid4()), self.output_imagepath)
			f.write(output_url + "\n")
		f.close()
			


	def processCsvFile(self):
		fw = open(self.output_filepath, 'w')
		with open(self.filepath, 'rb') as f:
			reader = csv.reader(f)
			try:
				for row in reader:
					for element in row:
						if self.url_valid.isUrl(element):
							output_url = self.img_comp.compress(element, str(uuid.uuid4()), self.output_imagepath) 
							fw.write(output_url)
						else:
							fw.write(element)
						if element == row[-1]:
							fw.write("\n")
						else:
							fw.write(",")
			except csv.Error as e:
				sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
		f.closed
		fw.close()
		
	def processXlsFile(self):
		workbook = xlrd.open_workbook(self.filepath)
		worksheet = workbook.sheet_by_name('Sheet1')

		new_workbook = xlwt.Workbook()
		new_sheet = new_workbook.add_sheet('Sheet1')

		for row in range(0, 100):
			try:
				element = worksheet.cell(row, 0).value
				if element == xlrd.empty_cell.value:
					continue
				if self.url_valid.isUrl(element):
					output_url = self.img_comp.compress(element, str(uuid.uuid4()), self.output_imagepath)
					new_sheet.write(row, 0, output_url)
			except IndexError as e:
				pass

		new_workbook.save(self.output_filename)