def save_xhtml(self, xhtml_d): assert os.path.isdir(self.save_path) ret_signal = True self.file_counter += 1 try: try: file = self.save_path + str(xhtml_d['netloc']) + "." + str(self.file_counter) + ".html" f = os.open( file, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) except Exception as e: #Stop the Process First and then raise the Exception with some extra info self.kill_evt.set() raise Exception("SCSpider error while Creating file - Error: %s" % e) #Place a file-object wrapper around the file Descriptor fobj = os.fdopen(f, "w", 1) #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding #if xhtml_d['charset']: # fenc = codecs.EncodedFile(fobj, xhtml_d['charset']) #else: fenc = fobj except Exception as e: print("SCSpider error while Creating file - Error: %s" % e) #Return None for the Spider to know that some error occurred for deciding what to do with it ret_signal = None else: try: fenc.write( xhtml_d['xhtml_s'] ) # Write the source to file except Exception as e: print("SCSpider Error while Writing file - Error: %s" % e) ret_signal = None finally: fenc.close() return ret_signal
def __load_dict(self, filename=None): #Create a temp dictionary of the seen URLs in 'file' seen_dict = dict() try: try: f = os.open( self.filespath + filename, os.O_RDONLY) except Exception as e: print("DUE Unit: Error while Opening file - Error: %s" % e) #Return None instead of Dictionary seen_dict = None #Place a file-object wrapper around the file Descriptor fobj = os.fdopen(f, "r", 1) #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding fenc = codecs.EncodedFile(fobj,'utf-8') for fileline in fenc: #Remove Whitespace characters before giving it as key value into seen_dict url = fileline.rstrip() seen_dict[ url ] = True except Exception as e: print("DUE Unit: Exception occurred while loading file - Error: %s" % e) #Notify Spider that Something went wrong - Return None instead of Dictionary seen_dict = None finally: #close file in any case fenc.close() #return the Dictionary return seen_dict
def open_tail(self, path, go_to_end=False): tail = Tail.FileTail() tail.file_descriptor = os.open(path, os.O_RDONLY | os.O_NONBLOCK) tail.path = path if go_to_end: os.lseek(tail.file_descriptor, 0, os.SEEK_END) watch_descriptor = self.watch_manager.add_watch( path, INOTIFY_FILE_MASK, proc_fun=self._inotify_file) tail.watch_descriptor = watch_descriptor.pop(path) return tail
def savetofile(self, filename=None, file_headers=True): """savetofile(): Stores the whole hash-url dictionary on hard disk. This function is recommended to be used externally from a process monitoring and handles the DUEUnit when the crawler lacks of main memory. Currently the number of dictionary records are recommended to be used as criterion""" if not filename: filename = str( self.base_url['netloc'] ) + "." + str( len(self.filelist) ) + ".seenurls" try: try: f = os.open( self.filespath + filename, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) except Exception as e: print("DUE Unit: Error while Creating file - Error: %s" % e) ret_signal = None #Place a file-object wrapper around the file Descriptor fobj = os.fdopen(f, "w", 1) #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding fenc = codecs.EncodedFile(fobj,'utf-8') except Exception as e: print("DUE Unit: Error while Saving file - Error: %s" % e) #Return None for the Spider to know that some error occurred for deciding what to do with it ret_signal = None else: if file_headers: header = "BASE URL: " + str( self.base_url['netloc'] ) + "/\n" #print header fenc.write(header) #heaser.encode() #print header lines = [ url for url in self.seen.keys() ] for line in lines: #os.write(f, line) fenc.write( str(line) + "\n" ) # Write a string to a file #line.encode() #Adding the new file name in the file list self.filelist.append(str(filename)) #Clears the seen dictionary self.seen.clear() #Return True for the Spider to know that everything went OK ret_signal = True finally: fenc.close() return ret_signal
else: os._exit(0) print "(%s) now daemonized" % (os.getpid(), ) # Close _all_ open (and othewise!) files. import resource maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if maxfd == resource.RLIM_INFINITY: maxfd = 4096 for fdnum in xrange(maxfd): try: os.close(fdnum) except OSError, e: if e.errno != errno.EBADF: raise # Remap std{in,out,err} devnull = os.open(os.path.devnull, os.O_RDWR) oflags = os.O_WRONLY | os.O_CREAT | os.O_APPEND if devnull != 0: # stdin os.dup2(devnull, 0) if options.stdout: stdout_fd = os.open(options.stdout, oflags) if stdout_fd != 1: os.dup2(stdout_fd, 1) os.close(stdout_fd) else: os.dup2(devnull, 1) if options.stderr: stderr_fd = os.open(options.stderr, oflags) if stderr_fd != 2: os.dup2(stderr_fd, 2) os.close(stderr_fd)
else: os._exit(0) print "(%s) now daemonized" % (os.getpid(),) # Close _all_ open (and othewise!) files. import resource maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if maxfd == resource.RLIM_INFINITY: maxfd = 4096 for fdnum in xrange(maxfd): try: os.close(fdnum) except OSError, e: if e.errno != errno.EBADF: raise # Remap std{in,out,err} devnull = os.open(os.path.devnull, os.O_RDWR) oflags = os.O_WRONLY | os.O_CREAT | os.O_APPEND if devnull != 0: # stdin os.dup2(devnull, 0) if options.stdout: stdout_fd = os.open(options.stdout, oflags) if stdout_fd != 1: os.dup2(stdout_fd, 1) os.close(stdout_fd) else: os.dup2(devnull, 1) if options.stderr: stderr_fd = os.open(options.stderr, oflags) if stderr_fd != 2: os.dup2(stderr_fd, 2) os.close(stderr_fd)