def __init__(self, N, R, width): ################################### # SET UP INFRASTRUCTURE ################################## self.running = True self.schedule = RandomActivation(self) self.grid = MultiGrid(width, width, True) self.positions = list(product([x for x in range(width)],repeat = 2)) ########################################## # SET UP AGENTS - Resources and Countries ######################################### for i in range(N): a = c.Country(i, self) self.schedule.add(a) pos = self.get_position() self.grid.place_agent(a, pos) self.area_owned = R for i in range(R): res = r.Resource(i, self, random.randint(1,6), "") pos = self.get_position() self.grid.place_agent(res,pos) ########################################## # SET UP DATA COLLECTOR ########################################### self.datacollector = DataCollector(agent_reporters = {"Scaling A": "scaling_a" ,\ "Scaling B" : "scaling_b", \ "Capacity": "capacity", \ "Land Owned": "land_owned", \ "Conquered": "conquered"})
def parse(self, line, batch=0): """Parse a line to get a resource. Make sure there is no ending '\n' in the input line Args: line: input string without ending '\n' Returns: a resource to crawl (return None if fail) """ if line.endswith('\n'): return None parts = line.split('\t') if len(parts) == 1: url = parts[0] code = None elif len(parts) == 2: # 12345 http://www.cse.psu.edu url = parts[1]; code = parts[0]; else: return None try: r = resource.Resource(code, None, url, True, 0, batch) except BadResourceError: r = None return r
def __init__(self): self.backup = BackUp("screen") self.stdscr = curses.initscr() self.MAX_Y, self.MAX_X = self.stdscr.getmaxyx() self.window = self.stdscr.subwin(self.MAX_Y, self.MAX_X, 0, 0) self.main_window = self.window.subwin(self.MAX_Y - 2, self.MAX_X - MENU_W - 3, 1, 1) self.menu_window = self.window.subwin(self.MAX_Y - 2, MENU_W, 1, self.MAX_X - MENU_W - 2) self._max_y = self.MAX_Y - 4 self._max_x = self.MAX_X - MENU_W - 6 self.screen = {} """创建【时标】 """ self.TS = TimeScale() """创建【资源】 """ self.R = resource.Resource(self._max_x, self._max_y) self.loadData() """创建【对象】 """ self.Obj = [] for _i in range(5): self.Obj.append( obj.Obj("%d" % _i, 0, 0, obj_pattern[_i % len(obj_pattern)], (_i % 7) + 1))
def __init__( self, fontDescription, aParent=None ) : self._parent = aParent self._family = '' self._faceName = '' self._size = wx.NORMAL_FONT.GetPointSize() self._style = 'regular' self._underline = 0 if fontDescription is not None: if isinstance(fontDescription, dict): fontDescription = resource.Resource(fontDescription) try: self._family = fontDescription.family except: pass try: self._faceName = fontDescription.faceName except: pass try: self._size = fontDescription.size except: pass try: self._style = fontDescription.style except: pass
def __init__(self, app_name, app_dict): self.name = app_name self.type = app_dict['type'] self.category = app_dict['category'] self.risklevel = app_dict['risklevel'] self.socket_list = socket.gen_socket_list('public', 'server', app_dict['nips']) self.curr_socket = 99999 self.resource = resource.Resource(app_name)
def add_resource(self, url, format=u'', description=u'', hash=u'', **kw): import resource self.resources_all.append( resource.Resource(package_id=self.id, url=url, format=format, description=description, hash=hash, **kw))
def add_resource(self, url, format=u'', description=u'', hash=u'', **kw): import resource self.resources.append(resource.Resource( resource_group_id=self.resource_groups[0].id, url=url, format=format, description=description, hash=hash, **kw))
def get_resources_in_stack(self, stack_id): # original resources from the stack stack_resources = self.heat.resources.list(stack_id=stack_id) # a list of simpler Resource objects resources = [ resource.Resource(res.physical_resource_id, res.resource_type) for res in stack_resources ] return resources
def addResources(self, args): for name in args: self.args["name"] = name self.activeResources[name] = resource.Resource(self.args) self.activeResources[name].report() try: del self.args["name"] except KeyError: print "KeyError" pass
def __init__(self, use_resource=True): #TODO docstring ==> use resource ???? # create resource for the conf file self._conf_resource = resource.Resource(Conf.CLINAME, Conf.ENVNAME) # list of sections self._sections = {} self._configuration_file_path = None # create config object if use_resource: self._load_config()
def get(self, section, option, default=None, fail_if_missing=False): """ get one option from a section. return the default if it is not found and if fail_if_missing is False, otherwise return NoOptionError :param section: Section where to find the option :type section: str :param option: Option to get :param default: Default value to return if fail_if_missing is False :param fail_if_missing: Will throw an exception when the option is not found and fail_if_missing is true :returns: the option as a string :except NoOptionError: Raised only when fail_is_missing set to True """ # all options are kept in lowercase opt = self.optionxform(option) if section not in self._sections: #check if it is a ENV section dummy = None if section == Conf._ENVGROUP: r = resource.Resource(CliArgument=None, EnvVariable=opt) dummy = r.getValue() elif section == Conf._CLIGROUP: r = resource.Resource(CliArgument=opt, EnvVariable=None) dummy = r.getValue() #return default if dummy is None otherwise return dummy return ((self._get_defaults(section, opt, default, fail_if_missing)) if dummy == None else dummy) elif opt in self._sections[section]: return self._replace_vars(self._sections[section][opt], "%s[%s]" % (section, option), -1) else: return self._get_defaults(section, opt, default, fail_if_missing)
def on_resource_fetched(self, r): self.stat_lock.acquire() try: self.fetched_count += 1 finally: self.stat_lock.release() msg = "[%s] [%d] [%s] [%s] [%s]" % ( r.crawl_date, r.hop, r.content_type, r.url, r.parent_url) logging.getLogger('resource.fetched').info(msg) if r.content_type == 'text/html': if self.before_parse_filter.check(r): if isinstance(r.html, unicode): logging.warning( 'messaging.py >> unexpected unicode html [%s]' % r.url) links = html_helper.get_links(r.html) parent_url = r.url if isinstance(parent_url, unicode): logging.warning( 'messaging.py >> unexpected unicode parent_url [%s]' % parent_url) hop = r.hop + 1 for link_pair in links: link = link_pair[0] anchor_text = link_pair[1] if isinstance(link, unicode): logging.warning( 'messaging.py >> unexpected unicode link [%s]' % link) url = urlparse.urljoin(parent_url, link) if isinstance(url, unicode): logging.warning( 'messaging.py >> unexpected unicode url [%s]' % url) try: new_r = resource.Resource(None, parent_url, url, False, hop, r.batch, anchor_text) except BadResourceError: pass # ignored else: self.discover_resource(new_r)
def hit(self): ''' spawn resource using random choice for ammo/fuel/shield ''' returnval = 0 if randint(0, 1) < 1: new = resource.Resource() self.choicelist = [ new.shield, new.fuel, new.shieldandammo, new.fuelandammo, new.ammo ] l = self.choicelist[randint(0, 4)] l(self.pos) returnval = new else: returnval = False self.kill() return returnval
def handle(self): # I CAN HANDLE MYSELF OKAY self.data = self.request.recv(1024).strip() print("Processing data from client {} in thread {}".format( self.client_address[0], threading.current_thread().name)) decoded = pickle.loads(self.data) print(decoded) r, u = None, None try: r = resource.Resource(decoded[0]["label"], decoded[0]["serial_no"], decoded[0]["key"], None) u = update.Update(r, decoded[1], decoded[0]["value"]) except KeyError: # recieved malformed dict print("Malformed dict from {}!".format(self.client_address[0])) print("Decoded pickle into update object and resource object") u.update_resource() resources[decoded[0]["label"]] = r.toDict() self.finish()
def can_be_instanciated(cls): """Class method used by the Resource to check that the Conf can be instantiated. These two objects have a special contract as they are strongly coupled. A Resource can use the Conf to check for a Resource and the Conf uses a Resource to read Conf filepath. :returns: True if the Conf file has got a file. :except Error: Base Conf Error """ #No conf info passed to the resource so the Resource will not look into the conf (to avoid recursive search) the_res = resource.Resource(cls.CLINAME, cls.ENVNAME) filepath = the_res.getValue(aRaiseException=False) if (filepath is not None) and os.path.exists(filepath): return True return False
def __init__(self, platforms, executableList, workerReqDict): self.platforms=platforms self.executableList=executableList self.workerReqDict=workerReqDict maxPlatform=platforms[0] # get the platform with the biggest number of cores. ncores_max=0 for platform in platforms: if (platform.hasMaxResource('cores')): ncores_now=platform.getMaxResource('cores') if ncores_now > ncores_max: ncores_max=ncores_now maxPlatform=platform self.usePlatform=maxPlatform # construct a list of all max. resources with settings for the # platform we use. self.used=dict() for rsrc in self.usePlatform.getMaxResources().itervalues(): self.used[rsrc.name]=resource.Resource(rsrc.name, 0) self.type=None self.depleted=False
def move_loser(self): if self.pos in self.land_owned: self.land_owned.remove(self.pos) neighbors = self.model.grid.get_neighborhood(self.pos, True) for cell in neighbors: moved = False contents = self.model.grid.get_cell_list_contents(cell) if len(contents) == 1 and isinstance(contents[0], r.Resource): if contents[0].owned == self.unique_id: self.model.grid.move_agent(self, cell) moved = True elif contents[0].owned == "": self.model.grid.move_agent(self, cell) moved = True contents[0].owned = self.unique_id self.land_owned.append(cell) elif len(contents) == 0: self.model.grid.move_agent(self, cell) moved = True res = r.Resource(self.model.area_owned + 1, self.model, 0, self.unique_id) #iterate up the unique_id for resources self.model.area_owned += 1 self.model.grid.place_agent(res, cell) self.land_owned.append(cell) if moved == False: #print ("NOWHERE TO GO I SURRENDER") #conquered --remove form schedule #self.model.schedule.remove(self) #remove from grid #self.model.grid.remove_agent(self) self.conquered.append(cell)
def parse(self, line, batch=0): """Parse a line to get a resource. Make sure there is no ending '\n' in the input line Args: line: input string without ending '\n' Returns: a resource to crawl (return None if fail) """ if line.endswith('\n'): return None parts = line.split('\t') if len(parts) == 1: url = parts[0] parent_url = None elif len(parts) == 2: # http://www.cse.psu.edu/~shzheng/sigkdd-2007.pdf http://www.cse.psu.edu/~shzheng/ url = parts[0]; if parts[1] == '': parent_url = None else: parent_url = parts[1] else: return None try: r = resource.Resource(None, parent_url, url, True, 0, batch) except BadResourceError: r = None return r
def __init__(self, src_dir): resource_mgr = resource.Resource() resource_mgr.load(src_dir) self._graphic_mgr = graphics.Graphics(resource_mgr) self._last_tick = 0 self._current_turn = 0 self._last_tile = None self._all_tiles = [] self._clock = pygame.time.Clock() self._players = [ players.HumanPlayer(self._graphic_mgr, players.Player.POSITION.SOUTH, u'南大'), players.AIPlayer(self._graphic_mgr, players.Player.POSITION.EAST, u'东大'), players.AIPlayer(self._graphic_mgr, players.Player.POSITION.NORTH, u'北大'), players.AIPlayer(self._graphic_mgr, players.Player.POSITION.WEST, u'西大') ] self._graphic_mgr.catch_players(self._players) self._graphic_mgr.clock = self._clock self._can_draw = False self._cache_text = None self.reset()
print "You cannot have less than 1 thread. The current number of threads is " + str(len(threads)) + "." else: for i in range(0, argument): threads.pop().stopThread() print "Stopped " + str(argument) + " threads." except ValueError: print "The stopThread command only accepts integers as a parameter." print "Welcome to the Spanish Inquisition's implementation of the gossip protocol." looper = True inCommunity = False resourcesMap = {} with open('resources/directory.txt', 'rb') as cvsfile: text = csv.reader(cvsfile, delimiter = '|') for row in text: res = resource.Resource(row[0], row[1]) resourcesMap[res.id.getAsHex()] = res mainThread = main.Main(threadName = "monty", resourcesMap = resourcesMap) map = {"alerts": alerts, "createThread": createThread, "completedResources": completedResources, "countThreads": countThreads, "download": download, "exit": exit, "find": find, "foundResources": foundResources, "help": help, "join": join, "leave": leave, "query": query, "stopThread": stopThread} threads = [mainThread] while looper: if not mainThread.alertQueue.empty(): print "You have " + str(mainThread.alertQueue.qsize()) + " alerts. Type alerts to see them." #TODO see if there's a better way to let the user know about new alerts input = raw_input(">") command = input argument = "" if " " in input: input = input.partition(" ") #puts the input into an array with anything before the first space in [0], a space in [1], and the rest in [2] command = input[0] argument = input[2] if command in map:
def __init__(self, dir, executable, args, addPriority=0, minVersion=None, maxVersion=None, id=None, task=None, running=False, workerServer=None, env=None, outputFiles=None): """Create a command dir = the directory relative to the task directory executable = the name of the executable object args = the arguments for the executable files = the needed input files (as a list of CommandInputFile objects) minVersion = the minimum version for the command's executable maxVersion = the maximum version for the executable id = the command's ID (or None to generate one at random). task = the task associated with the command running = whether the cmd is running workerServer = the server the worker executing this command is connected to. env = environment variables to set: a dict of values (or None). outputFiles = the list of any expected output files. """ #self.taskID=taskID if dir is not None and task is not None: self.dir = os.path.relpath( dir, self.task.activeInstance.getFullBasedir()) else: self.dir = dir self.executable = executable self.files = [] self.outputFiles = outputFiles self.args = args self.minVersion = minVersion self.maxVersion = maxVersion self.addPriority = addPriority self.task = None self.running = running # whether the command is running self.id = id self.workerServer = workerServer # the return code self.returncode = None # the cpu time in seconds used by this command self.cputime = 0. # dictionary of reserved resource objects self.reserved = {} # dictionary of required resource objects # all commands need a CPU to run on cores = resource.Resource('cores', 1) self.minRequired = {cores.name: cores} # dictionary of max allowed resource objects self.maxAllowed = {} self.task = None self.env = env cpc.server.queue.cmdqueue.QueueableItem.__init__(self)
def setReserved(self, name, value): """Add a resource to the reserved list.""" rsrc = resource.Resource(name, value) self.reserved[name] = rsrc
def main(confc): start = time.time() logger = logging.getLogger("main") # global configurations config = Config_global(confc["jobid"], skip_check=True) config_cdicsv = Config_cdicsv() # create document writer writer = output.CiteSeerWriter([config.crawlrepo, config_cdicsv.crawler]) # create URL filter urlfilter = urlfilters.URLFilter( blacklistfile=config.blacklistfile, domainblacklistfile=config.domainblacklistfile) # create document type filter mimetypefilter = Mime_Type_Filter(config.mtypes) # create document logger (for this middleware) doclogger = Doc_Logger(os.getenv('HOSTNAME'), mimetypefilter) # parse csv file logger.info('parsing csv file...') gs = load_cdicsv(confc["csv_file"]) # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all', len(gs)) counters.newCounter('saved_New') counters.newCounter('saved_Duplicate') counters.newCounter('saved_bitDuplicate') # bitwise duplicate counters.newCounter('filtered_all') counters.newCounter('filtered_URLFilter') counters.newCounter('filtered_MimetypeFilter') counters.newCounter('failed_all') counters.newCounter('failed_FileNotFound') # if inputs are pdf/ps counters.newCounter('failed_PDFFileNotFound') # if inputs are gzipped counters.newCounter('failed_BadURL') # Bad URL counters.newCounter('failed_SaveError') # errors when saving docs # if required to visit database, check that tables are created if config_cdicsv.save_toDB: cdb = crawldb.CrawlDB() logger.info("database: " + cdb.dbname) # create document and parent table if they do not exist cdb.createTables() # loop over each document from csv file doci = 0 max_length = len(str(len(gs))) for g in gs: accepted = False doci += 1 # get resource variable is_seed = True if g["hop"] == 0 else False try: r = resource.Resource(None, g['parenturl'],g['url'],\ is_seed,g['hop'],batch=0,anchor_text="") except TypeError, e: logger.error(e) os.exit(1) except BadResourceError, e: logger.error("Error parsing Url : " + g["url"]) counters.addCounter('failed_BadURL') continue
def move(self): best_move = [[], [], [], []] neighbors = self.model.grid.get_neighborhood(self.pos, True) for cell in neighbors: contents = self.model.grid.get_cell_list_contents(cell) #get list of possible moves with no one if len(contents) == 0: best_move[2].append(cell) elif len(contents) == 1 and isinstance(contents[0], r.Resource): # Grid is not owned if contents[0].owned == "": if len(best_move[0]) == 0: best_move[0].append((cell, contents[0])) else: if best_move[0][0][1].value < contents[0].value: best_move[0][0] = (cell, contents[0]) #print ("best replaced") #grid cell is owned by you elif contents[0].owned == self.unique_id: best_move[3].append((cell, contents[0])) #grid cell is owned by someone else elif contents[0].owned != self.unique_id and contents[ 0].owned != "": if len(best_move[1]) > 0: if best_move[1][0][1].value < contents[0].value: best_move[1][0] = (cell, contents[0]) else: best_move[1].append((cell, contents[0])) #if contents > 1 must be another country elif len(contents) > 1: #print (len(contents), contents) #account for situation where foreigner is on grid better = None country = False res = False for item in contents: if type(item) is type(self): #print (type(self)) #print (item.unique_id) better = (item.unique_id, ) country = True if isinstance(item, r.Resource): res = True if len(best_move[1]) > 0: #print (best_move[1][0]) if best_move[1][0][1].value < item.value: best_move[1][0] = (cell, item) else: best_move[1].append((cell, item)) #to prevent issues of two countries being on same grid if country == True and res == True: best_move[1][0] += better else: print("ISSUES ", contents) if len(best_move[0]) > 0: #claim thy land best_move[0][0][1].owned = self.unique_id #move to new spot self.model.grid.move_agent(self, best_move[0][0][0]) #print ("Agent has seized resource ", best_move[0][0][1].value ) self.land_owned.append(best_move[0][0][0]) elif len(best_move[1]) > 0 and len(best_move[0]) == 0: #print (best_move[1]) self.negotiate(best_move[1]) #print ("Too War!") elif len(best_move[2]) > 0: self.model.grid.move_agent(self, best_move[2][0]) res = r.Resource(self.model.area_owned + 1, self.model, 0, self.unique_id) #iterate up the unique_id for resources self.model.area_owned += 1 self.model.grid.place_agent(res, best_move[2][0]) self.land_owned.append(best_move[2][0]) #print ("Agent has claimed ", best_move[2][0] ) else: self.model.grid.move_agent(self, best_move[3][0][0])
display_width = game_maze.get_display_width() display_height = game_maze.get_display_length() tile_size = game_maze.get_tile_size() map_grid = game_maze.gridit() map_obstacles = game_maze.map_obstacles() GRASS = pygame.image.load(r'./assets/grass.png') GRASS = pygame.transform.scale(GRASS, (tile_size, tile_size)) GLASS = pygame.image.load(r'./assets/glass.png') GLASS = pygame.transform.scale(GLASS, (tile_size, tile_size)) FIRE = pygame.image.load(r'./assets/fire.png') FIRE = pygame.transform.scale(FIRE, (tile_size, tile_size)) BLACK = (0, 0, 0) WHITE = (255, 255, 255) resource_maze = resource.Resource("Glass", GLASS, 10, True) resource_location = resource_maze.random_resource_spread(game_maze) agent = character.Character() agent.transform_avatar(tile_size) car = agent.get_avatar() car_length, car_width = agent.get_avatar_dim() crashed = agent.get_state() gameDisplay = pygame.display.set_mode((display_width, display_height)) pygame.display.set_caption('reinforeced game') clock = pygame.time.Clock() def draw_rectangles(point, texture):
def _replace_vars(self, a_str, location, lineno=-1): """ private replacing all variables. A variable will be in the from of %(group[option]). Multiple variables are supported, ex /foo/%(group1[opt1])/%(group2[opt2])/bar Nested variables are also supported, ex /foo/%(group[%(group1[opt1]]. Note that the group part cannot be substituted, only the option can. This is because of the Regular Expression _SUBSGROUPRE that accepts only words as values. Args: index. The index from where to look for a closing bracket s. The string to parse Returns: the final string with the replacements Raises: exception NoSectionError if the section cannot be found """ toparse = a_str index = toparse.find("%(") # if found opening %( look for end bracket) if index >= 0: # look for closing brackets while counting openings one closing_brack_index = self._get_closing_bracket_index( index, a_str, location, lineno) #print "closing bracket %d"%(closing_brack_index) var = toparse[index:closing_brack_index + 1] dummy = None m = self._SUBSGROUPRE.match(var) if m == None: raise SubstitutionError( lineno, location, "Cannot match a group[option] in %s but found an opening bracket (. Malformated expression " % (var)) else: # recursive calls g = self._replace_vars(m.group('group'), location, -1) o = self._replace_vars(m.group('option'), location, -1) try: # if it is in ENVGROUP then check ENV variables with a Resource object # if it is in CLIGROUP then check CLI argument with a Resource object # otherwise check in standard groups if g == Conf._ENVGROUP: r = resource.Resource(CliArgument=None, EnvVariable=o) dummy = r.getValue() elif g == Conf._CLIGROUP: r = resource.Resource(CliArgument=o, EnvVariable=None) dummy = r.getValue() else: dummy = self._sections[g][self.optionxform(o)] except KeyError, _: #IGNORE:W0612 raise SubstitutionError( lineno, location, "Property %s[%s] doesn't exist in this configuration file \n" % (g, o)) toparse = toparse.replace(var, dummy) return self._replace_vars(toparse, location, -1)
def get_resource(self, name): return resource.Resource(self, name)
def startup(verbal=False): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # create document writer writer = output.CiteSeerWriter([runconfig.cdilite['docdir'],runconfig.cdilite['crawler']]) # create document logger (for this middleware) doclogger = Doc_Logger(os.getenv('HOSTNAME')) # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # parse log file g = create_instance(runconfig.cdilite['logparser'],runconfig.cdilite['doclist']) g.extract_info(logsummaryfile=runconfig.cdilite['logsummaryfile']) # prepare to write xml file impl = getDOMImplementation() xDoc = impl.createDocument(None, "response", None) root = xDoc.documentElement root.setAttribute("location", runconfig.cdilite['docdir']) # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',g.nline['parsed']) counters.newCounter('failed_BadURL') counters.newCounter('failed_FileNotFound') # save the current path currentPath = os.getcwd() # loop over each information tuple extracted from document list file # each tuple contains the name of the pdf files if verbal: print "counters.all = ",counters.all for i in range(0,counters.all): print '' sys.stdout.write("\r") sys.stdout.write("%9d/%-9d " % (i+1,counters.all)) sys.stdout.write("\n") infoprinter.printPara('URL',g.rel_path[i]) code = None # get resource variable "r" if verbal: print 'g.parent_url[i] = ',g.parent_url[i] if verbal: print 'g.url[i] = ',g.url[i] try: r = resource.Resource(code,g.parent_url[i],g.url[i],\ g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i]) except BadResourceError,e: infoprinter.printStatus('URL Parse','fail') counters.addCounter('failed_BadURL') continue r.crawl_date = g.crawl_date[i] r.content_type = g.content_type[i] infoprinter.printPara('mime-type',r.content_type) # where crawled documents are saved # retrieve the local hard copy of document infile = os.path.join(currentPath,runconfig.cdilite['docdir'],g.rel_path[i]) inpdf = infile # e.g., filepath/file.pdf if '%' in inpdf: inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~ # try to remove the last back slash from the full path # or try to see if fullpath/index.html exists, maybe that is the file # if document file still cannot be found, write into log and skip it inpdfpath = inpdf if not os.path.exists(inpdfpath): msg = doclogger.generator('FileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_FileNotFound') infoprinter.printStatus('Document file found','no') # inpdfpath is the "corrected" file path inpdf = inpdfpath infoprinter.printStatus('Document file found','yes') # load pdf file content to calculate encryption f = open(inpdf,'r') data = f.read() f.close() # calculate SHA1 r.content_sha1 = hashlib.sha1(data).hexdigest() try: # only save metadata file writer.save_met(r,inpdf) except IOError,e: msg = doclogger.generator('IOErrorSave',infile,r) logging.getLogger('document').info(msg)
def startup(verbal=True): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # check configurations if not checkConfig(): infoprinter.printStatus('Configuration check','fail') raise SystemExit("Change your configurations in runconfig.py") else: infoprinter.printStatus('Configuration check','ok') # create document writer writer = output.CiteSeerWriter([runconfig.outputdir,runconfig.crawler]) # create URL filter urlfilter = urlfilters.URLFilter(blacklistfile=runconfig.blacklistfile,domainblacklistfile=runconfig.domainblacklistfile) # create document type filter mimetypefilter = Mime_Type_Filter(runconfig.allow_doc_type) # create document content filter doccontentfilter = filter_doc.Doc_Content_Filter(runconfig.tempdir) # create text extractor textextractor = textextract.Text_Extractor() # create document logger (for this middleware) doclogger = Doc_Logger(os.getenv('HOSTNAME'),mimetypefilter) # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # parse log file print 'parsing log file...' g = create_instance(runconfig.logparser,runconfig.logfile) g.extract_info(logsummaryfile=runconfig.logsummaryfile,skip=runconfig.skip,nloglines=runconfig.nloglines) print 'parsing lot file finished' # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',g.nline['parsed']) counters.newCounter('saved_New') counters.newCounter('saved_Duplicate') counters.newCounter('filtered') counters.newCounter('filtered_URLFilter') counters.newCounter('filtered_MimetypeFilter') counters.newCounter('filtered_DocContentFilter') counters.newCounter('failed') counters.newCounter('failed_TextExtract') counters.newCounter('failed_FileNotFound') # if inputs are pdf/ps counters.newCounter('failed_PDFFileNotFound') # if inputs are gzipped counters.newCounter('failed_BadURL') # Bad URL counters.newCounter('failed_SaveError') # if error occurs when saving docs # create output directory if it does not exist if not os.path.exists(runconfig.outputdir): os.makedirs(runconfig.outputdir) # create temp directory if it does not exist if not os.path.exists(runconfig.tempdir): os.makedirs(runconfig.tempdir) # a mapping file is automatically generated if only export files # (no db input) if runconfig.toggle_save_doc_separate: open(runconfig.tempdir+'mapping.csv','w') # if required to visit database, make sure that database and tables # are created if runconfig.toggle_save_to_db: cdb = crawldb.CrawlDB() # print database name infoprinter.printPara('Database name',cdb.dbname) # create document and parent table if they do not exist cdb.createTables() # save the current path savedPath = os.getcwd() # loop over each information tuple extracted from crawler log file for i in range(0,counters.all): print '' sys.stdout.write("\r") sys.stdout.write("%9d/%-9d " % (i+1,counters.all)) sys.stdout.write("\n") infoprinter.printPara('URL',g.url[i]) # apply the URL filter if runconfig.toggle_urlfilter: if not urlfilter.check(g.url[i]): msg = "%s %s %s" % ('URLRejected',urlfilter.rejectreason,g.url[i]) logging.getLogger('document').info(msg) counters.addCounter('filtered_URLFilter') if verbal: infoprinter.printStatus('URL accepted','no') continue # get resource variable "r" try: code = None r = resource.Resource(code,g.parent_url[i],g.url[i],\ g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i]) except BadResourceError,e: infoprinter.printStatus('URL Parse','fail') counters.addCounter('failed_BadURL') continue # url length cannot be longer th r.crawl_date = g.crawl_date[i] r.content_type = g.content_type[i] infoprinter.printPara('mime-type',r.content_type) # where crawled documents are saved # retrieve the local hard copy of document # If files are downloaded using "lftp", input file path should be # constructed by appending the relative file path to "conf.inputdir" if runconfig.crawler.lower() == 'lftp': infile = runconfig.inputdir+g.rel_path[i] elif runconfig.crawler.lower() == 'heritrix' and runconfig.saver.lower() == 'mirror': infile = runconfig.inputdir+r.host+r.path else: infile = runconfig.inputdir+g.rel_path[i] # apply doctype_filter, which checks the document mimetype type mimetypefilter_ok = mimetypefilter.check(r) if not mimetypefilter_ok: msg = doclogger.generator('DocumentTypeNotAccepted',infile,r) logging.getLogger('document').info(msg) counters.addCounter('filtered_MimetypeFilter') if verbal: infoprinter.printStatus('Accepted document type','no') continue else: if verbal: infoprinter.printStatus('Accepted document type','yes') r.ext = mimetypefilter.ext # check if document is already in db # if it returns False, continue to next step # if it returns True,log it and skip processing this one # However, if the overwrite_file toggle is set, we need to continue to the # next step anyway if runconfig.toggle_save_to_db: recordExist = cdb.checkRecord(runconfig.dbt_document,md5=r.md5) if not recordExist: infoprinter.printStatus('New document','yes') else: msg = doclogger.generator('saved_Duplicate',infile,r) logging.getLogger('document').info(msg) counters.addCounter('saved_Duplicate') infoprinter.printStatus('New document','no') if not runconfig.overwrite_file: continue # check existence of input file, if the name part of "infile" # contains wild card characters e.g., %, # try to recover it to normal # "infile" is the original full file path from crawl log (may contain # escape characters and may by in zipped format) # "inpdf" contains original file names saved in disk (no escape characters, # and in acceptable file format, e.g., PDF/postscript) # "inpdfpath" contains the correct path of input file name, see below. in # some cases, url paths are not correctly normalized # and need to be corrected. For example, if the last segment does not # contain ".", it is taken as a directory and a "/" is # added, while this is incorrect. inpdf = infile # e.g., filepath/file.pdf if '%' in inpdf: inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~ # try to remove the last back slash from the full path # or try to see if fullpath/index.html exists, maybe that is the file # if document file still cannot be found, write into log and skip it inpdfpath = inpdf if not os.path.exists(inpdfpath): inpdfpath = inpdf[:-1] if not os.path.exists(inpdfpath): inpdfpath = inpdf+'index.html' if not os.path.exists(inpdfpath): # try to download the the paper using "wget" # downloaded paper is saved to temporary directory and renamed # to "wget.pdf". Note that we just temporirily add an extention # of ".pdf", but it may not be a PDF file. If it is not, # it will be filtered out by the doc_type_filter later. # add quotes to url # if download is not successful, we mark this document as # "FileNotFound" wgeturl = '"'+r.url+'"' wgetfile = os.path.join(runconfig.tempdir,"wget."+r.ext) wgetcmd = "wget "+wgeturl+" -O "+wgetfile # first remove the existing "wget.pdf" if it exists if os.path.exists(wgetfile): rmcmd = "rm -rf "+wgetfile cmdoutput = commands.getoutput(rmcmd) # download document using "wget", time out is 5 min cmdoutput = timeoutpython.run(wgetcmd, shell=True, timeout=300) # if function returns "-9", download failed, skip this doc #if cmdoutput[0] == -9: # print cmdoutput #cmdoutput = commands.getoutput(wgetcmd) #print 'cmdoutput = ',cmdoutput # Check if file downloaded successfully if (not os.path.exists(wgetfile)) or (cmdoutput[0] == -9): msg = doclogger.generator('FileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_FileNotFound') if verbal: infoprinter.printStatus('Document file found','no') infoprinter.printPara('infile',infile) continue else: inpdfpath = wgetfile # inpdfpath is the "corrected" file path inpdf = inpdfpath if verbal: infoprinter.printStatus('Document file found','yes') infoprinter.printPara('Document file path',inpdf) # If input file is in zipped format, assuming it is a .tar.gz file # we do the following things # * copy the .tar.gz file to a temp directory # * decompress it using tar -xvzf # * find the .pdf file inside the unzipped # * do whatever we want ... # * remove everything in the temp directory cmd_file = 'file -i "'+infile+'"' cmdoutput = commands.getoutput(cmd_file) #t = cmdoutput.split(' ') #infilemimetype = t[-1] #infoprinter.printStatus('MIME-type',infilemimetype) #print cmdoutput if 'application/x-gzip' in cmdoutput: infoprinter.printStatus('MIME-type','application/x-gzip') cmd_rm = 'rm -rf '+runconfig.tempdir+'*' cmdoutput = commands.getoutput(cmd_rm) cmd_cp = 'cp "'+infile+'" '+runconfig.tempdir cmdoutput = commands.getoutput(cmd_cp) # sometimes, for some (unknown) reasons, the "-C" option # does not work well for "tar" command, so we cd to the # temp directory, extract files from the .tar.gz and return # to the main directory # # obtain the file name from the full path: infilename infilename = os.path.split(infile)[1] os.chdir(runconfig.tempdir) cmd_tar = 'tar -xvzf "'+infilename+'"' cmdoutput = commands.getoutput(cmd_tar) os.chdir(savedPath) # only look for pdf files for root,dirs,files in os.walk(runconfig.tempdir): inpdffound = False for f in files: if f.endswith('pdf'): inpdf = os.path.join(root,f) inpdffound = True break if inpdffound == True: break if not inpdffound: msg = doclogger.generator('PDFFileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_PDFFileNotFound') infoprinter.printStatus('PDF Document file found','no') continue # document file is found # check if need to use doc_content_filter if runconfig.toggle_doc_content_filter: # extract text from documents filefmt = mimetypefilter.doctype if verbal: infoprinter.printPara('Mime type',filefmt) # acceptable formats: e.g., "application/pdf","application/postscript" textextractmsg = textextractor.extract(inpdf,filefmt) # classify document if text is extracted successfully if 'Success' in textextractmsg: infoprinter.printStatus('Extract text','success') # not a paper, log it and proceed it to the next if doccontentfilter.Decider(textextractor.outtxtfile,inpdf) == -1: counters.addCounter('filtered_DocContentFilter') msg = doclogger.generator('NotAcademic',infile,r) logging.getLogger('document').info(msg) infoprinter.printStatus('Accepted document content','no') continue else: infoprinter.printStatus('Accepted document content','yes') else: # text extraction fails, report error and write it into log file infoprinter.printStatus('Extract text','fail') counters.addCounter('failed_TextExtract') msg = doclogger.generator(textextractmsg,infile,r) logging.getLogger('document').info(msg) continue # determine the FINAL mimetype of this document, if it is # "application/pdf", use ".pdf" as the extension, if it is # "application/postscript", use ".ps" as the extension # "inpdf" is the final pdf file to be accepted (after re-download, after # filters) if mimetypefilter.doctype == 'application/pdf': r.ext = 'pdf' elif mimetypefilter.doctype == 'application/postscript': r.ext = 'ps' else: cmd_file = 'file -i "'+inpdf+'"' cmdoutput = commands.getoutput(cmd_file) if 'application/postscript' in cmdoutput: r.ext = 'ps' else: infoprinter.printStatus('Recognizable mimetype','no') sys.exit(cmdoutput) # write document information into database # database settings can be found at settings.py # read file content and calculate the SHA1 value # read PDF document information # In some cases, the actual PDF was downloaded but the URL ends with a # slash: for example # dial.academielouvain.be/vital/access/services/Download/boreal:12685/PDF_01/ # the downloaded file is renamed as "index.html" though it is PDF file. In this case, # we try "inpdf/index.html" to see if we can actually identify this file. # If this does not work, it could be that Heritrix downloads the file as "PDF_01", this # happens for the URL below, when the actual file is named "75" under the 78/ directory # www.br-ie.org/pub/index.php/rbie/article/viewFile/78/75/ # # If we still cannot find any file, we have to skip it try: f = open(inpdf,'r') data = f.read() f.close() except IOError: # just remove the last "slash" try: f = open(inpdf[:-1],'r') data = f.read() f.close() except IOError: try: f = open(inpdf+'index.html','r') data = f.read() f.close() except IOError: msg = doclogger.generator('FileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_FileNotFound') infoprinter.printStatus('Document file found','no') continue # If required to save crawled documents separately, # do not save to db, only save document to outputdir # Files are named using numbers starting from 1 # A mapping file is automatically generated filenamebody = id_to_fname(i+1,r.ext) outdoc = runconfig.outputdir+filenamebody if runconfig.toggle_save_doc_separate: mappingline = outdoc+','+infile # may not be inpdf ff = open(outdoc,'w') ff.write(data) ff.close try: f = open(outdoc) msg = doclogger.generator('saved_New',infile,r) logging.getLogger('document').info(msg) infoprinter.printStatus('Document saved','yes') # number of saved documents counters.addCounter('saved_New') except IOError,e: infoprinter.printStatus('Document saved','no') raise SystemExit(e)