Exemple #1
0
 def __init__(self, N, R, width):
     
     ###################################
     # SET UP INFRASTRUCTURE
     ##################################
     self.running = True
     self.schedule = RandomActivation(self)
     self.grid = MultiGrid(width, width, True)
     self.positions = list(product([x for x in range(width)],repeat = 2))  
     ##########################################
     #  SET UP AGENTS - Resources and Countries
     #########################################
     for i in range(N): 
         a = c.Country(i, self)
         self.schedule.add(a)
         pos = self.get_position()
         self.grid.place_agent(a, pos)
     self.area_owned = R    
     for i in range(R): 
         res = r.Resource(i, self, random.randint(1,6), "")
         pos = self.get_position()
         self.grid.place_agent(res,pos)
     ##########################################
     # SET UP DATA COLLECTOR
     ###########################################
     self.datacollector = DataCollector(agent_reporters = {"Scaling A": "scaling_a" ,\
                                                           "Scaling B" : "scaling_b", \
                                                           "Capacity": "capacity", \
                                                           "Land Owned": "land_owned", \
                                                           "Conquered": "conquered"})
Exemple #2
0
 def parse(self, line, batch=0):
     """Parse a line to get a resource.
     
     Make sure there is no ending '\n' in the input line
     
     Args:
         line: input string without ending '\n'
     
     Returns:
         a resource to crawl (return None if fail)
     """
     
     if line.endswith('\n'):
         return None
         
     parts = line.split('\t')
     if len(parts) == 1:
         url = parts[0]
         code = None
     elif len(parts) == 2:
         # 12345   http://www.cse.psu.edu
         url = parts[1];
         code = parts[0];
     else:
         return None
            
     try: 
         r = resource.Resource(code, None, url, True, 0, batch)
     except BadResourceError:
         r = None
          
     return r
Exemple #3
0
 def __init__(self):
     self.backup = BackUp("screen")
     self.stdscr = curses.initscr()
     self.MAX_Y, self.MAX_X = self.stdscr.getmaxyx()
     self.window = self.stdscr.subwin(self.MAX_Y, self.MAX_X, 0, 0)
     self.main_window = self.window.subwin(self.MAX_Y - 2,
                                           self.MAX_X - MENU_W - 3, 1, 1)
     self.menu_window = self.window.subwin(self.MAX_Y - 2, MENU_W, 1,
                                           self.MAX_X - MENU_W - 2)
     self._max_y = self.MAX_Y - 4
     self._max_x = self.MAX_X - MENU_W - 6
     self.screen = {}
     """创建【时标】
     """
     self.TS = TimeScale()
     """创建【资源】
     """
     self.R = resource.Resource(self._max_x, self._max_y)
     self.loadData()
     """创建【对象】
     """
     self.Obj = []
     for _i in range(5):
         self.Obj.append(
             obj.Obj("%d" % _i, 0, 0, obj_pattern[_i % len(obj_pattern)],
                     (_i % 7) + 1))
Exemple #4
0
 def __init__( self, fontDescription, aParent=None ) :
     self._parent = aParent
     self._family = ''
     self._faceName = ''
     self._size = wx.NORMAL_FONT.GetPointSize()
     self._style = 'regular'
     self._underline = 0
     if fontDescription is not None:
         if isinstance(fontDescription, dict):
             fontDescription = resource.Resource(fontDescription)
         try:
             self._family = fontDescription.family
         except:
             pass
         try:
             self._faceName = fontDescription.faceName
         except:
             pass
         try:
             self._size = fontDescription.size
         except:
             pass
         try:
             self._style = fontDescription.style
         except:
             pass
Exemple #5
0
 def __init__(self, app_name, app_dict):
     self.name = app_name
     self.type = app_dict['type']
     self.category = app_dict['category']
     self.risklevel = app_dict['risklevel']
     self.socket_list = socket.gen_socket_list('public', 'server',
                                               app_dict['nips'])
     self.curr_socket = 99999
     self.resource = resource.Resource(app_name)
Exemple #6
0
 def add_resource(self, url, format=u'', description=u'', hash=u'', **kw):
     import resource
     self.resources_all.append(
         resource.Resource(package_id=self.id,
                           url=url,
                           format=format,
                           description=description,
                           hash=hash,
                           **kw))
Exemple #7
0
 def add_resource(self, url, format=u'', description=u'', hash=u'', **kw):
     import resource
     self.resources.append(resource.Resource(
         resource_group_id=self.resource_groups[0].id,
         url=url,
         format=format,
         description=description,
         hash=hash,
         **kw))
 def get_resources_in_stack(self, stack_id):
     # original resources from the stack
     stack_resources = self.heat.resources.list(stack_id=stack_id)
     # a list of simpler Resource objects
     resources = [
         resource.Resource(res.physical_resource_id, res.resource_type)
         for res in stack_resources
     ]
     return resources
 def addResources(self, args):
     for name in args:
         self.args["name"] = name
         self.activeResources[name] = resource.Resource(self.args)
         self.activeResources[name].report()
     try:
         del self.args["name"]
     except KeyError:
         print "KeyError"
         pass
Exemple #10
0
    def __init__(self, use_resource=True):
        #TODO docstring ==> use resource ????

        # create resource for the conf file
        self._conf_resource = resource.Resource(Conf.CLINAME, Conf.ENVNAME)

        # list of sections
        self._sections = {}

        self._configuration_file_path = None

        # create config object
        if use_resource:
            self._load_config()
Exemple #11
0
    def get(self, section, option, default=None, fail_if_missing=False):
        """ get one option from a section.
        
            return the default if it is not found and if fail_if_missing is False, otherwise return NoOptionError
          
            :param section: Section where to find the option
            :type  section: str
            :param option:  Option to get
            :param default: Default value to return if fail_if_missing is False
            :param fail_if_missing: Will throw an exception when the option is not found and fail_if_missing is true
               
            :returns: the option as a string
            
            :except NoOptionError: Raised only when fail_is_missing set to True
        
        """
        # all options are kept in lowercase
        opt = self.optionxform(option)

        if section not in self._sections:
            #check if it is a ENV section
            dummy = None
            if section == Conf._ENVGROUP:
                r = resource.Resource(CliArgument=None, EnvVariable=opt)
                dummy = r.getValue()
            elif section == Conf._CLIGROUP:
                r = resource.Resource(CliArgument=opt, EnvVariable=None)
                dummy = r.getValue()
            #return default if dummy is None otherwise return dummy
            return ((self._get_defaults(section, opt, default,
                                        fail_if_missing))
                    if dummy == None else dummy)
        elif opt in self._sections[section]:
            return self._replace_vars(self._sections[section][opt],
                                      "%s[%s]" % (section, option), -1)
        else:
            return self._get_defaults(section, opt, default, fail_if_missing)
Exemple #12
0
    def on_resource_fetched(self, r):
        self.stat_lock.acquire()
        try:
            self.fetched_count += 1
        finally:
            self.stat_lock.release()

        msg = "[%s] [%d] [%s] [%s] [%s]" % (
            r.crawl_date, r.hop, r.content_type, r.url, r.parent_url)
        logging.getLogger('resource.fetched').info(msg)

        if r.content_type == 'text/html':
            if self.before_parse_filter.check(r):
                if isinstance(r.html, unicode):
                    logging.warning(
                        'messaging.py >> unexpected unicode html [%s]' % r.url)

                links = html_helper.get_links(r.html)
                parent_url = r.url

                if isinstance(parent_url, unicode):
                    logging.warning(
                        'messaging.py >> unexpected unicode parent_url [%s]' %
                        parent_url)

                hop = r.hop + 1
                for link_pair in links:
                    link = link_pair[0]
                    anchor_text = link_pair[1]

                    if isinstance(link, unicode):
                        logging.warning(
                            'messaging.py >> unexpected unicode link [%s]' %
                            link)

                    url = urlparse.urljoin(parent_url, link)

                    if isinstance(url, unicode):
                        logging.warning(
                            'messaging.py >> unexpected unicode url [%s]' %
                            url)

                    try:
                        new_r = resource.Resource(None, parent_url, url, False,
                                                  hop, r.batch, anchor_text)
                    except BadResourceError:
                        pass  # ignored
                    else:
                        self.discover_resource(new_r)
Exemple #13
0
    def hit(self):
        '''
        spawn resource using random choice for ammo/fuel/shield
        '''
        returnval = 0
        if randint(0, 1) < 1:
            new = resource.Resource()
            self.choicelist = [
                new.shield, new.fuel, new.shieldandammo, new.fuelandammo,
                new.ammo
            ]
            l = self.choicelist[randint(0, 4)]
            l(self.pos)
            returnval = new

        else:
            returnval = False

        self.kill()
        return returnval
Exemple #14
0
 def handle(self):
     # I CAN HANDLE MYSELF OKAY
     self.data = self.request.recv(1024).strip()
     print("Processing data from client {} in thread {}".format(
         self.client_address[0],
         threading.current_thread().name))
     decoded = pickle.loads(self.data)
     print(decoded)
     r, u = None, None
     try:
         r = resource.Resource(decoded[0]["label"], decoded[0]["serial_no"],
                               decoded[0]["key"], None)
         u = update.Update(r, decoded[1], decoded[0]["value"])
     except KeyError:
         # recieved malformed dict
         print("Malformed dict from {}!".format(self.client_address[0]))
     print("Decoded pickle into update object and resource object")
     u.update_resource()
     resources[decoded[0]["label"]] = r.toDict()
     self.finish()
Exemple #15
0
    def can_be_instanciated(cls):
        """Class method used by the Resource to check that the Conf can be instantiated. 
        
        These two objects have a special contract as they are strongly coupled. 
        A Resource can use the Conf to check for a Resource and the Conf uses a Resource to read Conf filepath.
        
        :returns: True if the Conf file has got a file.
           
        :except Error: Base Conf Error
        
        """
        #No conf info passed to the resource so the Resource will not look into the conf (to avoid recursive search)
        the_res = resource.Resource(cls.CLINAME, cls.ENVNAME)

        filepath = the_res.getValue(aRaiseException=False)

        if (filepath is not None) and os.path.exists(filepath):
            return True

        return False
 def __init__(self, platforms, executableList, workerReqDict):
     self.platforms=platforms
     self.executableList=executableList
     self.workerReqDict=workerReqDict
     maxPlatform=platforms[0]
     # get the platform with the biggest number of cores.
     ncores_max=0
     for platform in platforms:
         if (platform.hasMaxResource('cores')):
             ncores_now=platform.getMaxResource('cores')
             if ncores_now > ncores_max:
                 ncores_max=ncores_now
                 maxPlatform=platform
     self.usePlatform=maxPlatform
     # construct a list of all max. resources with settings for the
     # platform we use.
     self.used=dict()
     for rsrc in self.usePlatform.getMaxResources().itervalues():
         self.used[rsrc.name]=resource.Resource(rsrc.name, 0)
     self.type=None
     self.depleted=False
Exemple #17
0
    def move_loser(self):

        if self.pos in self.land_owned:
            self.land_owned.remove(self.pos)
        neighbors = self.model.grid.get_neighborhood(self.pos, True)

        for cell in neighbors:
            moved = False
            contents = self.model.grid.get_cell_list_contents(cell)

            if len(contents) == 1 and isinstance(contents[0], r.Resource):
                if contents[0].owned == self.unique_id:
                    self.model.grid.move_agent(self, cell)
                    moved = True
                elif contents[0].owned == "":
                    self.model.grid.move_agent(self, cell)
                    moved = True
                    contents[0].owned = self.unique_id
                    self.land_owned.append(cell)

            elif len(contents) == 0:
                self.model.grid.move_agent(self, cell)
                moved = True
                res = r.Resource(self.model.area_owned + 1, self.model, 0,
                                 self.unique_id)
                #iterate up the unique_id for resources
                self.model.area_owned += 1
                self.model.grid.place_agent(res, cell)
                self.land_owned.append(cell)

        if moved == False:
            #print ("NOWHERE TO GO I SURRENDER")
            #conquered --remove form schedule
            #self.model.schedule.remove(self)
            #remove from grid
            #self.model.grid.remove_agent(self)
            self.conquered.append(cell)
Exemple #18
0
 def parse(self, line, batch=0):
     """Parse a line to get a resource.
     
     Make sure there is no ending '\n' in the input line
     
     Args:
         line: input string without ending '\n'
     
     Returns:
         a resource to crawl (return None if fail)
     """
     
     if line.endswith('\n'):
         return None
         
     parts = line.split('\t')
     if len(parts) == 1:
         url = parts[0]
         parent_url = None
     elif len(parts) == 2:
         # http://www.cse.psu.edu/~shzheng/sigkdd-2007.pdf	http://www.cse.psu.edu/~shzheng/            
         url = parts[0];
         if parts[1] == '':
             parent_url = None
         else:
             parent_url = parts[1]
     else:
         return None
     
     try:        
         r = resource.Resource(None, parent_url, url, True, 0, batch)
     except BadResourceError:
         r = None
     
     return r
     
Exemple #19
0
 def __init__(self, src_dir):
     resource_mgr = resource.Resource()
     resource_mgr.load(src_dir)
     self._graphic_mgr = graphics.Graphics(resource_mgr)
     self._last_tick = 0
     self._current_turn = 0
     self._last_tile = None
     self._all_tiles = []
     self._clock = pygame.time.Clock()
     self._players = [
         players.HumanPlayer(self._graphic_mgr,
                             players.Player.POSITION.SOUTH, u'南大'),
         players.AIPlayer(self._graphic_mgr, players.Player.POSITION.EAST,
                          u'东大'),
         players.AIPlayer(self._graphic_mgr, players.Player.POSITION.NORTH,
                          u'北大'),
         players.AIPlayer(self._graphic_mgr, players.Player.POSITION.WEST,
                          u'西大')
     ]
     self._graphic_mgr.catch_players(self._players)
     self._graphic_mgr.clock = self._clock
     self._can_draw = False
     self._cache_text = None
     self.reset()
Exemple #20
0
			print "You cannot have less than 1 thread. The current number of threads is " + str(len(threads)) + "."
		else:
			for i in range(0, argument):
				threads.pop().stopThread()
			print "Stopped " + str(argument) + " threads."
	except ValueError:
		print "The stopThread command only accepts integers as a parameter."

print "Welcome to the Spanish Inquisition's implementation of the gossip protocol."
looper = True
inCommunity = False
resourcesMap = {}
with open('resources/directory.txt', 'rb') as cvsfile:
	text = csv.reader(cvsfile, delimiter = '|')
	for row in text:
		res = resource.Resource(row[0], row[1])
		resourcesMap[res.id.getAsHex()] = res
mainThread = main.Main(threadName = "monty", resourcesMap = resourcesMap)
map = {"alerts": alerts, "createThread": createThread, "completedResources": completedResources, "countThreads": countThreads, "download": download, "exit": exit, "find": find, "foundResources": foundResources, "help": help, "join": join, "leave": leave, "query": query, "stopThread": stopThread}
threads = [mainThread]
while looper:
	if not mainThread.alertQueue.empty():
		print "You have " + str(mainThread.alertQueue.qsize()) + " alerts. Type alerts to see them." #TODO see if there's a better way to let the user know about new alerts
	input = raw_input(">")
	command = input
	argument = ""
	if " " in input:
		input = input.partition(" ") #puts the input into an array with anything before the first space in [0], a space in [1], and the rest in [2]
		command = input[0]
		argument = input[2]
	if command in map:
Exemple #21
0
 def __init__(self,
              dir,
              executable,
              args,
              addPriority=0,
              minVersion=None,
              maxVersion=None,
              id=None,
              task=None,
              running=False,
              workerServer=None,
              env=None,
              outputFiles=None):
     """Create a command
         dir = the directory relative to the task directory
         executable = the name of the executable object
         args = the arguments for the executable
         files = the needed input files (as a list of CommandInputFile     
                                         objects) 
         minVersion = the minimum version for the command's executable
         maxVersion = the maximum version for the executable
         id = the command's ID (or None to generate one at random).
         task = the task associated with the command
         running = whether the cmd is running
         workerServer = the server the worker executing this command is 
                        connected to.
         env = environment variables to set: a dict of values (or None).
         outputFiles = the list of any expected output files.
        """
     #self.taskID=taskID
     if dir is not None and task is not None:
         self.dir = os.path.relpath(
             dir, self.task.activeInstance.getFullBasedir())
     else:
         self.dir = dir
     self.executable = executable
     self.files = []
     self.outputFiles = outputFiles
     self.args = args
     self.minVersion = minVersion
     self.maxVersion = maxVersion
     self.addPriority = addPriority
     self.task = None
     self.running = running  # whether the command is running
     self.id = id
     self.workerServer = workerServer
     # the return code
     self.returncode = None
     # the cpu time in seconds used by this command
     self.cputime = 0.
     # dictionary of reserved resource objects
     self.reserved = {}
     # dictionary of required resource objects
     # all commands need a CPU to run on
     cores = resource.Resource('cores', 1)
     self.minRequired = {cores.name: cores}
     # dictionary of max allowed resource objects
     self.maxAllowed = {}
     self.task = None
     self.env = env
     cpc.server.queue.cmdqueue.QueueableItem.__init__(self)
Exemple #22
0
 def setReserved(self, name, value):
     """Add a resource to the reserved list."""
     rsrc = resource.Resource(name, value)
     self.reserved[name] = rsrc
Exemple #23
0
def main(confc):

    start = time.time()
    logger = logging.getLogger("main")

    # global configurations
    config = Config_global(confc["jobid"], skip_check=True)
    config_cdicsv = Config_cdicsv()

    # create document writer
    writer = output.CiteSeerWriter([config.crawlrepo, config_cdicsv.crawler])

    # create URL filter
    urlfilter = urlfilters.URLFilter(
        blacklistfile=config.blacklistfile,
        domainblacklistfile=config.domainblacklistfile)

    # create document type filter
    mimetypefilter = Mime_Type_Filter(config.mtypes)

    # create document logger (for this middleware)
    doclogger = Doc_Logger(os.getenv('HOSTNAME'), mimetypefilter)

    # parse csv file
    logger.info('parsing csv file...')
    gs = load_cdicsv(confc["csv_file"])

    # number counter
    counters = counter.Counter()
    counters.newCounter('all')
    counters.setCounter('all', len(gs))
    counters.newCounter('saved_New')
    counters.newCounter('saved_Duplicate')
    counters.newCounter('saved_bitDuplicate')  # bitwise duplicate
    counters.newCounter('filtered_all')
    counters.newCounter('filtered_URLFilter')
    counters.newCounter('filtered_MimetypeFilter')
    counters.newCounter('failed_all')
    counters.newCounter('failed_FileNotFound')  # if inputs are pdf/ps
    counters.newCounter('failed_PDFFileNotFound')  # if inputs are gzipped
    counters.newCounter('failed_BadURL')  # Bad URL
    counters.newCounter('failed_SaveError')  # errors when saving docs

    # if required to visit database, check that tables are created
    if config_cdicsv.save_toDB:
        cdb = crawldb.CrawlDB()
        logger.info("database: " + cdb.dbname)
        # create document and parent table if they do not exist
        cdb.createTables()

    # loop over each document from csv file
    doci = 0
    max_length = len(str(len(gs)))
    for g in gs:
        accepted = False
        doci += 1

        # get resource variable
        is_seed = True if g["hop"] == 0 else False
        try:
            r = resource.Resource(None, g['parenturl'],g['url'],\
                is_seed,g['hop'],batch=0,anchor_text="")
        except TypeError, e:
            logger.error(e)
            os.exit(1)
        except BadResourceError, e:
            logger.error("Error parsing Url : " + g["url"])
            counters.addCounter('failed_BadURL')
            continue
Exemple #24
0
    def move(self):

        best_move = [[], [], [], []]

        neighbors = self.model.grid.get_neighborhood(self.pos, True)

        for cell in neighbors:
            contents = self.model.grid.get_cell_list_contents(cell)

            #get list of possible moves with no one
            if len(contents) == 0:
                best_move[2].append(cell)

            elif len(contents) == 1 and isinstance(contents[0], r.Resource):
                # Grid is not owned
                if contents[0].owned == "":
                    if len(best_move[0]) == 0:
                        best_move[0].append((cell, contents[0]))
                    else:
                        if best_move[0][0][1].value < contents[0].value:
                            best_move[0][0] = (cell, contents[0])
                            #print ("best replaced")
                #grid cell is owned by you
                elif contents[0].owned == self.unique_id:
                    best_move[3].append((cell, contents[0]))
                #grid cell is owned by someone else
                elif contents[0].owned != self.unique_id and contents[
                        0].owned != "":
                    if len(best_move[1]) > 0:
                        if best_move[1][0][1].value < contents[0].value:
                            best_move[1][0] = (cell, contents[0])
                    else:
                        best_move[1].append((cell, contents[0]))
            #if contents > 1 must be another country
            elif len(contents) > 1:
                #print (len(contents), contents)
                #account for situation where foreigner is on grid
                better = None
                country = False
                res = False
                for item in contents:
                    if type(item) is type(self):
                        #print (type(self))
                        #print (item.unique_id)
                        better = (item.unique_id, )
                        country = True
                    if isinstance(item, r.Resource):
                        res = True
                        if len(best_move[1]) > 0:
                            #print (best_move[1][0])
                            if best_move[1][0][1].value < item.value:
                                best_move[1][0] = (cell, item)
                        else:
                            best_move[1].append((cell, item))

                #to prevent issues of two countries being on same grid
                if country == True and res == True:
                    best_move[1][0] += better
                else:
                    print("ISSUES ", contents)

        if len(best_move[0]) > 0:
            #claim thy land
            best_move[0][0][1].owned = self.unique_id
            #move to new spot
            self.model.grid.move_agent(self, best_move[0][0][0])
            #print ("Agent has seized resource ", best_move[0][0][1].value )
            self.land_owned.append(best_move[0][0][0])

        elif len(best_move[1]) > 0 and len(best_move[0]) == 0:
            #print (best_move[1])
            self.negotiate(best_move[1])
            #print ("Too War!")

        elif len(best_move[2]) > 0:
            self.model.grid.move_agent(self, best_move[2][0])
            res = r.Resource(self.model.area_owned + 1, self.model, 0,
                             self.unique_id)
            #iterate up the unique_id for resources
            self.model.area_owned += 1
            self.model.grid.place_agent(res, best_move[2][0])
            self.land_owned.append(best_move[2][0])
            #print ("Agent has claimed ", best_move[2][0] )
        else:
            self.model.grid.move_agent(self, best_move[3][0][0])
Exemple #25
0
display_width = game_maze.get_display_width()
display_height = game_maze.get_display_length()
tile_size = game_maze.get_tile_size()
map_grid = game_maze.gridit()
map_obstacles = game_maze.map_obstacles()

GRASS = pygame.image.load(r'./assets/grass.png')
GRASS = pygame.transform.scale(GRASS, (tile_size, tile_size))
GLASS = pygame.image.load(r'./assets/glass.png')
GLASS = pygame.transform.scale(GLASS, (tile_size, tile_size))
FIRE = pygame.image.load(r'./assets/fire.png')
FIRE = pygame.transform.scale(FIRE, (tile_size, tile_size))
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)

resource_maze = resource.Resource("Glass", GLASS, 10, True)
resource_location = resource_maze.random_resource_spread(game_maze)

agent = character.Character()
agent.transform_avatar(tile_size)
car = agent.get_avatar()
car_length, car_width = agent.get_avatar_dim()
crashed = agent.get_state()

gameDisplay = pygame.display.set_mode((display_width, display_height))
pygame.display.set_caption('reinforeced game')

clock = pygame.time.Clock()


def draw_rectangles(point, texture):
Exemple #26
0
    def _replace_vars(self, a_str, location, lineno=-1):
        """ private replacing all variables. A variable will be in the from of %(group[option]).
            Multiple variables are supported, ex /foo/%(group1[opt1])/%(group2[opt2])/bar
            Nested variables are also supported, ex /foo/%(group[%(group1[opt1]].
            Note that the group part cannot be substituted, only the option can. This is because of the Regular Expression _SUBSGROUPRE that accepts only words as values.
            
            Args:
               index. The index from where to look for a closing bracket
               s. The string to parse
               
            Returns: the final string with the replacements
        
            Raises:
               exception NoSectionError if the section cannot be found
        """

        toparse = a_str

        index = toparse.find("%(")

        # if found opening %( look for end bracket)
        if index >= 0:
            # look for closing brackets while counting openings one
            closing_brack_index = self._get_closing_bracket_index(
                index, a_str, location, lineno)

            #print "closing bracket %d"%(closing_brack_index)
            var = toparse[index:closing_brack_index + 1]

            dummy = None

            m = self._SUBSGROUPRE.match(var)

            if m == None:
                raise SubstitutionError(
                    lineno, location,
                    "Cannot match a group[option] in %s but found an opening bracket (. Malformated expression "
                    % (var))
            else:

                # recursive calls
                g = self._replace_vars(m.group('group'), location, -1)
                o = self._replace_vars(m.group('option'), location, -1)

                try:
                    # if it is in ENVGROUP then check ENV variables with a Resource object
                    # if it is in CLIGROUP then check CLI argument with a Resource object
                    # otherwise check in standard groups
                    if g == Conf._ENVGROUP:
                        r = resource.Resource(CliArgument=None, EnvVariable=o)
                        dummy = r.getValue()
                    elif g == Conf._CLIGROUP:
                        r = resource.Resource(CliArgument=o, EnvVariable=None)
                        dummy = r.getValue()
                    else:
                        dummy = self._sections[g][self.optionxform(o)]
                except KeyError, _:  #IGNORE:W0612
                    raise SubstitutionError(
                        lineno, location,
                        "Property %s[%s] doesn't exist in this configuration file \n"
                        % (g, o))

            toparse = toparse.replace(var, dummy)

            return self._replace_vars(toparse, location, -1)
Exemple #27
0
 def get_resource(self, name):
     return resource.Resource(self, name)
Exemple #28
0
def startup(verbal=False):

  # record start time 
  tic = time.time()

  # create on-screen information print object
  infoprinter = printinfo.printInfo()

  # create document writer
  writer = output.CiteSeerWriter([runconfig.cdilite['docdir'],runconfig.cdilite['crawler']])

  # create document logger (for this middleware)
  doclogger = Doc_Logger(os.getenv('HOSTNAME'))

  # create general log configers and config logs
  logconfiger = Log_Configer()
  logconfiger.config_loggers()

  # parse log file
  g = create_instance(runconfig.cdilite['logparser'],runconfig.cdilite['doclist'])
  g.extract_info(logsummaryfile=runconfig.cdilite['logsummaryfile'])


  # prepare to write xml file
  impl = getDOMImplementation()
  xDoc = impl.createDocument(None, "response", None)
  root = xDoc.documentElement
  root.setAttribute("location", runconfig.cdilite['docdir'])



  # number counter
  counters = counter.Counter()
  counters.newCounter('all')
  counters.setCounter('all',g.nline['parsed'])
  counters.newCounter('failed_BadURL')
  counters.newCounter('failed_FileNotFound')

  # save the current path 
  currentPath = os.getcwd()

  # loop over each information tuple extracted from document list file 
  # each tuple contains the name of the pdf files
  if verbal: print "counters.all = ",counters.all
  for i in range(0,counters.all):
    print ''
    sys.stdout.write("\r")
    sys.stdout.write("%9d/%-9d  " % (i+1,counters.all))
    sys.stdout.write("\n")
    infoprinter.printPara('URL',g.rel_path[i])

    code = None
    
    # get resource variable "r"
    if verbal: print 'g.parent_url[i] = ',g.parent_url[i]
    if verbal: print 'g.url[i] = ',g.url[i]
    try:
        r = resource.Resource(code,g.parent_url[i],g.url[i],\
            g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i])
    except BadResourceError,e:
	infoprinter.printStatus('URL Parse','fail')
	counters.addCounter('failed_BadURL')
	continue

    r.crawl_date = g.crawl_date[i]
    r.content_type = g.content_type[i]
    infoprinter.printPara('mime-type',r.content_type)

    # where crawled documents are saved
    # retrieve the local hard copy of document
    infile = os.path.join(currentPath,runconfig.cdilite['docdir'],g.rel_path[i])

    inpdf = infile # e.g., filepath/file.pdf 
    if '%' in inpdf: 
      inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~

    # try to remove the last back slash from the full path 
    # or try to see if fullpath/index.html exists, maybe that is the file
    # if document file still cannot be found, write into log and skip it
    inpdfpath = inpdf
    if not os.path.exists(inpdfpath):
	msg = doclogger.generator('FileNotFound',infile,r)
  	logging.getLogger('document').info(msg)
	counters.addCounter('failed_FileNotFound')
	infoprinter.printStatus('Document file found','no')

    # inpdfpath is the "corrected" file path
    inpdf = inpdfpath
    infoprinter.printStatus('Document file found','yes')

    # load pdf file content to calculate encryption
    f = open(inpdf,'r')
    data = f.read()
    f.close()

    # calculate SHA1
    r.content_sha1 = hashlib.sha1(data).hexdigest() 
  
    try:
        # only save metadata file
        writer.save_met(r,inpdf) 
    except IOError,e:
        msg = doclogger.generator('IOErrorSave',infile,r)
        logging.getLogger('document').info(msg)
Exemple #29
0
def startup(verbal=True):

  # record start time 
  tic = time.time()

  # create on-screen information print object
  infoprinter = printinfo.printInfo()

  # check configurations
  if not checkConfig():
    infoprinter.printStatus('Configuration check','fail')
    raise SystemExit("Change your configurations in runconfig.py")
  else:
    infoprinter.printStatus('Configuration check','ok')

  # create document writer
  writer = output.CiteSeerWriter([runconfig.outputdir,runconfig.crawler])

  # create URL filter
  urlfilter = urlfilters.URLFilter(blacklistfile=runconfig.blacklistfile,domainblacklistfile=runconfig.domainblacklistfile)

  # create document type filter
  mimetypefilter = Mime_Type_Filter(runconfig.allow_doc_type)

  # create document content filter
  doccontentfilter = filter_doc.Doc_Content_Filter(runconfig.tempdir)

  # create text extractor 
  textextractor = textextract.Text_Extractor()

  # create document logger (for this middleware)
  doclogger = Doc_Logger(os.getenv('HOSTNAME'),mimetypefilter)

  # create general log configers and config logs
  logconfiger = Log_Configer()
  logconfiger.config_loggers()

  # parse log file
  print 'parsing log file...'
  g = create_instance(runconfig.logparser,runconfig.logfile)
  g.extract_info(logsummaryfile=runconfig.logsummaryfile,skip=runconfig.skip,nloglines=runconfig.nloglines)
  print 'parsing lot file finished'

  # number counter
  counters = counter.Counter()
  counters.newCounter('all')
  counters.setCounter('all',g.nline['parsed'])
  counters.newCounter('saved_New') 
  counters.newCounter('saved_Duplicate')
  counters.newCounter('filtered')
  counters.newCounter('filtered_URLFilter')
  counters.newCounter('filtered_MimetypeFilter')
  counters.newCounter('filtered_DocContentFilter')
  counters.newCounter('failed')
  counters.newCounter('failed_TextExtract')
  counters.newCounter('failed_FileNotFound')    # if inputs are pdf/ps
  counters.newCounter('failed_PDFFileNotFound') # if inputs are gzipped
  counters.newCounter('failed_BadURL') 		# Bad URL
  counters.newCounter('failed_SaveError')	# if error occurs when saving docs

  # create output directory if it does not exist
  if not os.path.exists(runconfig.outputdir):
      os.makedirs(runconfig.outputdir)

  # create temp directory if it does not exist
  if not os.path.exists(runconfig.tempdir):
      os.makedirs(runconfig.tempdir)

  # a mapping file is automatically generated if only export files 
  # (no db input) 
  if runconfig.toggle_save_doc_separate:
    open(runconfig.tempdir+'mapping.csv','w')

  # if required to visit database, make sure that database and tables 
  # are created
  if runconfig.toggle_save_to_db:
    cdb = crawldb.CrawlDB()
    # print database name
    infoprinter.printPara('Database name',cdb.dbname)
    # create document and parent table if they do not exist
    cdb.createTables()

  # save the current path 
  savedPath = os.getcwd()

  # loop over each information tuple extracted from crawler log file 
  for i in range(0,counters.all):
    print ''
    sys.stdout.write("\r")
    sys.stdout.write("%9d/%-9d  " % (i+1,counters.all))
    sys.stdout.write("\n")
    infoprinter.printPara('URL',g.url[i])

    # apply the URL filter
    if runconfig.toggle_urlfilter:
    	if not urlfilter.check(g.url[i]):
	    msg = "%s %s %s" % ('URLRejected',urlfilter.rejectreason,g.url[i])
            logging.getLogger('document').info(msg)
            counters.addCounter('filtered_URLFilter')
            if verbal: infoprinter.printStatus('URL accepted','no')
            continue
    
    # get resource variable "r"
    try:
        code = None
        r = resource.Resource(code,g.parent_url[i],g.url[i],\
            g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i])
    except BadResourceError,e:
	infoprinter.printStatus('URL Parse','fail')
	counters.addCounter('failed_BadURL')
	continue

    # url length cannot be longer th
    r.crawl_date = g.crawl_date[i]
    r.content_type = g.content_type[i]
    infoprinter.printPara('mime-type',r.content_type)

    # where crawled documents are saved
    # retrieve the local hard copy of document
    # If files are downloaded using "lftp", input file path should be 
    # constructed by appending the relative file path to "conf.inputdir"
    if runconfig.crawler.lower() == 'lftp':
        infile = runconfig.inputdir+g.rel_path[i]   
    elif runconfig.crawler.lower() == 'heritrix' and runconfig.saver.lower() == 'mirror':
        infile = runconfig.inputdir+r.host+r.path   
    else: 
        infile = runconfig.inputdir+g.rel_path[i]

    # apply doctype_filter, which checks the document mimetype type
    mimetypefilter_ok = mimetypefilter.check(r)
    if not mimetypefilter_ok: 
      msg = doclogger.generator('DocumentTypeNotAccepted',infile,r)
      logging.getLogger('document').info(msg)
      counters.addCounter('filtered_MimetypeFilter')
      if verbal: infoprinter.printStatus('Accepted document type','no')
      continue
    else:
      if verbal: infoprinter.printStatus('Accepted document type','yes')

    r.ext = mimetypefilter.ext

    # check if document is already in db
    # if it returns False, continue to next step
    # if it returns True,log it and skip processing this one
    # However, if the overwrite_file toggle is set, we need to continue to the
    # next step anyway
    if runconfig.toggle_save_to_db:
        recordExist = cdb.checkRecord(runconfig.dbt_document,md5=r.md5)
	if not recordExist:
	    infoprinter.printStatus('New document','yes')
       	else:
    	    msg = doclogger.generator('saved_Duplicate',infile,r)
    	    logging.getLogger('document').info(msg)
	    counters.addCounter('saved_Duplicate')
       	    infoprinter.printStatus('New document','no')
	    if not runconfig.overwrite_file:
    	        continue
   
    # check existence of input file, if the name part of "infile" 
    # contains wild card characters e.g., %, 
    # try to recover it to normal 
    # "infile" is the original full file path from crawl log (may contain 
    # escape characters and may by in zipped format) 
    # "inpdf" contains original file names saved in disk (no escape characters, 
    # and in acceptable file format, e.g., PDF/postscript)
    # "inpdfpath" contains the correct path of input file name, see below. in 
    # some cases, url paths are not correctly normalized 
    # and need to be corrected. For example, if the last segment does not 
    # contain ".", it is taken as a directory and a "/" is 
    # added, while this is incorrect. 
    inpdf = infile # e.g., filepath/file.pdf 
    if '%' in inpdf: 
      inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~

    # try to remove the last back slash from the full path 
    # or try to see if fullpath/index.html exists, maybe that is the file
    # if document file still cannot be found, write into log and skip it
    inpdfpath = inpdf
    if not os.path.exists(inpdfpath):
	inpdfpath = inpdf[:-1]
	if not os.path.exists(inpdfpath):
	    inpdfpath = inpdf+'index.html'
	    if not os.path.exists(inpdfpath):
		# try to download the the paper using "wget"
		# downloaded paper is saved to temporary directory and renamed
		# to "wget.pdf". Note that we just temporirily add an extention
		# of ".pdf", but it may not be a PDF file. If it is not,
		# it will be filtered out by the doc_type_filter later.
		# add quotes to url
 		# if download is not successful, we mark this document as
		# "FileNotFound"
		wgeturl = '"'+r.url+'"'
		wgetfile = os.path.join(runconfig.tempdir,"wget."+r.ext)
		wgetcmd = "wget "+wgeturl+" -O "+wgetfile
		
		# first remove the existing "wget.pdf" if it exists
		if os.path.exists(wgetfile):
		    rmcmd = "rm -rf "+wgetfile
		    cmdoutput = commands.getoutput(rmcmd)
		# download document using "wget", time out is 5 min
		cmdoutput = timeoutpython.run(wgetcmd, shell=True, timeout=300)
		# if function returns "-9", download failed, skip this doc
		#if cmdoutput[0] == -9:
		#    print cmdoutput
		#cmdoutput = commands.getoutput(wgetcmd)
	        #print 'cmdoutput = ',cmdoutput

		# Check if file downloaded successfully
		if (not os.path.exists(wgetfile)) or (cmdoutput[0] == -9):
      		    msg = doclogger.generator('FileNotFound',infile,r)
      		    logging.getLogger('document').info(msg)
      		    counters.addCounter('failed_FileNotFound')
		    if verbal: 
      		     	infoprinter.printStatus('Document file found','no')
      		    	infoprinter.printPara('infile',infile)
      		    continue
		else:
		    inpdfpath = wgetfile

    # inpdfpath is the "corrected" file path
    inpdf = inpdfpath
    if verbal:
        infoprinter.printStatus('Document file found','yes')
    	infoprinter.printPara('Document file path',inpdf)
    
    # If input file is in zipped format, assuming it is a .tar.gz file
    # we do the following things
    # * copy the .tar.gz file to a temp directory 
    # * decompress it using tar -xvzf 
    # * find the .pdf file inside the unzipped 
    # * do whatever we want ...
    # * remove everything in the temp directory 
    cmd_file = 'file -i "'+infile+'"'
    cmdoutput = commands.getoutput(cmd_file)
    #t = cmdoutput.split(' ')
    #infilemimetype = t[-1]
    #infoprinter.printStatus('MIME-type',infilemimetype)
    #print cmdoutput
    if 'application/x-gzip' in cmdoutput:
      infoprinter.printStatus('MIME-type','application/x-gzip')
      cmd_rm = 'rm -rf '+runconfig.tempdir+'*'
      cmdoutput = commands.getoutput(cmd_rm)

      cmd_cp = 'cp "'+infile+'" '+runconfig.tempdir
      cmdoutput = commands.getoutput(cmd_cp)

      # sometimes, for some (unknown) reasons, the "-C" option
      # does not work well for "tar" command, so we cd to the
      # temp directory, extract files from the .tar.gz and return
      # to the main directory
      #
      # obtain the file name from the full path: infilename
      infilename = os.path.split(infile)[1]
      os.chdir(runconfig.tempdir)
      cmd_tar = 'tar -xvzf "'+infilename+'"'
      cmdoutput = commands.getoutput(cmd_tar)
      os.chdir(savedPath)
  
      # only look for pdf files
      for root,dirs,files in os.walk(runconfig.tempdir):
        inpdffound = False
        for f in files:
	  if f.endswith('pdf'):
	    inpdf = os.path.join(root,f)
            inpdffound = True
            break
        if inpdffound == True:
          break
      if not inpdffound: 
        msg = doclogger.generator('PDFFileNotFound',infile,r)
        logging.getLogger('document').info(msg)
        counters.addCounter('failed_PDFFileNotFound')
        infoprinter.printStatus('PDF Document file found','no')
        continue
    
    # document file is found
    # check if need to use doc_content_filter
    if runconfig.toggle_doc_content_filter:
      
      # extract text from documents 
      filefmt = mimetypefilter.doctype

      if verbal: infoprinter.printPara('Mime type',filefmt)
      # acceptable formats: e.g., "application/pdf","application/postscript" 
      textextractmsg = textextractor.extract(inpdf,filefmt) 

      # classify document if text is extracted successfully
      if 'Success' in textextractmsg:
          infoprinter.printStatus('Extract text','success')
          # not a paper, log it and proceed it to the next
          if doccontentfilter.Decider(textextractor.outtxtfile,inpdf) == -1:
	      counters.addCounter('filtered_DocContentFilter')
              msg = doclogger.generator('NotAcademic',infile,r)
              logging.getLogger('document').info(msg)
	      infoprinter.printStatus('Accepted document content','no')
              continue
	  else:
	      infoprinter.printStatus('Accepted document content','yes')
      else: # text extraction fails, report error and write it into log file
          infoprinter.printStatus('Extract text','fail')
	  counters.addCounter('failed_TextExtract')
          msg = doclogger.generator(textextractmsg,infile,r)
          logging.getLogger('document').info(msg)
          continue

    # determine the FINAL mimetype of this document, if it is 
    # "application/pdf", use ".pdf" as the extension, if it is 
    # "application/postscript", use ".ps" as the extension
    # "inpdf" is the final pdf file to be accepted (after re-download, after
    # filters)
    if mimetypefilter.doctype == 'application/pdf':
	r.ext = 'pdf'
    elif mimetypefilter.doctype == 'application/postscript':
	r.ext = 'ps'
    else:
        cmd_file = 'file -i "'+inpdf+'"'
        cmdoutput = commands.getoutput(cmd_file)
     	if 'application/postscript' in cmdoutput:
	    r.ext = 'ps'
	else:
	    infoprinter.printStatus('Recognizable mimetype','no')
	    sys.exit(cmdoutput)


    # write document information into database
    # database settings can be found at settings.py
    # read file content and calculate the SHA1 value
    # read PDF document information
    # In some cases, the actual PDF was downloaded but the URL ends with a 
    # slash: for example
    # dial.academielouvain.be/vital/access/services/Download/boreal:12685/PDF_01/
    # the downloaded file is renamed as "index.html" though it is PDF file. In this case,
    # we try "inpdf/index.html" to see if we can actually identify this file.
    # If this does not work, it could be that Heritrix downloads the file as "PDF_01", this
    # happens for the URL below, when the actual file is named "75" under the 78/ directory
    # www.br-ie.org/pub/index.php/rbie/article/viewFile/78/75/
    #
    # If we still cannot find any file, we have to skip it
    try:
        f = open(inpdf,'r')
        data = f.read()
        f.close()
    except IOError:
	# just remove the last "slash"
	try: 
	    f = open(inpdf[:-1],'r')
            data = f.read()
            f.close()
	except IOError:
	    try:
 	    	f = open(inpdf+'index.html','r')
            	data = f.read()
            	f.close()
	    except IOError:
      	    	msg = doclogger.generator('FileNotFound',infile,r)
            	logging.getLogger('document').info(msg)
      	    	counters.addCounter('failed_FileNotFound')
      	    	infoprinter.printStatus('Document file found','no')
       	    	continue

    # If required to save crawled documents separately,
    # do not save to db, only save document to outputdir
    # Files are named using numbers starting from 1
    # A mapping file is automatically generated
    filenamebody = id_to_fname(i+1,r.ext)
    outdoc = runconfig.outputdir+filenamebody
    if runconfig.toggle_save_doc_separate:
      mappingline = outdoc+','+infile # may not be inpdf
      ff = open(outdoc,'w')
      ff.write(data)
      ff.close
      try:
	f = open(outdoc)
        msg = doclogger.generator('saved_New',infile,r)
        logging.getLogger('document').info(msg)
        infoprinter.printStatus('Document saved','yes')
        # number of saved documents 
        counters.addCounter('saved_New')
      except IOError,e:
        infoprinter.printStatus('Document saved','no')
        raise SystemExit(e)