Esempio n. 1
0
    def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \
            crawl_delay=1000, filter_keywords=['previous','next']):
        #entry_point = starting URL for crawl
        #search_depth = how many steps in path we save to retrace when at a dead end
        #found_set_persistence = how long, in min,  to keep a resource URI in memory
        #       before it is allowed to be returned as a new resource again.  720= 12
        #       hours before crawler 'forgets' it has seen something and resubmits it
        #       in the queue to be processed
        #crawl_delay = how long, in ms, before accessing/crawling a new resource

        self.entry_point = entry_point #entry point URI

        #initialize crawl variables
        self.current_uri = entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.crawl_delay = crawl_delay #in milliseconds
        self.degrees = 0
        self.return_if_found = False
        self.createform_type = None

        self.found_resources = TimeDecaySet(0)

        #initialize filter word list for crawling
        self.filter_keywords = ['edit','create','self','curies','websocket']
        [self.filter_keywords.append(x) for x in filter_keywords]
        log.debug( "filter keywords %s", self.filter_keywords)

        log.info( "-----------------------------------------------" )
        log.info( "Crawler Initialized." )
        log.info( "Entry Point: %s", self.entry_point )
        log.info( "-----------------------------------------------" )
Esempio n. 2
0
    def query_link_array(self, crawl_links):
        '''takes a crawl_link array (which has links and types of objects)
        and decides which of these links were quieried for. Return List of
        URIs that are matched resources not in the set already discovered'''

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
            log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)

        matching_uris = []

        #(1) if resource name exists, filter items to get only items that
        #match the singular resource name, AND (things that match the plural
        #resource name && are from_item_list)
        #(2) if title exists, filter items remaining for those that match the title

        for link_item in crawl_links:

            log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href'])
            this_link_item_matches = True

            #see if it matches resource_type, if queried for
            if self.qry_resource_type is not None:
                if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \
                        or (link_item['type'].lower() == self.qry_resource_type)):
                    #it does!

                    #double check for createForms the parent is correct
                    if ('createform' == link_item['type'].lower() and self.createform_type is not None):
                        if (self.current_uri_type.lower() not in self.createform_type):
                            this_link_item_matches = False
                        else:
                            log.info('SEARCH_LIST: matched search_type %s', link_item['type'])
                    else:
                        log.info('SEARCH_LIST: matched search_type %s', link_item['type'])

                else:
                    #it doesn't, but we're searching on resource_type
                    this_link_item_matches = False

            #see if it matches resource_title, if queried for
            if self.qry_resource_title is not None:
                if (link_item['title'].lower() == self.qry_resource_title):
                    #it does!
                    log.info('SEARCH_LIST: matched search_title %s', link_item['title'])
                else:
                    #it doesn't, but we're searching on resource_title
                    this_link_item_matches = False

            #if we made it to here and this_link_item_matches, it's a match!
            if this_link_item_matches:
                matching_uris.append(link_item['href'])

        #return list of matching uris
        return matching_uris
Esempio n. 3
0
    def query_link_array(self, crawl_links):
        '''takes a crawl_link array (which has links and types of objects)
        and decides which of these links were quieried for. Return List of
        URIs that are matched resources not in the set already discovered'''

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
            log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)

        matching_uris = []

        #(1) if resource name exists, filter items to get only items that
        #match the singular resource name, AND (things that match the plural
        #resource name && are from_item_list)
        #(2) if title exists, filter items remaining for those that match the title

        for link_item in crawl_links:

            log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href'])
            this_link_item_matches = True

            #see if it matches resource_type, if queried for
            if self.qry_resource_type is not None:
                if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \
                        or (link_item['type'].lower() == self.qry_resource_type)):
                    #it does!
                    log.info('SEARCH_LIST: matched search_type %s', link_item['type'])
                else:
                    #it doesn't, but we're searching on resource_type
                    this_link_item_matches = False

            #see if it matches resource_title, if queried for
            if self.qry_resource_title is not None:
                if (link_item['title'].lower() == self.qry_resource_title):
                    #it does!
                    log.info('SEARCH_LIST: matched search_title %s', link_item['title'])
                else:
                    #it doesn't, but we're searching on resource_title
                    this_link_item_matches = False

            #if we made it to here and this_link_item_matches, it's a match!
            if this_link_item_matches:
                matching_uris.append(link_item['href'])

        #return list of matching uris
        return matching_uris
Esempio n. 4
0
    def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \
            cache_table_mask_length=8, track_search_depth=5, \
            found_set_persistence=720, crawl_delay=1000, filter_keywords=['previous','next']):
        #entry_point = starting URL for crawl
        #search_depth = how many steps in path we save to retrace when at a dead end
        #found_set_persistence = how long, in min,  to keep a resource URI in memory
        #       before it is allowed to be returned as a new resource again.  720= 12
        #       hours before crawler 'forgets' it has seen something and resubmits it
        #       in the queue to be processed
        #crawl_delay = how long, in ms, before accessing/crawling a new resource

        self.entry_point = entry_point #entry point URI

        #initialize crawl variables
        self.current_uri = entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.current_uri_title = 'entry_point'
        self.crawl_history = LeakyLIFO(track_search_depth) #keep track of past
        self.crawl_delay = crawl_delay #in milliseconds
        self.found_resources = TimeDecaySet(found_set_persistence) #in seconds

        #initialize cache
        self.cache = CrawlerCacheWithCollisionHistory(cache_table_mask_length)

        #initialize queue/zmq variables
        self.q = None
        self.zmq = None

        self.find_called = False

        #initialize filter word list for crawling
        self.filter_keywords = ['edit','create','self','curies','websocket']
        [self.filter_keywords.append(x) for x in filter_keywords]
        log.debug( "filter keywords %s", self.filter_keywords)

        log.info( "-----------------------------------------------" )
        log.info( "Crawler Initialized." )
        log.info( "Entry Point: %s", self.entry_point )
        log.info( "-----------------------------------------------" )
Esempio n. 5
0
    def apply_hal_curies(json, del_curies=True):
        '''Find and apply CURIES relationship shorcuts (namespace/rel
        definitions) to other links in the json object. I.E., if we have
        a CURIES "http://learnair.media.mit.edu/rels/{rel}" with name "ch",
        and a link further called 'ch:sites', remove the CURIES part of the
        object and apply it so that 'ch:sites' is now "http://learnair.media
        .mit.edu/rels/sites". del_curies tells this function whether to
        remove the CURIES section of _links after applying it to the document
        (True), or whether to leave it in (False).'''

        try:
            curies = json['_links']['curies'] #find the curies.

            for curie in curies: #compare each curies name...
                for key in json['_links']: #...with each link relationship

                    #if we find a link relation that uses the curies
                    if (key.startswith(curie['name'] + ':')):

                        #combine the curies & key to make the full resource link
                        newIndex = curie['href']
                        replaceString = key.split(curie['name'] + ':',1)[1]
                        newIndex = re.sub(r"\{.*\}", replaceString, newIndex)

                        #move the resource to the full resource link
                        json['_links'][newIndex] = json['_links'][key]
                        del json['_links'][key]
                        log.debug( 'CURIES: %s moved to %s', key, newIndex )

            #delete curies section of json if desired
            if del_curies:
                del json['_links']['curies']
                log.debug( 'CURIES: CURIES Resource applied fully & removed.' )

        except:
            log.warn( "CURIES: No CURIES found" )
            json['_links']={}

        return json
Esempio n. 6
0
    def apply_hal_curies(json, del_curies=True):
        '''Find and apply CURIES relationship shorcuts (namespace/rel
        definitions) to other links in the json object. I.E., if we have
        a CURIES "http://learnair.media.mit.edu/rels/{rel}" with name "ch",
        and a link further called 'ch:sites', remove the CURIES part of the
        object and apply it so that 'ch:sites' is now "http://learnair.media
        .mit.edu/rels/sites". del_curies tells this function whether to
        remove the CURIES section of _links after applying it to the document
        (True), or whether to leave it in (False).'''

        try:
            curies = json['_links']['curies'] #find the curies.

            for curie in curies: #compare each curies name...
                for key in json['_links']: #...with each link relationship

                    #if we find a link relation that uses the curies
                    if (key.startswith(curie['name'] + ':')):

                        #combine the curies & key to make the full resource link
                        newIndex = curie['href']
                        replaceString = key.split(curie['name'] + ':',1)[1]
                        newIndex = re.sub(r"\{.*\}", replaceString, newIndex)

                        #move the resource to the full resource link
                        json['_links'][newIndex] = json['_links'][key]
                        del json['_links'][key]
                        log.debug( 'CURIES: %s moved to %s', key, newIndex )

            #delete curies section of json if desired
            if del_curies:
                del json['_links']['curies']
                log.debug( 'CURIES: CURIES Resource applied fully & removed.' )

        except:
            log.warn( "CURIES: No CURIES found" )

        return json
Esempio n. 7
0
    def crawl_node(self):

        #put uri in cache now that we're crawling it, make a note of collisions
        if self.cache.put_and_collision(self.current_uri):
            log.info( 'HASH COLLISION: value overwritten in hash table.' )

        #debug: print state of cache after updating
        log.debug('CACHE STATE: %s', self.cache._cache)

        #download the current resource
        try:
            req = requests.get(self.current_uri)
            log.info( '%s downloaded.', self.current_uri )

        #downloading the current resource failed
        except requests.exceptions.ConnectionError:

            log.warn( 'URI "%s" unresponsive, moving back to previous link...',\
                    self.current_uri )

            #if we failed to download the entry point, give up
            if self.current_uri == self.entry_point:
                log.error( 'URI is entry point, no previous link.  Try again when' \
                        + ' the entry point URI is available.' )
                return False

            #if it wasn't the entry point, go back in our search history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']
                return True

            #if we don't have any history left, go back to the entry point
            except:
                log.info( 'exhausted depth of search history, back to entry point' )
                self.current_uri = self.entry_point
                self.current_uri_type = "entry_point"
                self.current_uri_title = "entry_point"
                return True

        #end downloading resource

        #put request in JSON form, apply CURIES, get links
        resource_json = req.json()
        log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

        req_links = self.apply_hal_curies(resource_json)['_links']
        crawl_links = self.get_external_links(req_links)

        #crawl_links is a 'flat' list list[:][fields]
        #fields are href, type, title, in_cache, from_item_list

        log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                'self, create/edit, ws, itemlist flattened): %s', crawl_links)

        #find the uris/resources that match search criteria!
        if self.qry_extra is None:
            #we don't need to actually download the link to see if it matches
            matching_uris = self.query_link_array(crawl_links)
        else:
            #we only have enough information to tell if the current node matches
            matching_uris = self.query_current_node(resource_json)

        #... and send them out!!
        if (self.push_uris_to_queue(matching_uris) and self.find_called):
            return False #end crawl if we found one and 'find' was called

        #select next link!!!!

        #get uncached links
        uncached_links = [x for x in crawl_links if not x['in_cache']]
        log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \
                len(uncached_links), len(crawl_links) )

        if (len(uncached_links)>0):
            #we have uncached link(s) to follow! randomly pick one.
            random_index = random.randrange(0,len(uncached_links))

            self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title})
            self.current_uri = uncached_links[random_index]['href']
            self.current_uri_type = uncached_links[random_index]['type']
            self.current_uri_title = uncached_links[random_index]['title']

        else:
            #we don't have any uncached options from this node. Damn.
            log.info('CRAWL: no new links available here, crawling back up history')

            #special case of being at the entry point
            if (self.current_uri_type == 'entry_point'):
                #double check we have something to crawl
                if (len(crawl_links) > 0):

                    log.info('CRAWL: no uncached links from entrypoint, resetting cache')
                    self.cache.clear() # clear cache

                    #randomly select node from crawl_links
                    random_index = random.randrange(0,len(crawl_links))

                    self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title})
                    self.current_uri = crawl_links[random_index]['href']
                    self.current_uri_type = crawl_links[random_index]['type']
                    self.current_uri_title = crawl_links[random_index]['title']

                else:
                    log.error('CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!')
                    return False

            #not at entry point, time to try and move back up in history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']

            except: #no history left, not at entry point- jump to entry point
                log.info('CRAWL: crawling back up history, but exhausted history.  Jump to entrypoint.')
                self.current_uri= self.entry_point
                self.current_uri_type = 'entry_point'
                self.current_uri_title = 'entry_point'

        log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri)
        log.info('CRAWL: type: %s', self.current_uri_type)
        log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

        #recurse
        return True
Esempio n. 8
0
    def bfs(self):

        current_depth = 0
        visited = set()
        link_tree = [[] for k in range(self.degrees)]

        while True:

            time.sleep(self.crawl_delay/1000.0)

            #download the current resource
            try:
                req = requests.get(self.current_uri)
                log.info( '%s downloaded.', self.current_uri )

                #put request in JSON form, apply CURIES, get links
                resource_json = req.json()
                log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

            #downloading the current resource failed
            except requests.exceptions.ConnectionError:

                log.warn( 'URI "%s" unresponsive, ignoring',\
                        self.current_uri )

                resource_json = {'_links':[]}

                #if we failed to download the entry point, give up
                if self.current_uri == self.entry_point:
                    log.error( 'URI is entry point, no previous link.  Try again when' \
                            + ' the entry point URI is available.' )
                    return

            #end downloading resource

            #get links from this resource
            req_links = self.apply_hal_curies(resource_json)['_links']
            crawl_links = self.flatten_filter_link_array(req_links)

            #crawl_links is a 'flat' list list[:][fields]
            #fields are href, type, title, in_cache, from_item_list

            log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                    'self, create/edit, ws, itemlist flattened): %s', crawl_links)

            #find the uris/resources that match search criteria!
            matching_uris = self.query_link_array(crawl_links)
            #... and send them out!!
            if (self.push_uris_to_queue(matching_uris) and self.return_if_found):
                return #return if we are using find_first and we found one

            #push all uris that don't match visited to proper depth list
            visited.add(self.current_uri)

            if current_depth < self.degrees:
                [link_tree[current_depth].append(x) for x in crawl_links \
                        if not x['href'] in visited]

            log.debug('BFS Array: %s', link_tree)
            log.debug('VISITED: %s', visited)

            #select next current_uri and current_uri_type by looking through
            #link_tree, if empty return

            finished = True

            for index in range(len(link_tree)):
                if len(link_tree[index]):

                    self.current_uri = link_tree[index][0]['href']
                    self.current_uri_type = link_tree[index][0]['type']
                    del link_tree[index][0]

                    current_depth = index + 1
                    finished = False
                    break

            if finished:
                return

            log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
            log.info('CRAWL: moving to %s', self.current_uri)
            log.info('CRAWL: type: %s', self.current_uri_type)
            log.info('CRAWL: depth: %s', current_depth)
            log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
Esempio n. 9
0
    def crawl_node(self):

        #put uri in cache now that we're crawling it, make a note of collisions
        if self.cache.put_and_collision(self.current_uri):
            log.info('HASH COLLISION: value overwritten in hash table.')

        #debug: print state of cache after updating
        log.debug('CACHE STATE: %s', self.cache._cache)

        #download the current resource
        try:
            req = requests.get(self.current_uri)
            log.info('%s downloaded.', self.current_uri)

        #downloading the current resource failed
        except requests.exceptions.ConnectionError:

            log.warn( 'URI "%s" unresponsive, moving back to previous link...',\
                    self.current_uri )

            #if we failed to download the entry point, give up
            if self.current_uri == self.entry_point:
                log.error( 'URI is entry point, no previous link.  Try again when' \
                        + ' the entry point URI is available.' )
                return False

            #if it wasn't the entry point, go back in our search history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']
                return True

            #if we don't have any history left, go back to the entry point
            except:
                log.info(
                    'exhausted depth of search history, back to entry point')
                self.current_uri = self.entry_point
                self.current_uri_type = "entry_point"
                self.current_uri_title = "entry_point"
                return True

        #end downloading resource

        #put request in JSON form, apply CURIES, get links
        resource_json = req.json()
        log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

        req_links = self.apply_hal_curies(resource_json)['_links']
        crawl_links = self.get_external_links(req_links)

        #crawl_links is a 'flat' list list[:][fields]
        #fields are href, type, title, in_cache, from_item_list

        log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                'self, create/edit, ws, itemlist flattened): %s', crawl_links)

        #find the uris/resources that match search criteria!
        if self.qry_extra is None:
            #we don't need to actually download the link to see if it matches
            matching_uris = self.query_link_array(crawl_links)
        else:
            #we only have enough information to tell if the current node matches
            matching_uris = self.query_current_node(resource_json)

        #... and send them out!!
        if (self.push_uris_to_queue(matching_uris) and self.find_called):
            return False  #end crawl if we found one and 'find' was called

        #select next link!!!!

        #get uncached links
        uncached_links = [x for x in crawl_links if not x['in_cache']]
        log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \
                len(uncached_links), len(crawl_links) )

        if (len(uncached_links) > 0):
            #we have uncached link(s) to follow! randomly pick one.
            random_index = random.randrange(0, len(uncached_links))

            self.crawl_history.push({
                'href': self.current_uri,
                'type': self.current_uri_type,
                'title': self.current_uri_title
            })
            self.current_uri = uncached_links[random_index]['href']
            self.current_uri_type = uncached_links[random_index]['type']
            self.current_uri_title = uncached_links[random_index]['title']

        else:
            #we don't have any uncached options from this node. Damn.
            log.info(
                'CRAWL: no new links available here, crawling back up history')

            #special case of being at the entry point
            if (self.current_uri_type == 'entry_point'):
                #double check we have something to crawl
                if (len(crawl_links) > 0):

                    log.info(
                        'CRAWL: no uncached links from entrypoint, resetting cache'
                    )
                    self.cache.clear()  # clear cache

                    #randomly select node from crawl_links
                    random_index = random.randrange(0, len(crawl_links))

                    self.crawl_history.push({
                        'href': self.current_uri,
                        'type': self.current_uri_type,
                        'title': self.current_uri_title
                    })
                    self.current_uri = crawl_links[random_index]['href']
                    self.current_uri_type = crawl_links[random_index]['type']
                    self.current_uri_title = crawl_links[random_index]['title']

                else:
                    log.error(
                        'CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!'
                    )
                    return False

            #not at entry point, time to try and move back up in history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']

            except:  #no history left, not at entry point- jump to entry point
                log.info(
                    'CRAWL: crawling back up history, but exhausted history.  Jump to entrypoint.'
                )
                self.current_uri = self.entry_point
                self.current_uri_type = 'entry_point'
                self.current_uri_title = 'entry_point'

        log.debug(
            '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(),
                 self.current_uri)
        log.info('CRAWL: type: %s', self.current_uri_type)
        log.debug(
            '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

        #recurse
        return True