Python Indexer.add_url Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: add_url

Examples at hotexamples.com: 2

Python Indexer.add_url - 2 examples found. These are the top rated real world Python examples of indexer.Indexer.add_url extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

File: crawler.py Project: tema5190/Arlin

class Crawler(object):
    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self, url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*', url)

    def define_root_url(self, url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')

        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):
        def visible(element):
            if element.parent.name in [
                    'head', 'script', 'style', '[document]'
            ]:
                return False

            if re.match('<--.*-->', str(element)):
                return False

            if element == '\n':
                return False

            return True

        data = soup.findAll(text=True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+', text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)

    def visit(self, url, width, depth):

        if depth < 0:
            return

        if not self.pass_robot_txt(url):
            raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width - 1
            self.visit(url, width, depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self, url, width, depth):
        self.define_root_url(url)
        self.visit(url, width, depth)

Example #2

-1

Show file

File: crawler.py Project: polinatolmach/arlin

class Crawler(object):

    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self,url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*',url)

    def define_root_url(self,url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')


        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):

        def visible(element):
            if element.parent.name in ['head','script','style','[document]']:
                return False

            if re.match('<--.*-->',str(element)):
                return False

            if element == '\n':
                return False

            return True


        data = soup.findAll(text = True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+',text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)


    def visit(self, url, width, depth):

        if depth<0:
            return

        if not self.pass_robot_txt(url):
           raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width -1
            self.visit(url,width,depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self,url,width,depth):
        self.define_root_url(url)
        self.visit(url,width,depth)