Example #1
0
def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    database = Storage()
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            text = content.get_text()
            text = re.sub(r"[-/']", ' ', text)
            text = re.sub(
                ur'[\u0932\u094b\u0917\u092a\u0930\u093f\u0926\u0943\u0936\u094d\u092f\u0938\u094d\u0925\u093e\u0928\u0915\u0930\u094d\u092e\u091a\u093e\u0930\u0940\u201c\u2013\u2019\u092b\u201d]',
                ' ', text)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                database.add_link(url, outlink)
            punctuation = ".?;,!()|:\""
            for word in text.split():
                word = word.lstrip(punctuation).rstrip(punctuation).lower()
                search = re.search(r"[^a-z0-9]", word)
                if not search and word != "":
                    database.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return database
Example #2
0
def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    database = Storage()
    while tocrawl: 
        url = tocrawl.pop()
        if url not in crawled:
            print url
            soup = get_page(url)
            text = soup.get_text()
            text = re.sub(r"[-/']", ' ', text)
            text = re.sub(ur'[\u0932\u094b\u0917\u092a\u0930\u093f\u0926\u0943\u0936\u094d\u092f\u0938\u094d\u0925\u093e\u0928\u0915\u0930\u094d\u092e\u091a\u093e\u0930\u0940\u201c\u2013\u2019\u092b\u201d]', ' ', text)
            outlinks = get_all_links(soup)
            url_title = str(soup.title.string)
            database.add_title(url, url_title)
            for outlink in outlinks:
                database.add_link(url, outlink)
            punctuation = ".?;,!()|:\""
            for word in text.split():
                word = word.lstrip(punctuation).rstrip(punctuation).lower()
                search = re.search(r"[^a-z0-9]", word)
                if not search and word is not "":
                    database.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return database
Example #3
0
def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    database = Storage()
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            print url
            content = get_page(url)
            text = content.get_text()
            text = re.sub(r"[-/']", ' ', text)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                database.add_link(url, outlink)
            punctuation = ".?;,!()|:\""
            for word in text.split():
                word = word.lstrip(punctuation).rstrip(punctuation).lower()
                search = re.search(r"[^a-z0-9]", word)
                if not search and word is not "":  #!=
                    database.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return database
Example #4
0
def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    database = Storage()
    while tocrawl: 
        url = tocrawl.pop()
        if url not in crawled:
            print url
            content = get_page(url)
            text = content.get_text()
            text = re.sub(r"[-/']", ' ', text)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                database.add_link(url, outlink)
            punctuation = ".?;,!()|:\""
            for word in text.split():
                word = word.lstrip(punctuation).rstrip(punctuation).lower()
                search = re.search(r"[^a-z0-9]", word)
                if not search and word is not "": #!=
                    database.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return database