コード例 #1
0
ファイル: urlutils.py プロジェクト: moorthys/searchengine
def iscrawlable(page_url):
    try:
        rp = robotparser.RobotFileParser()
        parsedurl = urlparse.urlsplit(page_url)
        robotsurl = canonicalization.mycanonicalization(parsedurl.scheme+"://"+parsedurl.netloc + "/robots.txt")
        rp.set_url(robotsurl)
        rp.read()
        return rp.can_fetch("*", page_url)
    except IOError:
        return False
コード例 #2
0
ファイル: testcanon.py プロジェクト: moorthys/searchengine
__author__ = 'Moorthy'

import canonicalization

print canonicalization.mycanonicalization("HTTP://www.Example.com/SomeFile.html")
print canonicalization.mycanonicalization("http://www.example.com:80")
print canonicalization.mycanonicalization("http://www.example.com/a/../c.html")
print canonicalization.mycanonicalization("http://www.example.com/a.html#anything")
print canonicalization.mycanonicalization("http://www.example.com//a.html")
print canonicalization.mycanonicalization("http://www.example.com/c.html?a=10")
コード例 #3
0
ファイル: urlutils.py プロジェクト: moorthys/searchengine
def downloadandgeturls(page_url, docid):
    returnval = {}
    urlfilenamelist = page_url.rsplit('/',1)
    urlfilename = urlfilenamelist[-1]
    parsedurl = urlparse.urlsplit(page_url)

    if isvaliddomain(parsedurl.netloc):
        try:
            page_url.decode('ascii')
        except UnicodeEncodeError:
            returnval['iserror'] = True
            returnval['ispoliteenabled'] = False
            returnval['errormsg'] = "Unicode Encode Error in URL"
            return returnval

        if iscrawlable(page_url):
            urllist = []
            try:
                f = urllib.urlopen(page_url)
            except IOError:
                returnval['iserror'] = True
                returnval['ispoliteenabled'] = False
                returnval['errormsg'] = "URL failed to open."
                return returnval
            headers = f.info().headers
            if isvalidheader(headers):
                try:
                    html_page = f.read()
                    soup = BeautifulSoup(html_page)
                    for link in soup.findAll('a'):
                        urlvalue = link.get('href')
                        if not (urlvalue is None):
                            if urlvalue.startswith("http"):
                                try:
                                    urllist.append(canonicalization.mycanonicalization(urlvalue))
                                except AttributeError:
                                    print "Not valid URL."
                                except ValueError:
                                    print "Invalid IPv6 URL."
                            else:
                                try:
                                    if urlvalue.startswith("//"):
                                        finalurl = canonicalization.mycanonicalization(parsedurl.scheme+":"+urlvalue)
                                    elif urlvalue.startswith("/"):
                                        finalurl = canonicalization.mycanonicalization(parsedurl.scheme+"://"+parsedurl.netloc+"//"+urlvalue)
                                    else:
                                        new_page_url = page_url.replace(urlfilename, urlvalue)
                                        finalurl = canonicalization.mycanonicalization(new_page_url)
                                    if urlvalue not in restrictedurlvalue:
                                        urllist.append(finalurl)
                                except AttributeError:
                                    print "Not valid URL."

                    docdict = {}
                    docdict['page_url'] = page_url
                    docdict['raw_html'] = unicode(html_page, errors='ignore')
                    docdict['outlinks'] = set(urllist)
                    docdict['headerdata'] = headers
                    texts = soup.findAll(text=True)
                    visible_texts = filter(visible, texts)
                    single_text = '\n'.join(visible_texts)

                    if not isrelavantpage(single_text):
                        returnval['iserror'] = True
                        returnval['errormsg'] = "No keywords"
                        returnval['ispoliteenabled'] = True
                        return returnval

                    docdict['clean_text'] = single_text
                    fileutils.writedocument(docid, docdict)
                    uniqueurllist = set(urllist)
                    returnval['iserror'] = False
                    returnval['urllist'] = uniqueurllist
                    returnval['ispoliteenabled'] = True
                    return returnval
                except IOError:
                    print traceback.format_exc()
                    returnval['iserror'] = True
                    returnval['errormsg'] = "Invalid URL."
                    returnval['ispoliteenabled'] = False
                    return returnval
            else:
                returnval['iserror'] = True
                returnval['errormsg'] = "Header is invalid."
                returnval['ispoliteenabled'] = False
                return returnval
        else:
            returnval['iserror'] = True
            returnval['errormsg'] = "Not crawlable URL."
            returnval['ispoliteenabled'] = False
            return returnval
    else:
        returnval['iserror'] = True
        returnval['errormsg'] = "Domain is banned"
        returnval['ispoliteenabled'] = False
        return returnval