Exemple #1
0
def join_urls(baseurl, url):
    if url.startswith('//'):
        url = 'http:' + url
        return norms(url)
    elif url.startswith('www.'):
        http_url = SCHEME_HTTP + url
        if is_absolute(http_url):
            return norms(http_url)
    elif is_absolute(url):
        return norms(url)
    else:
        return norms(urljoin(baseurl, url))
Exemple #2
0
def join_urls(baseurl, url):
  if url.startswith('//'):
    url = 'http:' + url
    return norms(url)
  elif url.startswith('www.'):
    http_url = SCHEME_HTTP + url
    if is_absolute(http_url):
      return norms(http_url)
  elif is_absolute(url):
    return norms(url)
  else:
    return norms(urljoin(baseurl, url))
Exemple #3
0
 def normalize(self, item):
     normal = urlnorm.norms(item)
     # remove anchor-references
     normal = normal.split('#', 1)[0]
     # remove apache directory extravaganza [?[NDMS]=[AD]
     parm = normal.split('?', 1)[-1]
     if parm and len(parm) == 3 and parm[1] == '=' and parm[2] in 'AD' and parm[0] in 'NDMS':
         normal = normal[-4]
     return normal
Exemple #4
0
def parseForUrlsInHtml(data, location):
    # search for a possible base reference
    bases = searchInForTag(BasePattern, data)
    baseRef = None
    if len(bases)>=1:
        baseRef = bases[0][0]
        if len(bases)>1:
            print "more than one base tag found"

    # search for tags and add found tags to URL queue
    ret = {}
    for pattern in LinkPatterns:
        urls = searchInForTag(pattern, data)
        for url, name in urls:
            url = urlnorm.norms(urljoin(get_absolute_url(url, baseRef, location), url))
            ret[url] = 1
    return ret.keys()
def parse_page(dbinfo, html):
    soup = BeautifulSoup(html, "lxml")
    blog_count = 0
    done = False
    for table in soup.find_all('table'):
        rank = None
        href = None
        auth = None
        for tr in table.find_all('tr'):
            for string in tr.strings:
                if re.match(rank_regex, string) is not None:
                    rank = string.strip()[:-1]
            for td in tr.find_all('td'):
                td_class = td.get('class')
                if td_class is not None and 'site-details' in td_class:
                    site_details = parse_site_details(td)
                    if site_details is not None:
                        href = site_details
                if td_class is not None and 'statistics' in td_class:
                    statistics = parse_statistics(td)
                    if statistics is not None:
                        auth = statistics
        if (rank is not None or href is not None or auth is not None):
            print((rank, href, auth))
        if (rank is not None and href is not None and auth is not None):
            if int(auth) > AUTH_SCORE_THRESHOLD:
                blog = {
                    'link': urlnorm.norms(href),
                    # Alternative construction for archive.org URLs:
                    #               blog = {'link': 'https://web.archive.org' + urlnorm.norms(href),
                    'rank': rank,
                    'auth_score': auth
                }
                store_blog_ranking(dbinfo, blog)
                blog_count += 1
            else:
                done = True
    return (blog_count, done)
def parse_page(dbinfo, html):
    soup = BeautifulSoup(html, "lxml")
    blog_count = 0
    done = False
    for table in soup.find_all('table'):
        rank = None
        href = None
        auth = None
        for tr in table.find_all('tr'):
            for string in tr.strings:
                if re.match(rank_regex, string) is not None:
                    rank = string.strip()[:-1]
            for td in tr.find_all('td'):
                td_class = td.get('class')
                if td_class is not None and 'site-details' in td_class:
                    site_details = parse_site_details(td)
                    if site_details is not None:
                        href = site_details
                if td_class is not None and 'statistics' in td_class:
                    statistics = parse_statistics(td)
                    if statistics is not None:
                        auth = statistics
        if (rank is not None or href is not None or auth is not None):
            print( (rank, href, auth) )
        if (rank is not None and href is not None and auth is not None):
            if int(auth) > AUTH_SCORE_THRESHOLD:
                blog = {'link': urlnorm.norms(href),
                # Alternative construction for archive.org URLs:
#               blog = {'link': 'https://web.archive.org' + urlnorm.norms(href),
                        'rank': rank,
                        'auth_score': auth}
                store_blog_ranking(dbinfo, blog)
                blog_count += 1
            else:
                done = True
    return (blog_count, done)
Exemple #7
0
def clean_url(url):
    (proto, host, path, params, frag) = urlsplit(urlnorm.norms(url))
    return urlunsplit((proto, host, path, params, ''))
def mycanonicalization(urllink):
    parsedurl = urlparse.urlsplit(urllink)
    return urlnorm.norms(parsedurl.scheme+"://"+parsedurl.netloc+"//"+parsedurl.path)