Example #1
0
File: url.py Project: theduke/jenni
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if "//" in domain:
            domain = domain.split('//')[1]
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            display.append([page_title, url, bitly])
        i += 1
    return display
Example #2
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        domain = getTLD(url)
        domain = domain.strip()
        if "//" in domain:
            domain = domain.split('//')[1]
        try:
            ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP)
        except:
            i += 1
            continue
        localhost = False
        for x in ips:
            y = x[4][0]
            if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y:
                localhost = True
                break
        if localhost: break
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            if page_title:
                if IPv4_HOST in page_title or IPv6_HOST in page_title:
                    break
            display.append([page_title, url, bitly])
        i += 1
    return display
Example #3
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if "//" in domain:
            domain = domain.split('//')[1]
        try:
            ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP)
        except:
            i += 1
            continue
        localhost = False
        for x in ips:
            y = x[4][0]
            if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y:
                localhost = True
                break
        if localhost: break
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            if page_title:
                if IPv4_HOST in page_title or IPv6_HOST in page_title:
                    break
            display.append([page_title, url, bitly])
        i += 1
    return display
Example #4
0
File: url.py Project: hodiapa/jenni
def get_results(text):
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = [ ]
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            display.append([page_title, url, bitly])
        i += 1
    return display
Example #5
0
def get_results(text):
    if not text:
        return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    passs = False
    while i < k:
        url = uc.encode(a[i][0])
        url = uc.decode(url)
        url = uc.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if '//' in domain:
            domain = domain.split('//')[1]
        if not url.startswith(EXCLUSION_CHAR):
            #passs, page_title = find_title(url)
            passs, page_title = find_title_lite(url)
            display.append([page_title, url])
        i += 1
    return passs, display