def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if "//" in domain: domain = domain.split('//')[1] if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) domain = getTLD(url) domain = domain.strip() if "//" in domain: domain = domain.split('//')[1] try: ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP) except: i += 1 continue localhost = False for x in ips: y = x[4][0] if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y: localhost = True break if localhost: break if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url if page_title: if IPv4_HOST in page_title or IPv6_HOST in page_title: break display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if "//" in domain: domain = domain.split('//')[1] try: ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP) except: i += 1 continue localhost = False for x in ips: y = x[4][0] if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y: localhost = True break if localhost: break if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url if page_title: if IPv4_HOST in page_title or IPv6_HOST in page_title: break display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): a = re.findall(url_finder, text) k = len(a) i = 0 display = [ ] while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() passs = False while i < k: url = uc.encode(a[i][0]) url = uc.decode(url) url = uc.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if '//' in domain: domain = domain.split('//')[1] if not url.startswith(EXCLUSION_CHAR): #passs, page_title = find_title(url) passs, page_title = find_title_lite(url) display.append([page_title, url]) i += 1 return passs, display