def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1], 16) meep = unichr(cp) elif entity.startswith('&#'): cp = int(entity[2:-1]) meep = unichr(cp) else: char = name2codepoint[entity[1:-1]] meep = unichr(char) try: return uc.decode(meep) except: return uc.decode(uc.encode(meep))
def short(text): """ This function creates a bitly url for each url in the provided "text" string. The return type is a list. """ if not bitly_loaded: return list() if not text: return list() bitlys = list() try: a = re.findall(url_finder, text) k = len(a) i = 0 while i < k: b = unicode.decode(a[i][0]) if not b.startswith("http://bit.ly") or not b.startswith("http://j.mp/"): # check to see if the url is valid try: c = web.head(b) except: return [[None, None]] url = "http://api.j.mp/v3/shorten?login=%s&apiKey=%s&longUrl=%s&format=txt" % (bitly_user, bitly_api_key, urllib2.quote(b)) shorter = web.get(url) shorter.strip() bitlys.append([b, shorter]) i += 1 return bitlys except: return
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if "//" in domain: domain = domain.split('//')[1] if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) domain = getTLD(url) domain = domain.strip() if "//" in domain: domain = domain.split('//')[1] try: ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP) except: i += 1 continue localhost = False for x in ips: y = x[4][0] if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y: localhost = True break if localhost: break if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url if page_title: if IPv4_HOST in page_title or IPv6_HOST in page_title: break display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if "//" in domain: domain = domain.split('//')[1] try: ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP) except: i += 1 continue localhost = False for x in ips: y = x[4][0] if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y: localhost = True break if localhost: break if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url if page_title: if IPv4_HOST in page_title or IPv6_HOST in page_title: break display.append([page_title, url, bitly]) i += 1 return display
def get_results(text): a = re.findall(url_finder, text) k = len(a) i = 0 display = [ ] while i < k: url = unicode.encode(a[i][0]) url = unicode.decode(url) url = unicode.iriToUri(url) if not url.startswith(EXCLUSION_CHAR): try: page_title = find_title(url) except: page_title = None # if it can't access the site fail silently if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE): bitly = short(url) bitly = bitly[0][1] else: bitly = url display.append([page_title, url, bitly]) i += 1 return display
def find_title(url): """ This finds the title when provided with a string of a URL. """ uri = url for item in IGNORE: if item in uri: return False, 'ignored' if not re.search('^((https?)|(ftp))://', uri): uri = 'http://' + uri if 'twitter.com' in uri: uri = uri.replace('#!', '?_escaped_fragment_=') uri = uc.decode(uri) ## proxy the lookup of the headers through .py pyurl = u'https://tumbolia.appspot.com/py/' code = 'import simplejson;' code += "req=urllib2.Request(u'%s', headers={'Accept':'text/html'});" code += "req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1;" code += "rv:17.0) Gecko/20100101 Firefox/17.0'); u=urllib2.urlopen(req);" code += "rtn=dict();" code += "rtn['headers'] = u.headers.dict;" code += "contents = u.read();" code += "con = str();" code += r'''exec "try: con=(contents).decode('utf-8')\n''' code += '''except: con=(contents).decode('iso-8859-1')";''' code += "rtn['read'] = con;" code += "rtn['url'] = u.url;" code += "rtn['geturl'] = u.geturl();" code += r"print simplejson.dumps(rtn)" query = code % uri try: temp = web.quote(query) u = web.get(pyurl + temp) except Exception, e: return False, e
def get_results(text): if not text: return list() a = re.findall(url_finder, text) k = len(a) i = 0 display = list() passs = False while i < k: url = uc.encode(a[i][0]) url = uc.decode(url) url = uc.iriToUri(url) url = remove_nonprint(url) domain = getTLD(url) if '//' in domain: domain = domain.split('//')[1] if not url.startswith(EXCLUSION_CHAR): #passs, page_title = find_title(url) passs, page_title = find_title_lite(url) display.append([page_title, url]) i += 1 return passs, display
def readcsv_entires(path): with open(path, 'r', encoding='utf-8') as csvfile: next(csvfile) reader = csv.reader(csvfile, delimiter=',') for row in reader: e = Entry() e.strong_id = row[0] e.id = e.strong_id[1:] merged_split = row[1].split(';') words = merged_split[0].split(' ') e.word_uni = decode(words[1].strip()) e.word_ascii = decode(words[2].strip('[]')) e.part_of_speech = merged_split[1].strip() e.meaning1 = decode(row[2]) e.meaning2 = decode(row[3]) e.meaning3 = decode(row[4]) e.origin = replace_strongid_by_link(decode(row[5])) e.occ_count = row[6] e.occ = decode(row[7]) yield e
def find_title(url): """ This finds the title when provided with a string of a URL." """ uri = url if not uri and hasattr(self, 'last_seen_uri'): uri = self.last_seen_uri.get(origin.sender) for item in IGNORE: if item in uri: return if not re.search('^((https?)|(ftp))://', uri): uri = 'http://' + uri if "twitter.com" in uri: uri = uri.replace('#!', '?_escaped_fragment_=') redirects = 0 ## follow re-directs, if someone pastes a bitly of a tinyurl, etc.. page = str() while True: req = urllib2.Request(uri, headers={'Accept':'text/html'}) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0') u = urllib2.urlopen(req) info = u.info() page = u.read() u.close() if not isinstance(info, list): status = '200' else: status = unicode.encode(info[1]) info = info[0] if status.startswith('3'): uri = urlparse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 50: return "Too many re-directs." try: mtype = info['content-type'] except: return if not (('/html' in mtype) or ('/xhtml' in mtype)): return if not page: u = urllib2.urlopen(req) page = u.read(262144) u.close() content = page regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) content = regex.sub(r'<\1title>',content) regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE) content = regex.sub('',content) start = content.find('<title>') if start == -1: return end = content.find('</title>', start) if end == -1: return content = content[start+7:end] content = content.strip('\n').rstrip().lstrip() title = content if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1],16) return unichr(cp).encode('utf-8') elif entity.startswith('&#'): cp = int(entity[2:-1]) return unichr(cp).encode('utf-8') else: char = name2codepoint[entity[1:-1]] return unichr(char).encode('utf-8') title = r_entity.sub(e, title) if title: title = unicode.decode(title) else: title = 'None' title = title.replace('\n', '') title = title.replace('\r', '') def remove_spaces(x): if " " in x: x = x.replace(" ", " ") return remove_spaces(x) else: return x title = remove_spaces (title) re_dcc = re.compile(r'(?i)dcc\ssend') title = re.sub(re_dcc, '', title) if title: return title else: return 'No title'
def find_title(url): """ This finds the title when provided with a string of a URL. """ uri = url if not uri and hasattr(self, 'last_seen_uri'): uri = self.last_seen_uri.get(origin.sender) for item in IGNORE: if item in uri: return if not re.search('^((https?)|(ftp))://', uri): uri = 'http://' + uri if "twitter.com" in uri: uri = uri.replace('#!', '?_escaped_fragment_=') content = web.get(uri) regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) content = regex.sub(r'<\1title>',content) regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE) content = regex.sub('',content) start = content.find('<title>') if start == -1: return end = content.find('</title>', start) if end == -1: return content = content[start+7:end] content = content.strip('\n').rstrip().lstrip() title = content if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1],16) return unichr(cp).encode('utf-8') elif entity.startswith('&#'): cp = int(entity[2:-1]) return unichr(cp).encode('utf-8') else: char = name2codepoint[entity[1:-1]] return unichr(char).encode('utf-8') title = r_entity.sub(e, title) if title: title = unicode.decode(title) else: title = 'None' title = title.replace('\n', '') title = title.replace('\r', '') def remove_spaces(x): if " " in x: x = x.replace(" ", " ") return remove_spaces(x) else: return x title = remove_spaces (title) re_dcc = re.compile(r'(?i)dcc\ssend') title = re.sub(re_dcc, '', title) if title: return title
def find_title(url): """ This finds the title when provided with a string of a URL." """ uri = url if not uri and hasattr(self, 'last_seen_uri'): uri = self.last_seen_uri.get(origin.sender) for item in IGNORE: if item in uri: return if not re.search('^((https?)|(ftp))://', uri): uri = 'http://' + uri if "twitter.com" in uri: uri = uri.replace('#!', '?_escaped_fragment_=') redirects = 0 ## follow re-directs, if someone pastes a bitly of a tinyurl, etc.. while True: req = urllib2.Request(uri, headers={'Accept':'text/html'}) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0') u = urllib2.urlopen(req) info = u.info() u.close() if not isinstance(info, list): status = '200' else: status = unicode.encode(info[1]) info = info[0] if status.startswith('3'): uri = urlparse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 50: return "Too many re-directs." try: mtype = info['content-type'] except: return if not (('/html' in mtype) or ('/xhtml' in mtype)): return u = urllib2.urlopen(req) bytes = u.read(262144) u.close() content = bytes regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) content = regex.sub(r'<\1title>',content) regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE) content = regex.sub('',content) start = content.find('<title>') if start == -1: return end = content.find('</title>', start) if end == -1: return content = content[start+7:end] content = content.strip('\n').rstrip().lstrip() title = content if len(title) > 200: title = title[:200] + '[...]' def e(m): entity = m.group() if entity.startswith('&#x'): cp = int(entity[3:-1],16) return unichr(cp).encode('utf-8') elif entity.startswith('&#'): cp = int(entity[2:-1]) return unichr(cp).encode('utf-8') else: char = name2codepoint[entity[1:-1]] return unichr(char).encode('utf-8') title = r_entity.sub(e, title) if title: title = unicode.decode(title) else: title = 'None' title = title.replace('\n', '') title = title.replace('\r', '') def remove_spaces(x): if " " in x: x = x.replace(" ", " ") return remove_spaces(x) else: return x title = remove_spaces (title) re_dcc = re.compile(r'(?i)dcc\ssend') title = re.sub(re_dcc, '', title) if title: return title else: return 'No title'