Exemple #1
0
 def e(m):
     entity = m.group()
     if entity.startswith('&#x'):
         cp = int(entity[3:-1], 16)
         meep = unichr(cp)
     elif entity.startswith('&#'):
         cp = int(entity[2:-1])
         meep = unichr(cp)
     else:
         char = name2codepoint[entity[1:-1]]
         meep = unichr(char)
     try:
         return uc.decode(meep)
     except:
         return uc.decode(uc.encode(meep))
Exemple #2
0
def short(text):
    """
    This function creates a bitly url for each url in the provided "text" string.
    The return type is a list.
    """

    if not bitly_loaded: return list()
    if not text: return list()
    bitlys = list()
    try:
        a = re.findall(url_finder, text)
        k = len(a)
        i = 0
        while i < k:
            b = unicode.decode(a[i][0])
            if not b.startswith("http://bit.ly") or not b.startswith("http://j.mp/"):
                # check to see if the url is valid
                try: c = web.head(b)
                except: return [[None, None]]

                url = "http://api.j.mp/v3/shorten?login=%s&apiKey=%s&longUrl=%s&format=txt" % (bitly_user, bitly_api_key, urllib2.quote(b))
                shorter = web.get(url)
                shorter.strip()
                bitlys.append([b, shorter])
            i += 1
        return bitlys
    except:
        return
Exemple #3
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if "//" in domain:
            domain = domain.split('//')[1]
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemple #4
0
def short(text):
    """
    This function creates a bitly url for each url in the provided "text" string.
    The return type is a list.
    """

    if not bitly_loaded: return list()
    if not text: return list()
    bitlys = list()
    try:
        a = re.findall(url_finder, text)
        k = len(a)
        i = 0
        while i < k:
            b = unicode.decode(a[i][0])
            if not b.startswith("http://bit.ly") or not b.startswith("http://j.mp/"):
                # check to see if the url is valid
                try: c = web.head(b)
                except: return [[None, None]]

                url = "http://api.j.mp/v3/shorten?login=%s&apiKey=%s&longUrl=%s&format=txt" % (bitly_user, bitly_api_key, urllib2.quote(b))
                shorter = web.get(url)
                shorter.strip()
                bitlys.append([b, shorter])
            i += 1
        return bitlys
    except:
        return
Exemple #5
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        domain = getTLD(url)
        domain = domain.strip()
        if "//" in domain:
            domain = domain.split('//')[1]
        try:
            ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP)
        except:
            i += 1
            continue
        localhost = False
        for x in ips:
            y = x[4][0]
            if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y:
                localhost = True
                break
        if localhost: break
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            if page_title:
                if IPv4_HOST in page_title or IPv6_HOST in page_title:
                    break
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemple #6
0
def get_results(text):
    if not text: return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if "//" in domain:
            domain = domain.split('//')[1]
        try:
            ips = socket.getaddrinfo(domain, 80, 0, 0, socket.SOL_TCP)
        except:
            i += 1
            continue
        localhost = False
        for x in ips:
            y = x[4][0]
            if y.startswith('127') or '::1' == y or '0:0:0:0:0:0:0:1' == y:
                localhost = True
                break
        if localhost: break
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            if page_title:
                if IPv4_HOST in page_title or IPv6_HOST in page_title:
                    break
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemple #7
0
def get_results(text):
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = [ ]
    while i < k:
        url = unicode.encode(a[i][0])
        url = unicode.decode(url)
        url = unicode.iriToUri(url)
        if not url.startswith(EXCLUSION_CHAR):
            try:
                page_title = find_title(url)
            except:
                page_title = None # if it can't access the site fail silently
            if bitly_loaded: # and (page_title is not None or page_title == INVALID_WEBSITE):
                bitly = short(url)
                bitly = bitly[0][1]
            else: bitly = url
            display.append([page_title, url, bitly])
        i += 1
    return display
Exemple #8
0
def find_title(url):
    """
    This finds the title when provided with a string of a URL.
    """
    uri = url

    for item in IGNORE:
        if item in uri:
            return False, 'ignored'

    if not re.search('^((https?)|(ftp))://', uri):
        uri = 'http://' + uri

    if 'twitter.com' in uri:
        uri = uri.replace('#!', '?_escaped_fragment_=')

    uri = uc.decode(uri)

    ## proxy the lookup of the headers through .py
    pyurl = u'https://tumbolia.appspot.com/py/'
    code = 'import simplejson;'
    code += "req=urllib2.Request(u'%s', headers={'Accept':'text/html'});"
    code += "req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1;"
    code += "rv:17.0) Gecko/20100101 Firefox/17.0'); u=urllib2.urlopen(req);"
    code += "rtn=dict();"
    code += "rtn['headers'] = u.headers.dict;"
    code += "contents = u.read();"
    code += "con = str();"
    code += r'''exec "try: con=(contents).decode('utf-8')\n'''
    code += '''except: con=(contents).decode('iso-8859-1')";'''
    code += "rtn['read'] = con;"
    code += "rtn['url'] = u.url;"
    code += "rtn['geturl'] = u.geturl();"
    code += r"print simplejson.dumps(rtn)"
    query = code % uri
    try:
        temp = web.quote(query)
        u = web.get(pyurl + temp)
    except Exception, e:
        return False, e
Exemple #9
0
def get_results(text):
    if not text:
        return list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    passs = False
    while i < k:
        url = uc.encode(a[i][0])
        url = uc.decode(url)
        url = uc.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if '//' in domain:
            domain = domain.split('//')[1]
        if not url.startswith(EXCLUSION_CHAR):
            #passs, page_title = find_title(url)
            passs, page_title = find_title_lite(url)
            display.append([page_title, url])
        i += 1
    return passs, display
Exemple #10
0
def readcsv_entires(path):
    with open(path, 'r', encoding='utf-8') as csvfile:
        next(csvfile)
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            e = Entry()
            e.strong_id = row[0]
            e.id = e.strong_id[1:]
            merged_split = row[1].split(';')
            words = merged_split[0].split(' ')
            e.word_uni = decode(words[1].strip())
            e.word_ascii = decode(words[2].strip('[]'))
            e.part_of_speech = merged_split[1].strip()
            e.meaning1 = decode(row[2])
            e.meaning2 = decode(row[3])
            e.meaning3 = decode(row[4])
            e.origin = replace_strongid_by_link(decode(row[5]))
            e.occ_count = row[6]
            e.occ = decode(row[7])
            yield e
Exemple #11
0
def find_title(url):
    """
    This finds the title when provided with a string of a URL."
    """
    uri = url

    if not uri and hasattr(self, 'last_seen_uri'):
        uri = self.last_seen_uri.get(origin.sender)

    for item in IGNORE:
        if item in uri:
            return

    if not re.search('^((https?)|(ftp))://', uri):
        uri = 'http://' + uri

    if "twitter.com" in uri:
        uri = uri.replace('#!', '?_escaped_fragment_=')

    redirects = 0
    ## follow re-directs, if someone pastes a bitly of a tinyurl, etc..
    page = str()
    while True:
        req = urllib2.Request(uri, headers={'Accept':'text/html'})
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:10.0) Gecko/20100101 Firefox/10.0')
        u = urllib2.urlopen(req)
        info = u.info()
        page = u.read()
        u.close()

        if not isinstance(info, list):
            status = '200'
        else:
            status = unicode.encode(info[1])
            info = info[0]
        if status.startswith('3'):
            uri = urlparse.urljoin(uri, info['Location'])
        else: break

        redirects += 1
        if redirects >= 50:
            return "Too many re-directs."

    try: mtype = info['content-type']
    except: return
    if not (('/html' in mtype) or ('/xhtml' in mtype)):
        return

    if not page:
        u = urllib2.urlopen(req)
        page = u.read(262144)
        u.close()
    content = page
    regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
    content = regex.sub(r'<\1title>',content)
    regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
    content = regex.sub('',content)
    start = content.find('<title>')
    if start == -1: return
    end = content.find('</title>', start)
    if end == -1: return
    content = content[start+7:end]
    content = content.strip('\n').rstrip().lstrip()
    title = content

    if len(title) > 200:
        title = title[:200] + '[...]'

    def e(m):
        entity = m.group()
        if entity.startswith('&#x'):
            cp = int(entity[3:-1],16)
            return unichr(cp).encode('utf-8')
        elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            return unichr(cp).encode('utf-8')
        else:
            char = name2codepoint[entity[1:-1]]
            return unichr(char).encode('utf-8')

    title = r_entity.sub(e, title)

    if title:
        title = unicode.decode(title)
    else: title = 'None'

    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if "  " in x:
            x = x.replace("  ", " ")
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces (title)

    re_dcc = re.compile(r'(?i)dcc\ssend')
    title = re.sub(re_dcc, '', title)

    if title:
        return title
    else:
        return 'No title'
Exemple #12
0
def find_title(url):
    """
    This finds the title when provided with a string of a URL.
    """
    uri = url

    if not uri and hasattr(self, 'last_seen_uri'):
        uri = self.last_seen_uri.get(origin.sender)

    for item in IGNORE:
        if item in uri:
            return

    if not re.search('^((https?)|(ftp))://', uri):
        uri = 'http://' + uri

    if "twitter.com" in uri:
        uri = uri.replace('#!', '?_escaped_fragment_=')

    content = web.get(uri)
    regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
    content = regex.sub(r'<\1title>',content)
    regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
    content = regex.sub('',content)
    start = content.find('<title>')
    if start == -1: return
    end = content.find('</title>', start)
    if end == -1: return
    content = content[start+7:end]
    content = content.strip('\n').rstrip().lstrip()
    title = content

    if len(title) > 200:
        title = title[:200] + '[...]'

    def e(m):
        entity = m.group()
        if entity.startswith('&#x'):
            cp = int(entity[3:-1],16)
            return unichr(cp).encode('utf-8')
        elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            return unichr(cp).encode('utf-8')
        else:
            char = name2codepoint[entity[1:-1]]
            return unichr(char).encode('utf-8')

    title = r_entity.sub(e, title)

    if title:
        title = unicode.decode(title)
    else: title = 'None'

    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if "  " in x:
            x = x.replace("  ", " ")
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces (title)

    re_dcc = re.compile(r'(?i)dcc\ssend')
    title = re.sub(re_dcc, '', title)

    if title:
        return title
Exemple #13
0
def find_title(url):
    """
    This finds the title when provided with a string of a URL."
    """
    uri = url

    if not uri and hasattr(self, 'last_seen_uri'):
        uri = self.last_seen_uri.get(origin.sender)

    for item in IGNORE:
        if item in uri:
            return

    if not re.search('^((https?)|(ftp))://', uri):
        uri = 'http://' + uri

    if "twitter.com" in uri:
        uri = uri.replace('#!', '?_escaped_fragment_=')

    redirects = 0
    ## follow re-directs, if someone pastes a bitly of a tinyurl, etc..
    while True:
        req = urllib2.Request(uri, headers={'Accept':'text/html'})
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0')
        u = urllib2.urlopen(req)
        info = u.info()
        u.close()

        if not isinstance(info, list):
            status = '200'
        else:
            status = unicode.encode(info[1])
            info = info[0]
        if status.startswith('3'):
            uri = urlparse.urljoin(uri, info['Location'])
        else: break

        redirects += 1
        if redirects >= 50:
            return "Too many re-directs."

    try: mtype = info['content-type']
    except: return
    if not (('/html' in mtype) or ('/xhtml' in mtype)):
        return

    u = urllib2.urlopen(req)
    bytes = u.read(262144)
    u.close()
    content = bytes
    regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
    content = regex.sub(r'<\1title>',content)
    regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
    content = regex.sub('',content)
    start = content.find('<title>')
    if start == -1: return
    end = content.find('</title>', start)
    if end == -1: return
    content = content[start+7:end]
    content = content.strip('\n').rstrip().lstrip()
    title = content

    if len(title) > 200:
        title = title[:200] + '[...]'

    def e(m):
        entity = m.group()
        if entity.startswith('&#x'):
            cp = int(entity[3:-1],16)
            return unichr(cp).encode('utf-8')
        elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            return unichr(cp).encode('utf-8')
        else:
            char = name2codepoint[entity[1:-1]]
            return unichr(char).encode('utf-8')

    title = r_entity.sub(e, title)

    if title:
        title = unicode.decode(title)
    else: title = 'None'

    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if "  " in x:
            x = x.replace("  ", " ")
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces (title)

    re_dcc = re.compile(r'(?i)dcc\ssend')
    title = re.sub(re_dcc, '', title)

    if title:
        return title
    else:
        return 'No title'