Beispiel #1
1
 def replaceHTMLCodes(self, text=''):
     # Code from Lambdas ParseDOM file.
     if text is None or len(text) < 1:
         txt = self.txt
     txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
     txt = HTMLParser().unescape(txt)
     txt = txt.replace("&quot;", "\"")
     txt = txt.replace("&amp;", "&")
     txt = txt.strip()
     return txt
def _cleanTitle(title, html=True):
    if html:
        title = HTMLParser().unescape(title)
        if sys.version_info[0] < 3:  # for Python 2
            if isinstance(title, unicode):
                title = title.encode('utf-8')
        return title
    else:
        title = title.replace("&lt;", "<").replace("&gt;", ">").replace(
            "&amp;",
            "&").replace("&#034;", "\"").replace("&#039;", "'").replace(
                "&quot;", "\"").replace("&szlig;",
                                        "ß").replace("&ndash;", "-")
        title = title.replace("&Auml;", "Ä").replace("&Uuml;", "Ü").replace(
            "&Ouml;",
            "Ö").replace("&auml;", "ä").replace("&uuml;", "ü").replace(
                "&ouml;", "ö").replace("&eacute;",
                                       "é").replace("&egrave;", "è")
        title = title.replace("&#x00c4;", "Ä").replace(
            "&#x00e4;",
            "ä").replace("&#x00d6;", "Ö").replace("&#x00f6;", "ö").replace(
                "&#x00dc;", "Ü").replace("&#x00fc;",
                                         "ü").replace("&#x00df;", "ß")
        title = title.replace("&apos;", "'").strip()
        return title
Beispiel #3
0
    def texify(self, text_str, leave_nl=False):
        """
            Wiki TeX like strings to real TeX.
        """
        # bug with %\n ...
        text_str = u"\n".join([x.strip().rstrip(u"%") for x in text_str.splitlines()])

        changed = True
        while changed:
            for (k, v) in [
                # before the others
                (u"&amp;gt;", u" \gt "),
                (u"&amp;lt;", u" \lt "),
                (u"&amp;minus;", u" - "),
                #
                (u"&amp;", u"&"),

                (u"\;", u" "),
                (u"\!", u" "),
                (u"\,", u" "),

                #
                (u"&lt;", u" \lt "),
                (u"&gt;", u" \gt "),
                (u"<", u" \lt "),
                (u">", u" \gt "),
                (u"&lt", u" \lt "),
                (u"&gt", u" \gt "),

            ]:
                old_str = text_str
                text_str = old_str.replace(k, v)
                changed = (old_str != text_str)

        # special chars?
        text_str = HTMLParser().unescape(text_str)
        for (k, v) in [
            # failsafe
            (u"&", u" "),
        ]:
            text_str = text_str.replace(k, v)

        #replace
        # USE CAREFULLY because of f(x,a) problem
        #
        changed = True
        while changed:
            old_str = text_str

            if not leave_nl:
                text_str = text_str.replace(u"\n", u" ")
                text_str = text_str.replace(u"\r", u" ")

            # some of the interpunction math does really make sense e.g., ...
            text_str = text_str.strip()
            text_str = re.compile(ur'\.\s*\.\s*\.').sub(ur'\ldots ', text_str)
            text_str = re.compile(r'(^[.,]*|[.,\\\\]*$)').sub('', text_str)
            changed = (old_str != text_str)

        return text_str.strip()
Beispiel #4
0
def _replaceHTMLCodes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    txt = txt.strip()
    return txt
Beispiel #5
0
    def texify(self, text_str, leave_nl=False):
        """
            Wiki TeX like strings to real TeX.
        """
        # bug with %\n ...
        text_str = u"\n".join(
            [x.strip().rstrip(u"%") for x in text_str.splitlines()])

        changed = True
        while changed:
            for (k, v) in [
                    # before the others
                (u"&amp;gt;", u" \gt "),
                (u"&amp;lt;", u" \lt "),
                (u"&amp;minus;", u" - "),
                    #
                (u"&amp;", u"&"),
                (u"\;", u" "),
                (u"\!", u" "),
                (u"\,", u" "),

                    #
                (u"&lt;", u" \lt "),
                (u"&gt;", u" \gt "),
                (u"<", u" \lt "),
                (u">", u" \gt "),
                (u"&lt", u" \lt "),
                (u"&gt", u" \gt "),
            ]:
                old_str = text_str
                text_str = old_str.replace(k, v)
                changed = (old_str != text_str)

        # special chars?
        text_str = HTMLParser().unescape(text_str)
        for (k, v) in [
                # failsafe
            (u"&", u" "),
        ]:
            text_str = text_str.replace(k, v)

        #replace
        # USE CAREFULLY because of f(x,a) problem
        #
        changed = True
        while changed:
            old_str = text_str

            if not leave_nl:
                text_str = text_str.replace(u"\n", u" ")
                text_str = text_str.replace(u"\r", u" ")

            # some of the interpunction math does really make sense e.g., ...
            text_str = text_str.strip()
            text_str = re.compile(ur'\.\s*\.\s*\.').sub(ur'\ldots ', text_str)
            text_str = re.compile(r'(^[.,]*|[.,\\\\]*$)').sub('', text_str)
            changed = (old_str != text_str)

        return text_str.strip()
 def replaceHTMLCodes(self, text=''):
     # Code from Lambdas ParseDOM file.
     if text is None or len(text) < 1:
         txt = self.txt
     txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
     txt = HTMLParser().unescape(txt)
     txt = txt.replace("&quot;", "\"")
     txt = txt.replace("&amp;", "&")
     txt = txt.strip()
     return txt
Beispiel #7
0
def normalize(s):
    # Unescape html
    s = s.decode("utf-8")
    s = HTMLParser().unescape(s)
    s = s.replace("\n", " ")
    s = s.replace("\r", " ")

    while "  " in s:
        s = s.replace("  ", " ")

    return s.strip()
Beispiel #8
0
def normalize(s):
    # Unescape html
    s = s.decode("utf-8")
    s = HTMLParser().unescape(s)
    s = s.replace("\n", " ")
    s = s.replace("\r", " ")

    while "  " in s:
        s = s.replace("  ", " ")

    return s.strip()
Beispiel #9
0
def replace_html_codes(txt):
    try:
        from HTMLParser import HTMLParser
    except ImportError:
        from html.parser import HTMLParser
    import re
    txt = to_utf8(txt)
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt
  def convert(self, nameToClean):
    if (nameToClean == None):
        return None

    try:
        s = nameToClean.decode('utf8')
    except UnicodeDecodeError:
        s = nameToClean
    except UnicodeEncodeError:
        s = nameToClean

    s = HTMLParser().unescape(s)
    s = s.replace('<', '{').replace('>', '}')
    s = s.replace('\n', ' ').replace('\r', '').replace('\t', ' ')

    return s
Beispiel #11
0
    def convert(self, nameToClean):
        if (nameToClean == None):
            return None

        try:
            s = nameToClean.decode('utf8')
        except UnicodeDecodeError:
            s = nameToClean
        except UnicodeEncodeError:
            s = nameToClean

        s = HTMLParser().unescape(s)
        s = s.replace('<', '{').replace('>', '}')
        s = s.replace('\n', ' ').replace('\r', '').replace('\t', ' ')

        return s
Beispiel #12
0
def getlyrics1(artist, name):
    urlpath = artist + ':' + name
    urlpath = urlpath.replace(' ', '_')
    urlpath_ascii = urlpath.translate(toascii)
    # prefer Gracenote, so no set() to remove duplicates
    paths = ['Gracenote:'+urlpath, urlpath]
    if urlpath_ascii != urlpath:
        paths.extend(['Gracenote:'+urlpath.translate(toascii), urlpath.translate(toascii)])

    for p in paths:
        if verbose: print 'Trying wikia ' + p
        url = 'http://lyrics.wikia.com/' + p
        try:
            html = ''.join(urllib.urlopen(url.encode('utf-8')).readlines())
        except:
            if verbose: print 'Unable to connect'
            return None
        if html.find('<meta name="description" content="Instrumental" />') >= 0:
            return 'Instrumental'
        res = re.search('height=\'17\'/></a></div>\s*(?:<p>)?(.*?)<!--\s*(?:<p>)?NewPP limit report', html,
                        re.S | re.I | re.U)
        if res:
            txt = res.group(1)
            while txt.find('&#') >= 0:
                txt = HTMLParser().unescape(txt)
            return txt.replace('<br />', '\n')
    return None
Beispiel #13
0
def insert_to(project_url, destination, find_what, indent=0):
  url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
  response = urllib2.urlopen(url)
  if response.getcode() == 200:
    with open(destination, 'r') as dest:
      dest_contents = dest.readlines()
      lines = ''.join(dest_contents)
      content = HTMLParser().unescape(response.read())
      if content.replace(' ', '') in lines.replace(' ', ''):
        print_out('IGNORED', destination)
        return

    generated = []
    for line in dest_contents:
      generated.append(line)
      if line.lower().find(find_what.lower()) >= 0:
        spaces = len(line) - len(line.lstrip())
        for l in content.split('\n'):
          if l:
            generated.append('%s%s\n' % (' ' * (spaces + indent), l))

    with open(destination, 'w') as dest:
      for line in generated:
        dest.write(line)
      print_out('INSERT', destination)
Beispiel #14
0
def insert_to(project_url, destination, find_what, indent=0):
	url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
	response = urllib2.urlopen(url)
	if response.getcode() == 200:
		with open(destination, 'r') as dest:
			dest_contents = dest.readlines()
			lines = ''.join(dest_contents)
			content = HTMLParser().unescape(response.read())
			if content.replace(' ', '') in lines.replace(' ', ''):
				print_out('IGNORED', destination)
				return

		generated = []
		for line in dest_contents:
			generated.append(line)
			if line.lower().find(find_what.lower()) >= 0:
				spaces = len(line) - len(line.lstrip())
				for l in content.split('\n'):
					if l:
						generated.append('%s%s\n' % (' ' * (spaces + indent), l))

		with open(destination, 'w') as dest:
			for line in generated:
				dest.write(line)
			print_out('INSERT', destination)
Beispiel #15
0
def getlyrics3(artist, name):
    if verbose: print 'Trying letras.mus.br'
    url = 'http://letras.mus.br/' + artist.replace(' ', '-').lower() + '/'
    try:
        songs = ''.join(urllib.urlopen(url.encode('utf-8')).readlines())
    except:
        if verbose: print 'Unable to connect'
        return None
    res = re.search('<li><a href="([^"]*)">' + name + '</a></li>', songs, re.S | re.I | re.U)
    if not res:
        return None
    url = 'http://letras.mus.br' + res.group(1)
    try:
        html = ''.join(urllib.urlopen(url.encode('utf-8')).readlines())
    except:
        if verbose: print 'Unable to connect'
        return None
    res = re.search('<div id="div_letra" data-linhas="\d+">\s+(.*?)</div>',
                    html, re.S | re.I | re.U)
    if res:
        txt = res.group(1)
        while txt.find('&#') >= 0:
            txt = HTMLParser().unescape(txt)
        return txt.replace('<br/>', '').replace('<p>', '').replace('</p>', '').decode('utf-8')
    return None
Beispiel #16
0
def create_tweet(catalyst=''):
    b = Brain(os.path.join(os.path.dirname(__file__), 'cobe.brain'))

    # get a reply from brain, encode as UTF-8
    i = 0

    while True:
        tweet = b.reply(catalyst).encode('utf-8', 'replace')
        if(config['filter_urls']):
            tweet = remove_url(tweet)
        tweet = smart_truncate(tweet)


        # check if last words of tweet are less than 4 and remove them
        last_words_twert = tweet.split(' ')
        while len(last_words_twert[-1]) < 4:
            print "[debug] Removing last word:"+last_words_twert[-1]
            del(last_words_twert[-1])
        tweet = ' '.join(last_words_twert)



        #make sure we're not tweeting something close to something else in the txt files
        #or we can just give up after 100 tries
        if check_tweet(tweet) or i >= 100:
            break
        i += 1
    
    tweet = HTMLParser().unescape(tweet)
    tweet = tweet.upper()
    # clean up miscellaneous characters INCLUDING NUMBERS?
    for ch in ['(',')','1','2','3','4','5','6','7','8','9','0','.',', ,','-,','-;','-.',',,',' ;' ]:
        if ch in tweet:
            tweet=tweet.replace(ch,"")
        if ' TH ' in tweet:
            tweet = tweet.replace(' TH ',' ')
        if ' ND ' in tweet:
            tweet = tweet.replace(' ND ',' ')
        if ' RD ' in tweet:
            tweet = tweet.replace(' RD ',' ')
        if 'THE OF' in tweet:
            tweet = tweet.replace('THE OF ',' ')
        if "  " in tweet:
            tweet = tweet.replace("  "," ")
        if " - " in tweet:
            tweet = tweet.replace(" - "," ")
        if " , " in tweet:
            tweet = tweet.replace(" , ",", ")
    tweet = tweet.rstrip(" ,;=-")
    tweet = tweet.lstrip(" ,;=-?{}[]/_=+")

        
    #put the tweet in the db
    db_manager.insert_tweet(tweet)

    return tweet
Beispiel #17
0
    def clean_keywords(self):
        try:
            import urllib.parse
        except ImportError:  # python2 compatible
            import urllib

        try:  # python2 compatible
            from HTMLParser import HTMLParser
        except ImportError:
            from html.parser import HTMLParser

        def unicode_escape(unistr):
            """
            Tidys up unicode entities into HTML friendly entities
            Takes a unicode string as an argument
            Returns a unicode string
            """
            try:  # python2 compatible
                import htmlentitydefs
            except ImportError:
                import html.entities
            escaped = ""
            for char in unistr:
                try:  # python2 compatible
                    if ord(char) in htmlentitydefs.codepoint2name:
                        name = htmlentitydefs.codepoint2name.get(ord(char))
                        escaped += '&%s;' % name if 'nbsp' not in name else ' '
                except NameError:
                    if ord(char) in html.entities.codepoint2name:
                        name = html.entities.codepoint2name.get(ord(char))
                        escaped += '&%s;' % name if 'nbsp' not in name else ' '
                else:
                    escaped += char
            return escaped
        keywords = self.cleaned_data['keywords']
        _unsescaped_kwds = []
        for k in keywords:
            try:  # python2 compatible
                _k = urllib.unquote(('%s' % k)).split(",")
            except AttributeError:
                _k = urllib.parse.unquote(('%s' % k)).split(",")
            if not isinstance(_k, six.string_types):
                for _kk in [x.strip() for x in _k]:
                    _kk = HTMLParser().unescape(unicode_escape(_kk))
                    # Simulate JS Unescape
                    _kk = _kk.replace('%u', r'\u').decode('unicode-escape') if '%u' in _kk else _kk
                    _hk = HierarchicalKeyword.objects.filter(name__iexact='%s' % _kk.strip())
                    if _hk and len(_hk) > 0:
                        _unsescaped_kwds.append(_hk[0])
                    else:
                        _unsescaped_kwds.append(_kk)
            else:
                _hk = HierarchicalKeyword.objects.filter(name__iexact=_k.strip())
                if _hk and len(_hk) > 0:
                    _unsescaped_kwds.append(_hk[0])
                else:
                    _unsescaped_kwds.append(_k)
        return _unsescaped_kwds
Beispiel #18
0
 def run(self):
     self.progressbar_show.emit(True)
     self.info_label.emit(translate("AddonsInstaller", "Retrieving description..."))
     if len(self.macros[self.idx]) > 2:
         desc = self.macros[self.idx][2]
         url = self.macros[self.idx][4]
     else:
         mac = self.macros[self.idx][0].replace(" ","_")
         mac = mac.replace("&","%26")
         mac = mac.replace("+","%2B")
         url = "https://www.freecadweb.org/wiki/Macro_"+mac
         self.info_label.emit("Retrieving info from " + str(url))
         if ctx:
             u = urllib2.urlopen(url,context=ctx)
         else:
             u = urllib2.urlopen(url)
         p = u.read()
         if sys.version_info.major >= 3 and isinstance(p, bytes):
             p = p.decode("utf-8")
         u.close()
         code = re.findall("<pre>(.*?)<\/pre>",p.replace("\n","--endl--"))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code,key=len)[-1]
             code = code.replace("--endl--","\n")
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
             self.progressbar_show.emit(False)
             self.stop = True
             return
         desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>",p.replace("\n"," "))
         if desc:
             desc = desc[0]
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
             desc = "No description available"
         # clean HTML escape codes
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         try:
             code = code.decode("utf8")
             code = HTMLParser().unescape(code)
             code = code.encode("utf8")
             code = code.replace("\xc2\xa0", " ")
         except:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ")+mac+"\n")
         self.update_macro.emit(self.idx,self.macros[self.idx]+[desc,code,url])
     if self.macros[self.idx][1] == 1 :
         message = "<strong>" + translate("AddonsInstaller", "<strong>This addon is already installed.") + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     else:
         message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     self.info_label.emit( message )
     self.progressbar_show.emit(False)
     self.stop = True
Beispiel #19
0
def get_filename_from_title(title, ext='.m4a'):
    """
    Creates a filename from title
    """
    if not title:
        return 'music' + ext
    title = HTMLParser().unescape(title)
    for _ in FILENAME_EXCLUDE:
        title = title.replace(_, ' ')  # provide readability with space
    return title + ext  # TODO - smart hunt
Beispiel #20
0
 def run(self):
     self.progressbar_show.emit(True)
     self.info_label.emit(translate("AddonsInstaller", "Retrieving description..."))
     if len(self.macros[self.idx]) > 2:
         desc = self.macros[self.idx][2]
         url = self.macros[self.idx][4]
     else:
         mac = self.macros[self.idx][0].replace(" ","_")
         mac = mac.replace("&","%26")
         mac = mac.replace("+","%2B")
         url = "https://www.freecadweb.org/wiki/Macro_"+mac
         self.info_label.emit("Retrieving info from " + str(url))
         if ctx:
             u = urllib2.urlopen(url,context=ctx)
         else:
             u = urllib2.urlopen(url)
         p = u.read()
         if sys.version_info.major >= 3 and isinstance(p, bytes):
             p = p.decode("utf-8")
         u.close()
         code = re.findall("<pre>(.*?)<\/pre>",p.replace("\n","--endl--"))
         if code:
             code = code[0]
             code = code.replace("--endl--","\n")
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
             self.progressbar_show.emit(False)
             self.stop = True
             return
         desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>",p.replace("\n"," "))
         if desc:
             desc = desc[0]
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
             desc = "No description available"
         # clean HTML escape codes
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         try:
             code = code.decode("utf8")
             code = HTMLParser().unescape(code)
             code = code.encode("utf8")
             code = code.replace("\xc2\xa0", " ")
         except:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ")+mac+"\n")
         self.update_macro.emit(self.idx,self.macros[self.idx]+[desc,code,url])
     if self.macros[self.idx][1] == 1 :
         message = "<strong>" + translate("AddonsInstaller", "<strong>This addon is already installed.") + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     else:
         message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     self.info_label.emit( message )
     self.progressbar_show.emit(False)
     self.stop = True
    def parse_result_page(self, page, soup):
        ''' Parse each bird on the result page '''
        results = []
        for result in soup.findAll('div', 'search-result-text'):
            self.bird_counter += 1

            # extract data from the results page
            name = HTMLParser().unescape(result.find('h3', 'search-result-title').find('a')\
                .contents[0]).encode('ascii', 'replace')
            name = name.replace("Stirton?s", "Stirton's") # TODO - make more robust

            url = self.base_url + result.find('h3', 'search-result-title').find('a')\
                .attrs[0][1].encode('ascii', 'replace')

            scientific = HTMLParser().unescape(result.find('p', 'search-result-scientific').\
                contents[0]).encode('ascii', 'replace')

            con_status = result.find('p', 'search-result-status').contents[1].encode('ascii', 'replace')
            tries = 0

             # Get detail page for each bird
            detail = None
            while tries < self.max_tries:
                try:
                    print('  Getting details for "{}" (attempt {}) {}'.format(name, tries + 1, url))
                    detail = bs(urllib2.urlopen(url).read())
                    tries += 1
                    break
                except urllib2.URLError:
                    # sleep briefly if there's an error
                    tries += 1
                    time.sleep(self.sleep_dur)

            if not detail:
                print('* Exiting early as {} failed {} times...'.format(url, tries + 1))
                sys.exit()

            # extract additional data from details page
            order       = self.get_infobox_text('Order: ', detail)
            family      = self.get_infobox_text('Family: ', detail)
            nz_status   = self.get_infobox_text('New Zealand status: ', detail)

            # compile results into a single line for export to CSV
            results.append([
                name.strip(),
                order.strip(),
                family.strip(),
                scientific.strip(),
                con_status.strip(),
                nz_status.strip(),
                url.strip()
            ])
            print('  Got {} ({})'.format(name, self.bird_counter))
        return results
Beispiel #22
0
def sort_out_html_like_tags(string):
    
    string_unescaped = HTMLParser().unescape(string)
    string_without_nbsp = string_unescaped.replace('&nbsp;', '')
    string_without_ndash = string_without_nbsp.replace('&ndash;','')
    string_without_sup_tag = re.sub('<sup>.*?</sup>', '', string_without_ndash)
    string_without_sub_tag = re.sub('<sub>.*?</sub>', '', string_without_sup_tag)
    string_without_small_tag = re.sub('<small>.*?</small>', '', string_without_sub_tag)
    string_without_references_0 = re.sub('<ref.*?>.*?</ref>', '', string_without_small_tag, re.DOTALL)
    string_without_references = re.sub('<ref.*?>', '', string_without_references_0)

    return string_without_references
Beispiel #23
0
def extract_tweets(raw_html):
	''' This function scrapes the specified HTML string for Tweets and some related information.
	    Returns a list of lists(username, friendly_time, timestamp, tweet_text). '''
		
	if (len(raw_html)==0): raise TypeError("No raw_html specified");
		
	#Set up some temporary and holding variables for later
	retrieved_tweets = []; active_tweet= []; to_append="";

	#Query for username UNION time UNION timestamp UNION text
	xpath_query = "//span[starts-with(@class,'username')] | //small[@class = 'time']/a/@title | //span[starts-with(@class, '_timestamp')]/@data-time-ms | //p[contains(@class,'js-tweet-text')]"
	tree = html.fromstring(raw_html)
	query_results = tree.xpath(xpath_query)

	#Walk through query results
	for q in query_results:
		#We can extract all elements directly, EXCEPT for tweet text, because that's not an actual text element yet
		#See http://stackoverflow.com/questions/29398751 for why we query it like this (it's because of formatting)
		if (type(q) is lxml.html.HtmlElement):
			to_append = q.text_content()
		else: to_append = q;

		#Clean the extracted element up a little, make sure it's UTF-8 encoded and contains no linebreaks
		to_append = HTMLParser().unescape(to_append)
		to_append = to_append.encode('utf-8', errors='replace')
		to_append = to_append.replace('\n', ' ')
		to_append = to_append.replace(';', ',')

		#Append the cleaned-up string to the active element
		active_tweet.append(to_append)

		#Each tweet item contains (username, time, timestamp, text), so:
		#if we have reached a length of 4, the current item is finished and can be appended to the result set
		if (len(active_tweet) == 4):
			retrieved_tweets.append(active_tweet)
			active_tweet = []

	#Once we've walked through all query elements, the analysis is finished and we return the list-of-lists
	return retrieved_tweets
Beispiel #24
0
 def fill_details_from_wiki(self, url):
     try:
         if ctx:
             u = urllib2.urlopen(url, context=ctx)
         else:
             u = urllib2.urlopen(url)
     except urllib2.HTTPError:
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
     if code:
         # code = code[0]
         # take the biggest code block
         code = sorted(code, key=len)[-1]
         code = code.replace('--endl--', '\n')
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to fetch the code of this macro."))
     # Clean HTML escape codes.
     try:
         from HTMLParser import HTMLParser
     except ImportError:
         from html.parser import HTMLParser
     try:
         code = code.decode('utf8')
         code = HTMLParser().unescape(code)
         code = code.encode('utf8')
         code = code.replace('\xc2\xa0', ' ')
     except:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller", "Unable to clean macro code: ") +
             mac + '\n')
     desc = re.findall(
         "<td class=\"ctEven left macro-description\">(.*?)<\/td>",
         p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #25
0
def normalize_string(string, charset=None, replacing=False):
    """
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        try:
            if re.search(u'=[0-9a-fA-F]{2}', string):
                string = string.decode('Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
            try:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = string.decode('latin-1')
                pass

            except TypeError:
                string = unicode(string, errors='ignore')
                pass

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')
            pass

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string
Beispiel #26
0
def sort_out_html_like_tags(string):

    string_unescaped = HTMLParser().unescape(string)
    string_without_nbsp = string_unescaped.replace('&nbsp;', '')
    string_without_ndash = string_without_nbsp.replace('&ndash;', '')
    string_without_sup_tag = re.sub('<sup>.*?</sup>', '', string_without_ndash)
    string_without_sub_tag = re.sub('<sub>.*?</sub>', '',
                                    string_without_sup_tag)
    string_without_small_tag = re.sub('<small>.*?</small>', '',
                                      string_without_sub_tag)
    string_without_references_0 = re.sub('<ref.*?>.*?</ref>', '',
                                         string_without_small_tag, re.DOTALL)
    string_without_references = re.sub('<ref.*?>', '',
                                       string_without_references_0)

    return string_without_references
Beispiel #27
0
def getlyrics2(artist, name):
    path = name + '_lyrics_' + artist
    path = path.lower().replace(' ', '_')
    for p in set([path, path.translate(toascii)]):
        if verbose: print 'Trying lyricsmania ' + p
        url = 'http://www.lyricsmania.com/' + p + '.html'
        try:
            html = ''.join(urllib.urlopen(url.encode('utf-8')).readlines())
        except:
            if verbose: print 'Unable to connect'
            return None
        res = re.search("<div id='songlyrics_h' class='dn'>(.*?)</div>", html, re.S | re.I | re.U)
        if res:
            txt = res.group(1)
            while txt.find('&#') >= 0:
                txt = HTMLParser().unescape(txt)
            return txt.replace('<br />', '').decode('utf-8')
    return None
Beispiel #28
0
    def update(self, creator, project, progress_bar, force=False):
        """Adds a project to the project cache if it hasn't already been cached."""

        sanitizer = [('\\\\"', "'"),]
        project_url = 'http://www.kickstarter.com/projects/{}/{}'.format(creator, project)

        if project in self.project_cache and force == False:
            progress_bar.draw('c')
            progress_bar.increment()
            return  # Exit quickly if we can
        else:
            response = self.session.get(project_url)
            response_tree = etree.parse(StringIO(response.text), etree.HTMLParser(encoding = 'UTF-8'))
            current_project = HTMLParser().unescape(response_tree.xpath('//head/script[contains(text(), "window.current_project")]')[0].text.split('"')[1])
            for rule in sanitizer:
                current_project = current_project.replace(*rule)
            self.project_cache[project] = json.loads(current_project)
            progress_bar.draw('u')
            progress_bar.increment()
Beispiel #29
0
 def fill_details_from_wiki(self, url):
     try:
         if ctx:
             u = urllib2.urlopen(url, context=ctx)
         else:
             u = urllib2.urlopen(url)
     except urllib2.HTTPError:
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
     if code:
         # code = code[0]
         # take the biggest code block
         code = sorted(code, key=len)[-1]
         code = code.replace('--endl--', '\n')
     else:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
     # Clean HTML escape codes.
     try:
         from HTMLParser import HTMLParser
     except ImportError:
         from html.parser import HTMLParser
     try:
         code = code.decode('utf8')
         code = HTMLParser().unescape(code)
         code = code.encode('utf8')
         code = code.replace('\xc2\xa0', ' ')
     except:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ") + mac + '\n')
     desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
	def run(self, edit):
		if self.view.sel()[0].empty():
			region = sublime.Region(0, self.view.size())
			sublime.status_message('Beautifying Entire File')
			rawcode = self.view.substr(region)
			# print region
		else:
			region = self.view.line(self.view.sel()[0])
			sublime.status_message('Beautifying Selection Only')
			rawcode = self.view.substr(self.view.sel()[0])
			# print region
		rawcode = rawcode.replace('\n','').replace('{','~').replace('}','`')
		soup = BeautifulSoup(u''.join(rawcode))

		unformatted_tag_list = []

		for i, tag in enumerate(soup.find_all(['span', 'a', 'strong', 'em', 'b', 'i', 'input', 'button', 'script', 'option', 'label'])):
			unformatted_tag_list.append(unicode(str(tag), 'utf-8'))
			tag.replace_with('{' + u"unformatted_tag_list[{0}]".format(i) + '}')
		rawcode = soup.prettify().format(unformatted_tag_list=unformatted_tag_list)
		rawcode = HTMLParser().unescape(rawcode.replace('~','{').replace('`','}').replace('> <','><'))
		self.view.replace(edit, region, r.sub(r'\1' * indent_width, rawcode))
Beispiel #31
0
 tries = 0
 while tries < max_tries:
     print tries
     try:
         soup = bs(urllib2.urlopen(nzbirds_page % page).read())
         tries = max_tries
         break
     except urllib2.URLError:
         tries += 1
 for result in soup.findAll('div', 'search-result-text'):
     count += 1
     name = HTMLParser().unescape(
         result.find(
             'h3', 'search-result-title').find('a').contents[0]).encode(
                 'ascii', 'replace')
     name = name.replace("Stirton?s", "Stirton's")
     url = nzbirds_base + result.find(
         'h3', 'search-result-title').find('a').attrs[0][1].encode(
             'ascii', 'replace')
     scientific = HTMLParser().unescape(
         result.find('p',
                     'search-result-scientific').contents[0]).encode(
                         'ascii', 'replace')
     status = result.find('p',
                          'search-result-status').contents[1].encode(
                              'ascii', 'replace')
     tries = 0
     while tries < max_tries:
         try:
             print tries
             detail = bs(urllib2.urlopen(url).read())
Beispiel #32
0
 def getTitleFromSoup(self, video_url):
     req = requests.get(video_url)
     soup = BeautifulSoup(req.text)
     html = soup.title.string
     clean_title = HTMLParser().unescape(html)
     return clean_title.replace('|', '-') # | (pipe characters) screw with reddit's table foramtting
Beispiel #33
0
class FocraSpider(Spider):
	name = 'focras'
	'''
	To access scrapy's core API. basically can modify anything in the 'crawler'
	'''
	@classmethod
	def from_crawler(cls, crawler, **kwargs):
		print "focras - from crawler"
		spider = cls(stats=crawler.stats, settings=crawler.settings, **kwargs)
		crawler.signals.connect(spider.stopped, signals.engine_stopped)
		crawler.signals.connect(spider.idle, signals.spider_idle)
		return spider
	
	def __init__(self, stats=None, settings=None, **kwargs):
		super(FocraSpider, self).__init__(**kwargs)
		try:
			self.start_time = time.time()
			print 'focras init(' + self.cname + ') kwargs seeds ' + kwargs.get('seeds')
			print 'focras init(' + self.cname + ') kwargs template '+ self.template
			self.queue = Queue.Queue()
			self.queue_counter = 0
			self.queue_reload_counter = 0
			# to save the state of the pagination
			self.next_page_link = None
			self.end_of_data = False
			self.template = json.loads(self.template, object_pairs_hook=collections.OrderedDict)
			self.item = Item()
			self.pager = HTMLParser().unescape(self.pager)
			self.base_url = kwargs.get('seeds').split(',')
			self.crawled_pages = 0
			self.status = None
			self.lcam = None
			
			# non chain crawler dont have a queue, check for pager only
			# chain crawler url does not start with http
			if self.base_url[0].startswith('http'):
				# for request_url of chain crawler
				self.parentname = None
				if self.runtype == 'resume' and self.pager != 'null':
					db = client['FocraDB']
					collection = db['crawler']
					cursor_focra = collection.find_one({'_id':self.cname})
					self.base_url = [cursor_focra.get('next_page_link')]
					self.crawled_pages = cursor_focra.get('crawled_pages')
					self.start_time = self.start_time - cursor_focra.get('time_executed')
					client.close()
					print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
				else:
					print self.cname + " - Start page is: " + self.base_url[0]
					self.start_urls = self.base_url
			else:
				# chain crawler
				# get parent and field info from seeds
				self.parentname = self.base_url.pop()
				self.fieldname = self.base_url.pop()
				# connect using parent name and get first 100 of the field name
				self.crawler_db = settings['CRAWLER_DB']
				db = client[self.crawler_db]
				collection = db[self.parentname]
				if self.runtype == 'resume':
					db_focra = client['FocraDB']
					cursor_focra = db_focra['crawler'].find_one({'_id': self.cname})
					self.queue_counter = cursor_focra.get('queue_counter')
					self.next_page_link = cursor_focra.get('next_page_link')
					self.crawled_pages = cursor_focra.get('crawled_pages')
					self.start_time = self.start_time - cursor_focra.get('time_executed')
					print self.cname + " - Loading Queue from " + str(self.queue_counter)
					cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_counter).limit(LINK_NUMBER)
					self.queue_reload_counter = self.queue_reload_counter + LINK_NUMBER + self.queue_counter
				else:
					cursor = collection.find({}, {self.fieldname: 1}).limit(LINK_NUMBER)
					# set the queue reload counter
					self.queue_reload_counter += LINK_NUMBER
				client.close()
				
				if cursor.count() <= self.queue_reload_counter:
					print self.cname + '- No more links to load'
					self.end_of_data = True
						
				# put it into queue
				for link in cursor:
					if link.get(self.fieldname):
						soup = BeautifulSoup(link.get(self.fieldname))
						# to see the links added to queue
						#print soup.a['href']
						self.queue.put(soup.a['href'])
				
				# if resume
				if self.next_page_link:
					self.base_url = [self.next_page_link]
					print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
				else:
					self.base_url = [self.queue.get()]
					if self.queue_counter == 0:
						self.queue_counter += 1
						print self.cname + " - Start page is: " + self.base_url[0]
					else:
						print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
		except Exception as error:
			print error
	
	# interrupted state, crawler status determined by views.py
	# it is stopped or paused
	def stopped(self):
		try:
			if self.runtype != 'complete':
				print self.cname + " - Stopped"
				db = client['FocraDB']
				collection = db['crawler']
				# chain crawler queue from parent crawler
				if self.queue_counter != 0:
					collection.update({"_id": self.cname}, {"$set":{'queue_counter': self.queue_counter, 
																 	'crawled_pages': self.crawled_pages,
																 	'time_executed': time.time() - self.start_time}})
					print self.cname + " - Saved queue counter is: " + str(self.queue_counter)
				# main or chained crawler pager state
				if self.pager != 'null' and self.next_page_link:
					collection.update({"_id": self.cname}, {"$set":{'next_page_link': self.next_page_link,
																 	'crawled_pages': self.crawled_pages,
																 	'time_executed': time.time() - self.start_time}})
					print self.cname + " - Saved Page link is: " + str(self.next_page_link)
				client.close()
		except Exception as err:
			print err
	
	# closed gracefully, crawler status complete
	def idle(self):
		try:
			# crawl completed
			if self.status == 'running':
				db = client['FocraDB']
				collection = db['crawler']
				collection.update({"_id": self.cname}, {"$set":{'crawlerAddr': '',
																'crawlerStatus': 'completed',
																'crawled_pages': self.crawled_pages,
																'time_executed': time.time() - self.start_time}})
				print self.cname + " - Crawl completed, closing gracefully"
				self.runtype = 'complete'
				client.close()
		except Exception as err:
			print err
			
	def parse(self, response):		
		try:
			self.crawled_pages += 1
			db = client['FocraDB']
			db['crawler'].update({"_id": self.cname}, {"$set":{'crawled_pages': self.crawled_pages,
																'time_executed': time.time()-self.start_time}})
			print self.cname + " - Parsing items"
			body = BeautifulSoup(response.body)
			
			for tag in body.find_all('a', href=True):
				if 'http' not in tag['href']:
					tag['href'] = urljoin(self.base_url[0], tag['href'])
			for tag in body.find_all('img', src=True):
				if 'http' not in tag['src']:
					tag['src'] = urljoin(self.base_url[0], tag['src'])
			for t in body.find_all('tbody'):
				t.unwrap()
			
			response = response.replace(body=body.prettify(encoding='ascii'))
			
			dynamicItemLoader = ItemLoader(item=self.item, response=response)

			if self.parentname is not None:
				self.item.clear()
				self.item.fields['request_url'] = Field()
				dynamicItemLoader.add_value("request_url", response.url)

			'''
			new codes
			'''
			r = None
			d = {}
			for k, v in self.template.iteritems():
				d[k] = v.split('/')

			lca = None
			if self.lcam:
				lca = self.lcam
			else:
				lca = self.longest_common_ancestor(d)
				self.lcam = lca
				print lca
			
			if lca:
				r = response.xpath(lca).extract()				
				if r:
					if len(r) <= 1:
						for key, value in self.template.iteritems():
							self.item.fields[key] = Field()
							dynamicItemLoader.add_xpath(key, value)
					else:
						for i in range(len(r)):	
							# data region
							#print r[i].encode('ascii', 'ignore')
							sel = Selector(text=r[i])
							
							for key, value in self.template.iteritems():
								
								self.item.fields[key] = Field()
								
								#print self.get_xpath_tail(lca, value)
								
								x = sel.xpath(self.get_xpath_tail(lca, value)).extract()
								
								x = ''.join(x)
								if x.startswith('<a') or x.startswith('<img'):
									dynamicItemLoader.add_value(key, x)
								else:
									sb = ""
									for string in BeautifulSoup(x).stripped_strings:
										sb += "\n" + string
									dynamicItemLoader.add_value(key, sb)
								
			else:
				for key, value in self.template.iteritems():
					#print value
					self.item.fields[key] = Field()
					dynamicItemLoader.add_xpath(key, value)
			
			print "yielded dynamic loader"
			yield dynamicItemLoader.load_item()
			
			# after scraping the page, check status to see whether we should stop
			self.status = db['crawler'].find_one({"_id":self.cname}).get('crawlerStatus')
			if self.status == 'stopped' or self.status == 'paused':
				raise CloseSpider('stopped')
			
			# check for pagination
			if self.pager != 'null':
				next_link = None
				# if the pager is in html format
				if bool(BeautifulSoup(self.pager, "html.parser").find()):
					# remove the \r for 'end of line' diff
					self.pager = self.pager.replace('\r', '')
					a_tags = response.xpath('//a').extract()
					for tag in a_tags:
						if self.pager in tag:
							tag = BeautifulSoup(tag)
							next_link = tag.a.get('href')
							break
				# if the pager is in text format
				else:
					if response.xpath('//a[text()[normalize-space()="'+ self.pager +'"]]/@href').extract():
						next_link = response.xpath('//a[text()[normalize-space()="'+ self.pager +'"]]/@href').extract()[0]
					
				
				if next_link:
					self.next_page_link = next_link
					print self.cname + ' - Next page is: ' + self.next_page_link
					print "yielded request top"
					yield Request(self.next_page_link, callback=self.parse, dont_filter=True)
					
				else:
					# chained crawler WITH pagination
					# check for more links from parent column
					if not self.queue.empty():
						k = self.queue.get()
						print "yielded request middle ---"+k
						yield Request(k, callback=self.parse, dont_filter=True)
						self.queue_counter += 1
						if self.queue.qsize() <= LINK_NUMBER and self.end_of_data == False:
							self.check_queue()
			else:
				# chained crawler WITHOUT pagination
				# check for more links from parent column
				if not self.queue.empty():
					l = self.queue.get()
					print "yielded request btm ---"+l
					yield Request(l, callback=self.parse, dont_filter=True)
					self.queue_counter += 1
					if self.queue.qsize() <= LINK_NUMBER and self.end_of_data == False:
						self.check_queue()
						
		except Exception as err:
			print err

	def check_queue(self):
		try:
			print self.cname + '- Reload counter ' + str(self.queue_reload_counter)
			print self.cname + '- Queue less than ' + str(LINK_NUMBER) + ', querying for more links'
			db = client[self.crawler_db]
			collection = db[self.parentname]
			cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_reload_counter).limit(LINK_NUMBER)
			client.close()
			self.queue_reload_counter += LINK_NUMBER
			# cursor count returns the total row
			if cursor.count() <= self.queue_reload_counter:
				print self.cname + '- No more links to load'
				self.end_of_data = True
			# put it into queue
			for link in cursor:
				if link.get(self.fieldname):
					soup = BeautifulSoup(link.get(self.fieldname))
					# uncomment below to see queue links
					#print soup.a['href']
					self.queue.put(soup.a['href'])
		except Exception as err:
			print err	
	
	'''
	find the lowest common ancestor
	'''
	def longest_common_ancestor(self, d):
		
		if len(d) < 1:
			return None
		
		p = None
		for l in d.values():
			if len(l) < p or p is None:
				p = len(l)
	
		diff_index = None
		
		for i in range(p):
			check = None
			for v in d.itervalues():
				if check is None or check == v[i]:
					check = v[i]
				elif check != v[i]:
					diff_index = i
					break
			if diff_index:
				break
					
		if diff_index:
			# return None if root note is '/body' which is 2
			# return None if root note is '/html' which is 1
			# return None if root note is '/'  which is 0
			if diff_index < 3:
				return None
			sb = ""
			for i in range(diff_index):
				if i != 0:	
					sb += "/" + d.values()[0][i]
			return sb
		
		return None
	
	def get_xpath_tail(self, lca, value):
		last = lca.split("/")
		return '//' + re.sub('[^A-Za-z]+', '', last[len(last)-1]) + value.replace(lca, "", 1)
Beispiel #34
0
def welcomemsg(html):
    query = urlparse.parse_qs(html.geturl())
    messg = HTMLParser().unescape(query['msg'][0])
    messg = messg.replace('<b>', '').replace('WELCOME ', '')
    return messg.split('</b>')
Beispiel #35
0
def toASCII(s):
  s2 = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
  s2 = HTMLParser().unescape(s2)
  s2 = s2.replace(u'\xa0', u' ')
  return s2
Beispiel #36
0
def populate_restaurants(c):
    print 'Populating Restaurants table...'

    if not (os.access('restaurants', os.R_OK)
            and os.path.isdir('restaurants')):
        print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'"
        sys.exit(1)

    if not (os.access('suburbs.txt', os.R_OK)
            and os.path.isfile('suburbs.txt')):
        print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'"
        sys.exit(1)

    #get postcodes from file and cache in dict
    suburbs = open('suburbs.txt').readlines()
    postcodes = {}
    for suburb in suburbs:
        lat, lng, pst, sub = suburb.strip().split('\t')
        postcodes[sub] = pst
    postcodes['CBD'] = 2000  #special case not in data file

    users = c.execute('SELECT username FROM Users').fetchall()
    num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0]

    i = 0
    for restaurant in glob.glob('restaurants/*'):
        r = open(restaurant).readlines()

        #extract info from file
        try:
            name = r[0].strip()
            name = HTMLParser().unescape(name)
            address = r[1].strip()
            address = HTMLParser().unescape(address)
            address = re.sub(r'nsw', 'NSW', address, flags=re.I)
            if not address.endswith(', NSW'):
                address = address + ', NSW'
            suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1)
            suburb = HTMLParser().unescape(suburb)
            phone = r[2].strip().replace('(', '').replace(')', '')
            if re.match('Not available', phone):
                phone = 'Not provided'
            hours = r[3].strip()
            hours = re.sub(r'\s*,\s*', ', ', hours)
            hours = HTMLParser().unescape(hours)
            cuisine = r[4].strip()
            cuisine = HTMLParser().unescape(cuisine)
            cost = r[5].strip()
            image = r[6].strip()
        except:
            print >> sys.stderr, "Error: skipping '%s'" % restaurant
            continue

        #lookup postcode using suburb
        postcode = ''
        if not suburb in postcodes:
            continue
        else:
            postcode = postcodes[suburb]

        #and append it to the address
        address = address + ' ' + str(postcode)

        #chose a random protocol for the website
        protocol = 'http://'
        if random.randint(0, 1) == 1:
            protocol = 'https://'

        #make site of the form protocol://www.lowercase.name.of.restaurant.fake.com
        website = name.replace('  ', ' ').replace(' ', '.').replace(
            '-', '').strip() + '.fake.com'
        website = HTMLParser().unescape(website)
        website = urllib.quote(website)  #encode as url
        website = protocol + 'www.' + website  #avoid encoding the protocol
        website = website.lower().replace('..', '.')

        #ensure only some restaurants have owners
        owner = None
        if random.randint(0, 3) == 0:
            owner = users[random.randint(0, num_users - 1)][0]

        i += 1
        data = (i, name, suburb, address, postcode, phone, hours, cuisine,
                owner, website, cost, image)
        c.execute(
            '''INSERT INTO Restaurants
				(id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image)
				VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)
Beispiel #37
0
def welcomemsg(html):
	query = urlparse.parse_qs(html.geturl())
	messg = HTMLParser().unescape(query['msg'][0])
	messg = messg.replace('<b>', '').replace('WELCOME ', '')
	return messg.split('</b>')
 def fill_details_from_wiki(self, url):
     code = ""
     try:
         u = urlopen(url)
     except:
         print("AddonManager: Debug: unable to open URL", url)
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     # check if the macro page has its code hosted elsewhere, download if needed
     if "rawcodeurl" in p:
         rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">", p)
         if rawcodeurl:
             rawcodeurl = rawcodeurl[0]
             try:
                 u2 = urlopen(rawcodeurl)
             except:
                 print("AddonManager: Debug: unable to open URL",
                       rawcodeurl)
                 return
             # code = u2.read()
             # github is slow to respond... We need to use this trick below
             response = ""
             block = 8192
             #expected = int(u2.headers['content-length'])
             while 1:
                 #print("expected:",expected,"got:",len(response))
                 data = u2.read(block)
                 if not data:
                     break
                 if sys.version_info.major >= 3 and isinstance(data, bytes):
                     data = data.decode('utf-8')
                 response += data
             if response:
                 code = response
             u2.close()
     if not code:
         code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code, key=len)[-1]
             code = code.replace('--endl--', '\n')
         else:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller",
                           "Unable to fetch the code of this macro."))
         # Clean HTML escape codes.
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         if sys.version_info.major < 3:
             code = code.decode('utf8')
         try:
             code = HTMLParser().unescape(code)
             code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ')
         except:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller", "Unable to clean macro code")
                 + ": " + code + '\n')
         if sys.version_info.major < 3:
             code = code.encode('utf8')
     desc = re.findall(
         "<td class=\"ctEven left macro-description\">(.*?)<\/td>",
         p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #39
0
 def fill_details_from_wiki(self, url):
     code = ""
     try:
         u = urlopen(url)
     except urllib2.HTTPError:
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     # check if the macro page has its code hosted elsewhere, download if needed
     if "rawcodeurl" in p:
         rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">",p)
         if rawcodeurl:
             rawcodeurl = rawcodeurl[0]
             try:
                 u2 = urlopen(rawcodeurl)
             except urllib2.HTTPError:
                 return
             # code = u2.read()
             # github is slow to respond... We need to use this trick below
             response = ""
             block = 8192
             #expected = int(u2.headers['content-length'])
             while 1:
                 #print("expected:",expected,"got:",len(response))
                 data = u2.read(block)
                 if not data:
                     break
                 if sys.version_info.major >= 3 and isinstance(data, bytes):
                     data = data.decode('utf-8')
                 response += data
             if response:
                 code = response
             u2.close()
     if not code:
         code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code, key=len)[-1]
             code = code.replace('--endl--', '\n')
         else:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
         # Clean HTML escape codes.
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         if sys.version_info.major < 3:
             code = code.decode('utf8')
         try:
             code = HTMLParser().unescape(code)
             code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ')
         except:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ") + code + '\n')
         if sys.version_info.major < 3:
             code = code.encode('utf8')
     desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True