Exemple #1
0
 def get(self, *args):
     query = base.collapse(urllib.unquote(args[1]))
     if not query:
         return self.ok("Please provide two Wikipedia article titles.")
     query = StringIO.StringIO(query)
     try:
         query_tokens = csv.reader(query, delimiter=" ").next()
     except:
         return self.ok("Please use proper quoting for arguments.")
     try:
         from_name = query_tokens[0] or ""
     except:
         from_name = ""
     if not from_name:
         return self.ok("Please name a starting Wikipedia article title.")
     try:
         to_name = query_tokens[1] or ""
     except:
         to_name = ""
     if not to_name:
         return self.ok("Please name an ending Wikipedia article title.")
     query = urllib.urlencode({"from": from_name, "to": to_name})
     uri = API_URI + "?" + query
     try:
         html = api.urlfetch.fetch(uri).content
         html = unescape.unescape(html.decode("latin1"))
         tree = BeautifulSoup.BeautifulSoup(html)
     except Exception, error:
         return self.ok("Timeout fetching Wikipedia distance.")
Exemple #2
0
 def get(self, *args):
     query = base.collapse(urllib.unquote(args[1]))
     if not query:
         return self.ok("Please provide two Wikipedia article titles.")
     query = StringIO.StringIO(query)
     try:
         query_tokens = csv.reader(query, delimiter=" ").next()
     except:
         return self.ok("Please use proper quoting for arguments.")
     try:
         from_name = query_tokens[0] or ""
     except:
         from_name = ""
     if not from_name:
         return self.ok("Please name a starting Wikipedia article title.")
     try:
         to_name = query_tokens[1] or ""
     except:
         to_name = ""
     if not to_name:
         return self.ok("Please name an ending Wikipedia article title.")
     query = urllib.urlencode({"from": from_name, "to": to_name})
     uri = API_URI + "?" + query
     try:
         html = api.urlfetch.fetch(uri).content
         html = unescape.unescape(html.decode("latin1"))
         tree = BeautifulSoup.BeautifulSoup(html)
     except Exception, error:
         return self.ok("Timeout fetching Wikipedia distance.")
Exemple #3
0
 def get(self, *args):
     query = base.collapse(urllib.unquote(args[1]))
     query = urllib.urlencode({"key": query, "type": "Books", "page":"1"})
     uri = API_URI + "?" + query
     try:
         html = api.urlfetch.fetch(uri).content
         html = unescape.unescape(html.decode("latin1"))
         tree = BeautifulSoup.BeautifulSoup(html)
     except Exception, error:
         return self.ok("Timeout fetching ISBN information.")
Exemple #4
0
 def get(self, *args):
     query = base.collapse(urllib.unquote(args[1]))
     query = urllib.urlencode({"key": query, "type": "Books", "page": "1"})
     uri = API_URI + "?" + query
     try:
         html = api.urlfetch.fetch(uri).content
         html = unescape.unescape(html.decode("latin1"))
         tree = BeautifulSoup.BeautifulSoup(html)
     except Exception, error:
         return self.ok("Timeout fetching ISBN information.")
Exemple #5
0
 def get(self, *args):
     word = args[1]
     if not word:
         self.ok("Please provide a word.")
     word = urllib.unquote(word)
     payload = urllib.urlencode({"q": word})
     try:
         headers = {"Content-Type": "application/x-www-form-urlencoded"}
         html = api.urlfetch.fetch(API_URI,
                                   method=api.urlfetch.POST,
                                   payload=payload,
                                   headers=headers).content
     except Exception:
         return self.ok("Error fetching results.")
     tree = BeautifulSoup.BeautifulSoup(html)
     try:
         message = base.collapse(tree.find("blockquote").string)
         message = unescape.unescape(message)
     except:
         return self.ok("Error parsing results.")
     return self.ok(message)
Exemple #6
0
 def get(self, *args):
     word = args[1]
     if not word:
         self.ok("Please provide a word.")
     word = urllib.unquote(word)
     payload = urllib.urlencode({"q": word})
     try:
         headers = {"Content-Type": "application/x-www-form-urlencoded"}
         html = api.urlfetch.fetch(
             API_URI,
             method=api.urlfetch.POST,
             payload=payload,
             headers=headers).content
     except Exception:
         return self.ok("Error fetching results.")
     tree = BeautifulSoup.BeautifulSoup(html)
     try:
         message = base.collapse(tree.find("blockquote").string)
         message = unescape.unescape(message)
     except:
         return self.ok("Error parsing results.")
     return self.ok(message)
Exemple #7
0
    def _repopulate(self):
        print "Regenerating DOM"
        self.dom = xml.dom.minidom.parseString(urllib2.urlopen(self.url).read())

        print "Repopulating items array"
        self.index = 0
        # this monster achieves:
        #   - slice the first description node off (it's a channel desc)
        #   - crop each string to a max of 300 chars
        #   - URI decode text
        #   - unescape HTML entities
        self.items = map( lambda x: unescape( urllib2.unquote( x.firstChild.data))[0:300], \
                          self.dom.getElementsByTagName("description")[1:]
        )

        # No unicode support in puredata (at least not via pyext. It claims it can't convert) 
        print "Filtering unicode characters..."
        def maybe_delete(c):
            try:    return str(c)
            except: return " "
        self.items = map(lambda x: ''.join(x), map(lambda y: map(maybe_delete, y), self.items))
        
        print "Items now has %s entries" % len(self.items)
Exemple #8
0
def get_all_text(s):
    t = s.findAll(text=True)
    t = unescape(' '.join(t))
    t = ' '.join(t.split())
    return t
Exemple #9
0
def get_all_text(s):
    t = s.findAll(text=True)
    t = unescape(' '.join(t))
    t = ' '.join(t.split())
    return t
Exemple #10
0
#python3

import lxml.html

filename = 'petrol.html'
with open(filename, 'r') as f:
    text = f.read()

html = lxml.html.fromstring(text)

newtext = lxml.html.tostring(html)
newfile = 'petrol2.html'
with open(newfile, 'wb') as f:
    f.write(newtext)
#https://stackoverflow.com/questions/9487133/python-convert-html-ascii-encoded-text-to-utf8

import unescape
new2 = unescape.unescape(text)

newfile2 = 'petrol_unescape.html'
with open(newfile2, 'wt') as f:
    f.write(new2)

#http://effbot.org/zone/re-sub.htm#unescape-html
Exemple #11
0
def clean(text, hasDebugFlag=False):
    """
    Transforms wiki markup.
    @see https://www.mediawiki.org/wiki/Help:Formatting
    """
    # Drop transclusions (template, parser functions)
    text = dropNested(text, r'{{', r'}}')

    # Drop tables
    text = dropNested(text, r'{\|', r'\|}')

    # Remove any found signatures and timestamps
    text = removeSignature(text)

    # replace external links
    text = replaceExternalLinks(text)

    # replace internal links
    text = replaceInternalLinks(text)

    # drop MagicWords behavioral switches
    text = magicWordsRE.sub('', text)

    ################ Process HTML ###############

    # turn into HTML, except for the content of <syntaxhighlight>
    res = ''
    cur = 0
    for m in syntaxhighlight.finditer(text):
        end = m.end()
        res += unescape(text[cur:m.start()]) + m.group(1)
        cur = end
    text = res + unescape(text[cur:])

    # Handle bold/italic/quote
    text = bold_italic.sub(r'\1', text)
    text = bold.sub(r'\1', text)
    text = italic_quote.sub(r'"\1"', text)
    text = italic.sub(r'"\1"', text)
    text = quote_quote.sub(r'"\1"', text)
    # residuals of unbalanced quotes
    text = text.replace("'''", '').replace("''", '"')

    # Collect spans
    spans = []
    # Drop HTML comments
    for m in comment.finditer(text):
            spans.append((m.start(), m.end()))

    # Drop self-closing tags
    for pattern in selfClosing_tag_patterns:
        for m in pattern.finditer(text):
            spans.append((m.start(), m.end()))

    # Drop ignored tags
    for left, right in ignored_tag_patterns:
        for m in left.finditer(text):
            spans.append((m.start(), m.end()))
        for m in right.finditer(text):
            spans.append((m.start(), m.end()))

    # Bulk remove all spans
    text = dropSpans(spans, text)

    # Drop discarded elements
    for tag in discardElements:
        text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)

    # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
    text = unescape(text)

    # Expand placeholders
    for pattern, placeholder in placeholder_tag_patterns:
        index = 1
        for match in pattern.finditer(text):
            text = text.replace(match.group(), '%s_%d' % (placeholder, index))
            index += 1

    text = text.replace('<<', '«').replace('>>', '»')

    #############################################

    # Cleanup text
    text = text.replace('\t', ' ')
    text = re.sub(' (,:\.\)\]»)', r'\1', text)
    text = re.sub('(\[\(«) ', r'\1', text)
    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations

    # Remove lists, tables and such
    text = compact(text)

    # Remove symbols and reduce multiple successive spaces to one
    text = removeSymbols(text)
    text = spaces.sub(' ', text)

    if hasDebugFlag:
        print(text)
    return text