def export_news(offset, limit, force, export_path):
    for (dirpath, dirnames, filenames) in os.walk("to_import"):
        for filename in filenames:
            with open(os.sep.join((dirpath, filename))) as opened_file:
                parser = BeautifulSoup(opened_file, 'xml')
                if not os.path.exists(export_path):
                    os.makedirs(export_path)
                rows = parser.find_all("content")
                for row in rows:
                    title = row.find("title").text
                    url = row.find("url").text.replace("/administrator", "")
                    html_parser = HTMLParser()
                    url = html_parser.unescape(html_parser.unescape(url))
                    pub_date = row.find("publish_up").text
                    mod_date = row.find("modified").text
                    featured = bool(int(row.find("featured").text))
                    if mod_date == "0000-00-00 00:00:00":
                        mod_date = pub_date
                    res = prepare_dict(url)
                    if not res:
                        logger.warning("error for url %s" % url)
                        continue
                    res["title"] = title
                    res["id"] = normalize(title, max_length=200)
                    res["category"] = row.find("catid").text
                    res["pub_date"] = pub_date
                    res["mod_date"] = mod_date
                    res["featured"] = featured
                    res["hits"] = row.find("hits").text
                    save_json(export_path, res)

        break
Beispiel #2
0
    def getSets(self):
        """
        set names are always mined from the same, trusted source: magiccards.info
        @return:
        """
        os.environ['http_proxy'] = ''
        proxies = {
            #        "http": "http://*****:*****@3.187.59.236:9400",
            # "https": "http://*****:*****@3.187.59.236:9400",
        }
        results = requests.get("http://magiccards.info/search.html", proxies=proxies,
                               timeout=10)
        # print results.text
        setstag = re.search(r'<label\s+for="edition">.*?<option value=""></option>(.*?)</select>', results.text, re.DOTALL)

        setsraw = setstag.group(1).strip()
        # print setsraw
        setnames = []
        hp = HTMLParser()
        print setsraw
        for st in striplist(setsraw.split(r'option>')):
            print 'stmax', st
            rawElem = re.search(r'value="(.*)">(.*?)<', st)
            if rawElem:
                setkey = hp.unescape(rawElem.group(1))
                setname = hp.unescape(rawElem.group(2))
                print setkey, setname
                setnames.append(setname)
        setnames.remove('All Sets')
        self.setnames = setnames
Beispiel #3
0
def get_user_realname(user):
    from ckanext.dgu.drupalclient import DrupalClient
    from HTMLParser import HTMLParser

    if user.name.startswith('user_d'):
        user_id = user.name[len('user_d'):]

        html_parser = HTMLParser()

        try:
            dc = DrupalClient()
            properties = dc.get_user_properties(user_id)
        except Exception, ex:
            return user.fullname

        try:
            first_name = properties['field_first_name']['und'][0]['safe_value']
            first_name = html_parser.unescape(first_name)
        except:
            first_name = ''

        try:
            surname = properties['field_surname']['und'][0]['safe_value']
            surname = html_parser.unescape(surname)
        except:
            surname = ''
Beispiel #4
0
 def iter_movies(self, pattern):
     res = self.readurl("http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s" % pattern.encode("utf-8"))
     jres = json.loads(res)
     htmlparser = HTMLParser()
     for cat in ["title_popular", "title_exact", "title_approx"]:
         if cat in jres:
             for m in jres[cat]:
                 tdesc = unicode(m["title_description"])
                 if "<a" in tdesc and ">" in tdesc:
                     short_description = u"%s %s" % (
                         tdesc.split("<")[0].strip(", "),
                         tdesc.split(">")[1].split("<")[0],
                     )
                 else:
                     short_description = tdesc.strip(", ")
                 movie = Movie(m["id"], htmlparser.unescape(m["title"]))
                 movie.other_titles = NotLoaded
                 movie.release_date = NotLoaded
                 movie.duration = NotLoaded
                 movie.short_description = htmlparser.unescape(short_description)
                 movie.pitch = NotLoaded
                 movie.country = NotLoaded
                 movie.note = NotLoaded
                 movie.roles = NotLoaded
                 movie.all_release_dates = NotLoaded
                 movie.thumbnail_url = NotLoaded
                 yield movie
Beispiel #5
0
 def logger(self, message, msg_type, operation_name, counter, msg_id):  # Метод логирования soap пакетов
     try:
         try:
             if msg_type == 'RESPONSE' and operation_name != 'GetStats' and operation_name != 'GetEvents':
                 pars = HTMLParser()
                 message = pars.unescape(message)
                 message = unicode(message, 'utf-8')
             if msg_type == 'RESPONSE' and operation_name == 'GetEvents':
                 message = unicode(message, 'utf-8')
             else:
                 message = unicode(message, 'utf-8')
         except:
             pars = HTMLParser()
             message = pars.unescape(message)
         if not os.path.exists(self.logs_directory):
             os.makedirs(self.logs_directory)
         log = codecs.open(os.path.join(self.logs_directory, msg_id + "_" + self.appdate + "_" + operation_name +
                           "_OperationLog.xml"), 'a', encoding='utf8')
         log.write("--------------Start of " + msg_type + " - Transaction Number is " + str(counter) +
                   "--------------------\n" + message)
         log.write("\n--------------End of " + msg_type + "- Transaction Number is " + str(counter) +
                   "--------------------\n")
         log.close()
         return "Success creating LOG " + operation_name + " - " + msg_type + ": "
     except Exception:
         return "Failure creating LOG " + operation_name + " - " + msg_type + ": " + '\n' + traceback.format_exc()
Beispiel #6
0
def getImageLocation(comicRequest):

    titleString = 'id="ctitle">'
    captionString = 'title="'
    imageString = '//imgs.xkcd.com/comics/'

    response = urllib2.urlopen(parseComicRequest(comicRequest))
    html = response.read()

    titleStart = html.find(titleString) + len(titleString)
    titleEnd = html[titleStart:].find('<') + titleStart
    title = html[titleStart:titleEnd]

    imageAddressStart = html.find(imageString)
    imageAddressEnd = html[imageAddressStart:].find('"') + imageAddressStart
    imageAddress = html[imageAddressStart:imageAddressEnd]

    captionStart = (
        html[imageAddressEnd:].find(captionString) + imageAddressEnd +
        len(captionString)
    )
    captionEnd = html[captionStart:].find('"') + captionStart
    caption = html[captionStart:captionEnd]

    parser = HTMLParser()
    caption = parser.unescape(caption)
    title = parser.unescape(title)

    return '*' + title + "*\nhttp:" + str(imageAddress) + '\n' + caption
Beispiel #7
0
    def getSets(self):
        """
        @return: Set names found
        """
        results = requests.get("http://magiccards.info/search.html", proxies=self.proxies,
                               timeout=10)
        # print results.text
        setstag = re.search(r'<label\s+for="edition">.*?<option value=""></option>(.*?)</select>', results.text,
                            re.DOTALL)

        setsraw = setstag.group(1).strip()
        setnames = []
        hp = HTMLParser()
        print setsraw
        for st in striplist(setsraw.split(r'option>')):
            print 'stmax', st
            rawElem = re.search(r'value="(.*)">(.*?)<', st)
            if rawElem:
                setkey = hp.unescape(rawElem.group(1))
                setname = hp.unescape(rawElem.group(2))
                if not self.db.session.query(Edition).filter(Edition.name == setname).count():
                    ed = Edition(setname)
                    ed.sexps.append(EditionSexp(setkey, 'http://magiccards.info'))
                    ed.save(self.db.session)
                    print setkey, setname
                    setnames.append(setname)
        try:
            self.db.session.commit()
        except IntegrityError as ie:
            print ie.message
            logger.severe('', ie)

        return setnames
Beispiel #8
0
def main():
    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.amazon.com/gp/bestsellers/").read())

    categories = []

    # Scrape list of category names and urls
    for category_li in soup.find(attrs={'id':'zg_browseRoot'}).find('ul').findAll('li'):
        category = {}
        category['name'] = html_parser.unescape(category_li.a.string)
        category['url'] = category_li.a['href']

        categories.append(category)

    del soup

    # Loop through categories and print out each product's name, rank, and url.
    for category in categories:
        print category['name']
        print '-'*50

        soup = BeautifulSoup(urlopen(category['url']))

        i = 1
        for title_div in soup.findAll(attrs={'class':'zg_title'}):
            if i ==1:
                print "%d. %s\n    %s" % (i, html_parser.unescape(title_div.a.string), title_div.a['href'].strip())
            i += 1

        print ''
Beispiel #9
0
class IssuesScraper(Scraper):
    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/issues/feed/"
        self.html = HTMLParser()
        self.issue_provider = IssueProvider()

    def collect_urls(self):
        records = []
        items = self.get(self.url).findAll("item")
        for item in items:
            record = {
                "title": self.html.unescape(item.title.text),
                "timestamp_publish": parser.parse(item.pubdate.text),
                "site": "berniesanders.com",
                "lang": "en",
                "description_html": item.description.text,
                "description": self.html.unescape(BeautifulSoup(item.description.text).p.text),
                "url": item.link.nextSibling,
            }
            records.append(record)
        return records

    def retrieve(self, record):

        soup = self.get(record["url"])

        # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/>
        meta_image = soup.findAll(attrs={"property": "og:image"})
        record["image_url"] = meta_image[0]["content"].encode("utf8")

        # reset soup to content
        soup = self.sanitize_soup(soup.find("section", {"id": "content"}))
        while soup.article.style is not None:
            soup.article.style.extract()
        record["body_html"] = str(soup.article)
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == "br":
                text.append("")
        record["body"] = "\n".join(text)

        return record

    def go(self):
        urls = self.collect_urls()
        if not urls:
            logging.critical("Could not retrieve issues.")
            sys.exit(1)
        for url in urls:
            record = self.retrieve(url)
            if self.issue_provider.exists_by_url(record["url"]):
                print "found"
            else:
                msg = "Inserting record for '{0}'."
                logging.info(msg.format(record["title"].encode("utf8")))
                record["timestamp_creation"] = datetime.now()
                self.issue_provider.create(record)
Beispiel #10
0
 def iter_movies(self, pattern):
     res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s' % pattern.encode('utf-8'))
     jres = json.loads(res)
     htmlparser = HTMLParser()
     for cat in ['title_popular', 'title_exact', 'title_approx']:
         if cat in jres:
             for m in jres[cat]:
                 tdesc = unicode(m['title_description'])
                 if '<a' in tdesc and '>' in tdesc:
                     short_description = u'%s %s' % (tdesc.split('<')[
                                                     0].strip(', '), tdesc.split('>')[1].split('<')[0])
                 else:
                     short_description = tdesc.strip(', ')
                 movie = Movie(m['id'], htmlparser.unescape(m['title']))
                 movie.other_titles = NotLoaded
                 movie.release_date = NotLoaded
                 movie.duration = NotLoaded
                 movie.short_description = htmlparser.unescape(short_description)
                 movie.pitch = NotLoaded
                 movie.country = NotLoaded
                 movie.note = NotLoaded
                 movie.roles = NotLoaded
                 movie.all_release_dates = NotLoaded
                 movie.thumbnail_url = NotLoaded
                 yield movie
Beispiel #11
0
 def original_unescape(self, s):
     """Since we need to use this sometimes"""
     if isinstance(s, basestring):
         return unicode(HTMLParser.unescape(self, s))
     elif isinstance(s, list):
         return [unicode(HTMLParser.unescape(self, item)) for item in s]
     else:
         return s
Beispiel #12
0
class IssuesScraper(Scraper):

    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/issues/feed/"
        self.html = HTMLParser()

    def collect_urls(self):
        recs = []
        items = self.get(self.url).findAll("item")
        for item in items:
            rec = {
                "inserted_at": datetime.now(),
                "title": self.html.unescape(item.title.text),
                "created_at": parser.parse(item.pubdate.text),
                "site": "berniesanders.com",
                "lang": "en",
                "article_type": "Issues",
                "description_html": item.description.text,
                "description": self.html.unescape(
                    BeautifulSoup(item.description.text).p.text),
                "url": item.link.nextSibling
            }
            recs.append(rec)
        return recs

    def retrieve(self, rec):
        soup = self.get(rec["url"]).find("section", {"id": "content"})
        while soup.article.style is not None:
            soup.article.style.extract()
        rec["body_html"] = str(soup.article)
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == 'br':
                text.append("")
        rec["body"] = "\n".join(text)
        return rec

    def go(self):
        urls = self.collect_urls()
        if not urls:
            logging.critical("Could not retrieve issues.")
            sys.exit(1)
        for url in urls:
            rec = self.retrieve(url)
            query = {
                "title": rec["title"],
                "article_type": rec["article_type"]
            }
            if not self.db.articles.find(query).limit(1).count():
                msg = "Inserting '{0}', created {1}"
                logging.info(msg.format(
                    rec["title"].encode("utf8"),
                    str(rec["created_at"])
                ))
                self.db.articles.insert_one(rec)
    def getStatus(self):
        data = self.queryWebInterface()

        if data is None:
            print "Connection error of some sort"
            return None

        isplaying = int(data["isPlaying"])
        ispaused = int(data["isPaused"])
        playback_mode = int(data["playbackOrder"])

        if isplaying or ispaused:
            current_song_id = data["playingItem"]
        else:
            #Currently stopped so try and use either the last playing song or the currently focused item
            if len(data["prevplayedItem"]) > 0:
                current_song_id = data["prevplayedItem"]
            else:
                current_song_id = data["focusedItem"]

        if current_song_id != "?":
            current_song_id = int(current_song_id)

        #Deriving the page ourselves because playlistPage is just whatever page is currently visible, not the page
        #that our song is actually on.
        if (data["playlistActive"] == data["playlistPlaying"]) or data["playingItem"] == "?" and current_song_id != "?":
            current_page = (current_song_id/int(data["playlistItemsPerPage"])) + 1
            cur_position_on_page = current_song_id - (current_page-1) * int(data["playlistItemsPerPage"])
            current_song_name = data["playlist"][cur_position_on_page]["t"]
            current_artist = data["playlist"][cur_position_on_page]["a"]
            try:
                next_song_in_playlist = data["playlist"][cur_position_on_page+1]["t"] + " - " + data["playlist"][cur_position_on_page+1]["a"]
            except:
                next_song_in_playlist = None
        else:
            if len(data["helper1"]) > 0:
                #Not on the correct playlist page, fall back to less reliable helperi fields
                current_song_name = re.match("^(.*) - $", data["helper1"]).group(1)
                current_artist = re.search("(.*) - %s" % re.escape(current_song_name), data["helper2"]).group(1)
                next_song_in_playlist = None
            else:
                return None

        return_data = {}
        return_data["isplaying"] = isplaying
        return_data["ispaused"] = ispaused
        return_data["playback_mode"] = playback_mode
        #Encountered a problem with the ajquery template returning HTML escape sequences in song/artist names
        #Hopefully this fixes it
        h = HTMLParser()
        return_data["song_name"] = unicode(h.unescape(current_song_name)).encode("utf8")
        return_data["artist_name"] = unicode(h.unescape(current_artist)).encode("utf8")
        return_data["next_song_in_playlist"] = next_song_in_playlist
        return return_data
Beispiel #14
0
def escapeit(sval, EXTRAS=None):
    global _h
    _h = HTMLParser()
    # note, xmlescape and unescape do not work with utf-8 bytestrings
    # so pre-convert to full unicode and then convert back since our result xml is utf-8 encoded
    uval = sval.decode('utf-8')
    if EXTRAS:
        ures = xmlescape(_h.unescape(uval), EXTRAS)
    else:
        ures = xmlescape(_h.unescape(uval))
    return ures.encode('utf-8')
class ArticlesScraper(Scraper):

    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/daily/"
        self.html = HTMLParser()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, r.url
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                content = soup.article
                paragraphs = [self.html.unescape(replace_with_newlines(p))
                              for p in content.findAll("p")]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html
        return False, False

    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            rec = {
                "inserted_at": datetime.now(),
                "created_at": parser.parse(article.time["datetime"]),
                "source": "berniesanders.com",
                "type": "DemocracyDaily",
                "excerpt_html": str(article.find(
                    "div", {"class": "excerpt"}).p),
                "excerpt": self.html.unescape(
                    article.find(
                        "div", {"class": "excerpt"}).p.text),
                "title": article.h2.text,
                "article_category": article.h1.string.strip(),
                "url": article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]
            query = {"title": rec["title"], "type": "DemocracyDaily"}
            if not self.db.articles.find(query).limit(1).count():
                text, html = self.retrieve_article(rec["url"])
                rec["body"], rec["body_html"] = text, html
                msg = "Inserting '{0}', created {1}"
                logging.info(msg.format(
                    rec["title"].encode("utf8"),
                    str(rec["created_at"])
                ))
                self.db.articles.insert_one(rec)
    def isHTMLEntity(self, word):
        if (len(word) == 1):
            return False, word

        if (word[0] == '&'):
            h = HTMLParser()
            if (h.unescape(word) != '' and self.dict.check(h.unescape(word)) == True):
                word = unicode.encode(h.unescape(word), 'utf-8')
                return True, word
            else:
                return False, word
        else:
            return False, word
 def fixEntities(self, metadata, names):
     # fix the escaped entities or we end up with things like:
     # '&amp;' in the title
     htmlparser = HTMLParser()
     for name in names:
         value = metadata.get(name)
         if isinstance(value, list):
             l = []
             for v in value:
                 l.append(htmlparser.unescape(v))
             metadata[name] = l
         elif value:
             metadata[name] = htmlparser.unescape(value)
     return metadata
Beispiel #18
0
def main():
    if len(sys.argv) > 1:
        query = " ".join(sys.argv[1:])

    url = u'http://api.wolframalpha.com/v2/query?input={q}&appid={API_KEY}&format=plaintext'.format(API_KEY = wolfram_alpha_key, q = quote(query))

    resp = requests.get(url)

    for pod in re.findall(r'<pod.+?>.+?</pod>', resp.text, re.S):
        title = re.findall(r'<pod.+?title=[\'"](.+?)[\'"].*>', pod, re.S)
        parser = HTMLParser()
        print(Fore.GREEN + parser.unescape("".join(title).strip()) + Fore.RESET)
        for inner in re.findall(r'<plaintext>(.*?)</plaintext>', pod, re.S):
            print(parser.unescape(inner.strip()))
        print('')
Beispiel #19
0
def parse_summary(string, isReversed=False):
    if string is None: string = ''
    if type(string) == list: string = str(string)
    if not isReversed:
        val = string.split('\n')
        for i, v in enumerate(val):
            val[i] = '%s<br/>' % v
            val[i] = val[i].replace('Step:', '<strong>&emsp;Step:</strong>')
            val[i] = val[i].replace('Checkpoint:',
                                    '<strong>&emsp;Checkpoint:</strong>')
            val[i] = val[i].replace('Verify point:',
                                    '<strong>&emsp;Verify point:</strong>')
            val[i] = val[i].replace('*TC Steps:*',
                                    '<strong>&emsp;*TC Steps:*</strong>')
            val[i] = val[i].replace('*VP:*', '<strong>&emsp;*VP:*</strong>')
        return ''.join(val)
    else:
        ps = HTMLParser()
        val = string.split('<br/>')
        for i, v in enumerate(val):
            val[i] = re.sub(r'\n\t', '', val[i])
            val[i] = remove_tags(val[i])
            val[i] = ps.unescape(val[i])
            val[i] = val[i].encode('ascii', errors='ignore')
        val = filter(None, val)
        return '<br/>'.join(val).strip()
Beispiel #20
0
def cleaner(dummy, value, *_):
    """Cleans out unsafe HTML tags.

  Uses bleach and unescape until it reaches a fix point.

  Args:
    dummy: unused, sqalchemy will pass in the model class
    value: html (string) to be cleaned
  Returns:
    Html (string) without unsafe tags.
  """
    # Some cases like Request don't use the title value
    #  and it's nullable, so check for that
    if value is None:
        return value
    if not isinstance(value, basestring):
        # no point in sanitizing non-strings
        return value

    parser = HTMLParser()
    value = unicode(value)
    while True:
        lastvalue = value
        value = parser.unescape(
            bleach.clean(value, BLEACH_TAGS, BLEACH_ATTRS, strip=True))
        if value == lastvalue:
            break
    return value
Beispiel #21
0
def get_tags(html, tag_name):
    parser = HTMLParser()
    for m in re.findall('<%s(\s+[^>]*)/*>' % tag_name, html, re.IGNORECASE):
        attrs = {}

        for x in re.findall('(?:(%s))' % TAG_ATTRIBUTES_REGEX, m, re.UNICODE):
            if x[1]:
                attrs[x[1]] = parser.unescape(x[2])
            elif x[3]:
                attrs[x[3]] = parser.unescape(x[4])
            elif x[5]:
                attrs[x[5]] = parser.unescape(x[6])
            elif x[7]:
                attrs[x[7]] = parser.unescape(x[7])

        yield attrs
def get_imdb_movie_reviews(id,title,year):
    score_max = 10.0
    link = "http://www.imdb.com/title/tt%0.7d/" % id
    url = web.URL(link)
    dom = web.DOM(url.download(cached=True))
    overall = float(dom.by_class("titlePageSprite star-box-giga-star")[0].content.strip()) / score_max
    # try to get year directly from page; this isn't present in every entry
    try:
        year = dom('span.itemprop[itemprop=name]')[0].next.next.by_tag('a')[0].content
        year = int(year)
    except:
        pass
    rc = dom.by_attr(itemprop="reviewCount")[0].content.split(" ")[0].replace(",","")
    revlink = link + 'reviews?count=%s&start=0' % rc # get at most 20 reviews
    url = web.URL(revlink)
    dom = web.DOM(url.download(cached=True))
    parser = HTMLParser()
    lst = []
    hrs = dom.by_id('tn15main').by_tag('hr')
    for hr in hrs:
        div = hr.next.next
        try:
            score = float(div.by_tag("img")[1].attrs["alt"].split("/")[0]) / score_max
            date = div.by_tag("small")[2].content
        except:
            continue
        user = div.by_tag("a")[1].content
        p = div.next.next
        review = parser.unescape(p.content.replace("<br />","\n"))
        lst.append(dict(critic=user,norm_score=score,quote=review,
                        id=id,title=title,source="IMDB",overall_score=overall,year=year,date=date))
    return lst
Beispiel #23
0
 def get_jobs(self):
     try:
         jobs_start_time = time.time()
         h = HTMLParser()
         html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore')
         soup = BeautifulSoup(html, 'html.parser')
         data = soup.findAll('a', id=lambda x: x and x.startswith('popup'))
         counter = 0
         for a in data:
             if a.has_attr('href'):
                 counter = counter + 1
                 #self.DrawSpinner(counter)
                 try:
                     return_code = self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1])
                     if return_code == 1:
                         #In case the error pages starts to come
                         jobs_end_time = time.time()
                         print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time)
                         return
                         
                 except Exception:
                     continue
         jobs_end_time = time.time()
         print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time)
     except Exception as e:
         print 'exception= ', str(e)
         #print 'stacktrace= ', traceback.print_exc()
         print 'Line Number= ' + str(sys.exc_traceback.tb_lineno)
Beispiel #24
0
def cleaner(dummy, value, *_):
  """Cleans out unsafe HTML tags.

  Uses bleach and unescape until it reaches a fix point.

  Args:
    dummy: unused, sqalchemy will pass in the model class
    value: html (string) to be cleaned
  Returns:
    Html (string) without unsafe tags.
  """
  # Some cases don't use the title value and it's nullable, so check for that
  if value is None:
    return value
  if not isinstance(value, basestring):
    # no point in sanitizing non-strings
    return value

  parser = HTMLParser()
  value = unicode(value)
  while True:
    lastvalue = value
    value = parser.unescape(
        bleach.clean(value, BLEACH_TAGS, BLEACH_ATTRS, strip=True)
    )
    if value == lastvalue:
      break
  return value
def tj_striphtml(value):
	parser = HTMLParser()

	s = MLStripper()
	s.feed(parser.unescape(value))

	return s.get_data()
Beispiel #26
0
def _process_title(title):
    """处理feed.entries中的title"""
    html_parser = HTMLParser()
    title = html_parser.unescape(title)  # 进行2次HTML反转义
    title = html_parser.unescape(title)
    title = title.replace('\r', '').replace('\n', '')  # 去除换行符
    return remove_html_tag(title)
    def get(self, response, page, api_type, api_value):

        channel = {
            'page': page,
            'page_patten': None,
            'movies': []
        }

        response = json.loads(response)
        movies = response['data']
        if 'total' in response and response['total'] > 24:
            channel['page'] = int(round(response['total']/24))
        channel['page_patten'] = '%s|%s' % (api_type, api_value)

        h = HTMLParser()
        for movie in movies:
            type = self.get_quality(int(movie['quality']))
            label = "[%s] %s" % (type, movie['name'])
            if not movie['is_movie']:
                label = "[%s/%s] %s" % (movie['meta']['max_episode_name'], movie['time'], movie['name'])

            channel['movies'].append({
                'id': movie['slug'],
                'label': label.encode("utf-8"),
                'title': movie['name'].encode("utf-8"),
                'realtitle': movie['name'].encode("utf-8"),
                'thumb': movie['thumbnail'],
                'poster': movie['poster'],
                'type': type,
                'intro': h.unescape(movie['description'])
            })

        return channel
class ArkTweetNLP:

    def __init__(self, data=[]):
        # Lookup cache (constantly rerunning tagger takes time)
        self.cache = Cache('ark_tweet')

        # Unescape data
        self.h = HTMLParser()

        # Resolve and cache all currently uncached tweets
        self.resolve(data)


    def normalizeKey(self, tweet):
        clean = lambda txt: self.h.unescape(txt).strip()
        try:
            tmp = tweet.decode('utf-8')
        except UnicodeEncodeError, e:
            # Didn't want to resort to this, but get each character one at a time
            ctmp = []
            for c in tweet:
                try:
                    c.decode('utf-8')
                    ctmp.append(c)
                except UnicodeEncodeError, e:
                    continue
            tmp = ''.join(ctmp)
Beispiel #29
0
    def getMovie(html):
        hxs = lxml.html.document_fromstring(html)
        hp = HTMLParser()
        movie = {}
        try:
            movie['title'] = hp.unescape(hxs.xpath('//*[@id="overview-top"]/h1/span[1]/text()')[0].strip())
        except IndexError:
            movie['title'] = ""
        try:
            original_title = hxs.xpath('//*[@id="overview-top"]/h1/span[3]/text()')[0].strip()
            movie['original_title'] = original_title.replace('"', '')
        except:
            movie['original_title'] = ""
            
        try:
            movie['type'] = hxs.xpath('//*[@id="overview-top"]/div[1]/text()')[0].strip()        
        except:
            movie['type'] = ""
        try:
            movie['year'] = int(hxs.xpath('//*[@id="overview-top"]/h1/span[2]/a/text()')[0].strip())
        except:
            try:
                movie['year'] = int(hxs.xpath('//*[@id="overview-top"]/h1/span[3]/a/text()')[0].strip())
            except:
                try:
                    movie['year'] = int(hxs.xpath('//*[@id="overview-top"]/h1/span[2]/text()')[0].strip().replace('(', '').replace(')', ''))
                except:
                    movie['year'] = 0
        try:
            duration = hxs.xpath('//*[@id="overview-top"]/div[2]/time/text()')[0].strip()
            movie['duration'] = int(duration.replace('min', '').strip())
        except:
            movie['duration'] = 0
        try:
            movie['genres'] = hxs.xpath('//*[@id="overview-top"]/div[2]/a/span/text()')
        except:
            movie['genres'] = []
        try:
            movie['release_date'] = hxs.xpath('//*[@id="overview-top"]/div[2]/span[3]/a/text()')[0].strip()
        except:
            try:
                movie['release_date'] = hxs.xpath('//*[@id="overview-top"]/div[2]/span[4]/a/text()')[0].strip()
            except:
                movie['release_date'] = ""
        try:
            movie['rating'] = float(hxs.xpath('//*[@id="overview-top"]/div[3]/div[3]/strong/span/text()')[0].strip())
        except:
            movie['rating'] = 0

        try:
            movie['poster'] = hxs.xpath('//*[@id="img_primary"]/div/a/img/@src')[0].strip()
        except:
            movie['poster'] = ""

        try:
            movie['actors'] = hxs.xpath('//*[@id="overview-top"]/div[6]/a/span/text()')
        except:
            movie['actors'] = ""

        return movie
Beispiel #30
0
def update_event_description(event_id, description, analyst):
    """
    Update event description.

    :param event_id: The ObjectId of the Event to update.
    :type event_id: str
    :param description: The new description.
    :type description: str
    :param analyst: The user updating this Event.
    :type analyst: str
    :returns: dict with keys "success" (boolean) and "message" (str)
    """

    if not description:
        return {'success': False, 'message': "No description to change"}
    event = Event.objects(id=event_id).first()
    if not event:
        return {'success': False, 'message': "No event found"}
    # Have to unescape the submitted data. Use unescape() to escape
    # &lt; and friends. Use urllib2.unquote() to escape %3C and friends.
    h = HTMLParser()
    description = h.unescape(description)
    event.description = description
    try:
        event.save(username=analyst)
        return {'success': True}
    except ValidationError, e:
        return {'success': False, 'message': e}
 def first_selected_option_text(self):
     if not self.is_patternfly:
         return text(self.first_selected_option)
     else:
         parser = HTMLParser()
         return parser.unescape(
             execute_script("return arguments[0].innerHTML;", self.first_selected_option))
def wolfplex(options):
    # clean events
    Event.objects.filter(source="wolfplex").delete()

    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read())

    events = soup.find("div", id="accueil-agenda").dl

    for date_info, event in zip(events('dt'), events('dd')[1::2]):
        if event.span:
            event.span.clear()

        title = html_parser.unescape(event.text)
        base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else ""
        url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org"
        start = parse(date_info.span["title"])

        if "@" in title:
            title, location = title.split("@", 1)
        else:
            location = None

        Event.objects.create(
            title=title,
            source="wolfplex",
            url=url,
            start=start,
            location=location
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
Beispiel #33
0
def twitch_lookup(location):
    locsplit = location.split("/")
    if len(locsplit) > 1 and len(locsplit) == 3:
        channel = locsplit[0]
        type = locsplit[1]  # should be b or c
        id = locsplit[2]
    else:
        channel = locsplit[0]
        type = None
        id = None
    h = HTMLParser()
    fmt = "{}: {} playing {} ({})"  # Title: nickname playing Game (x views)
    if type and id:
        if type == "b":  # I haven't found an API to retrieve broadcast info
            soup = http.get_soup("http://twitch.tv/" + location)
            title = soup.find('span', {'class': 'real_title js-title'}).text
            playing = soup.find('a', {'class': 'game js-game'}).text
            views = soup.find('span', {'id': 'views-count'}).text + " view"
            views = views + "s" if not views[0:2] == "1 " else views
            return h.unescape(fmt.format(title, channel, playing, views))
        elif type == "c":
            data = http.get_json("https://api.twitch.tv/kraken/videos/" +
                                 type + id)
            title = data['title']
            playing = data['game']
            views = str(data['views']) + " view"
            views = views + "s" if not views[0:2] == "1 " else views
            return h.unescape(fmt.format(title, channel, playing, views))
    else:
        data = http.get_json(
            "http://api.justin.tv/api/stream/list.json?channel=" + channel)[0]
        if data:
            title = data['title']
            playing = data['meta_game']
            viewers = "\x033\x02Online now!\x02\x0f " + str(
                data["channel_count"]) + " viewer"
            print viewers
            viewers = viewers + "s" if not " 1 view" in viewers else viewers
            print viewers
            return h.unescape(fmt.format(title, channel, playing, viewers))
        else:
            data = http.get_json("https://api.twitch.tv/kraken/channels/" +
                                 channel)
            title = data['status']
            playing = data['game']
            viewers = "\x034\x02Offline\x02\x0f"
            return h.unescape(fmt.format(title, channel, playing, viewers))
Beispiel #34
0
 def GetNewestCurseData(name, unused_mod):
     parser = HTMLParser()
     # Name the project.
     projectUrl = baseUrl + '/projects/' + str(name)
     projectUrl = DerefUrl(projectUrl).split('?')[0]
     # Find the project ID.
     projectPage = Get(projectUrl)
     tree = soupparser.fromstring(projectPage)
     projectID = int(
         tree.xpath('//li[@class="view-on-curse"]/a/@href')[0].split('/')
         [-1])
     projectTitle = tree.xpath(
         '//h1[@class="project-title"]//span/text()')[0]
     # Find the newest copy of the mod.
     # TODO: Filter by stability, regex, whatever. Add once needed.
     filesUrl = projectUrl + '/files?filter-game-version=2020709689%3A6170'
     filesPage = Get(filesUrl)
     tree = soupparser.fromstring(filesPage)
     files = tree.xpath(
         '//div[@class="project-file-name-container"]/a[@class="overflow-tip"]/@href'
     )
     names = tree.xpath(
         '//div[@class="project-file-name-container"]/a[@class="overflow-tip"]/text()'
     )
     stability = tree.xpath(
         '//td[@class="project-file-release-type"]/div/@class')
     assert len(files) == len(names) == len(stability)
     files_filtered = []
     names_filtered = []
     for i in xrange(len(files)):
         if 'alpha' not in stability[i]:
             files_filtered.append(files[i])
             names_filtered.append(names[i])
     if files_filtered:
         files = files_filtered
         names = names_filtered
     data = {
         PROJECTID: projectID,
         PROJECTPAGE: projectUrl,
         TITLE: projectTitle,
     }
     if files:
         # Find the URL and MD5 of that file.
         filePage = Get(baseUrl + files[0])
         tree = soupparser.fromstring(filePage)
         hash = tree.xpath('//span[@class="%s"]/text()' % HASH)
         url = tree.xpath('//a[@class="button fa-icon-download"]/@href')
         data[FILENAME] = parser.unescape(names[0])
         data[HASH] = hash[0]
         data[SRC] = baseUrl + url[0]
         # Find the dependencies for this file.
         dependencies = [
             int(url.split('/')[-1]) for url in tree.xpath(
                 '//*[text()="Required Library"]/following-sibling::ul/li/a/@href'
             )
         ]
         data[DEPENDENCIES] = dependencies
     IncProgressbar(data[TITLE])
     return FixupData(data)
Beispiel #35
0
def twtt2(tweet):
    h = HTMLParser()
    for match in re.finditer("&[^\s]*;", tweet):
        try:
            tweet = tweet.replace(match.group(), str(h.unescape(match.group())))
        except UnicodeDecodeError:
            continue
    return tweet
Beispiel #36
0
def load_urls_name_page():
    connect = urllib2.urlopen("http://listen.jazzradio.com/public3")
    data = connect.read()
    p = HTMLParser()
    data = p.unescape(data)
    for i in simplejson.loads(data):
        print "%s =  %s" % (i["name"].encode('utf-8'),
                            i["playlist"].encode('utf-8'))
	def parse_news(self, news):
		u_news = news.decode("utf-8")
		ascii_news=u_news.encode("ascii","ignore")

		parser = HTMLParser() #Initializes parser
		parsed_news_text = parser.unescape(ascii_news) #Parses HTML Numeric characters
		quote_removed_news_text = parsed_news_text.replace('"', '').replace("'", '') #Removes single and double quotes
		return(quote_removed_news_text)
Beispiel #38
0
 def test_is_html_escaped(self):
     """Unescape the escaped response to see if it's the original content"""
     h = HTMLParser()
     content = '*****@*****.**'
     self.assertEqual(
         h.unescape(
             self.middleware.process_response(
                 None, HttpResponse(content)).content), content)
 def _process_odoo_data(self, odoo_data):
     # Concatenation of all boxes in one text
     html_parser = HTMLParser()
     fields = ('original_text', 'english_text', 'translated_text')
     for field in fields:
         if field in odoo_data:
             odoo_data[field] = html_parser.unescape(
                 BOX_SEPARATOR.join(odoo_data[field]))
Beispiel #40
0
    def process_html_content(self, line):
        line_tmp = line.strip()
        #fixed_backslashes = self.regexbackslash.sub(r"\\\\", line_tmp)

        parser = HTMLParser()
        line = parser.unescape(self.unescapeHtmlChars(line))
        processed_output = self.strip_tags(line) + "\n"
        return processed_output.encode("utf-8")
Beispiel #41
0
def main():
    if len(sys.argv) > 1:
        query = " ".join(sys.argv[1:])

    url = u'http://api.wolframalpha.com/v2/query?input={q}&appid={API_KEY}&format=plaintext'.format(
        API_KEY=wolfram_alpha_key, q=quote(query))

    resp = requests.get(url)

    for pod in re.findall(r'<pod.+?>.+?</pod>', resp.text, re.S):
        title = re.findall(r'<pod.+?title=[\'"](.+?)[\'"].*>', pod, re.S)
        parser = HTMLParser()
        print(Fore.GREEN + parser.unescape("".join(title).strip()) +
              Fore.RESET)
        for inner in re.findall(r'<plaintext>(.*?)</plaintext>', pod, re.S):
            print(parser.unescape(inner.strip()))
        print('')
Beispiel #42
0
 def _create_elem_tree(self, filename):
     f = open(filename, 'r')
     h = HTMLParser()
     unescaped = h.unescape(f.read())
     f.close()
     tree = ElementTree()
     tree.parse(StringIO(unescaped))
     return tree
Beispiel #43
0
def twtt2(text):
    """
    Takes in a tweet and replaces html character codes with their ASCII equivalent.
    """
    htmlparser = HTMLParser()
    #unescape function takes in a html tag and converts it to bytecode
    #encode takes result and turns it into ascii equivalent
    return htmlparser.unescape(text).encode('utf-8','ignore')
Beispiel #44
0
def Request(query):
    try:
        r = requests.get(args.backendurl + query)
        r.raise_for_status()
        h = HTMLParser()
        Send(h.unescape(r.text.strip(' \n\t\r')))
    except Exception, e:
        print e
def get_distribution_metadata(resource_id):
    # Se importa 'datajson_actions' en la función para evitar dependencias circulares con 'config_controller'
    json_dict = get_data_json_contents()
    html_parser = HTMLParser()
    json_dict = html_parser.unescape(json_dict)
    datajson = DataJson(json_dict)
    dist = datajson.get_distribution(resource_id)
    return dist
Beispiel #46
0
 def first_selected_option_text(self):
     if not self.is_patternfly:
         return text(self.first_selected_option)
     else:
         parser = HTMLParser()
         return parser.unescape(
             execute_script("return arguments[0].innerHTML;",
                            self.first_selected_option))
Beispiel #47
0
def transform_filename(filename):
    LOGGER.debug(filename)
    # first unescape
    parser = HTMLParser()
    unescaped = parser.unescape(filename)
    # replace '/' with unicode U+2215 '∕'
    new_name = unescaped.replace('/', u'\u2215')
    return new_name
Beispiel #48
0
 def get_mail_recipient(self):
     mail_recipient = ''
     groups_ids = self._get_group_ids()
     for record in groups_ids:
         for records in record.users:
             mail_recipient = ('' if records.partner_id.email == False else records.partner_id.email  + ',') + mail_recipient 
     parser = HTMLParser()
     return parser.unescape(mail_recipient)    
Beispiel #49
0
def _to_smart(verse):
    verse = verse.replace(",`", ", '")
    verse = verse.replace("`", "'")
    out = smartypants(verse)
    parser = HTMLParser()
    out = parser.unescape(out)

    return out
Beispiel #50
0
def main():
    opts = webdriver.ChromeOptions()
#     opts.binary_location('/Applications/Google Chrome 2.app/Contents/MacOS/Google Chrome')
#   directly write location in origin code of chromeoption.init for it didn't work on my computer
    driver = webdriver.Chrome(chrome_options=opts)
    driver.get("https://www.zhihu.com/question/28481779")

    def execute_times(times):
        for i in range(times + 1):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            try:
                driver.find_element_by_css_selector('button.QuestionMainAction').click()
                print "page" + str(i)
                time.sleep(1)
            except:
                break

    execute_times(5)

    result_raw = driver.page_source
    result_soup = BeautifulSoup(result_raw, 'html.parser')

    result_bf = result_soup.prettify()

    with open("/Users/zhanglei/crawler/zhihu/raw_result.txt", 'w') as girls:
        girls.write(result_bf)

    print 'store raw data successfully!'

    with open("/Users/zhanglei/crawler/zhihu/noscript_meta.txt", 'w') as noscript_meta:
        noscript_nodes = result_soup.find_all('noscript')
        noscript_inner_all = ""
        for noscript in noscript_nodes:
            noscript_inner = noscript.get_text()
            noscript_inner_all += noscript_inner + '\n'

        h = HTMLParser()
        noscript_all = h.unescape(noscript_inner_all)
        noscript_meta.write(noscript_all)

    print 'store noscript meta data successfully!'


    img_soup = BeautifulSoup(noscript_all, 'html.parser')
    img_nodes = img_soup.find_all('img')
    with open("/Users/zhanglei/crawler/zhihu/img_meta.txt", 'w') as img_meta:
        count = 0
        for img in img_nodes:
            if img.get('src') is not None:
                img_url = img.get('src')

                line = str(count) + "\t" + img_url + "\n"
                img_meta.write(line)
                urllib.urlretrieve(img_url, "/Users/zhanglei/crawler/zhihu/image/" + str(count) + ".jpg")
                count += 1

    print 'store meta data and image successfully!'
    def check_request(self, request, context):
        h = HTMLParser()
        unescaped_trimmed_request = h.unescape(
            request.replace(' ', '').replace('\\n', '').lower())

        for item in UnvalidatedRedirectsChecker.FORBIDDEN_RESOURCES:
            if item in unescaped_trimmed_request:
                return None
        return request
Beispiel #52
0
def sovet(args, message):
    sovet = url.urlopen("http://f*****g-great-advice.ru/api/random").read()

    parser = HTMLParser()
    params = {
        'message': parser.unescape(json.loads(sovet)["text"]).encode('utf8')
    }
    params.update(DIALOG_PARAM)
    return execute_in_vk(SEND_MESSAGE_COMMAND, params)
Beispiel #53
0
def html_unescape(html):
    """html转义字符 逆转"""
    if type(html) == str:
        html = html.decode('utf-8')

    html_parser = HTMLParser()
    html = html_parser.unescape(html)

    return html
Beispiel #54
0
    def normalize_html(self, html):
        """Strip HTML Tags and normalize gratuitous newlines."""

        parser = HTMLParser()
        results = parser.unescape('\n'.join(
            ' '.join(line.split()) for line in strip_tags(html).splitlines()
            if line))

        return '\n'.join(filter(bool, results.split('\n\n')))
Beispiel #55
0
def send_request(url, command, headers=None, data=None):
    # Request files don't specify HTTP vs HTTPS, so I'm trying to try both instead of
    # asking the user.  If no http or https, first try http, and if that errors, try
    # https.  Request files can be used for GET's also, so I pull that part out as well.

    if 'http' not in url:
        try:
            http_url = 'http://%s' % url
            if (data):
                response = requests.post(http_url, headers=headers, data=data, verify=False)
            else:
                response = requests.get(http_url, headers=headers, verify=False)
        except Exception as error:
            print
            error
            try:
                https_url = 'https://%s' % url
                if (data):
                    response = requests.post(https_url, headers=headers, data=data, verify=False)
                else:
                    response = requests.get(https_url, headers=headers, verify=False)
            except Exception as error:
                print
                error
    else:
        try:
            response = requests.get(url, headers=headers, verify=False)
        except Exception as error:
            print
            "[!] Failed to establish connection"
            # print error
            exit()
    # print response.headers
    # print response.content
    match = re.search('([---------------------------------------------------][\n])(.*)', response.content)
    try:
        command_output = str(match.group(0))
        print
        '\n{}\nOUTPUT OF: {}\n{}'.format('-' * 30, command, '-' * 30)
        # print command_output.replace('\\n','\n')
        command_output = command_output.replace('\\n', '\n')
        h = HTMLParser()
        print(h.unescape(command_output))

        # print command_output
    except Exception as error:
        print
        "\n[!] Could not found command output.  Debug info:\n"
        print
        "---------------Response Headers---------------"
        print
        response.headers
        print
        "---------------Response Content---------------"
        print
        response.content
        return error
Beispiel #56
0
def cleanInput(str):
    if type(str) is not unicode:
        str = unicode(str, "iso-8859-15")
        xmlc = re.compile('&#(.+?);', re.DOTALL).findall(str)
        for c in xmlc:
            str = str.replace("&#"+c+";", unichr(int(c)))
    p = HTMLParser()
    str = p.unescape(str)
    return str
def execute():

    h = HTMLParser()

    for name, rule in frappe.db.sql(
            """SELECT name, rule FROM `tabItem Variant Restrictions` WHERE rule is not null""",
            as_list=1):
        frappe.db.set_value("Item Variant Restrictions", name, 'rule',
                            h.unescape(rule))
Beispiel #58
0
def loadCatemapSF(csvfile):
    from HTMLParser import HTMLParser
    htmlparser = HTMLParser()

    schema = StructType([
        StructField("Category", StringType(), False),
        StructField("Descript", StringType(), False),
        StructField("count", IntegerType(), False),
        StructField("New_Class", StringType(), False),
    ])

    df = spark.read.csv(csvfile, header=True, schema=schema)
    pairs = df.select("Category", "Descript", "New_Class").collect()
    catemap = {(htmlparser.unescape(s["Category"]),
                htmlparser.unescape(s["Descript"])):
               htmlparser.unescape(s["New_Class"])
               for s in pairs}
    return catemap
Beispiel #59
0
def getText(textElement):
    regex = re.compile('\ +')
    html = HTMLParser()

    text = regex.sub(' ', textElement)
    text = text.strip()
    text = html.unescape(text)

    return text
Beispiel #60
0
def change_ref(text):  # 处理网页中的转义序列
    datas = re.findall(REF_PATTERN, text)
    if len(datas) > 0:
        parser = HTMLParser()
        datas = set(datas)
        for data in datas:
            replace = parser.unescape(data)
            text = text.replace(data, replace)
    return text