Python clean_htmlの例、utils.clean_html Pythonの例

コード例 #1

0

ファイルを表示

    def reply(self, slots: Dict[Slot, str], user_id=None) -> Union[SingleTextResponse, SingleImageResponse]:

        name, profession = self._initial_slots[Slot.Name], self._initial_slots[Slot.NameProfession]
        output = self._es.search(index='collection-index', body={
            "query": {
                "match": {
                    "about_author.name": {
                        "query": name,
                        "fuzziness": "2"
                    }
                }
            }
        })['hits']['hits']
        output = random.choice(output)

        hall = output["_source"]["hall"] if output["_source"]["hall"] else random.randint(1, 25)
        picture_name = "'{}'".format(output["_source"]["art_name"])

        text = f'{name}, основная отрасль искусства: {profession}. Страна {output["_source"]["country"]}. ' \
               f'Одно из популярных произведений {picture_name}. Посмотреть на шедевр можно в {hall} зале'

        raw_text = clean_html(output['_source']['text']).split('.')

        summary = '.'.join(raw_text[0:2]) if len(raw_text) >= 2 else '.'.join(raw_text) + '.'

        descr = clean_html(output['_source']['annotation']) if output['_source']['annotation'] != 'empty' \
            else summary

        if output["_source"]['img']:
            yield SingleImageResponse(is_finished=True, is_successful=True, text=text,
                                      img_url=f'https://pushkinmuseum.art{output["_source"]["img"]}', img_description=descr)
        else:
            yield SingleTextResponse(is_finished=True, is_successful=True, text=text)

コード例 #2

0

ファイルを表示

ファイル: test_markdown.py プロジェクト: coinse/misaka

        def test():
            with codecs.open(text_path, 'r', encoding='utf-8') as fd:
                text = fd.read()
            with codecs.open(html_path, 'r', encoding='utf-8') as fd:
                expected_html = fd.read()

            actual_html = self.r(text)
            expected_result = clean_html(expected_html)
            actual_result = clean_html(actual_html)

            ok(actual_result).diff(expected_result)

コード例 #3

0

ファイルを表示

ファイル: test_markdown.py プロジェクト: hepochen/hoedown_misaka

        def test():
            with codecs.open(text_path, 'r', encoding='utf-8') as fd:
                text = fd.read()
            with codecs.open(html_path, 'r', encoding='utf-8') as fd:
                expected_html = fd.read()

            actual_html = self.r(text)
            expected_result = clean_html(expected_html)
            actual_result = clean_html(actual_html)

            ok(actual_result).diff(expected_result)

コード例 #4

0

ファイルを表示

 def crawlPage(self, url):
     task = wrap_url(url)
     h = httplib2.Http()
     response, content = h.request(task.url,
                                   method="GET",
                                   headers=task.headers)
     return clean_html(content, task.encoding)

コード例 #5

0

ファイルを表示

    def __init__(self, input, url, **options):
        self.input = input
        self.url = url
        self.options = options

        self.doc = clean_html(input, url, return_doc=True)
        self.html = doc2html(self.doc)

コード例 #6

0

ファイルを表示

ファイル: crawl_vvic.py プロジェクト: chenweiqiang2016/cwq-crawler

 def crawlPage(self, url):
     task = wrap_url(url)
     h = httplib2.Http()
     response, content = h.request(task.url,
                               method="GET",
                                headers=task.headers)
     return clean_html(content, task.encoding)

コード例 #7

0

ファイルを表示

ファイル: __init__.py プロジェクト: dotajin/haoku-open

	def __init__(self, input, url, **options):
		self.input = input
		self.url = url
		self.options = options

		self.doc = clean_html(input, url, return_doc=True)
		self.html = doc2html(self.doc)

コード例 #8

0

ファイルを表示

	def __init__(self, input, **options):
		self.input = input
		self.options = options
		self.debug = options.get('debug', True)
		self.title = options.get('title', '+^_^+')
		self.pages = options.get('pages', [])
		self.texts = options.get('texts', [])
		self.doc = clean_html(input, return_doc=True)

コード例 #9

0

ファイルを表示

ファイル: clean.py プロジェクト: dotajin/haoku-open

	def __init__(self, input, **options):
		self.input = input
		self.options = options
		self.debug = options.get('debug', False)
		self.title = options.get('title', '+^_^+')
		self.pages = options.get('pages', [])
		self.texts = options.get('texts', None)
		self.doc = clean_html(input, return_doc=True)

コード例 #10

0

ファイルを表示

    def reply(self,
              slots: Dict[Slot, str],
              user_id=None) -> Union[SingleTextResponse, SingleImageResponse]:
        event_name = self._initial_slots[Slot.EventName]

        output = self._es.search(index='event-index',
                                 body={
                                     "query": {
                                         "match": {
                                             "event_name": {
                                                 "query": event_name,
                                                 "fuzziness": "2"
                                             }
                                         }
                                     }
                                 })['hits']['hits'][0]['_source']

        data_begin, data_end = output['dateBegin'], output['dateEnd']
        name = output['event_name']
        halls = output['halls'] if output['halls'] else 'уточняется'
        event_type = output['type']
        price = clean_html(
            output['price']) if output['price'] else 'уточняется'
        raw_text = clean_html(output['text']).split('.')
        summary = '.'.join(
            raw_text[0:5]) if len(raw_text) >= 5 else '.'.join(raw_text) + '.'

        text = f"{name}. Тип мероприятия: {event_type}. Будет проходить с {data_begin} по {data_end}. " \
               f"Место проведения: {halls}. Стоимость билетов: {price}.\nКоротко о событии: {summary}.".replace('\n'
                                                                                                                '<br '
                                                                                                                '/>',
                                                                                                                '').replace(' 00:00:00', '')
        img = output['img'] if output['img'] else output['extra_img']

        if img:
            yield SingleImageResponse(
                is_finished=True,
                is_successful=True,
                text=text,
                img_url=f'https://pushkinmuseum.art/{img}',
                img_description='')
        else:
            yield SingleTextResponse(is_finished=True,
                                     is_successful=True,
                                     text=text)

コード例 #11

0

ファイルを表示

ファイル: exchangerates.py プロジェクト: netBriler/telegram-cryptocompare-bot

async def inline_exchange_rates(inline_query: InlineQuery, amount: [int, float], from_coins: str, to_coins: str):
    compare = get_price(from_coins, to_coins)
    text = _get_text(compare, amount)

    input_content = InputTextMessageContent(text)

    item = InlineQueryResultArticle(id=generate_inline_id(inline_query.query), title='Cryptocurrency',
                                    description=clean_html(text), thumb_url='https://shorturl.at/dkrtD',
                                    input_message_content=input_content)

    await bot.answer_inline_query(inline_query.id, results=[item], cache_time=1)

コード例 #12

0

ファイルを表示

ファイル: article (copy).py プロジェクト: dotajin/haoku-open

    def __init__(self, input, url, **options):
        self.input = input
        self.url = url
        self.options = options
        if 'title' in options:
            self._title = options.get('title')
        if 'pages' in options:
            self._pages = options.get('pages')

        self.doc = clean_html(input, url, return_doc=True)
        self.html = doc2html(self.doc)

コード例 #13

0

ファイルを表示

ファイル: article.py プロジェクト: dotajin/haoku-open

	def __init__(self, input, url, **options):
		self.input = input
		self.url = url
		self.options = options
		if 'title' in options:
			self._title = options.get('title')
		if 'pages' in options:
			self._pages = options.get('pages')

		self.doc = clean_html(input, url, return_doc=True)
		self.html = doc2html(self.doc)

コード例 #14

0

ファイルを表示

ファイル: about_collection.py プロジェクト: MayerMax/pushkin_guide

    def reply(self,
              slots: Dict[Slot, str],
              user_id=None) -> Union[SingleTextResponse, SingleImageResponse]:
        art_name = self._initial_slots[Slot.ArtName]

        output = self._es.search(index='collection-index',
                                 body={
                                     "query": {
                                         "match": {
                                             "art_name": {
                                                 "query": art_name,
                                                 "fuzziness": "2"
                                             }
                                         }
                                     }
                                 })['hits']['hits'][0]

        author = output['_source']['about_author'][
            'name'] if 'about_author' in output['_source'] else 'неизвестен'
        hall = output["_source"]["hall"] if output["_source"][
            "hall"] != 'empty' else random.randint(1, 25)
        text = f'Работа {art_name}. Автор {author}. Посмотреть на шедевр можно в зале {hall}'

        raw_text = clean_html(output['_source']['text']).split('.')
        summary = '.'.join(
            raw_text[0:2]) if len(raw_text) >= 2 else '.'.join(raw_text) + '.'

        descr = clean_html(output['_source']['annotation']) if output['_source']['annotation'] != 'empty' \
            else summary

        if output["_source"]['img']:
            yield SingleImageResponse(
                is_finished=True,
                is_successful=True,
                text=text,
                img_url=f'https://pushkinmuseum.art{output["_source"]["img"]}',
                img_description=descr)
        else:
            yield SingleTextResponse(is_finished=True,
                                     is_successful=True,
                                     text=f'{text}\n{descr}')

コード例 #15

0

ファイルを表示

ファイル: clean (copy).py プロジェクト: dotajin/haoku-open

	def __init__(self, input, **options):
		self.input = input
		self.url = options.get('url', '')
		self.debug = options.get('debug', False)
		self.title = options.get('title', '^^')
		self.pages = options.get('pages', None)
		self.texts = options.get('texts', None)
		self.domain = get_domain(self.url)
		self.options = options
		self.doc = clean_html(input, return_doc=True)
		self.text = self.doc.text_content()
		self.len = word_count(self.text) if self.text else 0

コード例 #16

0

ファイルを表示

    def search_query(self,
                     searcher,
                     query,
                     output_fn,
                     collection='robust04',
                     K=1000):
        output_dir = os.path.dirname(output_fn)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(output_fn, 'w', encoding="utf-8") as out:
            sentid2text = {}
            hits = searcher.search(self.JString(query), K)
            for i in range(len(hits)):
                sim = hits[i].score
                docno = hits[i].docid
                content = hits[i].content
                if collection == 'core18':
                    content_json = json.loads(content)
                    content = ''
                    for each in content_json['contents']:
                        if each is not None and 'content' in each.keys():
                            content += '{}\n'.format(each['content'])
                if collection == 'robust04':
                    content = parse_doc_from_index(content)
                clean_content = clean_html(content, collection=collection)
                tokenized_content = tokenizer.tokenize(clean_content)
                sentid = 0
                for sent in tokenized_content:
                    # Split sentence if it's longer than BERT's maximum input length
                    if len(sent.strip().split()) > MAX_INPUT_LENGTH:
                        seq_list = chunk_sent(sent, MAX_INPUT_LENGTH)
                        for seq in seq_list:
                            sentno = docno + '_' + str(sentid)
                            out.write(
                                '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                    0, round(float(sim), 16), query, seq, 0,
                                    sentno, 0, self.didx))
                            out.flush()
                            sentid += 1
                            self.didx += 1
                            sentid2text[sentno] = seq
                    else:
                        sentno = docno + '_' + str(sentid)
                        out.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                            0, round(float(sim), 16), query, sent, 0, sentno,
                            0, self.didx))
                        out.flush()
                        sentid += 1
                        self.didx += 1
                        sentid2text[sentno] = sent

        return sentid2text

コード例 #17

0

ファイルを表示

ファイル: clean (copy).py プロジェクト: dotajin/haoku-open

 def __init__(self, input, **options):
     self.input = input
     self.url = options.get('url', '')
     self.debug = options.get('debug', False)
     self.title = options.get('title', '^^')
     self.pages = options.get('pages', None)
     self.texts = options.get('texts', None)
     self.domain = get_domain(self.url)
     self.options = options
     self.doc = clean_html(input, return_doc=True)
     self.text = self.doc.text_content()
     self.len = word_count(self.text) if self.text else 0

コード例 #18

0

ファイルを表示

def getHTML(url: str, wdfId: str, connection: MySQL):
    with connection as db:
        lastDay = db.getLastDayContents(url)
    if lastDay:
        return
    if ("http://" in url
            or "https://" in url) and (url[:17] != "http://localhost:"
                                       and url[:17] != "http://localhost/"):

        # Detect if content is even HTML
        customHeaders = {
            'User-Agent': 'server:ch.sdipi.wdf:v3.1.0 (by /u/protectator)',
        }
        try:
            contentHead = requests.head(url, headers=customHeaders)
        except requests.exceptions.SSLError as e:
            print("SSL Exception while getHTML of " + url)
            print(e)
            return
        if 'content-type' in contentHead.headers:
            if 'html' not in contentHead.headers['content-type']:
                return

        chrome_process = subprocess.run(["node", "./js/index.js", url],
                                        stdout=subprocess.PIPE)

        if chrome_process.returncode == 0:
            htmlContentRaw = chrome_process.stdout.decode('utf8')

            htmlContent = re.sub("<", " <", htmlContentRaw)
            with connection as db:

                htmlParsed = lxml.html.fromstring(htmlContent)
                try:
                    title = htmlParsed.find(".//title").text
                except:
                    title = ""

                cleaner = Cleaner()
                cleaner.javascript = True
                cleaner.style = True
                textClean = cleaner.clean_html(htmlParsed).text_content()

                lang = detect(textClean)
                try:
                    stop_words = get_stop_words(lang)
                except:
                    stop_words = []
                bestText = clean_html(htmlContent, stop_words, (lang == 'en'))
                db.content(wdfId, url, htmlContent, lang, title)
                db.setContentText(url, bestText, title, lang)
        else:
            print(chrome_process.stdout)

コード例 #19

0

ファイルを表示

ファイル: clean.py プロジェクト: dotajin/haoku-open

 def clean_texts(self, html):
     doc = clean_html(html, return_doc=True)
     html = ''
     for child in doc.getchildren():
         if child.getchildren():
             html += doc2html(child)
             continue
         text = child.text_content() or ''
         text = self.clean_text(text.strip())
         if text:
             child.text = text
             html += doc2html(child)
     return html

コード例 #20

0

ファイルを表示

ファイル: clean.py プロジェクト: dotajin/haoku-open

	def clean_texts(self, html):
		doc = clean_html(html, return_doc=True)
		html = ''
		for child in doc.getchildren():
			if child.getchildren():
				html += doc2html(child)
				continue
			text = child.text_content() or ''
			text = self.clean_text(text.strip())
			if text:
				child.text = text
				html += doc2html(child)
		return html

コード例 #21

0

ファイルを表示

ファイル: entities.py プロジェクト: EricLagergren/openstates

    def scan_entry(self, entry):
        """Test an entry against the trie to see if any entities
        are found.
        """
        # Search the trie.
        matches = []
        try:
            summary = clean_html(entry["summary"])
        except KeyError:
            # This entry has no summary. Skip.
            return entry, []
        matches += trie_scan(self.trie, summary)

        return entry, matches

コード例 #22

0

ファイルを表示

    def scan_entry(self, entry):
        '''Test an entry against the trie to see if any entities
        are found.
        '''
        # Search the trie.
        matches = []
        try:
            summary = clean_html(entry['summary'])
        except KeyError:
            # This entry has no summary. Skip.
            return entry, []
        matches += trie_scan(self.trie, summary)

        return entry, matches

コード例 #23

0

ファイルを表示

ファイル: getter.py プロジェクト: dotajin/haoku-open

	def run(self):
		html = get_or_cache(self.url)
		doc = clean_html(html, self.url, return_doc=True)
		urls = html2urls(html, self.url, name=False)
		urls = sorted(urls, key=self.score)

		name = tag2text(doc, 'meta', property="og:site_name")
		if name:
			self.name = self.text = name
		else:
			cnt = 10
			while cnt <= 100:
				if self.get_name(urls[:cnt]):
					print self.domain, cnt
					break
				cnt += 10

		if self.name is not None:
			self.get_sub(urls)

コード例 #24

0

ファイルを表示

ファイル: entities.py プロジェクト: VersaHQ/openstates

    def process_entry(self, entry):
        '''Given an entry, add a mongo id and other top-level
        attributes, then run it through scan_feed to recognize
        any entities mentioned.
        '''
        abbr = self.abbr
        third = itemgetter(2)

        entry, matches = self.scan_entry(entry)
        matches = self.extract_entities(matches)

        ids = map(third, matches)
        strings = [m.group() for m, _, _ in matches]
        assert len(ids) == len(strings)

        # Add references and save in mongo.
        entry['state'] = abbr  # list probably wiser
        entry['entity_ids'] = ids or []
        entry['entity_strings'] = strings or []
        entry['save_time'] = datetime.datetime.utcnow()

        try:
            entry['_id'] = new_feed_id(entry)
        except BogusEntry:
            # This entry appears to be malformed somehow. Skip.
            msg = 'Skipping malformed feed: %s'
            msg = msg % repr(entry)[:100] + '...'
            self.logger.info(msg)
            return

        entry['_type'] = 'feedentry'

        entry['summary'] = clean_html(entry['summary'])
        try:
            entry['summary_detail']['value'] = clean_html(
                entry['summary_detail']['value'])
        except KeyError:
            pass

        # Kill any keys that contain dots.
        entry = dict((k, v) for (k, v) in entry.items() if '.' not in k)

        # Bail if the feed contains any banned key-value pairs.
        entry_set = self._dictitems_to_set(entry)
        for keyval_set in self._banned_keyvals:
            if entry_set & keyval_set:
                msg = 'Skipped story containing banned key values: %r'
                self.logger.info(msg % keyval_set)
                return

        # Skip any entries that are missing required keys:
        required = set('summary source host link published_parsed'.split())
        if required not in set(entry):
            if 'links' not in entry:
                msg = 'Skipped story lacking required keys: %r'
                self.logger.info(msg % (required - set(entry)))
                return
            else:
                source = entry['links'][-1].get('href')
                if source:
                    host = urlparse.urlparse(entry['links'][0]['href']).netloc
                    entry['source'] = source
                    entry['host'] = host
                else:
                    msg = 'Skipped story lacking required keys: %r'
                    self.logger.info(msg % (required - set(entry)))
                    return

        # Save
        msg = 'Found %d related entities in %r'
        self.logger.info(msg % (len(ids), entry['title']))
        return entry

コード例 #25

0

ファイルを表示

ファイル: crawler.py プロジェクト: bioinfonerd-forks/PatentCrawler-1

                #print(abstract_command)
                abstract_result = subprocess.getoutput(abstract_command)
                try:
                    abstract_result = json.loads(abstract_result)
                except JSONDecodeError as ex:
                    print(ex)
                    print('-----------abs--------')
                    print(abstract_result)
                    update_cookies()
                    inner_fail_time += 1
                    sleep(inner_fail_time ** 2)
                    continue
    
                try:
                    abstract = abstract_result['abstractInfoDTO']['abIndexList'][0]['value']        # 摘要
                    abstract = clean_html(abstract)
                    CPC = ''
                    for item in abstract_result['abstractInfoDTO']['abstractItemList']:
                        if item['indexCode'] == 'CPC':
                            CPC = item['value']

                    #figure_id = abstract_result['abstractInfoDTO']['figureRid']
                except KeyError as ex:
                    print(ex)
                    print('---------abs--key-------')
#print(abstract_result)
                    inner_fail_time += 1
                    sleep(inner_fail_time ** 2)
                    continue
                '''
                if figure_id != None:

コード例 #26

0

ファイルを表示

ファイル: meta.py プロジェクト: dotajin/haoku-open

def html2meta(html, url):
    doc = clean_html(html, url, return_doc=True)
    return doc2meta(doc, url)

コード例 #27

0

ファイルを表示

 def search_document(self,
                     searcher,
                     qid2docid,
                     qid2text,
                     output_fn,
                     collection='robust04',
                     K=1000,
                     topics=None,
                     cv_fold=None):
     output_dir = os.path.dirname(output_fn)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
     with open(output_fn, 'w', encoding="utf-8") as out:
         if 'core' in collection:
             # Robust04 provides CV topics
             topics = qid2text
         for qid in topics:
             text = qid2text[qid]
             hits = searcher.search(self.JString(text), K)
             for i in range(len(hits)):
                 sim = hits[i].score
                 docno = hits[i].docid
                 label = 1 if qid in qid2docid and docno in qid2docid[
                     qid] else 0
                 content = hits[i].content
                 if collection == 'core18':
                     content_json = json.loads(content)
                     content = ''
                     for each in content_json['contents']:
                         if each is not None and 'content' in each.keys():
                             content += '{}\n'.format(each['content'])
                 if collection == 'robust04':
                     content = parse_doc_from_index(content)
                 clean_content = clean_html(content, collection=collection)
                 tokenized_content = tokenizer.tokenize(clean_content)
                 sentid = 0
                 for sent in tokenized_content:
                     # Split sentence if it's longer than BERT's maximum input length
                     if len(sent.strip().split()) > MAX_INPUT_LENGTH:
                         seq_list = chunk_sent(sent, MAX_INPUT_LENGTH)
                         for seq in seq_list:
                             sentno = docno + '_' + str(sentid)
                             if cv_fold == '5':
                                 out.write(
                                     '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.
                                     format(label, round(float(sim), 11),
                                            text, seq, qid, sentno, qid,
                                            self.didx - 1))
                             else:
                                 out.write(
                                     '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.
                                     format(label, round(float(sim), 16),
                                            text, seq, qid, sentno,
                                            self.qidx, self.didx))
                             out.flush()
                             sentid += 1
                             self.didx += 1
                     else:
                         sentno = docno + '_' + str(sentid)
                         if cv_fold == '5':
                             out.write(
                                 '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                     label, round(float(sim), 11), text,
                                     sent, qid, sentno, qid, self.didx - 1))
                         else:
                             out.write(
                                 '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                     label, round(float(sim),
                                                  16), text, sent, qid,
                                     sentno, self.qidx, self.didx))
                         out.flush()
                         sentid += 1
                         self.didx += 1
             self.qidx += 1

コード例 #28

0

ファイルを表示

ファイル: entities.py プロジェクト: EricLagergren/openstates

    def process_entry(self, entry):
        """Given an entry, add a mongo id and other top-level
        attributes, then run it through scan_feed to recognize
        any entities mentioned.
        """
        abbr = self.abbr
        third = itemgetter(2)

        entry, matches = self.scan_entry(entry)
        matches = self.extract_entities(matches)

        ids = map(third, matches)
        strings = [m.group() for m, _, _ in matches]
        assert len(ids) == len(strings)

        # Add references and save in mongo.
        entry["state"] = abbr  # list probably wiser
        entry["entity_ids"] = ids or []
        entry["entity_strings"] = strings or []
        entry["save_time"] = datetime.datetime.utcnow()

        try:
            entry["_id"] = new_entry_id(entry)
        except BogusEntry:
            # This entry appears to be malformed somehow. Skip.
            msg = "Skipping malformed feed: %s"
            msg = msg % repr(entry)[:100] + "..."
            self.logger.info(msg)
            return

        entry["_type"] = "feedentry"

        try:
            entry["summary"] = clean_html(entry["summary"])
        except KeyError:
            return
        try:
            entry["summary_detail"]["value"] = clean_html(entry["summary_detail"]["value"])
        except KeyError:
            pass

        # Kill any keys that contain dots.
        entry = dict((k, v) for (k, v) in entry.items() if "." not in k)

        # Bail if the feed contains any banned key-value pairs.
        entry_set = self._dictitems_to_set(entry)
        for keyval_set in self._banned_keyvals:
            if entry_set & keyval_set:
                msg = "Skipped story containing banned key values: %r"
                self.logger.info(msg % keyval_set)
                return

        # Skip any entries that are missing required keys:
        required = set("summary source host link published_parsed".split())
        if required not in set(entry):
            if "links" not in entry:
                msg = "Skipped story lacking required keys: %r"
                self.logger.info(msg % (required - set(entry)))
                return
            else:
                source = entry["links"][-1].get("href")
                if source:
                    host = urlparse.urlparse(entry["links"][0]["href"]).netloc
                    entry["source"] = source
                    entry["host"] = host
                else:
                    msg = "Skipped story lacking required keys: %r"
                    self.logger.info(msg % (required - set(entry)))
                    return

        # Save
        msg = "Found %d related entities in %r"
        if ids:
            self.logger.info(msg % (len(ids), entry["title"]))
        else:
            self.logger.debug(msg % (len(ids), entry["title"]))
        return entry

コード例 #29

0

ファイルを表示

str = '''<b>redy</b>
        <img src="data:image/jpeg;base64,">
        <<script></script>script> alert("Haha, I hacked your page."); <<script></script>script>
        '''
html = '''
        both <em id="foo" style="color: black">can</em> have
                     <img id="bar" src="foo"/>
   <script type="text/javascript" src="evil-site"></script>
   <link rel="alternate" type="text/rss" src="evil-rss">
   <style>
     body {background-image: url(javascript:do_evil)};
     div {color: expression(evil)};
   </style>
 < onload="evil_function()">
    <!-- I am interpreted for EVIL! -->
   <img src="">
   <a href="javascript:evil_function()">a link</a>
   <a href="#" onclick="evil_function()">another link</a>
   <p onclick="evil_function()">a paragraph</p>
   <div style="display: none">secret EVIL!</div>
   <object> of EVIL! </object>
   <iframe src="evil-site"></iframe>
   <form action="evil-site">
     Password: <input type="password" name="password">
   </form>
   <blink>annoying EVIL!</blink>
   <a href="evil-site">spam spam SPAM!</a>
   <image src="evil!">
</html>''' + str
print(clean_html(html))

コード例 #30

0

ファイルを表示

    def process_entry(self, entry):
        '''Given an entry, add a mongo id and other top-level
        attributes, then run it through scan_feed to recognize
        any entities mentioned.
        '''
        abbr = self.abbr
        third = itemgetter(2)

        entry, matches = self.scan_entry(entry)
        matches = self.extract_entities(matches)

        ids = map(third, matches)
        strings = [m.group() for m, _, _ in matches]
        assert len(ids) == len(strings)

        # Add references and save in mongo.
        entry['state'] = abbr  # list probably wiser
        entry['entity_ids'] = ids or []
        entry['entity_strings'] = strings or []
        entry['save_time'] = datetime.datetime.utcnow()

        try:
            entry['_id'] = new_feed_id(entry)
        except BogusEntry:
            # This entry appears to be malformed somehow. Skip.
            msg = 'Skipping malformed feed: %s'
            msg = msg % repr(entry)[:100] + '...'
            self.logger.info(msg)
            return

        entry['_type'] = 'feedentry'

        entry['summary'] = clean_html(entry['summary'])
        try:
            entry['summary_detail']['value'] = clean_html(
                entry['summary_detail']['value'])
        except KeyError:
            pass

        # Kill any keys that contain dots.
        entry = dict((k, v) for (k, v) in entry.items() if '.' not in k)

        # Bail if the feed contains any banned key-value pairs.
        entry_set = self._dictitems_to_set(entry)
        for keyval_set in self._banned_keyvals:
            if entry_set & keyval_set:
                msg = 'Skipped story containing banned key values: %r'
                self.logger.info(msg % keyval_set)
                return

        # Skip any entries that are missing required keys:
        required = set('summary source host link published_parsed'.split())
        if required not in set(entry):
            if 'links' not in entry:
                msg = 'Skipped story lacking required keys: %r'
                self.logger.info(msg % (required - set(entry)))
                return
            else:
                source = entry['links'][-1].get('href')
                if source:
                    host = urlparse.urlparse(entry['links'][0]['href']).netloc
                    entry['source'] = source
                    entry['host'] = host
                else:
                    msg = 'Skipped story lacking required keys: %r'
                    self.logger.info(msg % (required - set(entry)))
                    return

        # Save
        msg = 'Found %d related entities in %r'
        self.logger.info(msg % (len(ids), entry['title']))
        return entry

コード例 #31

0

ファイルを表示

ファイル: getter.py プロジェクト: dotajin/haoku-open

def html2meta(html, url):
	doc = clean_html(html, url, return_doc=True)
	return doc2meta(doc, url)