def reply(self, slots: Dict[Slot, str], user_id=None) -> Union[SingleTextResponse, SingleImageResponse]: name, profession = self._initial_slots[Slot.Name], self._initial_slots[Slot.NameProfession] output = self._es.search(index='collection-index', body={ "query": { "match": { "about_author.name": { "query": name, "fuzziness": "2" } } } })['hits']['hits'] output = random.choice(output) hall = output["_source"]["hall"] if output["_source"]["hall"] else random.randint(1, 25) picture_name = "'{}'".format(output["_source"]["art_name"]) text = f'{name}, основная отрасль искусства: {profession}. Страна {output["_source"]["country"]}. ' \ f'Одно из популярных произведений {picture_name}. Посмотреть на шедевр можно в {hall} зале' raw_text = clean_html(output['_source']['text']).split('.') summary = '.'.join(raw_text[0:2]) if len(raw_text) >= 2 else '.'.join(raw_text) + '.' descr = clean_html(output['_source']['annotation']) if output['_source']['annotation'] != 'empty' \ else summary if output["_source"]['img']: yield SingleImageResponse(is_finished=True, is_successful=True, text=text, img_url=f'https://pushkinmuseum.art{output["_source"]["img"]}', img_description=descr) else: yield SingleTextResponse(is_finished=True, is_successful=True, text=text)
def test(): with codecs.open(text_path, 'r', encoding='utf-8') as fd: text = fd.read() with codecs.open(html_path, 'r', encoding='utf-8') as fd: expected_html = fd.read() actual_html = self.r(text) expected_result = clean_html(expected_html) actual_result = clean_html(actual_html) ok(actual_result).diff(expected_result)
def crawlPage(self, url): task = wrap_url(url) h = httplib2.Http() response, content = h.request(task.url, method="GET", headers=task.headers) return clean_html(content, task.encoding)
def __init__(self, input, url, **options): self.input = input self.url = url self.options = options self.doc = clean_html(input, url, return_doc=True) self.html = doc2html(self.doc)
def __init__(self, input, **options): self.input = input self.options = options self.debug = options.get('debug', True) self.title = options.get('title', '+^_^+') self.pages = options.get('pages', []) self.texts = options.get('texts', []) self.doc = clean_html(input, return_doc=True)
def __init__(self, input, **options): self.input = input self.options = options self.debug = options.get('debug', False) self.title = options.get('title', '+^_^+') self.pages = options.get('pages', []) self.texts = options.get('texts', None) self.doc = clean_html(input, return_doc=True)
def reply(self, slots: Dict[Slot, str], user_id=None) -> Union[SingleTextResponse, SingleImageResponse]: event_name = self._initial_slots[Slot.EventName] output = self._es.search(index='event-index', body={ "query": { "match": { "event_name": { "query": event_name, "fuzziness": "2" } } } })['hits']['hits'][0]['_source'] data_begin, data_end = output['dateBegin'], output['dateEnd'] name = output['event_name'] halls = output['halls'] if output['halls'] else 'уточняется' event_type = output['type'] price = clean_html( output['price']) if output['price'] else 'уточняется' raw_text = clean_html(output['text']).split('.') summary = '.'.join( raw_text[0:5]) if len(raw_text) >= 5 else '.'.join(raw_text) + '.' text = f"{name}. Тип мероприятия: {event_type}. Будет проходить с {data_begin} по {data_end}. " \ f"Место проведения: {halls}. Стоимость билетов: {price}.\nКоротко о событии: {summary}.".replace('\n' '<br ' '/>', '').replace(' 00:00:00', '') img = output['img'] if output['img'] else output['extra_img'] if img: yield SingleImageResponse( is_finished=True, is_successful=True, text=text, img_url=f'https://pushkinmuseum.art/{img}', img_description='') else: yield SingleTextResponse(is_finished=True, is_successful=True, text=text)
async def inline_exchange_rates(inline_query: InlineQuery, amount: [int, float], from_coins: str, to_coins: str): compare = get_price(from_coins, to_coins) text = _get_text(compare, amount) input_content = InputTextMessageContent(text) item = InlineQueryResultArticle(id=generate_inline_id(inline_query.query), title='Cryptocurrency', description=clean_html(text), thumb_url='https://shorturl.at/dkrtD', input_message_content=input_content) await bot.answer_inline_query(inline_query.id, results=[item], cache_time=1)
def __init__(self, input, url, **options): self.input = input self.url = url self.options = options if 'title' in options: self._title = options.get('title') if 'pages' in options: self._pages = options.get('pages') self.doc = clean_html(input, url, return_doc=True) self.html = doc2html(self.doc)
def reply(self, slots: Dict[Slot, str], user_id=None) -> Union[SingleTextResponse, SingleImageResponse]: art_name = self._initial_slots[Slot.ArtName] output = self._es.search(index='collection-index', body={ "query": { "match": { "art_name": { "query": art_name, "fuzziness": "2" } } } })['hits']['hits'][0] author = output['_source']['about_author'][ 'name'] if 'about_author' in output['_source'] else 'неизвестен' hall = output["_source"]["hall"] if output["_source"][ "hall"] != 'empty' else random.randint(1, 25) text = f'Работа {art_name}. Автор {author}. Посмотреть на шедевр можно в зале {hall}' raw_text = clean_html(output['_source']['text']).split('.') summary = '.'.join( raw_text[0:2]) if len(raw_text) >= 2 else '.'.join(raw_text) + '.' descr = clean_html(output['_source']['annotation']) if output['_source']['annotation'] != 'empty' \ else summary if output["_source"]['img']: yield SingleImageResponse( is_finished=True, is_successful=True, text=text, img_url=f'https://pushkinmuseum.art{output["_source"]["img"]}', img_description=descr) else: yield SingleTextResponse(is_finished=True, is_successful=True, text=f'{text}\n{descr}')
def __init__(self, input, **options): self.input = input self.url = options.get('url', '') self.debug = options.get('debug', False) self.title = options.get('title', '^^') self.pages = options.get('pages', None) self.texts = options.get('texts', None) self.domain = get_domain(self.url) self.options = options self.doc = clean_html(input, return_doc=True) self.text = self.doc.text_content() self.len = word_count(self.text) if self.text else 0
def search_query(self, searcher, query, output_fn, collection='robust04', K=1000): output_dir = os.path.dirname(output_fn) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_fn, 'w', encoding="utf-8") as out: sentid2text = {} hits = searcher.search(self.JString(query), K) for i in range(len(hits)): sim = hits[i].score docno = hits[i].docid content = hits[i].content if collection == 'core18': content_json = json.loads(content) content = '' for each in content_json['contents']: if each is not None and 'content' in each.keys(): content += '{}\n'.format(each['content']) if collection == 'robust04': content = parse_doc_from_index(content) clean_content = clean_html(content, collection=collection) tokenized_content = tokenizer.tokenize(clean_content) sentid = 0 for sent in tokenized_content: # Split sentence if it's longer than BERT's maximum input length if len(sent.strip().split()) > MAX_INPUT_LENGTH: seq_list = chunk_sent(sent, MAX_INPUT_LENGTH) for seq in seq_list: sentno = docno + '_' + str(sentid) out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 0, round(float(sim), 16), query, seq, 0, sentno, 0, self.didx)) out.flush() sentid += 1 self.didx += 1 sentid2text[sentno] = seq else: sentno = docno + '_' + str(sentid) out.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 0, round(float(sim), 16), query, sent, 0, sentno, 0, self.didx)) out.flush() sentid += 1 self.didx += 1 sentid2text[sentno] = sent return sentid2text
def getHTML(url: str, wdfId: str, connection: MySQL): with connection as db: lastDay = db.getLastDayContents(url) if lastDay: return if ("http://" in url or "https://" in url) and (url[:17] != "http://localhost:" and url[:17] != "http://localhost/"): # Detect if content is even HTML customHeaders = { 'User-Agent': 'server:ch.sdipi.wdf:v3.1.0 (by /u/protectator)', } try: contentHead = requests.head(url, headers=customHeaders) except requests.exceptions.SSLError as e: print("SSL Exception while getHTML of " + url) print(e) return if 'content-type' in contentHead.headers: if 'html' not in contentHead.headers['content-type']: return chrome_process = subprocess.run(["node", "./js/index.js", url], stdout=subprocess.PIPE) if chrome_process.returncode == 0: htmlContentRaw = chrome_process.stdout.decode('utf8') htmlContent = re.sub("<", " <", htmlContentRaw) with connection as db: htmlParsed = lxml.html.fromstring(htmlContent) try: title = htmlParsed.find(".//title").text except: title = "" cleaner = Cleaner() cleaner.javascript = True cleaner.style = True textClean = cleaner.clean_html(htmlParsed).text_content() lang = detect(textClean) try: stop_words = get_stop_words(lang) except: stop_words = [] bestText = clean_html(htmlContent, stop_words, (lang == 'en')) db.content(wdfId, url, htmlContent, lang, title) db.setContentText(url, bestText, title, lang) else: print(chrome_process.stdout)
def clean_texts(self, html): doc = clean_html(html, return_doc=True) html = '' for child in doc.getchildren(): if child.getchildren(): html += doc2html(child) continue text = child.text_content() or '' text = self.clean_text(text.strip()) if text: child.text = text html += doc2html(child) return html
def scan_entry(self, entry): """Test an entry against the trie to see if any entities are found. """ # Search the trie. matches = [] try: summary = clean_html(entry["summary"]) except KeyError: # This entry has no summary. Skip. return entry, [] matches += trie_scan(self.trie, summary) return entry, matches
def scan_entry(self, entry): '''Test an entry against the trie to see if any entities are found. ''' # Search the trie. matches = [] try: summary = clean_html(entry['summary']) except KeyError: # This entry has no summary. Skip. return entry, [] matches += trie_scan(self.trie, summary) return entry, matches
def run(self): html = get_or_cache(self.url) doc = clean_html(html, self.url, return_doc=True) urls = html2urls(html, self.url, name=False) urls = sorted(urls, key=self.score) name = tag2text(doc, 'meta', property="og:site_name") if name: self.name = self.text = name else: cnt = 10 while cnt <= 100: if self.get_name(urls[:cnt]): print self.domain, cnt break cnt += 10 if self.name is not None: self.get_sub(urls)
def process_entry(self, entry): '''Given an entry, add a mongo id and other top-level attributes, then run it through scan_feed to recognize any entities mentioned. ''' abbr = self.abbr third = itemgetter(2) entry, matches = self.scan_entry(entry) matches = self.extract_entities(matches) ids = map(third, matches) strings = [m.group() for m, _, _ in matches] assert len(ids) == len(strings) # Add references and save in mongo. entry['state'] = abbr # list probably wiser entry['entity_ids'] = ids or [] entry['entity_strings'] = strings or [] entry['save_time'] = datetime.datetime.utcnow() try: entry['_id'] = new_feed_id(entry) except BogusEntry: # This entry appears to be malformed somehow. Skip. msg = 'Skipping malformed feed: %s' msg = msg % repr(entry)[:100] + '...' self.logger.info(msg) return entry['_type'] = 'feedentry' entry['summary'] = clean_html(entry['summary']) try: entry['summary_detail']['value'] = clean_html( entry['summary_detail']['value']) except KeyError: pass # Kill any keys that contain dots. entry = dict((k, v) for (k, v) in entry.items() if '.' not in k) # Bail if the feed contains any banned key-value pairs. entry_set = self._dictitems_to_set(entry) for keyval_set in self._banned_keyvals: if entry_set & keyval_set: msg = 'Skipped story containing banned key values: %r' self.logger.info(msg % keyval_set) return # Skip any entries that are missing required keys: required = set('summary source host link published_parsed'.split()) if required not in set(entry): if 'links' not in entry: msg = 'Skipped story lacking required keys: %r' self.logger.info(msg % (required - set(entry))) return else: source = entry['links'][-1].get('href') if source: host = urlparse.urlparse(entry['links'][0]['href']).netloc entry['source'] = source entry['host'] = host else: msg = 'Skipped story lacking required keys: %r' self.logger.info(msg % (required - set(entry))) return # Save msg = 'Found %d related entities in %r' self.logger.info(msg % (len(ids), entry['title'])) return entry
#print(abstract_command) abstract_result = subprocess.getoutput(abstract_command) try: abstract_result = json.loads(abstract_result) except JSONDecodeError as ex: print(ex) print('-----------abs--------') print(abstract_result) update_cookies() inner_fail_time += 1 sleep(inner_fail_time ** 2) continue try: abstract = abstract_result['abstractInfoDTO']['abIndexList'][0]['value'] # 摘要 abstract = clean_html(abstract) CPC = '' for item in abstract_result['abstractInfoDTO']['abstractItemList']: if item['indexCode'] == 'CPC': CPC = item['value'] #figure_id = abstract_result['abstractInfoDTO']['figureRid'] except KeyError as ex: print(ex) print('---------abs--key-------') #print(abstract_result) inner_fail_time += 1 sleep(inner_fail_time ** 2) continue ''' if figure_id != None:
def html2meta(html, url): doc = clean_html(html, url, return_doc=True) return doc2meta(doc, url)
def search_document(self, searcher, qid2docid, qid2text, output_fn, collection='robust04', K=1000, topics=None, cv_fold=None): output_dir = os.path.dirname(output_fn) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_fn, 'w', encoding="utf-8") as out: if 'core' in collection: # Robust04 provides CV topics topics = qid2text for qid in topics: text = qid2text[qid] hits = searcher.search(self.JString(text), K) for i in range(len(hits)): sim = hits[i].score docno = hits[i].docid label = 1 if qid in qid2docid and docno in qid2docid[ qid] else 0 content = hits[i].content if collection == 'core18': content_json = json.loads(content) content = '' for each in content_json['contents']: if each is not None and 'content' in each.keys(): content += '{}\n'.format(each['content']) if collection == 'robust04': content = parse_doc_from_index(content) clean_content = clean_html(content, collection=collection) tokenized_content = tokenizer.tokenize(clean_content) sentid = 0 for sent in tokenized_content: # Split sentence if it's longer than BERT's maximum input length if len(sent.strip().split()) > MAX_INPUT_LENGTH: seq_list = chunk_sent(sent, MAX_INPUT_LENGTH) for seq in seq_list: sentno = docno + '_' + str(sentid) if cv_fold == '5': out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'. format(label, round(float(sim), 11), text, seq, qid, sentno, qid, self.didx - 1)) else: out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'. format(label, round(float(sim), 16), text, seq, qid, sentno, self.qidx, self.didx)) out.flush() sentid += 1 self.didx += 1 else: sentno = docno + '_' + str(sentid) if cv_fold == '5': out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( label, round(float(sim), 11), text, sent, qid, sentno, qid, self.didx - 1)) else: out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( label, round(float(sim), 16), text, sent, qid, sentno, self.qidx, self.didx)) out.flush() sentid += 1 self.didx += 1 self.qidx += 1
def process_entry(self, entry): """Given an entry, add a mongo id and other top-level attributes, then run it through scan_feed to recognize any entities mentioned. """ abbr = self.abbr third = itemgetter(2) entry, matches = self.scan_entry(entry) matches = self.extract_entities(matches) ids = map(third, matches) strings = [m.group() for m, _, _ in matches] assert len(ids) == len(strings) # Add references and save in mongo. entry["state"] = abbr # list probably wiser entry["entity_ids"] = ids or [] entry["entity_strings"] = strings or [] entry["save_time"] = datetime.datetime.utcnow() try: entry["_id"] = new_entry_id(entry) except BogusEntry: # This entry appears to be malformed somehow. Skip. msg = "Skipping malformed feed: %s" msg = msg % repr(entry)[:100] + "..." self.logger.info(msg) return entry["_type"] = "feedentry" try: entry["summary"] = clean_html(entry["summary"]) except KeyError: return try: entry["summary_detail"]["value"] = clean_html(entry["summary_detail"]["value"]) except KeyError: pass # Kill any keys that contain dots. entry = dict((k, v) for (k, v) in entry.items() if "." not in k) # Bail if the feed contains any banned key-value pairs. entry_set = self._dictitems_to_set(entry) for keyval_set in self._banned_keyvals: if entry_set & keyval_set: msg = "Skipped story containing banned key values: %r" self.logger.info(msg % keyval_set) return # Skip any entries that are missing required keys: required = set("summary source host link published_parsed".split()) if required not in set(entry): if "links" not in entry: msg = "Skipped story lacking required keys: %r" self.logger.info(msg % (required - set(entry))) return else: source = entry["links"][-1].get("href") if source: host = urlparse.urlparse(entry["links"][0]["href"]).netloc entry["source"] = source entry["host"] = host else: msg = "Skipped story lacking required keys: %r" self.logger.info(msg % (required - set(entry))) return # Save msg = "Found %d related entities in %r" if ids: self.logger.info(msg % (len(ids), entry["title"])) else: self.logger.debug(msg % (len(ids), entry["title"])) return entry
str = '''<b>redy</b> <img src="data:image/jpeg;base64,"> <<script></script>script> alert("Haha, I hacked your page."); <<script></script>script> ''' html = ''' both <em id="foo" style="color: black">can</em> have <img id="bar" src="foo"/> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> <style> body {background-image: url(javascript:do_evil)}; div {color: expression(evil)}; </style> < onload="evil_function()"> <!-- I am interpreted for EVIL! --> <img src=""> <a href="javascript:evil_function()">a link</a> <a href="#" onclick="evil_function()">another link</a> <p onclick="evil_function()">a paragraph</p> <div style="display: none">secret EVIL!</div> <object> of EVIL! </object> <iframe src="evil-site"></iframe> <form action="evil-site"> Password: <input type="password" name="password"> </form> <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> <image src="evil!"> </html>''' + str print(clean_html(html))