def extract_one(self, item, response): name = extract_text(item.css('.child_title').get(), guess_layout=False) link = item.css('.click_for_more_container>a::attr(href)').get() if link: link = urljoin(response.url, link) else: link = '' price = item.css('.centered .styleColor::text').get() price = price.split('€')[-1].strip().replace(',', '') uid = item.xpath( './/div[@class="item-box-desc"]/p/i[contains(text(), "Product Code")]/text()' ).get() if uid: uid = uid.split(':')[-1].strip() ean = item.xpath( './/div[@class="item-box-desc"]/p/i[contains(text(), "Barcode/EAN")]/text()' ).get() if ean: ean = ean.split(':')[-1].strip() return { 'ean': ean, 'link': link, 'name': name, 'price': float(price), 'spider': self.name, 'T': utc_time(), 'uid': uid, }
def printContentOnConsole(text): if not "html-text" in sys.modules: import html_text print(html_text.extract_text(text)) #subprocess.Popen(["echo", html_text.extract_text(text)]) #sys.stdout.flush() return text
def post(masto, body, instance, title=None, direction='ltr', in_reply_to=None): # Markdown more than we need, to [hopefully] discard chopped markup. summary = extract_text(markdown(body.strip()))[:140] hashtags = get_hashtags(body, ignore=summary) mentions = get_mentions(body, ignore=summary) irt_id = in_reply_to and in_reply_to.get('id') or None body = linkify_hashtags(linkify_mentions(body), instance) if direction == 'rtl': body = u"""<div dir="rtl"> {} </div>""".format(markdown(body)) if in_reply_to: body = u"""#### In reply to [@{}]({}): {}""".format(in_reply_to['account']['username'], in_reply_to['url'], body) gist = make_gist( title or u"A gistodon toot, {} GMT".format(time.asctime(time.gmtime())), body + u""" ###### Generated by [Gistodon](https://github.com/thedod/gistodon/#readme).""") if NO_TOOTING: return gist status = u'{}... {}'.format(summary, gist) if hashtags or mentions: status += u'\n' + u' '.join(hashtags.union(mentions)) return masto.status_post(status, spoiler_text=title, in_reply_to_id=irt_id)['url']
def switchToCli(): if not "html-text" in sys.modules: import html_text # Print content where is context menu was called from if config.pluginContext == "study": print(html_text.extract_text(config.studyWindowContent)) else: print(html_text.extract_text(config.bibleWindowContent)) # Hide gui if platform.system() == "Darwin": config.mainWindow.showMinimized() else: config.mainWindow.hide() # Cli input config.cli = True toQuit = False #config.printContentOnConsole = printContentOnConsole config.bibleWindowContentTransformers.append(printContentOnConsole) config.studyWindowContentTransformers.append(printContentOnConsole) while config.cli: print("--------------------") print("Enter '.bible' to read bible text, '.study' to read study resource, '.help' to read UBA command reference, '.gui' to launch gui, '.quit' to quit,") command = input("or UBA command: ").strip() if command == ".gui": del config.bibleWindowContentTransformers[-1] del config.studyWindowContentTransformers[-1] config.cli = False elif command == ".bible": print(html_text.extract_text(config.bibleWindowContent)) elif command == ".study": print(html_text.extract_text(config.studyWindowContent)) elif command in (".help", ".command"): getCommandDocumentation() elif command == ".quit": toQuit = True config.cli = False else: config.mainWindow.runTextCommand(command) if toQuit: config.mainWindow.quitApp() else: app.setApplicationName("UniqueBible.app") if platform.system() == "Darwin": config.mainWindow.showMaximized() config.mainWindow.raise_() else: config.mainWindow.show()
def get_body(self, response): xpath = "//div[@id = 'article']/descendant::p[not(ancestor::div/@class='aboutAuthor')]" body = response.xpath(xpath).getall() body = [extract_text(para) for para in body] # check if first line is real text if len(body[0].split()) < 5: body = body[1:] return "\n\n".join(body)
def get_paragraphs_html_text(str_text, mode): """ using html_text """ try: text = html_text.extract_text(str_text, guess_layout=False) except TypeError: return [''] return re.split("\n", text)
def test_webpages(): webpages = sorted(glob.glob('./test_webpages/*.html')) extracted = sorted(glob.glob('./test_webpages/*.txt')) for page, extr in zip(webpages, extracted): with open(page, 'r', encoding='utf8') as f_in: html = f_in.read() with open(extr, 'r', encoding='utf8') as f_in: expected = f_in.read() assert extract_text(html) == expected
async def security_check(page, response): html = await page.content() text = html_text.extract_text(html) error_level = len([r for r in error_regs if r.search(text)]) if len(text) < 1000: error_level += 1 if response and not response.ok: error_level += 1 return error_level
def main(): output = {} for path in Path('html').glob('*.html.gz'): with gzip.open(path, 'rt', encoding='utf8') as f: html = f.read() item_id = path.stem.split('.')[0] output[item_id] = {'articleBody': html_text.extract_text(html)} (Path('output') / 'html-text.json').write_text(json.dumps( output, sort_keys=True, ensure_ascii=False, indent=4), encoding='utf8')
def _test(train_response): model = decode_object(train_response.pop('model')) # type: BaseModel pprint(train_response) pprint(json.loads(train_response['quality'])) page_neg, page_pos = pages[:2] pred_proba = lambda page: \ model.predict_proba([{'text': extract_text(page['html'])}])[0][1] assert pred_proba(page_pos) > 0.5 assert pred_proba(page_neg) < 0.5 return train_response
def parse_dataprovider_publicmessagedetail(self, response): # TODO # print(self, response.text) response_json = response.json() msg = Message( oid = response.meta['messageOID'], date=response_json["time"], # TODO: Parse title=html_text.extract_text(response_json["subject"]), body=html_text.extract_text(response_json["body"]), publication_id=response_json["tenderOID"], data={ "authorityKey": response_json["authorityKey"], "tender": response_json["tender"], }, ) if "attachmentLink" in response_json: msg['file_name']=response_json["fileName"] msg['file_urls']=[response.urljoin(response_json["attachmentLink"])] return msg
def text_worker(item, html_field): url = item.get('url') html = item.get(html_field) try: text = html_text.extract_text(html) except UnicodeEncodeError: return None text_item = {'text': text} if url is not None: text_item['url'] = url return text_item
def relevancy(self, response: Response) -> float: if not isinstance(response, TextResponse): # XXX: only text responses are supported return 0.0 if self.classifier_input == 'vector': x = self._page_vector(response) elif self.classifier_input == 'text': x = html_text.extract_text(response.text) elif self.classifier_input == 'text_url': x = { 'text': html_text.extract_text(response.text), 'url': response.url } elif self.classifier_input == 'html': x = response.text else: raise ValueError("self.classifier_input is invalid") return float(self.relevancy_clf.predict_proba([x])[0, 1])
def main(): output = {} for path in Path('html').glob('*.html.gz'): with gzip.open(path, 'rt', encoding='utf8') as f: html = f.read() item_id = path.stem.split('.')[0] doc = Document(html) text = html_text.extract_text(doc.summary(html_partial=True)) output[item_id] = {'articleBody': text} (Path('output') / 'readability.json').write_text(json.dumps( output, sort_keys=True, ensure_ascii=False, indent=4), encoding='utf8')
def test_webpages(page, extracted): html = _load_file(page) if not six.PY3: # FIXME: produces '\xa0' in Python 2, but ' ' in Python 3 # this difference is ignored in this test. # What is the correct behavior? html = html.replace(' ', ' ') expected = _load_file(extracted) assert extract_text(html) == expected tree = cleaner.clean_html(parse_html(html)) assert etree_to_text(tree) == expected
def test_guess_layout(): html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>' '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>' '<p>text_6<em>text_7</em>text_8</p>text_9</div>' '<script>document.getElementById("demo").innerHTML = ' '"This should be skipped";</script> <p>...text_10</p>') text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \ 'text_8 text_9 ...text_10' assert extract_text(html, guess_punct_space=False, guess_layout=False) == text text = ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10') assert extract_text(html, guess_punct_space=False, guess_layout=True) == text text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \ 'text_8 text_9...text_10' assert extract_text(html, guess_punct_space=True, guess_layout=False) == text text = 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n' \ 'text_6 text_7 text_8\n\ntext_9\n\n...text_10' assert extract_text(html, guess_punct_space=True, guess_layout=True) == text
def htmlToPlainText(content): if isHtmlTextInstalled: content = html_text.extract_text(content) elif isBeautifulsoup4Installed: content = re.sub("(</th>|</td>)", r"\1 ", content) content = re.sub("(<br>|<br/>|</tr>)", r"\1\n", content) content = re.sub("(</h[0-9]>|</p>|</div>|<hr>)", r"\1\n\n", content) content = BeautifulSoup(content, "html5lib").get_text() else: content = re.sub("<br/?>|<br>", "\n", content) content = re.sub('<[^<]+?>', '', content) return content
def get_text_html(href): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } #req = requests.get(href, headers=headers) req = reliable_request(href) html = req.text text = html_text.extract_text(html) return text, html
def parse(self, response, date_check=True): articles = response.css("div.m-headlineTease__info") for article in articles: url = article.xpath(".//a/@href").get() if self.in_urls(url): continue dt = extract_text(article.xpath(".//time").get()).strip() if "days" in dt and dt[0].isnumeric(): dt = self.today - timedelta(int(dt[0])) if self.cutoff > dt and date_check: break yield scrapy.Request(url=url, callback=self.art_parse, headers=self.headers)
def extract_one(self, item): name = extract_text(item.css('.desc-zone a[itemprop="url"]').get(), guess_layout=False) link = item.css('.desc-zone h5 a::attr(href)').get() mfr = item.css('.skus .sku[data-selenium="sku"]::text').get() idata = json.loads(item.css('::attr(data-itemdata)').get()) return { 'T': utc_time(), 'link': link, 'mfr': mfr, 'name': name.strip(), 'price': float(idata['price']), 'spider': self.name, 'uid': idata['sku'], }
async def fetch(spider, url, **spider_kwargs): _, page = await spider.get(url, **spider_kwargs) is_login = await is_login_page(page) if is_login: print("Detected login page.") await login(page) html = await page.content() text = extract_text(html) if "Let's do a quick security check" in text: input("SOLVE SECURITY CHECK") elif 'You’ve made too many requests in too short a time. Please try again later.' in text: print("TOO MANY REQUESTS! sleeping") await asyncio.sleep(60 * 11) return await fetch(spider, url, **spider_kwargs) return page
def startWithCli(): if not "html-text" in sys.modules: import html_text # Cli input #config.mainWindow.hide() #config.cli = True #config.printContentOnConsole = printContentOnConsole config.bibleWindowContentTransformers.append(printContentOnConsole) config.studyWindowContentTransformers.append(printContentOnConsole) while config.cli: print("--------------------") print( "Enter '.bible' to read bible text, '.study' to read study resource, '.help' to read UBA command reference, '.gui' to launch gui, '.quit' to quit," ) command = input("or UBA command: ").strip() if command == ".gui": del config.bibleWindowContentTransformers[-1] del config.studyWindowContentTransformers[-1] config.cli = False elif command == ".bible": print(html_text.extract_text(config.bibleWindowContent)) # The following line does not work on Windows on UBA startup #config.mainWindow.mainPage.runJavaScript("document.documentElement.outerHTML", 0, printContentOnConsole) elif command == ".study": print(html_text.extract_text(config.studyWindowContent)) # The following line does not work on Windows on UBA startup #config.mainWindow.studyPage.runJavaScript("document.documentElement.outerHTML", 0, printContentOnConsole) elif command in (".help", ".command"): getCommandDocumentation() elif command == ".quit": exit() else: config.mainWindow.runTextCommand(command) config.mainWindow.show() if platform.system() == "Windows": config.mainWindow.showMaximized()
def extract_features(doc): html = doc['html'] or '' if not doc_is_extra_sampled(doc): try: html = gzip.decompress(base64.b64decode(html)).decode('utf8') except Exception: pass # support not compressed html too text = html_text.extract_text(html) try: lang = langdetect.detect(text) except LangDetectException: lang = None return { 'text': text, 'language': lang, }
def extract_text_from_pdf(pdf_path): text = "" resource_manager = PDFResourceManager() fake_file_handle = io.BytesIO() converter = HTMLConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue().decode() converter.close() fake_file_handle.close() text = html_text.extract_text(text) file = open(pdf_path.replace('.pdf', '.txt'), 'w') file.write(text) file.close()
def parse(self, response, date_check=True): articles = response.css(".archive-item-component__info") for article in articles: dt = extract_text(article.css("time").get()) dt = self.strptime(dt, "%B %d, %Y") if dt and date_check: if dt < self.cutoff: break url = urljoin(self.baseurl, article.xpath("a/@href").get()) if self.in_urls(url): continue yield scrapy.Request( url=url, callback=self.art_parse, headers=self.headers, cb_kwargs=dict(dt=dt), )
def parse(self, response, date_check=True): articles = response.css(".post_wrapper") for art in articles: dt = extract_text( art.xpath(".//*[contains(@class, 'post_detail')]/a").get()) dt = self.strptime(dt, "%B %d, %Y") if dt and date_check: if dt < self.cutoff: break url = art.xpath(".//h3/a/@href").get() if self.in_urls(url): continue yield scrapy.Request( url=url, headers=self.headers, callback=self.art_parse, cb_kwargs=dict(dt=dt), )
def get_info(self, callback): items = MovieScrapItem3() name = callback.css('#firstHeading::text')[0].extract() try: image = 'http:' + callback.css( '.vcard img::attr(src)')[0].extract() except Exception as e: image = None born = callback.css('tr:nth-child(3) td span::text')[1].extract() content = callback.css('.vcard+ p') for para in content: html = para.get() bio = extract_text(html) items['name'] = name items['image'] = image items['born'] = born items['bio'] = bio yield items
def parse(self, response): cols = "rank name gain pips drawdown trades type monthly chart price age added action" skip = [7, 8, 11, 12] def age_to_months(t): t = t.replace('m', 'M') d = durations.Duration(t); return d.to_months() postprocess = { 'age': lambda t: age_to_months(t) } td = dict() for i, col in enumerate(cols.split()): td[i] = col Behold().show('td') for provider in response.xpath("//div[@class='row']//tr"): data_row = dict() Behold().show('provider') details_url = None for i, datum in enumerate(provider.xpath('td')): Behold().show('i', 'datum') if i == 1: # name details_url = datum.css("a::attr(href)").get() if i in skip: print(".....skipping") continue text = html_text.extract_text(datum.get()) column_name = td[i] if column_name in postprocess: text = postprocess[column_name](text) data_row[column_name] = text if details_url: yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row}) # next_page = response.css('.fa-angle-right').get() # if next_page is not None: # yield response.follow(next_page, self.parse)
async def _get_probability_blocked(self, response: Any) -> int: """Return an ingeger representing the probability of this worker encountered an blocking-related error. i.e. server admin has blocked IP address for making too many requests. Any value > 1 can be assumed to mean that the page is blocked. """ try: self._text = html_text.extract_text(self._html) error_level = len( [r for r in self.block_error_regs if r.search(self._text)]) if len(self._text) < 1000: error_level += 1 if response.status > 399: error_level += 1 if error_level > 1: self.logger.warning( f"You may have been blocked! (Block probability level {error_level})") return error_level except Exception as e: self.logger.error(f"Could not get block probability!: {e}")
def parse(self, response): y = response.css(".article-thumb") for el in y: # meta is a string of how many days ago the article was published # e.g. "1 week ago," "5 days ago" etc. meta = extract_text(el.css(".article-thumb__meta").get()) # discard any article that is more than 4 days old words = ["6", "week"] if any([k in meta for k in words]): break url = el.css("a::attr(href)").get() if self.in_urls(url): continue yield scrapy.Request( url=url, callback=self.art_parse, headers=self.headers, cb_kwargs=dict(dt=None), )