Esempio n. 1
0
 def extract_one(self, item, response):
     name = extract_text(item.css('.child_title').get(), guess_layout=False)
     link = item.css('.click_for_more_container>a::attr(href)').get()
     if link:
         link = urljoin(response.url, link)
     else:
         link = ''
     price = item.css('.centered .styleColor::text').get()
     price = price.split('€')[-1].strip().replace(',', '')
     uid = item.xpath(
         './/div[@class="item-box-desc"]/p/i[contains(text(), "Product Code")]/text()'
     ).get()
     if uid:
         uid = uid.split(':')[-1].strip()
     ean = item.xpath(
         './/div[@class="item-box-desc"]/p/i[contains(text(), "Barcode/EAN")]/text()'
     ).get()
     if ean:
         ean = ean.split(':')[-1].strip()
     return {
         'ean': ean,
         'link': link,
         'name': name,
         'price': float(price),
         'spider': self.name,
         'T': utc_time(),
         'uid': uid,
     }
Esempio n. 2
0
def printContentOnConsole(text):
    if not "html-text" in sys.modules:
        import html_text
    print(html_text.extract_text(text))
    #subprocess.Popen(["echo", html_text.extract_text(text)])
    #sys.stdout.flush()
    return text
Esempio n. 3
0
def post(masto, body, instance, title=None, direction='ltr', in_reply_to=None):
    # Markdown more than we need, to [hopefully] discard chopped markup.
    summary = extract_text(markdown(body.strip()))[:140]
    hashtags = get_hashtags(body, ignore=summary)
    mentions = get_mentions(body, ignore=summary)
    irt_id = in_reply_to and in_reply_to.get('id') or None
    body = linkify_hashtags(linkify_mentions(body), instance)
    if direction == 'rtl':
        body = u"""<div dir="rtl">
{}
</div>""".format(markdown(body))
    if in_reply_to:
        body = u"""#### In reply to [@{}]({}):

{}""".format(in_reply_to['account']['username'], in_reply_to['url'], body)
    gist = make_gist(
        title
        or u"A gistodon toot, {} GMT".format(time.asctime(time.gmtime())),
        body + u"""

###### Generated by [Gistodon](https://github.com/thedod/gistodon/#readme).""")
    if NO_TOOTING:
        return gist
    status = u'{}... {}'.format(summary, gist)
    if hashtags or mentions:
        status += u'\n' + u' '.join(hashtags.union(mentions))
    return masto.status_post(status, spoiler_text=title,
                             in_reply_to_id=irt_id)['url']
Esempio n. 4
0
def switchToCli():
    if not "html-text" in sys.modules:
        import html_text
    # Print content where is context menu was called from
    if config.pluginContext == "study":
        print(html_text.extract_text(config.studyWindowContent))
    else:
        print(html_text.extract_text(config.bibleWindowContent))
    # Hide gui
    if platform.system() == "Darwin":
        config.mainWindow.showMinimized()
    else:
        config.mainWindow.hide()
    # Cli input
    config.cli = True
    toQuit = False
    #config.printContentOnConsole = printContentOnConsole
    config.bibleWindowContentTransformers.append(printContentOnConsole)
    config.studyWindowContentTransformers.append(printContentOnConsole)
    while config.cli:
        print("--------------------")
        print("Enter '.bible' to read bible text, '.study' to read study resource, '.help' to read UBA command reference, '.gui' to launch gui, '.quit' to quit,")
        command = input("or UBA command: ").strip()
        if command == ".gui":
            del config.bibleWindowContentTransformers[-1]
            del config.studyWindowContentTransformers[-1]
            config.cli = False
        elif command == ".bible":
            print(html_text.extract_text(config.bibleWindowContent))
        elif command == ".study":
            print(html_text.extract_text(config.studyWindowContent))
        elif command in (".help", ".command"):
            getCommandDocumentation()
        elif command == ".quit":
            toQuit = True
            config.cli = False
        else:
            config.mainWindow.runTextCommand(command)
    if toQuit:
        config.mainWindow.quitApp()
    else:
        app.setApplicationName("UniqueBible.app")
        if platform.system() == "Darwin":
            config.mainWindow.showMaximized()
            config.mainWindow.raise_()
        else:
            config.mainWindow.show()
Esempio n. 5
0
 def get_body(self, response):
     xpath = "//div[@id = 'article']/descendant::p[not(ancestor::div/@class='aboutAuthor')]"
     body = response.xpath(xpath).getall()
     body = [extract_text(para) for para in body]
     # check if first line is real text
     if len(body[0].split()) < 5:
         body = body[1:]
     return "\n\n".join(body)
Esempio n. 6
0
def get_paragraphs_html_text(str_text, mode):
  """
  using html_text
  """
  try:
      text = html_text.extract_text(str_text, guess_layout=False)
  except TypeError:
      return ['']
  return re.split("\n", text)
Esempio n. 7
0
def test_webpages():
    webpages = sorted(glob.glob('./test_webpages/*.html'))
    extracted = sorted(glob.glob('./test_webpages/*.txt'))
    for page, extr in zip(webpages, extracted):
        with open(page, 'r', encoding='utf8') as f_in:
            html = f_in.read()
        with open(extr, 'r', encoding='utf8') as f_in:
            expected = f_in.read()
        assert extract_text(html) == expected
Esempio n. 8
0
async def security_check(page, response):
    html = await page.content()
    text = html_text.extract_text(html)
    error_level = len([r for r in error_regs if r.search(text)])
    if len(text) < 1000:
        error_level += 1
    if response and not response.ok:
        error_level += 1
    return error_level
def main():
    output = {}
    for path in Path('html').glob('*.html.gz'):
        with gzip.open(path, 'rt', encoding='utf8') as f:
            html = f.read()
        item_id = path.stem.split('.')[0]
        output[item_id] = {'articleBody': html_text.extract_text(html)}
    (Path('output') / 'html-text.json').write_text(json.dumps(
        output, sort_keys=True, ensure_ascii=False, indent=4),
                                                   encoding='utf8')
Esempio n. 10
0
    def _test(train_response):
        model = decode_object(train_response.pop('model'))  # type: BaseModel
        pprint(train_response)
        pprint(json.loads(train_response['quality']))

        page_neg, page_pos = pages[:2]
        pred_proba = lambda page: \
            model.predict_proba([{'text': extract_text(page['html'])}])[0][1]
        assert pred_proba(page_pos) > 0.5
        assert pred_proba(page_neg) < 0.5
        return train_response
    def parse_dataprovider_publicmessagedetail(self, response):
        # TODO
        # print(self, response.text)
        response_json = response.json()
        msg = Message(
            oid = response.meta['messageOID'],
            date=response_json["time"],  # TODO: Parse
            title=html_text.extract_text(response_json["subject"]),
            body=html_text.extract_text(response_json["body"]),
            publication_id=response_json["tenderOID"],
            data={
                "authorityKey": response_json["authorityKey"],
                "tender": response_json["tender"],
            },
        )
        if "attachmentLink" in response_json:
            msg['file_name']=response_json["fileName"]
            msg['file_urls']=[response.urljoin(response_json["attachmentLink"])]

        return msg
Esempio n. 12
0
def text_worker(item, html_field):
    url = item.get('url')
    html = item.get(html_field)
    try:
        text = html_text.extract_text(html)
    except UnicodeEncodeError:
        return None
    text_item = {'text': text}
    if url is not None:
        text_item['url'] = url
    return text_item
Esempio n. 13
0
    def relevancy(self, response: Response) -> float:
        if not isinstance(response, TextResponse):
            # XXX: only text responses are supported
            return 0.0

        if self.classifier_input == 'vector':
            x = self._page_vector(response)
        elif self.classifier_input == 'text':
            x = html_text.extract_text(response.text)
        elif self.classifier_input == 'text_url':
            x = {
                'text': html_text.extract_text(response.text),
                'url': response.url
            }
        elif self.classifier_input == 'html':
            x = response.text
        else:
            raise ValueError("self.classifier_input is invalid")

        return float(self.relevancy_clf.predict_proba([x])[0, 1])
Esempio n. 14
0
def main():
    output = {}
    for path in Path('html').glob('*.html.gz'):
        with gzip.open(path, 'rt', encoding='utf8') as f:
            html = f.read()
        item_id = path.stem.split('.')[0]
        doc = Document(html)
        text = html_text.extract_text(doc.summary(html_partial=True))
        output[item_id] = {'articleBody': text}
    (Path('output') / 'readability.json').write_text(json.dumps(
        output, sort_keys=True, ensure_ascii=False, indent=4),
                                                     encoding='utf8')
Esempio n. 15
0
def test_webpages(page, extracted):
    html = _load_file(page)
    if not six.PY3:
        # FIXME: &nbsp; produces '\xa0' in Python 2, but ' ' in Python 3
        # this difference is ignored in this test.
        # What is the correct behavior?
        html = html.replace('&nbsp;', ' ')
    expected = _load_file(extracted)
    assert extract_text(html) == expected

    tree = cleaner.clean_html(parse_html(html))
    assert etree_to_text(tree) == expected
Esempio n. 16
0
def test_guess_layout():
    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
            '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
            '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
            '<script>document.getElementById("demo").innerHTML = '
            '"This should be skipped";</script> <p>...text_10</p>')

    text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
           'text_8 text_9 ...text_10'
    assert extract_text(html, guess_punct_space=False, guess_layout=False) == text

    text = ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
            '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')
    assert extract_text(html, guess_punct_space=False, guess_layout=True) == text

    text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
           'text_8 text_9...text_10'
    assert extract_text(html, guess_punct_space=True, guess_layout=False) == text

    text = 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n' \
           'text_6 text_7 text_8\n\ntext_9\n\n...text_10'
    assert extract_text(html, guess_punct_space=True, guess_layout=True) == text
Esempio n. 17
0
 def htmlToPlainText(content):
     if isHtmlTextInstalled:
         content = html_text.extract_text(content)
     elif isBeautifulsoup4Installed:
         content = re.sub("(</th>|</td>)", r"\1&emsp;", content)
         content = re.sub("(<br>|<br/>|</tr>)", r"\1\n", content)
         content = re.sub("(</h[0-9]>|</p>|</div>|<hr>)", r"\1\n\n",
                          content)
         content = BeautifulSoup(content, "html5lib").get_text()
     else:
         content = re.sub("<br/?>|<br>", "\n", content)
         content = re.sub('<[^<]+?>', '', content)
     return content
Esempio n. 18
0
def get_text_html(href):

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }

    #req = requests.get(href, headers=headers)
    req = reliable_request(href)
    html = req.text
    text = html_text.extract_text(html)

    return text, html
Esempio n. 19
0
 def parse(self, response, date_check=True):
     articles = response.css("div.m-headlineTease__info")
     for article in articles:
         url = article.xpath(".//a/@href").get()
         if self.in_urls(url):
             continue
         dt = extract_text(article.xpath(".//time").get()).strip()
         if "days" in dt and dt[0].isnumeric():
             dt = self.today - timedelta(int(dt[0]))
             if self.cutoff > dt and date_check:
                 break
         yield scrapy.Request(url=url,
                              callback=self.art_parse,
                              headers=self.headers)
Esempio n. 20
0
 def extract_one(self, item):
     name = extract_text(item.css('.desc-zone a[itemprop="url"]').get(), guess_layout=False)
     link = item.css('.desc-zone h5 a::attr(href)').get()
     mfr = item.css('.skus .sku[data-selenium="sku"]::text').get()
     idata = json.loads(item.css('::attr(data-itemdata)').get())
     return {
         'T': utc_time(),
         'link': link,
         'mfr': mfr,
         'name': name.strip(),
         'price': float(idata['price']),
         'spider': self.name,
         'uid': idata['sku'],
     }
async def fetch(spider, url, **spider_kwargs):
    _, page = await spider.get(url, **spider_kwargs)
    is_login = await is_login_page(page)
    if is_login:
        print("Detected login page.")
        await login(page)
    html = await page.content()
    text = extract_text(html)
    if "Let's do a quick security check" in text:
        input("SOLVE SECURITY CHECK")
    elif 'You’ve made too many requests in too short a time. Please try again later.' in text:
        print("TOO MANY REQUESTS! sleeping")
        await asyncio.sleep(60 * 11)
        return await fetch(spider, url, **spider_kwargs)
    return page
Esempio n. 22
0
def startWithCli():
    if not "html-text" in sys.modules:
        import html_text
    # Cli input
    #config.mainWindow.hide()
    #config.cli = True
    #config.printContentOnConsole = printContentOnConsole
    config.bibleWindowContentTransformers.append(printContentOnConsole)
    config.studyWindowContentTransformers.append(printContentOnConsole)
    while config.cli:
        print("--------------------")
        print(
            "Enter '.bible' to read bible text, '.study' to read study resource, '.help' to read UBA command reference, '.gui' to launch gui, '.quit' to quit,"
        )
        command = input("or UBA command: ").strip()
        if command == ".gui":
            del config.bibleWindowContentTransformers[-1]
            del config.studyWindowContentTransformers[-1]
            config.cli = False
        elif command == ".bible":
            print(html_text.extract_text(config.bibleWindowContent))
            # The following line does not work on Windows on UBA startup
            #config.mainWindow.mainPage.runJavaScript("document.documentElement.outerHTML", 0, printContentOnConsole)
        elif command == ".study":
            print(html_text.extract_text(config.studyWindowContent))
            # The following line does not work on Windows on UBA startup
            #config.mainWindow.studyPage.runJavaScript("document.documentElement.outerHTML", 0, printContentOnConsole)
        elif command in (".help", ".command"):
            getCommandDocumentation()
        elif command == ".quit":
            exit()
        else:
            config.mainWindow.runTextCommand(command)
    config.mainWindow.show()
    if platform.system() == "Windows":
        config.mainWindow.showMaximized()
Esempio n. 23
0
def extract_features(doc):
    html = doc['html'] or ''
    if not doc_is_extra_sampled(doc):
        try:
            html = gzip.decompress(base64.b64decode(html)).decode('utf8')
        except Exception:
            pass  # support not compressed html too
    text = html_text.extract_text(html)
    try:
        lang = langdetect.detect(text)
    except LangDetectException:
        lang = None
    return {
        'text': text,
        'language': lang,
    }
Esempio n. 24
0
def extract_text_from_pdf(pdf_path):
    text = ""
    resource_manager = PDFResourceManager()
    fake_file_handle = io.BytesIO()
    converter = HTMLConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue().decode()
    converter.close()
    fake_file_handle.close()
    text = html_text.extract_text(text)
    file = open(pdf_path.replace('.pdf', '.txt'), 'w')
    file.write(text)
    file.close()
Esempio n. 25
0
 def parse(self, response, date_check=True):
     articles = response.css(".archive-item-component__info")
     for article in articles:
         dt = extract_text(article.css("time").get())
         dt = self.strptime(dt, "%B %d, %Y")
         if dt and date_check:
             if dt < self.cutoff:
                 break
         url = urljoin(self.baseurl, article.xpath("a/@href").get())
         if self.in_urls(url):
             continue
         yield scrapy.Request(
             url=url,
             callback=self.art_parse,
             headers=self.headers,
             cb_kwargs=dict(dt=dt),
         )
Esempio n. 26
0
 def parse(self, response, date_check=True):
     articles = response.css(".post_wrapper")
     for art in articles:
         dt = extract_text(
             art.xpath(".//*[contains(@class, 'post_detail')]/a").get())
         dt = self.strptime(dt, "%B %d, %Y")
         if dt and date_check:
             if dt < self.cutoff:
                 break
         url = art.xpath(".//h3/a/@href").get()
         if self.in_urls(url):
             continue
         yield scrapy.Request(
             url=url,
             headers=self.headers,
             callback=self.art_parse,
             cb_kwargs=dict(dt=dt),
         )
Esempio n. 27
0
 def get_info(self, callback):
     items = MovieScrapItem3()
     name = callback.css('#firstHeading::text')[0].extract()
     try:
         image = 'http:' + callback.css(
             '.vcard img::attr(src)')[0].extract()
     except Exception as e:
         image = None
     born = callback.css('tr:nth-child(3) td span::text')[1].extract()
     content = callback.css('.vcard+ p')
     for para in content:
         html = para.get()
         bio = extract_text(html)
     items['name'] = name
     items['image'] = image
     items['born'] = born
     items['bio'] = bio
     yield items
    def parse(self, response):

        cols = "rank name gain pips drawdown trades type monthly chart price age added action"

        skip = [7, 8, 11, 12]

        def age_to_months(t):
            t = t.replace('m', 'M')
            d = durations.Duration(t);
            return d.to_months()

        postprocess = {
            'age': lambda t: age_to_months(t)
        }

        td = dict()
        for i, col in enumerate(cols.split()):
            td[i] = col

        Behold().show('td')

        for provider in response.xpath("//div[@class='row']//tr"):
            data_row = dict()
            Behold().show('provider')
            details_url = None

            for i, datum in enumerate(provider.xpath('td')):
                Behold().show('i', 'datum')
                if i == 1: # name
                    details_url = datum.css("a::attr(href)").get()
                if i in skip:
                    print(".....skipping")
                    continue
                text = html_text.extract_text(datum.get())
                column_name = td[i]
                if column_name in postprocess:
                    text = postprocess[column_name](text)
                data_row[column_name] = text
            if details_url:
                yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})

    # next_page = response.css('.fa-angle-right').get()
        # if next_page is not None:
        #     yield response.follow(next_page, self.parse)
Esempio n. 29
0
 async def _get_probability_blocked(self, response: Any) -> int:
     """Return an ingeger representing the probability of this worker encountered an blocking-related error. 
     i.e. server admin has blocked IP address for making too many requests.
     Any value > 1 can be assumed to mean that the page is blocked.
     """
     try:
         self._text = html_text.extract_text(self._html)
         error_level = len(
             [r for r in self.block_error_regs if r.search(self._text)])
         if len(self._text) < 1000:
             error_level += 1
         if response.status > 399:
             error_level += 1
         if error_level > 1:
             self.logger.warning(
                 f"You may have been blocked! (Block probability level {error_level})")
         return error_level
     except Exception as e:
         self.logger.error(f"Could not get block probability!: {e}")
Esempio n. 30
0
 def parse(self, response):
     y = response.css(".article-thumb")
     for el in y:
         # meta is a string of how many days ago the article was published
         # e.g. "1 week ago," "5 days ago" etc.
         meta = extract_text(el.css(".article-thumb__meta").get())
         # discard any article that is more than 4 days old
         words = ["6", "week"]
         if any([k in meta for k in words]):
             break
         url = el.css("a::attr(href)").get()
         if self.in_urls(url):
             continue
         yield scrapy.Request(
             url=url,
             callback=self.art_parse,
             headers=self.headers,
             cb_kwargs=dict(dt=None),
         )