Ejemplo n.º 1
0
    async def check_update(self, data) -> Page:
        html = await self.session(id=data)
        parser = HTMLParser(html)
        thread = await self.thread_selector(parser)
        user = [
            await self.pretty_text(p.html)
            for p in parser.css("div.mfd-post-top-0 > a")
        ]
        link = [
            await self.pretty_text(p.html)
            for p in parser.css("div.mfd-post-top-1")
        ]
        posts = [
            await self.pretty_text(p.html)
            for p in parser.css("div.mfd-post-body-right")
        ]
        ids = [
            int(p.attributes['data-id']) for p in parser.css(self.id_selector)
        ]

        if len(thread) > 0:
            tuple_title = tuple(
                zip_longest(thread, user, link, fillvalue=thread[0]))
            title = [
                f"{title[0]}\n{title[1]}\n{title[2]}" for title in tuple_title
            ]
            return Page([
                SinglePost(title=data[0], md=data[1], id=data[2])
                for data in zip(title, posts, ids)
            ])

        return Page()
Ejemplo n.º 2
0
 def html_to_text(self, html, *args):
     tree = HTMLParser(html)
     for tag in tree.css('script'):
         tag.decompose()
     for tag in tree.css('style'):
         tag.decompose()
     text = tree.body.text(separator='\n')
     text = ' '.join(text.split()) #string
     return text
Ejemplo n.º 3
0
def get_text(url):
    global page_text
    tree = HTMLParser(url)
    if tree.body is None:
        return None
    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()
    page_text += tree.body.text()
Ejemplo n.º 4
0
def get_text_from_html(html):
    tree = HTMLParser(html)
    if tree.body is None:
        return None
    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()
    text = tree.body.text(separator='\n')
    return text
Ejemplo n.º 5
0
def get_text_content(html) -> Optional[str]:
    tree = HTMLParser(html)

    if tree.body:
        for tag in tree.css("script"):
            tag.decompose()
        for tag in tree.css("style"):
            tag.decompose()

        text = tree.body.text(separator="\n", strip=True)
        return text
Ejemplo n.º 6
0
def parse_text(html):
    tree = HTMLParser(html)

    if tree.body is None:
        return None

    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    return tree
Ejemplo n.º 7
0
def htmltotxt(url):
    # html = urlopen(url).read()
    html = requests.get(url).text
    tree = HTMLParser(html)
    for tag in tree.css('script') + tree.css('style'):
        tag.decompose()
    text = re.sub(" +", " ", tree.body.text().replace("\n", ""))
    title = tree.css_first('title').text().strip()
    hrefs = set(
        prepareUrl(url, match[1]) for match in HREF_REGEX.findall(html)
        if not isUrlNa(match[1]))
    return title, text, hrefs
Ejemplo n.º 8
0
def worker():
    for i in range(500):
        html = "<span></span><div><p class='p3'>text</p><p class='p3'>sd</p></div><p></p>"
        selector = "p.p3"
        tree = HTMLParser(html)

        assert tree.css_first(selector).text() == 'text'

        for tag in tree.css('p'):
            tag.decompose()

        for tag in tree.css('span'):
            tag.decompose()
Ejemplo n.º 9
0
Archivo: bot.py Proyecto: 0xCN/MailBot
def get_text_selectolax(html):
    """
    parsing HTML from email and returning crucial parts as TEXT
    """
    tree = HTMLParser(html)
    if tree.body is None:
        return None
    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()
    text = tree.body.text(separator='\n')
    return text
Ejemplo n.º 10
0
def get_text_selectolax(html):
    html_parser = HTMLParser(html)

    if html_parser.body is None:
        return None

    for tag in html_parser.css('script'):
        tag.decompose()
    for tag in html_parser.css('style'):
        tag.decompose()

    parsed_text = html_parser.body.text(separator='\n')
    return parsed_text
Ejemplo n.º 11
0
def get_text_selectolax(html):
    html = html.strip()

    if len(html) == 0:
        return None

    tree = HTMLParser(html)
    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    text = tree.body.text(separator='\n')
    return text
Ejemplo n.º 12
0
def parse_selectolax(html):
    tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']
    tree = HTMLParser(html)
    paragraphs = []
    heads = []
    links = []
    for t in tags:
        selector = t
        for node in tree.css(selector):
            if selector == 'p':
                paragraphs.append(str(node.text()))
            if selector == 'h1':
                heads.append(str(node.text()))
            if selector == 'h2':
                heads.append(str(node.text()))
            if selector == 'h3':
                heads.append(str(node.text()))
            if selector == 'h4':
                heads.append(str(node.text()))
            if selector == 'h5':
                heads.append(str(node.text()))
            if selector == 'h6':
                heads.append(str(node.text()))
            if (selector == 'a' and 'href' in node.attributes
                    and 'title' in node.attributes):
                links.append(
                    str(node.attributes['href']) + "\|" +
                    str(node.attributes['title']))
                #url | titulo de noticia, etc. a la que la url apunta

    return "<p>".join(paragraphs), "<h>".join(heads), "<t>".join(links)
Ejemplo n.º 13
0
    def core_course_get_courses(self, username, password):
        page = HTMLParser(
            self.session.get("http://moodle.fbtuit.uz/my/").text
        )  #todo bundan ham optimal qilish mumkin :/
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return []  #Parol xato bo'lsa bo'sh list qaytaradi
            page = HTMLParser(
                self.session.get("http://moodle.fbtuit.uz/my/").text)
        "dasturlash 136 shaklida bo'ladi har bir list elementi"
        "kursnomi idsi"
        course_list = []
        '''
		Parse ni quyidagi qism bo'yicha amalga oshiradi:
		<div class="media-body">
        <h4 class="h5"><a href="http://moodle.fbtuit.uz/course/view.php?id=173" class="">Kurs nomi</a></h4>
        </div>
		'''
        for node in page.css("div"):
            if 'class' in node.attributes:
                if node.attributes['class'] == 'media-body' and node.text(
                ) != '':
                    if node.text().strip(
                    ) + " " + node.css_first('a').attributes['href'][
                            node.css_first('a').attributes['href'].find("=") +
                            1:] in course_list:
                        break
                    course_list.append(
                        node.text().strip() + " " +
                        node.css_first('a').attributes['href']
                        [node.css_first('a').attributes['href'].find("=") +
                         1:])
        return course_list
Ejemplo n.º 14
0
 def core_course_get_grades(
     self, username, password
 ):  #, course_id): keyinchalik kurslarni o'zidagi baholarni batafsil ko'rsatadigan qilish
     "Kurs id si bo'yicha baholarni aytadi"
     "Buni sal chiroyliroq qilib qo'ygin-ey :)"
     page = HTMLParser(
         self.session.get(
             "http://moodle.fbtuit.uz/grade/report/overview/index.php").text
     )
     if page.css_first(
             'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
         if not self.core_auth_confirm_user(username, password):
             return ""
         page = HTMLParser(
             self.session.get(
                 "http://moodle.fbtuit.uz/grade/report/overview/index.php").
             text)
     grades = "Kurs nomi|Baho\n"
     #td - table tegi, table dan baholarni olamiz
     counter = 0
     for node in page.css('td'):
         if node.text() == "": return grades
         grades += node.text()
         counter += 1
         if counter % 2: grades += "|"
         else: grades += "\n"
Ejemplo n.º 15
0
    def parse_topics_list(self, page_content: str):
        results = []

        root = HTMLParser(page_content)
        topic_parent_elements = root.css('.bg1,.bg2')
        for element in topic_parent_elements:
            topic_href_element = element.css_first('.topictitle')
            topic_title = topic_href_element.text()

            rel_url = topic_href_element.attributes.get('href')

            author = element.css_first('.author > a').text()
            author_profile_rel_url = element.css_first(
                '.author > a').attributes.get('href')
            answers = element.css_first('.posts').child.html
            views = element.css_first('.views').child.html
            last_post_dt_string = element.css_first(
                '.lastpost span time').attributes.get('datetime')

            data_container = TopicMetaInfo(
                domain=self.domain,
                topic_id=0,
                url=rel_url,
                title=topic_title,
                author=author,
                author_profile_link=author_profile_rel_url,
                posts_count=answers,
                views_count=views,
                last_post_timestamp=last_post_dt_string)
            data_container.process()

            results.append(data_container)
        return results
Ejemplo n.º 16
0
def listHome():
    html = requests.get(HOST).text
    body = HTMLParser(html)

    selector = "div.sdc-site-tile--has-link"

    for node in body.css(selector):
        for a in node.css('a.sdc-site-tile__headline-link'):
            attributes = a.attributes
            label = a.text(strip=True)
            url = build_url({'action': 'playVoD', 'path': attributes['href']})
        for img in node.css('img.sdc-site-tile__image'):
            attributes = img.attributes
            icon = attributes['src']
        print('larbel>' + label + '>url>' + url + '>icon>' + icon)
        #addVideo(label, url, icon)
    '''
    soup = BeautifulSoup(html, 'html.parser')
    for item in soup('div', 'sdc-site-tile--has-link'):
        videoitem = item.find('span', {'class': 'sdc-site-tile__badge'})
        if videoitem is not None and videoitem.find('path') is not None:
            headline = item.find('h3', {'class': 'sdc-site-tile__headline'})
            label = headline.span.string
            url = build_url({'action': 'playVoD', 'path': headline.a.get('href')})
            icon = item.img.get('src')
            addVideo(label, url, icon)
    '''

    xbmcplugin.endOfDirectory(addon_handle, cacheToDisc=True)
Ejemplo n.º 17
0
def get_post_data(thread_url):
    print(f'getting post data from {thread_url}')
    local_path = parse_url(thread_url)['filepath']
    html = open(local_path, "br").read()
    p = HTMLParser(html)
    posts = p.css("li.b-post")

    for post in posts:
        try:
            post_data = {}
            post_data['content'] = post.css_first(
                "div.b-post__content").text().replace('\t',
                                                      '').replace('\n', '')
            post_data['date'] = format_post_date(
                post.css_first("div.b-post__timestamp").text())
            post_data['author'] = post.css_first("div.author").text().replace(
                '\t', '').replace('\n', '')
            post_data['author_url'] = post.css_first("div.author").css_first(
                "a").attrs['href']
            post_data['author_title'] = post.css_first(
                "div.usertitle").text().replace('\t', '').replace('\n', '')
            logging.info(f"Post data values: {post_data.values()}")
            f = open('data/posts.csv', 'a')
            writer = csv.writer(f, delimiter=',')
            writer.writerow(post_data.values())
            f.close()
            #get_html(post_data['author_url'],author=True)
        except:
            logging.info(f"Error getting post data")
Ejemplo n.º 18
0
def get_text_from_html(html):
    """
    Uses Selectolax to parse the HTML
    """
    tree = HTMLParser(html)

    if tree.body is None:
        return None

    for tag in tree.css(u'script'):
        tag.decompose()
    for tag in tree.css(u'style'):
        tag.decompose()

    text = tree.body.text(separator=u'\n')
    return text
Ejemplo n.º 19
0
def get_subforum_data():
    '''Parse data for each subforum list present on main page'''
    html = open("scrape/_main.txt", "br").read()
    p = HTMLParser(html)
    subforum_lists = p.css("tr.subforum-list")

    for subforum_list in subforum_lists:
        subforum_elements = subforum_list.css("div.subforum-info")
        for subforum_element in subforum_elements:
            try:
                subforum_data = {}
                subforum_title_element = subforum_element.css_first(
                    'a.subforum-title')
                subforum_data['title'] = subforum_title_element.text()
                subforum_data['url'] = subforum_title_element.attrs['href']
                subforum_title_list.append(
                    parse_url(subforum_data['url'])['forum'])
                counts_text = subforum_element.css_first('span.counts').text()
                counts = counts_text.replace('(', '').replace(')', '').replace(
                    ',', '').split('/')
                subforum_data['topics'] = counts[0]
                subforum_data['posts'] = counts[1]
                logging.info(f"Subforum data is: {subforum_data}")
                f = open('data/subforums.csv', 'a')
                writer = csv.writer(f, delimiter=',')
                writer.writerow(subforum_data.values())
                f.close()
            except:
                logging.info(f"Error getting subforum data")
Ejemplo n.º 20
0
def extract_text_from_html(html_file):
    with open(html_file, "r", encoding="utf-8") as rf:
        html = rf.read().strip()

    tree = HTMLParser(html)

    if tree.body is None:
        return None

    for tag in tree.css("script"):
        tag.decompose()
    for tag in tree.css("style"):
        tag.decompose()

    text = tree.body.text(separator="\n")
    return text
Ejemplo n.º 21
0
def get_imdb_page(show):
    global requests

    logging.info("Scraping information for show: " + (show))
    # We want to query imdb one time
    url = 'https://www.imdb.com/search/title?title=' + show + '&title_type=tv_series,tv_miniseries&sort=popularity'
    # Making a response and parsing it
    response = get(url, headers=headers)

    if response.status_code != 200:
        logging.warning('Received status code, ' + str(response.status_code))
        raise Exception("Received a non-200 status code!")

    parser = HTMLParser(response.text)

    # Update progress bar and wait
    requests += 1
    elapsed_time = time() - start_time
    os.system('clear')
    print('Request: {}; Frequency: {} requests/s'.format(
        requests, requests / elapsed_time))

    # We only care about the divs that have the movie name
    # imdb_page has the link to the tv show's imdb page
    if (len(parser.css(".lister-item-header a")) <= 0):
        logging.warning('Did not find any results for: ' + show)
        raise Exception("Did not find a valif imdb page")
    imdb_page = "https://www.imdb.com" + parser.css_first(
        ".lister-item-header a").attributes['href']

    return imdb_page
Ejemplo n.º 22
0
def Text2(url, tokenize):
    global tree
    global html
    global parsed
    if (Url != url):
        parsed = False
    if parsed == False:
        html = requests.get(url).content
        tree = HTMLParser(html)
        parsed = True
    """
    soup = BeautifulSoup(html, 'html.parser')
    #soup = soup.find_all(string=lambda text:isinstance(text,Comment))

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    text = soup.
    """

    if tree.body is None:
        return None

    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    text = tree.body.text(separator='\n')
    text = text.replace("\n", "")
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = re.sub(r"\s+", " ", text)
    if tokenize == True:
        tokens = nltk.sent_tokenize(text)
        return tokens
    else:
        return text
Ejemplo n.º 23
0
def remove_html_tags(cell):
    tree = HTMLParser(cell)
    for tag in tree.css('script, style'):
        tag.decompose()

    text_content = tree.text(deep=True)

    return text_content
Ejemplo n.º 24
0
def get_text_selectolax(html):
    if type(html) == str:
        tree = HTMLParser(html)

        if tree.body is None:
            return None

        for tag in tree.css('script'):
            tag.decompose()
        for tag in tree.css('style'):
            tag.decompose()

        text = tree.body.text(separator=' ').replace('\n', '').replace('\t', ' ').replace('\xa0', ' ')
        return text

    else:
        return np.nan
Ejemplo n.º 25
0
def _parse_quote_page(url):
    data = requests.get(url)
    dom = HTMLParser(data.text)
    for tag in dom.css('a.authorOrTitle'):
        if 'href' in tag.attributes:
            return (tag.text(), tag.attributes['href'])

    return None
Ejemplo n.º 26
0
def post_about_dangerous_content(build_directory: Path, **config):

    OK_URL_PREFIXES = [
        "https://github.com/mdn/",
    ]

    comments = []

    for doc in get_built_docs(build_directory):
        rendered_html = "\n".join(
            x["value"]["content"]
            for x in doc["body"]
            if x["type"] == "prose" and x["value"]["content"]
        )
        tree = HTMLParser(rendered_html)
        external_urls = defaultdict(int)
        for node in tree.css("a[href]"):
            href = node.attributes.get("href")
            href = href.split("#")[0]
            # We're only interested in external URLs at the moment
            if href.startswith("//") or "://" in href:
                if any(href.lower().startswith(x.lower()) for x in OK_URL_PREFIXES):
                    # exceptions are skipped
                    continue
                external_urls[href] += 1

        if external_urls:
            external_urls_list = []
            for url in sorted(external_urls):
                count = external_urls[url]

                external_urls_list.append(
                    f"  - {'🚨 ' if url.startswith('http://') else ''}"
                    f"<{url}> ({count} time{'' if count==1 else 's'})"
                )
            comments.append((doc, "\n".join(external_urls_list)))
        else:
            comments.append((doc, "No external URLs"))

    heading = "## External URLs\n\n"
    if comments:
        per_doc_comments = []
        for doc, comment in comments:
            lines = []
            if config["prefix"]:
                url = mdn_url_to_dev_url(config["prefix"], doc["mdn_url"])
                lines.append(f"URL: [`{doc['mdn_url']}`]({url})")
            else:
                lines.append(f"URL: `{doc['mdn_url']}`")
            lines.append(f"Title: `{doc['title']}`")
            lines.append(f"[on GitHub]({doc['source']['github_url']})")
            lines.append("")
            lines.append(comment)

            per_doc_comments.append("\n".join(lines))
        return heading + "\n---\n".join(per_doc_comments)
    else:
        return heading + "*no external links in the built pages* 👱🏽"
Ejemplo n.º 27
0
    def get_all_website_links_selectolax(self, url):
        urls = set()

        domain_name = urlparse(url).netloc
        domain_list = [
            'news.cgtn.com', 'newsus.cgtn.com', 'newsaf.cgtn.com',
            'newseu.cgtn.com'
        ]
        domain_list.append(domain_name)

        try:
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            req = urllib.request.Request(url=url, headers=headers)
            r = urllib.request.urlopen(req)
            sll = HTMLParser(r.read())
        except:
            print("exception fail")
            return urls

        for a_tag in sll.css("a"):
            if not "href" in a_tag.attributes:
                continue
            href = a_tag.attrs["href"]
            if href == "" or href is None:
                continue

            href = urljoin(url, href)
            parsed_href = urlparse(
                href
            )  # This removes the query portion (e.g. /story?id=12345) resulting in the final string being /story.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + "?" + parsed_href.query  #added query back (TODO: robust?)

            if not self.is_valid(href):
                # not a valid URL
                print("why")
                continue
            if href in self.internal_urls:
                # already in the set
                continue
            if not (any(domain in href for domain in domain_list)):
                # external link
                #             if href not in external_urls:
                #                 external_urls.add(href)
                continue

            a_tag_text = a_tag.text(deep=True, separator='', strip=False)
            if not (any(word in a_tag_text for word in self.keywords)
                    or any(word in href for word in self.keywords)):
                #check if A tag text OR A tag href doesn't contain keyword
                continue

            # print(a_tag_text)
            urls.add(href)
            self.internal_urls.add(href)
        return urls
Ejemplo n.º 28
0
    def core_calendar_get_days(self, username, password, _time=''):
        page = HTMLParser(
            self.session.get(
                f"http://moodle.fbtuit.uz/calendar/view.php?view=month&lang=uz{f'&time={_time}' if len(_time) else ''}"
            ).text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Saytga kirish":
            if not self.core_auth_confirm_user(username, password):
                return 0xff
            page = HTMLParser(
                self.session.get(
                    f"http://moodle.fbtuit.uz/calendar/view.php?view=month&lang=uz{f'&time={_time}' if _time!='' else ''}"
                ).text)
        data = {}
        for tag in page.css("h2"):  #Kalendar ko'rsatayotgan oyni olamiz
            if tag.attributes.get("class", {}) == "current":
                data['current'] = tag.text()
                break

        for tag in page.css("a"):
            if "arrow_link" in tag.attributes.get(
                    "class", {}
            ):  #if 'class' tag.attributes da va tag.attributes['class'] == ...ni qisqartirilgani
                for i in tag.css("span"):
                    if i.attributes['class'] == 'arrow_text': name = i.text()
                data[tag.attributes['class'].split()[-1]] = {
                    'name':
                    name,
                    '_time':
                    tag.attributes['href'][tag.attributes['href'].rfind("=") +
                                           1:]
                }
        data['days'] = {}
        #├
        #└
        for table in page.css("table"):
            if not ('calendar' in table.attributes.get('class', {})): continue
            for td in table.css("td"):
                if td.css_first("a"):
                    if td.css_first("a").attributes.get("class", {}) == 'day':
                        data['days'][td.css_first('a').text(
                        )] = td.css_first('a').attributes['href'][
                            td.css_first('a').attributes['href'].rfind("=") +
                            1:]
        return data
 def clear_text(text: str, rm_strong=True) -> str:
     selector = "strong"
     text = unicodedata.normalize("NFKD", text)
     text = text.replace("\n", " ")
     tree = HTMLParser(text)
     if rm_strong:
         for node in tree.css(selector):
             node.decompose()
     return tree.text().strip()
Ejemplo n.º 30
0
    def core_calendar_get_tasks(self, username, password, _time):
        page = HTMLParser(
            self.session.get(
                f"http://moodle.fbtuit.uz/calendar/view.php?view=day&lang=uz&time={_time}"
            ).text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Saytga kirish":
            if not self.core_auth_confirm_user(username, password):
                return 0xff
            page = HTMLParser(
                self.session.get(
                    f"http://moodle.fbtuit.uz/calendar/view.php?view=month&lang=uz&time={_time}"
                ).text)
        data = {'tasks': []}
        for h2 in page.css("h2"):
            if h2.attributes.get("class", {}) == "current":
                data['current'] = h2.text()
        counter = 1
        for div in page.css("div"):
            if div.attributes.get("data-type", '') == 'event':
                data['tasks'].append({
                    'name':
                    "\n├" + str(counter) + ". " + div.css_first("h3").text() +
                    "\n| Oxirgi muddat: " + div.css_first("span").text()
                })
                for i in div.css("a"):
                    if i.text() == "Go to activity":
                        link = i.attributes['href']
                        break
                if "quiz" in link:
                    data['tasks'][-1][
                        'name'] = "\n├ " + f"<a href='{div.css_first('a').attributes['href']}'>{div.css_first('h3').text()}</a>\
					\n| Oxirgi muddat: {div.css_first('span').text()}"

                    data['tasks'][-1]['callback_data'] = None
                else:
                    data['tasks'][-1]['callback_data'] = link[link.rfind("=") +
                                                              1:]
                    counter += 1
        return data


#print('\n'.join(tasks))