Python HTMLParser.css_firstの例、selectolax.parser.HTMLParser.css_first Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_export_pdf.py プロジェクト: betagouv/zam

def test_generate_pdf_with_amendement_content_gouvernemental(
        app, lecture_senat, article1_senat, amendements_senat):
    from zam_repondeur.models import DBSession
    from zam_repondeur.services.import_export.pdf import generate_html_for_pdf

    amendement_6666 = amendements_senat[0]
    amendement_6666.auteur = "LE GOUVERNEMENT"
    amendement_6666.user_content.reponse = "La présentation"
    DBSession.add(amendement_6666)

    parser = HTMLParser(
        generate_html_for_pdf(DummyRequest(), "print/all.html",
                              {"lecture": lecture_senat}))

    assert (
        parser.css_first(".first-page .lecture").text() ==
        "Sénat, session 2017-2018, Séance publique, Numéro lecture, texte nº\xa063"
    )
    assert _html_page_titles(parser) == [
        "Article 1", "Réponse", "Amendement nº 6666"
    ]
    response_node = parser.css_first(".reponse")
    assert _cartouche_to_list(response_node) == [
        "Article",
        "Art. 1",
        "Amendement",
        "6666",
        "Auteur",
        "Gouvernement",
    ]
    assert response_node.css_first("div h5").text() == "Réponse"
    assert "La présentation" in response_node.css_first("div p").text()

コード例 #2

0

ファイルを表示

ファイル: foxnews.py プロジェクト: picaguo1997/ECS272-Winter2020

    def scrape(self):
        super().scrape()
        articles = []

        for i, URL in enumerate(self.links):
            try:
                r = urllib.request.urlopen(URL)
            except:
                print('Skipping:', URL)
                continue
            sll = HTMLParser(r.read())

            print(i + 1, '/', len(self.links), URL)

            headline = sll.css_first(
                'meta[name="dc.title"]').attributes['content']
            timestamp = parse(
                sll.css_first(
                    'meta[name="dcterms.created"]').attributes['content'])
            main_article = sll.css_first('.article-body')

            story = {}
            story['content'] = []
            story['headline'] = headline
            story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S")
            story['url'] = URL
            story['journal'] = self.journal

            for paragraph in main_article.css('p'):
                story['content'].append(paragraph.text(deep=True,
                                                       separator=''))

            articles.append(story)

        self.output = articles

コード例 #3

0

ファイルを表示

def getDefinition(terme):
    termeHttp = terme
    termeHttp = urllib.parse.quote_plus(termeHttp, encoding='iso-8859-1')
    url = 'http://www.jeuxdemots.org/rezo-dump.php?gotermsubmit=Chercher&gotermrel=' + termeHttp + '&rel=1'
    html = r.get(url)
    #print (terme + "   " +termeHttp)
    tree = HTMLParser(html.text)

    try:
        definition = tree.css_first('def').text()
        code = tree.css_first('CODE').text()
    except AttributeError as error:
        print("Le mot" + terme + " n'existe pas", error)
        return

    print("Definition pour " + terme + ": \n" + definition)

    #code = re.sub(r'(?m)^ *//.*\n?', '', code)
    #code = code.strip("\n")
    #code = code.split(";")
    #writer = csv.writer(open(terme+".csv","w"))
    #writer.writerow(code)

    match = re.findall(r"(" + terme + "\>[^0-9].*)", code)
    match.reverse()
    #print(code)
    #print(match)

    for m in match:
        m = m[:-1]  # il faut corriger dans expressions reguliers (m') !!
        getDefinition(m)

コード例 #4

0

ファイルを表示

 def scrape(self):
     super().scrape()
     articles = []
     
     for i, URL in enumerate(self.links):
         try:
             r = urllib.request.urlopen(URL)
             sll = HTMLParser(r.read())
             
             print(i+1,'/',len(self.links),URL)
             
             headline = sll.css_first('meta[property="og:title"]').attributes['content']
             main_article = sll.css_first('section[id="body-text"],.Article__content')
             timestamp = parse(sll.css_first('meta[name="pubdate"],meta[property="og:pubdate"]').attributes['content'])
             
             story = {}
             story['content'] = []
             story['headline'] = headline
             story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S")
             story['url'] = URL
             story['journal'] = self.journal
             
             
             for paragraph in main_article.css('.zn-body__paragraph,.Paragraph__component'):
                 story['content'].append(paragraph.text(deep=True, separator=''))
             
             articles.append(story)
         except:
             print('Skipping:',URL)
             continue 
         
     self.output = articles

コード例 #5

0

ファイルを表示

 async def resolve_link(self, url):
     parser = HTMLParser(await self.session(custom_url=url))
     tid = int(
         parser.css_first("a.mfd-link-dotted").attributes['href'].split(
             '?threadId=')[1])
     name = parser.css_first("div.mfd-header").text().strip()
     return tid, name

コード例 #6

0

ファイルを表示

def detect_lang_worker(filepath):
    # read file
    with open(filepath, 'rb') as f:
        page_raw = f.read()

    tree = HTMLParser(page_raw)
    text = ""

    # detect lang by description
    description = tree.css_first("meta[property=\"og:description\"]")
    if description:
        text = description.attributes['content'].strip()

    # or by title
    if not text:
        title = tree.css_first("meta[property=\"og:title\"]")
        text = title.attributes['content']

    # detect lang precisiously
    try:
        lang = simpletools.detectLang(text, detect_all=True)
    except:
        return None, None

    return lang, os.path.basename(filepath)

コード例 #7

0

ファイルを表示

    def core_course_get_contents(self, username, password, course_id):
        "Kursdagi mavzularni ko'rsatish uchun govno kod"
        page = HTMLParser(
            self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" +
                             str(course_id)).text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return []
            page = HTMLParser(
                self.session.get(
                    "http://moodle.fbtuit.uz/course/view.php?id=" +
                    str(course_id)).text)
        counter = 1
        contents = []
        "Kursdagi barcha mavzularni olamiz"
        '''
		>>> s=''
		>>> for node in page.css("li"):
		...  if 'id' in node.attributes:
		...   if node.attributes['id'][:7]=='section' and node.attributes['id']!='sectio
		n-0': print(node.child.css_first("span").text()); s+= node.child.css_first('span').text()
		'''
        for tag in page.tags("li"):
            if 'id' in tag.attributes:
                if tag.attributes['id'][:7] == "section" and tag.attributes[
                        'id'] != "section-0":
                    if not ('resource' in tag.html): continue
                    section = HTMLParser(tag.html)
                    contents.append(
                        str(counter) + ". " + section.css_first("span").text())
                    counter += 1
        if contents == []:
            contents = ["Bu yerda yuklanadigan hech narsa yo'q :/"]
        return contents

コード例 #8

0

ファイルを表示

 async def resolve_link(self, url):
     parser = HTMLParser(await self.session(custom_url=url))
     name = parser.css_first("div.mfd-header h1").text().strip().split(
         ' ')[-1]
     tid = int(
         parser.css_first("div.mfd-header div a").attributes['href'].split(
             '?id=')[1])
     return tid, name

コード例 #9

0

ファイルを表示

    def core_course_get_tasks(self, username, password,
                              course_id):  #Topshiriq mavzularini ko'rsatadi
        "Topshiriqlar list shaklida qaytadi, masalan: [1-dedline 6280, ..., vazifa uning_idsi]"
        page = HTMLParser(
            self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" +
                             str(course_id)).text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return []
            page = HTMLParser(
                self.session.get(
                    "http://moodle.fbtuit.uz/course/view.php?id=" +
                    str(course_id)).text)
        tasks = []
        counter = 1
        ids = []
        #Yana li tegiga murojaat qilamiz, chunki topshiriqlar sectionlarni ichida berilgan bo'ladi
        #if lar ko'payib ketarkan, optimal yo'lini qidirish kerak
        for tag in page.tags('li'):
            if 'id' in tag.attributes:
                if tag.attributes['id'][:7] == 'section':
                    if not ('http://moodle.fbtuit.uz/mod/assign/view.php?'
                            in tag.html):
                        continue
                    section = HTMLParser(tag.html)
                    theme = section.css_first('span').text() + '\n'
                    for tag1 in section.tags('a'):
                        if not ('http://moodle.fbtuit.uz/mod/assign/view.php?'
                                in tag1.attributes['href']):
                            continue
                        #if 'section' in tag1.attributes['id']: continue
                        #tasks.append("├"+str(counter)+". "+tag1.text().replace("\n"," ")+" "+tag1.css_first('input').attributes['value'])
                        #tasks.append("├"+str(counter)+". "+tag1.css_first('span').text()+" "+tag1.css_first('input').attributes['value'])
                        if tag1.attributes['href'][tag1.attributes['href'].
                                                   rfind("=") + 1:] in ids:
                            continue
                        tasks.append(theme + "├" + str(counter) + ". " +
                                     tag1.text() + " " +
                                     tag1.attributes['href']
                                     [tag1.attributes['href'].rfind("=") + 1:])
                        ids.append(tag1.attributes['href']
                                   [tag1.attributes['href'].rfind("=") + 1:])
                        counter += 1
                        theme = ''
                        '''s = ''
						for input_tag in tag1.css('input'):
							if input_tag.attributes['name']!='modulename' and input_tag.attributes['name']!='id': continue
							s = input_tag.attributes['value']+" "+s
						tasks[-1]+=s'''

                    tasks[-1] = tasks[-1].replace("├", "└")

        if tasks == []: return ["Bu yerda topshiriqlar yo'q :)"]
        return tasks

コード例 #10

0

ファイルを表示

ファイル: dblp_html_extract.py プロジェクト: luisdomin5/fatcat

def parse_html(path: str) -> dict:
    """
    Parses from HTML:

    - key
    - title
    - issns (list)
    - wikidata_qid
    - homepage_url
    - acronym (?)

    TODO: publisher?
    """
    key = path.replace('.html', '')
    if not len(key.split('/')) == 2:
        print(key, file=sys.stderr)
        return {}
    meta = dict(dblp_prefix=key, issns=[])

    try:
        with open(path, 'r') as html_file:
            doc = HTMLParser(html_file.read())
    except FileNotFoundError:
        return {}

    elem = doc.css_first('header#headline h1')
    if elem and elem.text():
        meta['title'] = elem.text()
        if meta['title'].endswith(')') and meta['title'].count('(') == 1:
            meta['acronym'] = meta['title'].split('(')[-1][:-1]
            meta['title'] = meta['title'].split('(')[0].strip()

    # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs">
    # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs">
    elems = doc.css('header#headline a[itemprop="sameAs"]') or []
    for elem in elems:
        if not elem.attributes.get('href'):
            continue
        url = elem.attributes['href']
        if "://portal.issn.org/" in url:
            issn = url.split('/')[-1].strip()
            if len(issn) == 9:
                meta['issns'].append(issn)
            else:
                print(issn, file=sys.stderr)
        elif "://www.wikidata.org/entity/Q" in url:
            meta['wikidata_qid'] = url.split('/')[-1]
            assert 'Q' in meta['wikidata_qid']

    # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a>
    elem = doc.css_first('header#headline a[itemprop="url"]')
    if elem and elem.attributes.get('href'):
        meta['homepage_url'] = elem.attributes['href']

    return meta

コード例 #11

0

ファイルを表示

ファイル: over_parser.py プロジェクト: Charnelx/Overbot

    def parse_topic_content(page_content: str, url: str):
        root = HTMLParser(page_content)
        topic_content = root.css_first(
            '.bg1 > .inner > .postbody > * > .content').text()
        topic_closed = bool(root.css_first('.fa-lock'))

        data_container = TopicData(content=topic_content,
                                   url=url,
                                   closed=topic_closed)

        data_container.process()
        return data_container

コード例 #12

0

ファイルを表示

ファイル: test_export_pdf.py プロジェクト: betagouv/zam

def test_generate_pdf_with_amendement_content_factor_only_groups(
        app, lecture_senat, article1_senat, amendements_senat):
    from zam_repondeur.models import DBSession
    from zam_repondeur.services.import_export.pdf import generate_html_for_pdf

    amendement_6666 = amendements_senat[0]
    amendement_6666.auteur = "M. JEAN"
    amendement_6666.groupe = "Les Indépendants"
    amendement_6666.user_content.avis = "Favorable"
    amendement_6666.user_content.objet = "L’objet"
    amendement_6666.user_content.reponse = "La réponse"
    DBSession.add(amendement_6666)

    amendement_9999 = amendements_senat[1]
    amendement_9999.auteur = "M. CLAUDE"
    amendement_9999.groupe = "Les Indépendants"
    amendement_9999.user_content.avis = "Favorable"
    amendement_9999.user_content.objet = "L’objet"
    amendement_9999.user_content.reponse = "La réponse"
    DBSession.add(amendement_9999)

    parser = HTMLParser(
        generate_html_for_pdf(DummyRequest(), "print/all.html",
                              {"lecture": lecture_senat}))

    assert (
        parser.css_first(".first-page .lecture").text() ==
        "Sénat, session 2017-2018, Séance publique, Numéro lecture, texte nº\xa063"
    )
    assert _html_page_titles(parser) == [
        "Article 1",
        "Réponse",
        "Amendement nº 6666",
        "Amendement nº 9999",
    ]
    response_node = parser.css_first(".reponse")
    assert _cartouche_to_list(response_node) == [
        "Article",
        "Art. 1",
        "Amendements",
        "6666 et 9999",
        "Auteurs",
        "M. CLAUDE et M. JEAN",
        "Groupes",
        "Les Indépendants",
        "Avis",
        "Favorable",
    ]
    assert response_node.css_first("div h5").text() == "Objet"
    assert "L’objet" in response_node.css_first("div p").text()
    assert response_node.css("div h5")[-1].text() == "Réponse"
    assert "La réponse" in response_node.css("div p")[-1].text()

コード例 #13

0

ファイルを表示

ファイル: cgtn.py プロジェクト: picaguo1997/ECS272-Winter2020

    def scrape(self):
        super().scrape()
        articles = []

        for i, URL in enumerate(self.links):
            try:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
                }
                req = urllib.request.Request(url=URL, headers=headers)
                r = urllib.request.urlopen(req)
            except:
                print('Skipping:', URL)
                continue
            sll = HTMLParser(r.read())

            print(i + 1, '/', len(self.links), URL)

            headline = sll.css_first('.news-title')

            main_article = sll.css('.text.en > p')

            timestamp = sll.css_first('.date')

            try:
                headline = headline.text(deep=True, separator='').strip()
                timestamp = timestamp.text(deep=True, separator='').strip()
                timestamp = timestamp.encode('ascii', 'ignore').decode('utf-8')
            except:
                headline = ''
                timestamp = ''
                main_article = ''

            story = {}
            story['content'] = []
            story['headline'] = headline
            story['time-stamp'] = timestamp  # 2020-03-11 16:24:48
            story['url'] = URL
            story['journal'] = self.journal

            for paragraph in main_article:
                line = paragraph.text(deep=True, separator='')
                line = line.strip()
                line = line.encode('ascii', 'ignore').decode('utf-8')
                if line:
                    story['content'].append(line)

            articles.append(story)

        self.output = articles

コード例 #14

0

ファイルを表示

ファイル: weather.py プロジェクト: AntonyMoes/cyberhelper

async def get_weather(location: str) -> str:
    async with aiohttp.request('GET', domain + weather_request + quote(location), headers={'User-Agent': user_agent}) as resp:
        search_text = await resp.text()
        title = HTMLParser(search_text).css_first('title').text()
        possible_href = str(resp.url)

    if title != 'Яндекс.Погода':
        # if we got rerouted to weather
        weather_text = search_text
        exact_location = ''
        for node in HTMLParser(weather_text).css('span.breadcrumbs__title'):
            exact_location += node.text() + ','
        exact_location = exact_location[:-1]
        href = possible_href
    else:
        # if we got location list as we expected
        node = HTMLParser(search_text).css_first('div.grid__cell')
        if node is None:
            return f'По запросу "{location}" ничего не найдено'
        node = node.css_first('li.place-list__item')
        node = node.css_first('a')
        href = domain + node.attributes['href']
        exact_location = node.text()
        async with aiohttp.request('GET', href, headers={'User-Agent': user_agent}) as resp:
            weather_text = await resp.text()

    # parsing weather
    card = HTMLParser(weather_text).css_first('div.content__main').css_first('div.content__row').css_first('div.card')
    temp_info = card.css_first('div.fact__temp-wrap').css_first('a')
    now_temp = temp_info.css_first('div.fact__temp').css_first('span.temp__value').text()
    now_condition = temp_info.css_first('div.fact__feelings').css_first('div.link__condition').text()
    wind_info = card.css_first('div.fact__props').css_first('dl.fact__wind-speed').css_first('dd.term__value')
    now_wind = wind_info.css_first('span.wind-speed').text() + ' ' + wind_info.css_first('span.fact__unit').text()

    day_info = HTMLParser(weather_text).css_first('div.forecast-briefly').css_first('div.swiper-wrapper')
    # print(day_info.html)
    slide = None
    for day in day_info.css('div.swiper-slide'):
        text: str = day.text()
        if text.find('Сегодня') != -1:
            slide = day.css_first('a')

    day_temp = slide.css_first('div.forecast-briefly__temp_day').css_first('span.temp__value').text()
    night_temp = slide.css_first('div.forecast-briefly__temp_night').css_first('span.temp__value').text()
    condition = slide.css_first('div.forecast-briefly__condition').text()

    return f'Место: {exact_location}' \
           f'\n\nCЕЙЧАС:\nТемпература: {now_temp}\nСостояние: {now_condition}\nВетер: {now_wind}' \
           f'\n\nCЕГОДНЯ:\nТемпература днем: {day_temp}\nТемпература ночью: {night_temp}\nСостояние: {condition}'\
           f'\n\nПолный прогноз: {href}'

コード例 #15

0

ファイルを表示

async def google_it(query: str, how_many: int = 1) -> str:
    async with aiohttp.request('GET',
                               base_query + quote(query),
                               headers={'User-Agent': user_agent}) as resp:
        text = await resp.text()
        i = 0

        results = []
        search_result_node = HTMLParser(text).css_first('div[eid]')
        if search_result_node is None:
            return 'Ничего не нашел'

        nodes = search_result_node.css_first('div > div.srg').css_first(
            'div.srg').css('div.g')
        for node in nodes:
            node = node.css_first('div[data-ved]').css_first('div.rc')
            header_node = node.css_first('div.r').css_first('a')

            url = header_node.attributes['href']

            header_node = node.css_first('h3').css_first('div')

            title = header_node.text().strip()

            print(f'{i}: {title} {url}')
            results.append(f'Описание: {title}\nСсылка: {url}\n')
            i += 1

    if len(results) > 0:
        return '\n'.join(results[:how_many])
    else:
        return 'Ничего не нашел'

コード例 #16

0

ファイルを表示

 async def check_update(self) -> Page:
     parser = HTMLParser(await self.session(), "html.parser")
     post = parser.css_first("div.trt").html
     return Page([
         SinglePost(md=(await self.pretty_text(post)).strip(),
                    title=self.title)
     ])

コード例 #17

0

ファイルを表示

    def process(self, query):
        html = r.get(self.buildUrl(query.term))
        tree = HTMLParser(html.text)
        code_tag = tree.css_first('CODE')

        if not code_tag:
            return None

        code_text = code_tag.text()
        definition_tag = tree.css_first('def')
        definition = ''
        if definition_tag:
            definition = definition_tag.text()

        return self.processGet(query.term, query.properties, code_text,
                               definition, query)

コード例 #18

0

ファイルを表示

def get_imdb_page(show):
    global requests

    logging.info("Scraping information for show: " + (show))
    # We want to query imdb one time
    url = 'https://www.imdb.com/search/title?title=' + show + '&title_type=tv_series,tv_miniseries&sort=popularity'
    # Making a response and parsing it
    response = get(url, headers=headers)

    if response.status_code != 200:
        logging.warning('Received status code, ' + str(response.status_code))
        raise Exception("Received a non-200 status code!")

    parser = HTMLParser(response.text)

    # Update progress bar and wait
    requests += 1
    elapsed_time = time() - start_time
    os.system('clear')
    print('Request: {}; Frequency: {} requests/s'.format(
        requests, requests / elapsed_time))

    # We only care about the divs that have the movie name
    # imdb_page has the link to the tv show's imdb page
    if (len(parser.css(".lister-item-header a")) <= 0):
        logging.warning('Did not find any results for: ' + show)
        raise Exception("Did not find a valif imdb page")
    imdb_page = "https://www.imdb.com" + parser.css_first(
        ".lister-item-header a").attributes['href']

    return imdb_page

コード例 #19

0

ファイルを表示

def test_relative_url():
    tree = HTMLParser('<html><a href="/test/relative">Testing</a></html>')
    href_node = tree.css_first("a")
    base = "https://www.google.com/tester1"

    ahref = Ahref(href_node, base)
    assert ahref.absolute_url == "https://www.google.com/test/relative"

コード例 #20

0

ファイルを表示

    def core_course_get_courses(self, username, password):
        page = HTMLParser(
            self.session.get("http://moodle.fbtuit.uz/my/").text
        )  #todo bundan ham optimal qilish mumkin :/
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return []  #Parol xato bo'lsa bo'sh list qaytaradi
            page = HTMLParser(
                self.session.get("http://moodle.fbtuit.uz/my/").text)
        "dasturlash 136 shaklida bo'ladi har bir list elementi"
        "kursnomi idsi"
        course_list = []
        '''
		Parse ni quyidagi qism bo'yicha amalga oshiradi:
		<div class="media-body">
        <h4 class="h5"><a href="http://moodle.fbtuit.uz/course/view.php?id=173" class="">Kurs nomi</a></h4>
        </div>
		'''
        for node in page.css("div"):
            if 'class' in node.attributes:
                if node.attributes['class'] == 'media-body' and node.text(
                ) != '':
                    if node.text().strip(
                    ) + " " + node.css_first('a').attributes['href'][
                            node.css_first('a').attributes['href'].find("=") +
                            1:] in course_list:
                        break
                    course_list.append(
                        node.text().strip() + " " +
                        node.css_first('a').attributes['href']
                        [node.css_first('a').attributes['href'].find("=") +
                         1:])
        return course_list

コード例 #21

0

ファイルを表示

 def core_course_get_files(self, username, password, course_id, section):
     page = HTMLParser(
         self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" +
                          str(course_id)).text)
     if page.css_first(
             'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
         if not self.core_auth_confirm_user(username, password):
             return 0xff
         page = HTMLParser(
             self.session.get(
                 "http://moodle.fbtuit.uz/course/view.php?id=" +
                 str(course_id)).text)
     #li teglari oralig'ini tekshiradi, mavzular va fayllar o'sha yerda
     for tag in page.tags('li'):
         if 'id' in tag.attributes:
             if tag.attributes['id'] == 'section-' + str(section):
                 page = HTMLParser(tag.html)  #aaaaaaa
                 break
     links = []
     for tag in page.tags('a'):
         if not (tag.attributes['href'] in links):
             links.append(tag.attributes['href'])
     try:
         os.mkdir(os.getcwd() + "/temp")
     except Exception as e:
         pass
     for link in links:
         if not ('resource' in link): continue
         resp = self.session.get(link, allow_redirects=True)
         file_name = resp.url[resp.url.rfind("/") + 1:]
         file_name = unquote(file_name)
         if 'view.php' in file_name: continue
         with open(os.getcwd() + "/temp/" + file_name, 'wb') as file:
             file.write(resp.content)
     return 0

コード例 #22

0

ファイルを表示

 def core_course_get_grades(
     self, username, password
 ):  #, course_id): keyinchalik kurslarni o'zidagi baholarni batafsil ko'rsatadigan qilish
     "Kurs id si bo'yicha baholarni aytadi"
     "Buni sal chiroyliroq qilib qo'ygin-ey :)"
     page = HTMLParser(
         self.session.get(
             "http://moodle.fbtuit.uz/grade/report/overview/index.php").text
     )
     if page.css_first(
             'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
         if not self.core_auth_confirm_user(username, password):
             return ""
         page = HTMLParser(
             self.session.get(
                 "http://moodle.fbtuit.uz/grade/report/overview/index.php").
             text)
     grades = "Kurs nomi|Baho\n"
     #td - table tegi, table dan baholarni olamiz
     counter = 0
     for node in page.css('td'):
         if node.text() == "": return grades
         grades += node.text()
         counter += 1
         if counter % 2: grades += "|"
         else: grades += "\n"

コード例 #23

0

ファイルを表示

    def SelectTitle(self):
        """
    	Ask for CSS title selector
    	"""
        titleSelect = self.CONFIG.GetTitleCSS()
        if titleSelect == "":
            print(
                "Title Selector unspecified, please add a css selector for the game titles under SiteInfo > TitleSelector in the configuration file: {}"
                .format(self.CONFIG.filename))
            quit()

        #print("TODO: PREVIEW THIS SELECTOR")
        pageContents, last_status_code = "", -1
        while last_status_code != 200:
            target = self.links[0]
            try:
                last_status_code, pageContents = self.getContents(target)
            except requests.exceptions.ConnectionError as e:
                print("Unable to connect, retrying")
                continue
        titletree = HTMLParser(pageContents)
        titlepreview = titletree.css_first(titleSelect).text()
        print("Title Preview:")
        print(titlepreview)

        confirm = ""
        while confirm != "y" and confirm != "n":
            print("Does this game's title match? {} (y)es/(n)o".format(
                self.links[0]))
            confirm = input()
            if confirm == "n":
                print(
                    "Please reconfigugre your the css selector for the game titles under SiteInfo > TitleSelector in the configuration file: {}"
                    .format(self.CONFIG.filename))
                quit()

コード例 #24

0

ファイルを表示

def test_node_comparison_fails():
    html = """<div id="test"></div>"""
    html_parser = HTMLParser(html)
    node = html_parser.css_first('#test')

    assert node != None
    assert node != 123
    assert node != object

コード例 #25

0

ファイルを表示

ファイル: pythonbytes.py プロジェクト: WhaitSources/Pythonbytes-episode-scraper

def scrape_podcast(link):
	print('[*] Scraping', base_url + link)
	selectolax = HTMLParser(requests.get(base_url + link, headers=headers).content)
	dl_link = base_url + str(selectolax.css_first('a.btn.btn-default.subscribe-btn.btn-sm').attrs['href'])
	file_name = dl_link.split('/')[-1]
	print('[+] Downloading', file_name)
	with open(output_dir + file_name, 'wb') as file:
		file.write(requests.get(dl_link, headers=headers).content)

コード例 #26

0

ファイルを表示

ファイル: ingest_html.py プロジェクト: internetarchive/sandcrawler

def html_guess_platform(url: str, doc: HTMLParser,
                        biblio: Optional[BiblioMetadata]) -> Optional[str]:

    generator: Optional[str] = None
    generator_elem = doc.css_first("meta[name='generator']")
    if generator_elem:
        generator = generator_elem.attrs["content"]
    else:
        generator_elem = doc.css_first("a[id='developedBy']")
        if generator_elem:
            generator = generator_elem.text()
    if generator and "open journal systems 3" in generator.lower():
        return "ojs3"
    elif generator and "open journal systems" in generator.lower():
        return "ojs"
    elif generator and "plone" in generator.lower():
        return "plone"
    elif generator and "wordpress" in generator.lower():
        return "wordpress"
    elif generator and "blogger" in generator.lower():
        return "blogger"
    elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
        return "ojs"
    else:
        try:
            if ('powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
                    in doc.html):
                return "ojs"
            if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
                return "arpha"
            if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
                return "galenos"
        except UnicodeDecodeError:
            pass

    icon_elem = doc.css_first("link[type='image/x-icon']")
    if icon_elem and "href" in icon_elem.attrs:
        if "journalssystem.com" in icon_elem.attrs["href"]:
            return "journalssystem.com"
        elif "indexcopernicus.com" in icon_elem.attrs["href"]:
            return "indexcopernicus"

    if "scielo" in url:
        return "scielo"

    return None

コード例 #27

0

ファイルを表示

def test_node_insert_after():
    html_parser = HTMLParser(
        '<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>'
    )
    html_parser2 = HTMLParser('<div>Test</div>')
    img_node = html_parser.css_first('img')
    img_node.insert_after(html_parser2.body.child)
    assert html_parser.body.child.html == '<div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'

コード例 #28

0

ファイルを表示

def test_external_url():
    tree = HTMLParser(
        '<html><a href="https://www.getevents.nl/tester">Getevents</a></html>'
    )
    href_node = tree.css_first("a")
    base = "https://www.google.com/getevents"

    ahref = Ahref(href_node, base)
    assert not ahref.is_internal

コード例 #29

0

ファイルを表示

ファイル: parse_wsb.py プロジェクト: molson77/Kive

def get_html_title(path):
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        html = f.read()
        tree = HTMLParser(html)
        node = tree.css_first('title')
        if node:
            text = node.text(deep=False)
            return text if text else None  # Return title text if title tag exists
        else:
            return None  # Else, return None

コード例 #30

0

ファイルを表示

ファイル: test_nodes.py プロジェクト: pushshift/selectolax

def test_attrs_test_dict_features():
    html_parser = HTMLParser('<div id="id" v data-id="foo"></div>')
    node = html_parser.css_first('div')
    node.attrs['new_att'] = 'new'
    assert list(node.attrs.keys()) == ['id', 'v', 'data-id', 'new_att']
    assert list(node.attrs.values()) == ['id', None, 'foo', 'new']
    assert len(node.attrs) == 4
    assert node.attrs.get('unknown_field', 'default_value') == 'default_value'
    assert 'id' in node.attrs
    assert 'vid' not in node.attrs