Python cssselect Examples, lxml.html.cssselect Python Examples

Example #1

0

Show file

File: athlete_tracker_test_new_races.py Project: flyeven/scraperwiki-scraper-vault

def parsePage(html):
    
    # Dictionary to store info
    athInfo = {}
    
    #Now start populating our data object
    athInfo['AthleteName'] = html.cssselect("h2")[0].text
    athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip()    

    infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession']
    detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime']
    
    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text
    
    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HasResults'] = 1
    athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['Bib'], data=athInfo, table_name="RESULTS", verbose=0)

Example #2

0

Show file

def signIn(username, password):
    raw = requests.get(PATHS['login'])
    session_left_slice = raw.headers['set-cookie'].find('=') + 1
    session_right_slice = raw.headers['set-cookie'].find(';')
    session_id = raw.headers['set-cookie'][
        session_left_slice:session_right_slice]
    html = lxml.html.fromstring(raw.text)
    db_viewstate = html.cssselect("input#__DATABASE_VIEWSTATE").value
    print db_viewstate
    ev_validation = html.cssselect("input#__EVENTVALIDATION").value
    # Create the form payload
    username_key = 'txtUserID'
    password_key = 'txtPassword'
    login_button = 'btnSubmit'
    form_payload = {
        '__EVENTTARGET': '',
        '__EVENTARGUMENT': '',
        '__VIEWSTATE': db_viewstate,
        '__EVENTVALIDATION': ev_validation,
        username_key: username,
        password_key: password,
    }
    session = requests.session()
    session.post(PATHS['login'], data=form_payload)
    return session

Example #3

0

Show file

File: metadata.py Project: frnsys/audubon

def get_metadata(url):
    resp = requests.head(url, headers=headers, timeout=5)
    resp.raise_for_status()

    if 'text/html' not in resp.headers.get('Content-Type'):
        return {'url': url}

    resp = requests.get(url, headers=headers, timeout=10)
    resp.raise_for_status()

    html = lxml.html.fromstring(resp.content.decode('utf8'))
    tags = html.cssselect('meta[property], meta[name]')

    meta = {}
    for tag in tags:
        prop = tag.attrib.get('property', tag.attrib.get('name'))
        data = tag.attrib.get('content')
        if data is not None:
            meta[prop] = data

    can = html.cssselect('link[rel="canonical"]')
    if can:
        meta['canonical'] = can[0].attrib['href']

    # Canonical data
    meta['url'] = _get(meta, 'canonical', 'og:url', default=url)
    meta['description'] = _get(meta, 'description', 'og:description',
                               'twitter:description')
    meta['title'] = _get(meta, 'og:title', 'twitter:title', url)

    return meta

Example #4

0

Show file

File: example_set.py Project: JonathanRaiman/PythonObjectLM

 def scrape_restaurant_data(self, example):
     # get this from yelp
     
     html = obtain_html(example["url"])
     
     html.make_links_absolute(example["url"])
     
     title = html.cssselect("h1.biz-page-title")[0].text.strip()
     
     review_highlights = html.cssselect("ul.review-highlights-list")
     if len(review_highlights) > 0:
         description = tree_to_str(clean_up_highlights(review_highlights[0]))
     else:
         description = create_description_highlights(html)
     
     images = html.cssselect("img.photo-box-img")
     image_url = None
     if len(images) > 0:
         image_url   = images[0].attrib["src"]
     
     return {
     "title": title,
     "description": description,
     "categories": example["categories"],
     "image_url" : image_url,
     "rating": rating_to_string(example["rating"]),
     "price": example["price"]
     }

Example #5

0

Show file

def parsePage(html):

    # Dictionary to store info
    athInfo = {}

    #Now start populating our data object
    athInfo['AthleteName'] = html.cssselect("h2")[0].text
    athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip()

    infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession']
    detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime']

    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text

    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath(
        "//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath(
        "//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HasResults'] = 1
    athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['Bib'],
                            data=athInfo,
                            table_name="RESULTS",
                            verbose=0)

Example #6

0

Show file

File: daft_to_trello.py Project: rshk/daft_to_trello

def scrape_daft_page(url):
    info = {'url': url}

    http_client = CachedHttpClient()
    data = http_client.get(url)
    html = lxml.html.fromstring(data)

    content_tag = html.xpath('//div[@id="content"]')[0]

    title_tag = content_tag.cssselect('.smi-info h1')[0]
    info['title'] = title_tag.text

    image_tag = html.cssselect('#smi-gallery-img-main img')[0]
    image_src = image_tag.attrib['src']
    if image_src.startswith('//'):
        image_src = 'https:' + image_src
    info['image'] = image_src

    price_tag = html.cssselect('#smi-price-string')[0]
    info['price'] = price_tag.text

    header_text = html.cssselect('#smi-summary-items .header_text')
    hdrtext = [t.text for t in header_text]

    info['beds'] = hdrtext[1]
    info['baths'] = hdrtext[2]

    info['description'] = '\n\n'.join(
        elem.text_content()
        for elem in html.cssselect('#smi-tab-overview .description_block'))
    # info['description'] = content_tag.cssselect('.overview')[0].text

    return info

Example #7

0

Show file

    def test_render(self):
        # Bouton tout pourri qui affiche "Accueil -> [ Jardins, Variétés ]"
        column = DropDownLinkColumn(links=[
            Link(text=u'Main button',
                 viewname='s5appadherant:accueil'),
            Link(text=u'Jardins',
                 viewname='s5appadherant:jardin_all',
                 args=()),
            Link(text=u'Variété',
                 viewname='s5appadherant:variete_list')
        ])

        output = column.render(G(Adherant))
        html = lxml.html.fromstring(output)

        elements = html.cssselect(".btn-group > a")
        self.assertEqual(1, len(elements))
        self.assertEqual(u'Main button', elements[0].text)
        self.assertEqual(reverse('s5appadherant:accueil'), elements[0].attrib['href'])

        elements = html.cssselect("button.dropdown-toggle")
        self.assertEqual(1, len(elements))
        self.assertEqual('dropdown', elements[0].attrib['data-toggle'])

        elements = html.cssselect("ul.dropdown-menu li a")
        self.assertEqual(2, len(elements))
        self.assertEqual(u'Jardins', elements[0].text)
        self.assertEqual(reverse('s5appadherant:jardin_all'), elements[0].attrib['href'])
        self.assertEqual(u'Variété', elements[1].text)
        self.assertEqual(reverse('s5appadherant:variete_list'), elements[1].attrib['href'])

Example #8

0

Show file

def get_citation(title):
    """Given a paper title, attempts to get citation
    strings for that paper from Google Scholar."""
    # Search for the paper by title
    resp = requests.get(BASE_URL, params={'q': title})
    html = lxml.html.fromstring(resp.content)
    result_els = html.cssselect('.gs_r')
    if not result_els:
        return None

    # Only consider the first match
    result_el = result_els[0]

    # result_title = result_el.cssselect('.gs_rt a')[0].text

    # Request the citations
    result_id = result_el.attrib['data-cid']
    resp = requests.get(BASE_URL,
                        params={
                            'q':
                            'info:{}:scholar.google.com/'.format(result_id),
                            'output': 'cite'
                        })
    html = lxml.html.fromstring(resp.content)
    citations = {}
    for format_el, citation_el in zip(html.cssselect('th'),
                                      html.cssselect('td .gs_citr')):
        format = format_el.text
        citation = citation_el.text_content()
        citations[format] = citation
    return citations

Example #9

0

Show file

File: liberate.py Project: doismellburning/iscp-inject

def sign_in(u, p):

    session = requests.session()
    raw = session.get(LOGIN_URL)

    html = lxml.html.fromstring(raw.text)

    viewstate = html.cssselect("input#__VIEWSTATE")[0].value
    viewstate_generator = html.cssselect("input#__VIEWSTATEGENERATOR")[0].value

    payload = {
        "ctl00$cphMain$Logon1$_resolution" : "1440x900",
        "ctl00$cphMain$Logon1$_email" : u,
        "ctl00$cphMain$Logon1$_password": p,
        "ctl00$cphMain$Logon1$_login": "******",
        "__EVENTTARGET": "",
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": viewstate,
        "__VIEWSTATEGENERATOR": viewstate_generator,
    }

    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
    }

    r = session.post(LOGIN_URL, data=payload, headers=headers)

    assert "unread messages" in r.content

    return session

Example #10

0

Show file

File: athlete_tracker_25.py Project: rayassch/scraperwiki-scraper-vault

def parsePage(html):
    
    # Dictionary to store info
    athInfo = {}
    
    #Now start populating our data object
    athInfo['ATHLETE_NAME'] = html.cssselect("h2")[0].text
    athInfo['DIVISION_RANK'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OVERALL_RANK'] = html.cssselect("#div-rank *")[0].tail.strip()    

    #infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION']
    infoFields = ['BIB', 'DIVISION', 'STATE', 'COUNTRY', 'PROFESSION']
    detailsFields = ['TOTAL_SWIM', 'TOTAL_BIKE', 'TOTAL_RUN', 'TOTAL_TIME']
    
    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text
    
    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HAS_RESULTS'] = 1
    athInfo['SCRAPED'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['BIB'], data=athInfo, table_name="RESULTS", verbose=0)

Example #11

0

Show file

    def test_templates_course_detail_one_open_course_run(self):
        """
        For a course with one open course run, the course run should be in the header
        and the side column should display an indication that there is no other course run.
        """
        course = CourseFactory()
        page = course.extended_object

        # Create an open course run
        now = timezone.now()
        CourseRunFactory(
            direct_course=course,
            start=now + timedelta(hours=1),
            enrollment_start=now - timedelta(hours=1),
            enrollment_end=now + timedelta(hours=1),
        )

        self.assertTrue(page.publish("fr"))

        url = page.get_absolute_url()
        response = self.client.get(url)
        self.assertEqual(response.status_code, 200)

        html = lxml.html.fromstring(response.content)

        # Check syllabus intro
        header = str(etree.tostring(html.cssselect(".subheader__intro")[0]))
        self.assertEqual(header.count("course-detail__run-descriptions"), 1)
        self.assertIn("S&#226;&#128;&#153;inscrire maintenant", header)

        # Check syllabus aside column
        aside = str(etree.tostring(html.cssselect(".course-detail__aside")[0]))
        self.assertNotIn("course-detail__run-descriptions", aside)
        self.assertNotIn("S&#226;&#128;&#153;inscrire maintenant", aside)
        self.assertIn("Aucune autre session ouverte", aside)

Example #12

0

Show file

File: iproperty_crawler.py Project: chickenwilful/spms_project

def get_agent_by_html(html):
    # print tostring(html)
    try:
        name = html.cssselect('a')[0].get('title').encode(
            'ascii', 'ignore').decode('ascii')
        agent_url = html.cssselect('a')[0].get('href')
        estate_name = html.cssselect('a')
        estate_name = estate_name[len(estate_name) - 1].get('title')
        try:
            reg_number = re.search(
                re.escape(r'CEA Registration Number :') + '\s(.{8})',
                tostring(html), re.I).group(1)
        except AttributeError:
            reg_number = None
        try:
            lic_number = re.search(
                re.escape(r'Agency Licence Number :') + '\s(.{9})',
                tostring(html), re.I).group(1)
        except AttributeError:
            lic_number = None
        phone_number = html.cssselect('span a')
        if phone_number:
            phone_number = get_phone_number(phone_number[0])
        agent = AgentIProperty(name=name,
                               phone_number=phone_number,
                               estate_name=estate_name,
                               reg_number=reg_number,
                               lic_number=lic_number,
                               url=agent_url)
        # print agent
        return agent
    except IndexError:
        return None

Example #13

0

Show file

    def test_templates_course_detail_two_open_course_runs(self):
        """
        For a course with two open course runs, the course run starting next should be in the
        header and the other course run should be in the side column.
        """
        course = CourseFactory()
        page = course.extended_object
        url = page.get_absolute_url()

        # Create 2 open course runs
        now = timezone.now()
        start1, start2 = random.sample(
            [now + timedelta(days=1), now + timedelta(days=2)], 2
        )
        CourseRunFactory(
            direct_course=course,
            start=start1,
            enrollment_start=now - timedelta(hours=1),
            enrollment_end=now + timedelta(hours=1),
        )
        CourseRunFactory(
            direct_course=course,
            start=start2,
            enrollment_start=now - timedelta(hours=1),
            enrollment_end=now + timedelta(hours=1),
        )

        self.assertTrue(page.publish("fr"))
        response = self.client.get(url)
        self.assertEqual(response.status_code, 200)

        html = lxml.html.fromstring(response.content)

        # Check syllabus intro
        header = str(
            etree.tostring(
                html.cssselect(".subheader__intro")[0],
                encoding="iso8859-1",
                method="html",
            ).decode("utf-8")
        )
        self.assertEqual(header.count("course-detail__runs--open"), 1)
        self.assertIn("S’inscrire maintenant", header)
        date_string = formats.date_format(min(start1, start2))
        with translation.override("fr"):
            self.assertIn(f"Du {date_string}", header)

        # Check syllabus aside column
        aside = str(
            etree.tostring(
                html.cssselect(".course-detail__aside")[0],
                encoding="iso8859-1",
                method="html",
            ).decode("utf-8")
        )
        self.assertEqual(aside.count("course-detail__runs--open"), 1)
        self.assertIn("S’inscrire maintenant", aside)
        date_string = formats.date_format(max(start1, start2))
        with translation.override("fr"):
            self.assertIn(f"Du {date_string}", aside)

Example #14

0

Show file

File: usage.py Project: MerlinMagic2018/rogers-usage

def current_usage_info(session):
    response = session.get(current_usage_url)
    html = lxml.html.fromstring(response.text)

    def convert(text):
        m = re.search(r'(\d+(?:\.\d+)?) GB', text)
        if m:
            return float(m.group(1))

        m = re.search(r'(\d+(?:\.\d+)?) MB', text)
        if m:
            return float(m.group(1)) / 1024.0

    tds = html.cssselect('#usageInformation')[0].xpath('.//td')
    info = {
        'download_usage':
        convert(_condense_whitespace(tds[2].text_content())),
        'upload_usage':
        convert(_condense_whitespace(tds[4].text_content())),
        'total_usage':
        convert(_condense_whitespace(tds[6].text_content())),
        'allowance':
        convert(_condense_whitespace(tds[8].text_content())),
        'billing_period':
        re.sub(
            r'Details for ?', '',
            _condense_whitespace(
                html.cssselect('#currentBillingPeriod')[0].text_content())),
    }
    info['left'] = info['allowance'] - info['total_usage']

    return info

Example #15

0

Show file

File: vinacontrol.py Project: htlcnn/scripts

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', '--username', help='Username')
    parser.add_argument('-p', '--password', help='Password')
    args = parser.parse_args()



    with requests.Session() as s:
        homepage = s.get('http://trithuc.vinacontrol.com.vn/')
        html = lxml.html.fromstring(homepage.text)

        payloads = {'name': args.username,
                    'pass': args.username,
                    'form_build_id': html.cssselect('input[name=form_build_id]')[0].attrib['value'],
                    'form_id': html.cssselect('input[name=form_id]')[0].attrib['value'],
                    'op': 'Đăng nhập'
                   }
        s.post('http://trithuc.vinacontrol.com.vn/node', data=payloads)
        res = s.get('http://trithuc.vinacontrol.com.vn/ds-cauhoi?field_quiz_phanloai_tid[0]=438&items_per_page=All')
        html = lxml.html.fromstring(res.text)
    table = html.cssselect('table[data-view-name=ds_cauhoi]')[0]
    columns = ['STT', 'Phân loại', 'Câu hỏi', 'Trả lời']
    df = pd.DataFrame(columns=columns)

    for row in table.cssselect('tbody>tr'):
        df = df.append(pd.DataFrame([parse_row(row)], columns=columns))

    df.to_excel('test.xls')

Example #16

0

Show file

File: athlete_tracker_25.py Project: flyeven/scraperwiki-scraper-vault

def parsePage(html):

    # Dictionary to store info
    athInfo = {}

    # Now start populating our data object
    athInfo["ATHLETE_NAME"] = html.cssselect("h2")[0].text
    athInfo["DIVISION_RANK"] = html.cssselect("#rank *")[0].tail.strip()
    athInfo["OVERALL_RANK"] = html.cssselect("#div-rank *")[0].tail.strip()

    # infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION']
    infoFields = ["BIB", "DIVISION", "STATE", "COUNTRY", "PROFESSION"]
    detailsFields = ["TOTAL_SWIM", "TOTAL_BIKE", "TOTAL_RUN", "TOTAL_TIME"]

    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text

    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    # have to use xpath to get T1 and T2 data
    athInfo["T1"] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo["T2"] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo["HAS_RESULTS"] = 1
    athInfo["SCRAPED"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=["BIB"], data=athInfo, table_name="RESULTS", verbose=0)

Example #17

0

Show file

File: vinacontrol.py Project: micsoftvn/scripts

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', '--username', help='Username')
    parser.add_argument('-p', '--password', help='Password')
    args = parser.parse_args()

    with requests.Session() as s:
        homepage = s.get('http://trithuc.vinacontrol.com.vn/')
        html = lxml.html.fromstring(homepage.text)

        payloads = {
            'name':
            args.username,
            'pass':
            args.username,
            'form_build_id':
            html.cssselect('input[name=form_build_id]')[0].attrib['value'],
            'form_id':
            html.cssselect('input[name=form_id]')[0].attrib['value'],
            'op':
            'Đăng nhập'
        }
        s.post('http://trithuc.vinacontrol.com.vn/node', data=payloads)
        res = s.get(
            'http://trithuc.vinacontrol.com.vn/ds-cauhoi?field_quiz_phanloai_tid[0]=438&items_per_page=All'
        )
        html = lxml.html.fromstring(res.text)
    table = html.cssselect('table[data-view-name=ds_cauhoi]')[0]
    columns = ['STT', 'Phân loại', 'Câu hỏi', 'Trả lời']
    df = pd.DataFrame(columns=columns)

    for row in table.cssselect('tbody>tr'):
        df = df.append(pd.DataFrame([parse_row(row)], columns=columns))

    df.to_excel('test.xls')

Example #18

0

Show file

File: nyaa.py Project: luketurner/animagic

def _result_type(html):

    if html.cssselect(".tinfodownloadbutton a"):
        return "info"
    elif html.cssselect(".tlistdownload a"):
        return "list"
    else:
        return "empty"

Example #19

0

Show file

 def scrape_comment(self, html, parent):
     c = HTMLDocument(
         text = html.cssselect("div.text-holder"),
         headline = html.cssselect("a.commentTitle")[0].text_content().strip(),
         section = parent.props.section,
         date = readDate(" ".join([t.text for t in html.cssselect("ul.meta li.createdate, li.createtime")])),
         author = html.cssselect("ul.meta li.by")[0].text.strip().lstrip("By").strip(),
         url = parent.props.url + "#{}".format(html.cssselect("a.commentTitle")[0].get('id')))
     c.props._parent = "{p.props.headline}, {p.props.date}".format(p = parent)
     return c

Example #20

0

Show file

File: chinaluxus.py Project: kamiswin/max-x.net

def nextPage(html,base_url=''):
    # logger.info('have many page')
    car_body = lxml.html.tostring(html.cssselect('.text')[-1])
    while len(html.cssselect('.next')) > 0 and len(html.cssselect('.nextBtn')) == 0:
        nextpage = requests.get(base_url + html.cssselect('.next')[0].get('href'))
        nexthtml = lxml.html.fromstring(nextpage.content)
        body = lxml.html.tostring(nexthtml.cssselect('.text')[-1])
        car_body += body
        html = nexthtml
    return car_body

Example #21

0

Show file

File: crawl.py Project: iwaikantoku/crawling

def scrape_detail_page(response: requests.Response) -> dict:
    html = lxml.html.fromstring(response.text)
    ebook = {
        'url': response.url,
        'title': html.cssselect('#bookTitle')[0].text_content(),
        'price': html.cssselect('.buy')[0].text.strip(),
        'content':
        [h3.text_content() for h3 in html.cssselect('#content > h3')]
    }
    return ebook

Example #22

0

Show file

def scrape_detail_page(response):
    html = lxml.html.fromstring(response.text)
    ebook = {
        'url': response.url,
        'key': extract_key(response.url),
        'title': html.cssselect('#bookTitle')[0].text_content(),
        'price': html.cssselect('.buy')[0].text.strip(),
        'content': [normalize_spaces(h3.text_content()) for h3 in html.cssselect('#content>h3')],
    }
    return ebook

Example #23

0

Show file

def download_councillors():
    with open(WEBPAGESTXT, 'r') as txtfile:
        urls = txtfile.readlines()
    urls = [url.strip() for url in urls]

    session = http.client.HTTPSConnection('www.berlin.de', timeout=10)
    councillors = {}
    for url in urls:
        if councillors:
            time.sleep(2)

        bezirk = bezirk_from_url(url)

        headers = {'Accept-Encoding': 'gzip', 'Connection': 'keep-alive'}
        session.request('GET', url, headers=headers)
        response = session.getresponse()

        response = response.read()
        response = zlib.decompress(response, 47)

        try:
            response = response.decode('latin-1', 'strict')
        except UnicodeDecodeError:
            response = response.decode('windows-1252', 'strict')

        html = lxml.html.fromstring(response)
        html.make_links_absolute(url)

        tablerows = html.cssselect('.zl12')
        tablerows += html.cssselect('.zl11')

        number = html.cssselect('table.tk1:nth-child(8)')[0]
        number = number.text_content()
        _, number = number.split(':')
        number = number.strip()
        if number.isdigit():
            number = int(number)
            if not number == len(tablerows):
                print('%s:' % bezirk,
                      '%s councillors were found.' % len(tablerows),
                      'Should be %s councillors.' % number)

        for row in tablerows:
            councillor = extract_councillor(row)
            councillor['BEZIRK'] = bezirk
            identifier = normalized_name(councillor['ANZEIGENAME'])
            try:
                councillors[bezirk][identifier] = councillor
            except KeyError:
                councillors[bezirk] = {identifier: councillor}
    session.close()
    return councillors

Example #24

0

Show file

def parse(page_text):
    urls = []
    html = get_html(page_text)
    url = html.cssselect('link[rel = "canonical"]')[0].get('href')
    print("URL:", url)
    breadcrumbs = html.cssselect('div.breadcrumbs>span.almost_bold')[0].text
    if (breadcrumbs == "History"):
        pagination = int(html.cssselect('div.rating_pagination.pagination>span')[0].text)
        if(pagination == 1):
            rating_pagination = html.cssselect("div.rating_pagination.pagination")[0]
            a_list = rating_pagination.cssselect('div.rating_pagination.pagination>a')
            for a in a_list:
                urls.append(main_type + a.get("href"))
        href_list = []
        a_list = html.cssselect('table.rating.responsive>tr:not([class])>td[style="text-align:left"]>a')
        for a in a_list:
            href_list.append(main_type + a.get("href"))

        href_list = list(set(href_list))
        for href in href_list:
            urls.append(href)
    else:
        type = html.cssselect('div.breadcrumbs>span[itemprop = "itemListElement"]')[1].cssselect('a')[0].get('title')
        group_list = html.cssselect('div.tbt2.row_heading>div>h2')[1:]
        title = html.cssselect('meta[property = "og:title"]')[0].get("content")
        # print("Название", title)
        id_device = add_device(title, type, url)
        # print("Идентификатор продукта", id_device)
        tables = html.cssselect('div.tbt1.single>div.table')
        # print(len(tables))
        for index, table in enumerate(tables):
            group = group_list[index].text
            tbts = table.cssselect('div.tbt5')
            for tbt in tbts:
                divs = tbt.cssselect('div')
                one_block = divs[1].text
                #Прерываем ошибочные таблицs
                if(one_block == None):
                    # add = False
                    break
                two_block = divs[2].text
                if(two_block == None):
                    try:
                        two_block = divs[2].cssselect('span')[0].text
                    except:
                        two_block = divs[2].cssselect('a')[0].text
                if(two_block == "+"):
                    two_block = 1
                else:
                    if(two_block == "-"):
                        two_block = 0
                # print("Группа:", group)
                id_device_variables = add_device_variables(one_block, group)
                # print("Идентификатор VARIABLE:", id_device_variables)
                id_device_value = add_device_value(id_device, id_device_variables, two_block)

    return urls

Example #25

0

Show file

    def test_render(self):
        column = self.get_column()

        output = column.render(G(Adherant))
        html = lxml.html.fromstring(output)

        elements = html.cssselect('a[data-toggle="modal"]')
        self.assertEqual(1, len(elements))
        self.assertEqual(u"Accueil", elements[0].text)

        modal_id = elements[0].attrib['data-target']
        modal = html.cssselect(u'[id="%s"]' % modal_id[1:])
        self.assertEqual(1, len(modal))

Example #26

0

Show file

File: liberate.py Project: doismellburning/iscp-inject

def upload_course(session, title="Lorem Ipsum", filename="test.jpg", filedata="", filetype="image/png"):
    response = session.get("https://www.iscp.ac.uk/evidence/course.aspx")

    html = lxml.html.fromstring(response.text)

    viewstate = html.cssselect("input#__VIEWSTATE")[0].value
    viewstate_generator = html.cssselect("input#__VIEWSTATEGENERATOR")[0].value
    viewstate_encrypted = html.cssselect("input#__VIEWSTATEENCRYPTED")[0].value
    event_validation = html.cssselect("input#__EVENTVALIDATION")[0].value

    payload = {
        "__EVENTTARGET": "",
        "__EVENTARGUMENT": "",
        "__EVENTVALIDATION": event_validation,
        "__VIEWSTATE": viewstate,
        "__VIEWSTATEGENERATOR": viewstate_generator,
        "__VIEWSTATEENCRYPTED": viewstate_encrypted,
        "ctl00$cphMain$txtDate": "01/01/1970",
        "ctl00$cphMain$txtEndDate": "",
        "ctl00$cphMain$drpTitles": 6,  # Other
        "ctl00$cphMain$txtOtherTitle": title,
        "ctl00$cphMain$drpTypes": 0,
        "ctl00$cphMain$txtOtherType": "",
        "ctl00$cphMain$txtAwardingBody": "",
        "ctl00$cphMain$txtFeedback": "",
        "ctl00$cphMain$txtLearn": "",
        "ctl00$cphMain$txtImprove": "",
        "ctl00$cphMain$txtActionPlan": "",
        "ctl00$cphMain$topicChooser1$hidScrollTop": "",
        "ctl00$cphMain$topicChooser1$hidTpcExpanded": "True",
        "ctl00$cphMain$topicChooser1$hidSelectedTopics": "",
        "ctl00$cphMain$topicChooser1$hdnPopUpShowing": "",
        "ctl00$cphMain$topicChooser1$hidTab": "",
        "ctl00$cphMain$btnInsert": "Save Course/seminar",
        "ctl00$TraineeReport1$download_token_value_id": "17/05/2015 12:20:46",
        "ctl00$TraineeReport1$txtStartDate": "16/05/2014",
        "ctl00$TraineeReport1$txtEndDate": "16/05/2015",
        "ctl00$txtFeedbackComments": "",
    }

    files = {
        "ctl00$cphMain$fupControl1": (
            filename,
            filedata,
            filetype,  # FIXME
        ),
    }

    r = session.post("https://www.iscp.ac.uk/evidence/course.aspx", data=payload, files=files)

    pprint.pprint(r.text)

Example #27

0

Show file

File: CombineSichuang.py Project: loachNW/niwen

def theme(data):
    html = lxml.html.fromstring(data)  # 等同于etree.HTML函数
    data_theme = ""
    a = html.cssselect('#tab_sdyj > tfoot > tr')
    for i in range(len(a)):
        if (len(
                html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' +
                               str(i + 1) + ') > td:nth-child(2)')) !=
                0) and (len(
                    html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' +
                                   str(i + 1) + ') > td:nth-child(6)')) != 0):
            data_theme += html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' +
                                         str(i + 1) + ')')[0].text_content()
    return (data_theme)

Example #28

0

Show file

File: LinkSpringer Book downloader.py Project: iocio005/linkspringer

	def download(self,link):
		parent_url='http://link.springer.com'
		source = requests.get(link).content
		html = lxml.html.fromstring(source)
		book_title= html.cssselect('h1#title')[0].text_content() 
		chapter=01
		for i in html.cssselect('li.toc-item'):
			url=urljoin(parent_url,i.cssselect('div.actions')[0].cssselect('span.action')[0].cssselect('a')[0].get('href'))
			pdf=requests.get(url).content
			f = open(book_title+str(chapter)+'.pdf', 'wb+')
			f.write(pdf)
			chapter+=1
			print url
		Download_Book().concatenate_pdf(book_title)

Example #29

0

Show file

File: test_cms_plugins_program.py Project: openfun/richie

    def test_cms_plugins_program_fallback_when_never_published(self):
        """
        The program plugin should render in the fallback language when the program
        page has never been published in the current language.
        """
        # Create a program
        program = ProgramFactory(
            page_title={
                "en": "public program",
                "fr": "programme publique"
            },
            fill_cover={
                "original_filename": "cover.jpg",
                "default_alt_text": "my cover",
            },
        )
        program_page = program.extended_object

        # Create a page to add the plugin to
        page = create_i18n_page({"en": "A page", "fr": "Une page"})
        placeholder = page.placeholders.get(slot="maincontent")
        add_plugin(placeholder, ProgramPlugin, "en", **{"page": program_page})
        add_plugin(placeholder, ProgramPlugin, "fr", **{"page": program_page})

        # Publish only the French version of the program
        program_page.publish("fr")

        # Check the page content in English
        page.publish("en")
        url = page.get_absolute_url(language="en")
        response = self.client.get(url)

        html = lxml.html.fromstring(response.content)

        # The program's full name should be wrapped in a link within an h2
        title = html.cssselect(".program-glimpse__title")[0]
        link = title.cssselect(".program-glimpse__link")[0]
        self.assertEqual(link.text_content().strip(), "programme publique")
        self.assertNotContains(response, "public program")

        # Program's cover should be present
        cover = html.cssselect(".program-glimpse__media")[0]
        self.assertEqual(cover.get("aria-hidden"), "true")
        img = cover.cssselect("img")[0]
        self.assertIsNotNone(
            re.search(
                r"/media/filer_public_thumbnails/filer_public/.*cover\.jpg__300x170",
                img.get("src"),
            ))

Example #30

0

Show file

File: dl_tadu.py Project: geniusnut/funpython

def getpage(url, f, count):
  html = geturl(url)
  html = lxml.html.fromstring(html)
  title = html.cssselect('h2')[0].text
  
  print('第 %d 页（%s）已下载' % (count, title))
  patt = '(%\w+)+'
  f.write(title + '\n\n')
  content = html.cssselect('script')[7].text
  content = content[content.find('unescape')+10 : content.find('\"))')]
  #print(content)
  content = content.replace('%3Cbr%2F%3E%3Cbr%2F%3E','\n')
  content = content.replace('%','\\').encode()
  content = content.decode('unicode_escape')
  f.write(content + '\n')

Example #31

0

Show file

File: crawl.py Project: iwaikantoku/crawling

def scrape_list_page(response: requests.Response) -> Iterator[str]:

    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)
    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        url = a.get('href')
        yield url

Example #32

0

Show file

File: usage.py Project: bsandrow/rogers-usage

def current_month_daily_breakdown(session):
    summary_markers = [
        '',
        'Total Usage (GB)',
        'Usage Allowance',
        'Additional Use',
        'Usage',
    ]

    response = session.get(day_to_day_url)
    html = lxml.html.fromstring(response.text)
    rows = html.cssselect(".internetUsageDataContainer .mainSection tr")
    data = []
    summary = []

    for row in rows:
        row = map(_format_cell_data, row.cssselect('td'))
        if row[0] in summary_markers:
            cleaned = [ td for td in row if td ]
            if len(cleaned):
                summary.append(cleaned)
        else:
            data.append(row)

    return DailyBreakdown(data, summary)

Example #33

0

Show file

File: submit.py Project: wurikiji/algospot

    def get_submission_list(self, problem_name):
        self.check_problem_exist(problem_name)
        request = urllib2.Request(url=SITE_PREFIX+'judge/submission/recent/?problem='+problem_name)
        response = self.opener.open(request)

        try:
            import lxml.html
        except ImportError:
            print 'lxml library is needed for parsing HTML'
            return

        html = lxml.html.fromstring(unicode(response.read().decode('utf8')))
        context = {}
        fields = ('id', 'problem', 'user', 'language', 'length', 'state', 'stats', 'submitted_on')
        length = {'id': 9, 'problem': 15, 'user': 15, 'language': 5, 'length': 7, 'state': 15, 'stats': 7, 'submitted_on': 15}
        template = u'%(id)s %(problem)s %(user)s %(language)s %(length)s %(state)s %(stats)s %(submitted_on)s'

        def width(string):
            return sum(1+(unicodedata.east_asian_width(c) in 'WF') for c in string)

        for tr in html.cssselect('table.submission_list tr'):
            for field in fields:
                element = tr.find_class(field)
                if element:
                    context[field] = unicode(element[0].text_content().strip())
                else:
                    context[field] = u''
                context[field] = ' ' * (length[field] - width(context[field])) + context[field]
            print template % context

Example #34

0

Show file

File: search.py Project: sanxiyn/sandbox

def search(topic):
    url = 'https://lobste.rs/search?what=stories&order=newest&q=' + topic
    response = requests.get(url)
    html = lxml.html.fromstring(response.text)
    for item in html.cssselect('.link a'):
        text = item.text_content()
        print(text)

Example #35

0

Show file

File: dbmultiverse.py Project: iocio005/python-dbuniverse-downloader

def download():
    i = 0
    aux = 0
    for i in range(int(MAX_BOOKS)):
        try:
            if not os.path.isfile(get_url_from_pne(img_temp, 'DBM_', i, 'jpg')) and not os.path.isfile(get_url_from_pne(img_temp, 'DBM_', i, 'png')):
                link = 'http://www.dragonball-multiverse.com/es/page-'+str(i)+'.html'
                source = requests.get(link).content
                html = lxml.html.fromstring(source)
                book_title = html.cssselect('div')[7].cssselect('img')[0].get('src')
                url=urljoin(parent_url,book_title)
                img_format = book_title[-3:] # file ext catched (.jpg or .png)
                img_file = open(get_url_from_pne(img_temp, 'DBM_', i, img_format),'wb') 
                img_file.write(requests.get(url).content)
                img_file.close()
                time.sleep(0.5)
                print("Downloaded in: "+get_url_from_pne(img_temp, 'DBM_', i, img_format))
            else:
                print("Skipping book number %s\r" %i)


        except Exception as e:
            print(e)
            print("It cannot be downloaded")
            BLACK_LIST.append(i)
            time.sleep(.5)

Example #36

0

Show file

File: context_manager.py Project: Goobs/django-with-asserts

    def __enter__(self):
        # Similar to assertContains(), we verify the status code
        self.test_case.assertEqual(self.response.status_code, self.status_code)

        # TODO consider validating self.response['Content-Type']

        # Parse the response as HTML
        html = lxml.html.fromstring(self.response.content.decode('utf-8'))
        if self.selector is not None:
            # Use cssselect to filter the elements
            elements = html.cssselect(self.selector)

            # Ensure some data exists
            if len(elements) == 0:
                raise SelectorNotFound(
                    'No selector matches found for {0}'.format(self.selector)
                )

            return elements
        if self.element_id is not None:
            try:
                return html.get_element_by_id(self.element_id)
            except KeyError:
                raise ElementIDNotFound(
                    'Element with id, {0}, not present'.format(self.element_id)
                )

        # No filtering defined, return the entire parsed HTML document
        return html

Example #37

0

Show file

File: neeu.py Project: kamiswin/max-x.net

def spiderboy(url):

    page = requests.get(url,headers=headers)
    base_url ='/'.join(page.url.split('/')[:-4])

    html = lxml.html.fromstring(page.content.decode('gbk','ignore'))
    items = html.cssselect('.anewsnotitle')
    for item in items:
        car_link = base_url + item.cssselect('.newstext h3 a')[0].get('href')
        logger.info('link: '+car_link)
        try:
            Car.objects.get(car_link = car_link)
            pass
        except Exception,e:
            car_title = str(item.cssselect('.newstext h3 a')[0].text_content())
            logger.info('title: '+car_title)
            car_icon = base_url + item.cssselect('.newspic a img')[0].get('src')
            logger.info('icon_url: '+car_icon)
            car_des = str(item.cssselect('.newstext p')[0].text_content())
            logger.info('get des')

            innerpage = requests.get(car_link,headers=headers)
            innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk','ignore'))

            try:
                next = innerhtml.cssselect('.cpagesizebottom a')[-1]
                if next.text_content() == u'下一页':
                    mid_body = nextPage(innerhtml,base_url)
                else:
                    mid_body = lxml.html.tostring(innerhtml.cssselect('.content')[0])
                    mid_body = cleaner.clean_html(mid_body)

            except:
                mid_body = lxml.html.tostring(innerhtml.cssselect('.content')[0])
                mid_body = cleaner.clean_html(mid_body)

            pattern = re.compile(r'(?:src|href)="([^http].*?[\.jpg])"', re.VERBOSE)

            test = pattern.findall(mid_body)
            test = list(set(test))

            for i in test:
                mid_body = mid_body.replace(i,base_url+i)

            car_body = mid_body
            logger.info('body: catch')
            car_cate = category_select(url,catechoice)
            logger.info('category: '+car_cate)

            ca = Car(car_title=car_title,
                     car_des=car_des,
                     car_link=car_link,
                     car_body=car_body,
                     car_icon=car_icon,
                     car_source="neeu",
                     car_cate=car_cate)


            ca.save()
            logger.info('done one')

Example #38

0

Show file

File: testutils.py Project: ArthurasJ/manozodynas

 def selectMany(self, css):
     html = self.get_html()
     selection = html.cssselect(css)
     if not len(selection):
         self.fail('No elements matching: %r' % css)
     self.state['selection'] = selection
     return selection

Example #39

0

Show file

async def main():
    try:
        all_urls = ALL_URLS.read_text().split()
    except FileNotFoundError:
        all_urls = []

    async with aiohttp.ClientSession(raise_for_status=True) as session:
        async with session.get(
                "https://p.eagate.573.jp/game/sdvx/") as response:
            html = lxml.html.fromstring(await response.text())

        urls = [
            img.attrib["data-original"]
            for img in html.cssselect("div.news_box img")
        ]

        if urls:
            for url in urls:
                if WEBHOOK and url not in all_urls:
                    print(url)
                    all_urls.append(url)
                    async with session.get(url) as response:
                        image_data = await response.read()
                    data = aiohttp.FormData()
                    data.add_field("file", image_data, filename=URL(url).name)
                    await session.post(WEBHOOK, data=data)

            ALL_URLS.write_text("".join(f"{url}\n" for url in all_urls))
            CURRENT_URLS.write_text("".join(f"{url}\n" for url in urls))

Example #40

0

Show file

File: cafe.py Project: jangxyz/xyz.daum

def parse_article_oneline_list(url, text=None):
    '''
    XPATH: //div[@class="memo_list"]/form[@id="listForm"]/ul/dl

    source:
        <dl>
            <dt class="profile_block reply_size" >
            <div id="pimgWrap_0_6475" class="fl">
                <img src="http://fimg.daum-img.net/tenth/img/y/t/i/u/ccJT/76/96c1c9-42087-d1.bmp" width="32" height="32" alt="" onmouseover="MemoFormController.showProfileLayer(this, 'pimg_0_6475');" onmouseout="MemoFormController.hideProfileLayer();">
                <img id="pimg_0_6475" src="http://fimg.daum-img.net/tenth/img/y/t/i/u/ccJT/76/96c1c9-42087-d3.bmp" width="150" height="150" style="display: none;" alt="프로필 이미지" />
            </div>
            </dt>			
            <dd class="content_block ">
            <div id="memoViewer_0_6475" class="content_viewer ">
                <p class="nickname">
                &nbsp; <a href="#" onclick="showSideView(this, 'Zo6UMXQoclc0', '', 'Ellen[\uC774\uACBD\uBBFC]'); return false;" class="b">Ellen[이경민]</a>
                &nbsp; <span class="txt_sub num">12.07.11. 09:45</span> &nbsp;
                </p>
                <div class="content_memo">
                    7/15(일) 오후 2시에 강서구 등촌동 저희집에서 집들이 할께요! 
                    <br />
                    참석 가능하시면 댓글 달아주세요~ ㅎㅎ좀 멀긴하지만 맛있는 음식과 술이 기다리고 있을거예요~ ^^ 
                    <img src="http://i1.daumcdn.net/cafeimg/cf_img2/img_blank2.gif" width="8" height="12" alt="새글" class="icon_new" />
                    <b>								
                        <a href="#" onclick="ReplyFormController.showReplyForm('0_6475'); return false;" class="txt_point" >
                            [<span id="commentReplyCount_0_6475" class="txt_point">8</span>]
                        </a>
                    </b>						
                </div>
            </div><!-- content_viewer -->
            <div id="memoModify_0_6475" class="content_modify"></div>
            <div id="memoBtns_0_6475" class="memo_btns p11">
                <a href="#" onclick="ReplyFormController.showReplyForm('0_6475'); return false;" class="p11">답글</a>																	</div>
            </dd><!-- end content_block -->
        </dl>
    '''
    _type = namedtuple('BriefArticleInfo', 
        'article_num title post_date author path url'.split())

    # fetch
    if text is None:
        text = urlread(url, timeouts=ARTICLE_TIMEOUTS)
    html = lxml.html.fromstring(text)

    results = []
    articles = html.cssselect('div.memo_list form#listForm ul dl')
    for dl in articles:
        content = dl.cssselect('div.content_viewer div.content_memo')[0].xpath('child::text()')
        nick = dl.cssselect('div.content_viewer p.nickname a')[0]
        date = dl.cssselect('div.content_viewer p.nickname span.txt_sub.num')[0]
        article_num = dl.cssselect('div.content_viewer')[0].attrib['id'].rsplit('_', 1)[-1]
        results.append(_type(
            int(article_num),
            "\n".join(content).strip(),
            date.text.strip(),
            nick.text.strip(),
            None,
            None,
        ))
    return results

Example #41

0

Show file

File: things.py Project: seraph0017/maxblog

def fetch_elements(html,lo_name,locate_source):
    """.. :py:method::
    根据对象库获取元素
    """
    path = locate_source[lo_name]
    if isinstance(path,tuple):
        if len(path) == 3:
            return html.cssselect(path[0])[path[1]:path[2]]
        elif len(path) == 2:
            return html.cssselect(path[0])[path[1]]
        else:
            raise Exception
    elif isinstance(path,basestring):
        return html.cssselect(path)
    else:
        raise Exception

Example #42

0

Show file

File: dbmultiverse.py Project: iocio005/python-dbuniverse-downloader

def download():
    i = 0
    aux = 0
    for i in range(int(MAX_BOOKS)):
        try:
            if not os.path.isfile(get_url_from_pne(
                    img_temp, 'DBM_', i, 'jpg')) and not os.path.isfile(
                        get_url_from_pne(img_temp, 'DBM_', i, 'png')):
                link = 'http://www.dragonball-multiverse.com/es/page-' + str(
                    i) + '.html'
                source = requests.get(link).content
                html = lxml.html.fromstring(source)
                book_title = html.cssselect('div')[7].cssselect('img')[0].get(
                    'src')
                url = urljoin(parent_url, book_title)
                img_format = book_title[-3:]  # file ext catched (.jpg or .png)
                img_file = open(
                    get_url_from_pne(img_temp, 'DBM_', i, img_format), 'wb')
                img_file.write(requests.get(url).content)
                img_file.close()
                time.sleep(0.5)
                print("Downloaded in: " +
                      get_url_from_pne(img_temp, 'DBM_', i, img_format))
            else:
                print("Skipping book number %s\r" % i)

        except Exception as e:
            print(e)
            print("It cannot be downloaded")
            BLACK_LIST.append(i)
            time.sleep(.5)

Example #43

0

Show file

File: tasks.py Project: xbogdan/crawlers

	def run(self):
		base_cookie = self.get_base_cookie()

		req = requests.get(self.ENDPOINT, headers=self.HEADERS, cookies=base_cookie, verify=False, allow_redirects=False)

		if req.status_code != 200:
			print req.status_code
			print req.content
			print 'Error'
			return

		html = lxml.html.fromstring(req.content)
		categs = html.cssselect('.tabel_categorie')

		for categ in categs:
			a = categ.cssselect('a')[0]
			name = a.text
			name = name.replace('Calorii ', '').lower()

			categories = name.split(" ")
			if len(categories) > 1:
				category_name = "{}-{}".format(categories[0], categories[1])
			else:
				category_name = categories[0]

			url = a.get('href')

			req = requests.get(url, headers=self.HEADERS, cookies=base_cookie, verify=False, allow_redirects=False)

			self._process_category_table(req.content, category_name)

Example #44

0

Show file

File: crawl.py Project: kyoppii13/room-heatmap

def scrape_list_page(response: requests.Response) -> Iterator[str]:
    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)

    for a in html.cssselect('table.cassetteitem_other > tbody > tr > td:nth-child(9) > a'):
        url = a.get('href')
        yield url

Example #45

0

Show file

File: scraper.py Project: jbm160/vote

def getpoll():
    pageURL = "http://polldaddy.com/poll/7575405/"
    html = lxml.html.parse(pageURL).getroot()
    votebutton = html.cssselect('.vote-button')
    datavote = votebutton[0].get("data-vote")
    datadict = ast.literal_eval(datavote)
    return datadict

Example #46

0

Show file

File: cafe.py Project: jangxyz/xyz.daum

def parse_cafe_inner_url_from_official(url):
    '''Parse cafe official url and return real url.
    
	<frame name="down" id="down" src="http://cafe986.daum.net/_c21_/home?grpid=ccJT" width="100%" height="100%" frameborder="0" marginwidth="0" marginheight="0" title="카페 메인 프레임">
    '''
    #CAFE_HOME_PATTERN = re.compile(u'''
    #    # get src of frame#down
    #    <frame [^>]*
    #        (
    #            (id="down" [^>]*src="([^"]*)")
    #            |
    #            (src="([^"]*)" [^>]*id="down")
    #        )
    #    [^>]*>
    #''', re.S | re.X)

    site1 = urlread(url, timeouts=ARTICLE_TIMEOUTS)

    #match = CAFE_HOME_PATTERN.search(site1)
    #if not match:
    #    raise Exception("parse error")
    #url = match.group(3) or match.group(5)
    html = lxml.html.fromstring(site1)
    frame = html.cssselect('frame#down')[0]
    url = frame.get('src')

    return url

Example #47

0

Show file

File: cafe.py Project: jangxyz/xyz.daum

def parse_article_album_list(url, text=None):
    ''' parse article phone list and result list of article information as a tuple:
        (article_num, title, post_date, author, path, url)
    '''
    _type = namedtuple('BriefArticleInfo', 
        'article_num title post_date author path url fldid grpid'.split())

    # fetch
    if text is None:
        text = urlread(url, timeouts=ARTICLE_TIMEOUTS)


    html = lxml.html.fromstring(text)
    articles = html.cssselect('div.albumListBox li')

    def _parse(li):
        subject = li.cssselect('dd.subject a')[0]
        author  = li.cssselect('dd.nick a')[0]
        article_num, post_date = li.cssselect('dd.txt_sub.p11 span.num')
        href = subject.get('href')
        path = unescape(href)
        query_dict = urlparse.parse_qs(urllib.splitquery(path)[-1])
        return _type(
            int(article_num.text.strip()), 
            subject.text.strip(),
            post_date.text.strip(),
            author.text.strip(),
            href,
            get_domain(url, href),
            query_dict.get('fldid', [None])[0],
            query_dict.get('grpid', [None])[0],
        )

    return [_parse(li) for li in articles if not li.cssselect('div.blank_thumb')]

Example #48

0

Show file

def parsing_list_product(request):
    html = get_html(request)
    a_list = html.cssselect('ul.sitemap>li>a')
    for a in a_list:
        href = a.get('href')
        url_product.append(href)
        log.info('Ссылка на товар:%s', href)

Example #49

0

Show file

def parsing_category(request):
    global categories_data, filename_category

    parent_category_one = 0
    parent_category_two = 0

    html = get_html(request)

    ul = html.cssselect('ul.sitemap')[0]
    li_list = ul.cssselect('li')
    for li in li_list:
        li_class = li.get('class')
        if (li_class == 'level-0'):
            parent_category = 0
            a = li.cssselect('a')[0]
            category_name = a.text
            category_url = a.get('href')

            temp = [index_add(), parent_category, category_name, category_url]

            categories_data.append(temp)
            parent_category_one = categories_data[-1][0]

            log.info(
                'PARENT_CATEGORY_ID: %s, CATEGORY_NAME: %s, CATEGORY_URL: %s',
                parent_category, category_name, category_url)
        else:
            if (li_class == "level-1"):
                a = li.cssselect('a')[0]
                parent_category = parent_category_one

                category_name = a.text
                category_url = a.get('href')

                temp = [
                    index_add(), parent_category, category_name, category_url
                ]

                categories_data.append(temp)
                parent_category_two = categories_data[-1][0]
            else:
                if (li_class == "level-2"):

                    parent_category = parent_category_two

                    a = li.cssselect('a')[0]

                    category_name = a.text
                    category_url = a.get('href')

                    temp = [
                        index_add(), parent_category, category_name,
                        category_url
                    ]

                    categories_data.append(temp)

    with open(filename_category, "w", newline="", encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(categories_data)

Example #50

0

Show file

    def test_testing_topic_announce(self):
        """Controls topics that are of type announcement don't have sorted options"""
        # Creates posts for announcement topics
        forum = ForumFactory()
        PostFactory(topic=TopicFactory(forum=forum, type=Topic.TOPIC_ANNOUNCE))
        PostFactory(topic=TopicFactory(forum=forum, type=Topic.TOPIC_ANNOUNCE))

        user = UserFactory()
        assign_perm("can_read_forum", user, forum)
        self.client.force_login(user)

        response = self.client.get(f"/forum/forum/{forum.slug}-{forum.pk}/")

        html = lxml.html.fromstring(response.content)
        # Select the header block of the announcement block, the first block
        announce_block = str(
            etree.tostring(html.cssselect(".topiclist .card-header")[0]))

        # Controls that announce_block is about announcements and not topics
        self.assertIn("Announcements", announce_block)
        self.assertNotIn("Topics", announce_block)
        self.assertIn("Replies", announce_block)
        self.assertIn("Views", announce_block)
        self.assertIn("Last post", announce_block)

        # There's no sortable informations
        self.assertNotIn("sortable sorted", announce_block)
        # There's no column that has a sorting link on
        self.assertNotIn("<a href=", announce_block)
        # There's no toggle sorting
        self.assertNotIn("Toggle sorting", announce_block)

Example #51

0

Show file

def get_calendar(data: dict, limit: int = 31) -> str:
    result = []
    now = datetime.datetime.now(tz=TZ)
    for days in range(limit):
        date = now + datetime.timedelta(days=days)
        temp = data.get(date.year, {})
        temp = temp.get(date.month, {})
        temp = temp.get(date.day, {})
        if not temp: break
        info = []
        for key, value in temp.items():
            # "qdhd": 庆典活动
            # "tdz":  团队战
            # "tbhd": 特别活动
            # "jqhd": 剧情活动
            # "jssr": 角色生日
            if value and key in ['qdhd', 'tdz', 'tbhd', 'jqhd', 'jssr']:
                html = lxml.html.fromstring(value)
                nodes = html.cssselect('.cl-t')
                for node in nodes:
                    info.append(node.text)
        msg = '\n'.join(info)
        if not msg: continue
        result.append('\n'.join(['==========',
                                 date.strftime('%Y-%m-%d'), msg]))
    return '\n'.join(result)

Example #52

0

Show file

File: notes_parse.py Project: IsaiahKing/MiscRecord

def get_hrefs(data,css_select):
    # soup = BeautifulSoup(data,'lxml')
    # ele = soup.select(css_select)
    html = lxml.html.fromstring(data)
    ele = html.cssselect(css_select)
    hrefs = [e for e in ele]
    cell = (hrefs[0].text,hrefs[1].get('href'))
    return cell

Example #53

0

Show file

File: lib.py Project: riadh001/mediathread

def _get_metadata_citation(html, format=None):

    if format is None:
        format = 'chicago'

    citation = html.cssselect("div.citation#cite_%s" % format)
    citation = citation[0].text_content()
    return citation.replace('<', '&lt;').replace('>', '&gt;')

Example #54

0

Show file

File: autohome.py Project: kamiswin/max-x.net

def spiderboy(url):
    page = requests.get(url)
    html = lxml.html.fromstring(page.content.decode('gbk'))
    items = html.cssselect('#ATitle')

    for item in items:
        car_link = item.get('href')
        # logger.info('link: '+car_link)
        try:
            Car.objects.get(car_link=car_link)
            # logger.info('already have ' + car_link)
            pass
        except:
            car_title = str(item.text_content())
            # logger.info('title: '+car_title)
            car_icon = 'http://x.autoimg.cn/news/index/img/20110801/logo_new.png'
            car_des = ''

            innerpage = requests.get(car_link)
            innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk'))

            try:
                next = base_url + innerhtml.cssselect('.page-item-readall')[0].get('href')
            except:
                next = None

            if next:
                innerpage = requests.get(next)
                innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk'))

            try:
                innerhtml.cssselect('.diversion-box')[0].drop_tree()
            except:
                pass

            try:
                innerhtml.cssselect('.btn.font-normal')[0].drop_tree()
            except:
                pass


            article =innerhtml.cssselect('#articleContent')[0]
            mid_body = lxml.html.tostring(article,encoding=unicode)
            mid_body2 = cut(mid_body)
            r = re.compile(r'<a>|</a>')
            mid_body3 = cleaner.clean_html(mid_body2)
            car_body = mid_body3
            car_body = r.sub('',car_body)

            ca = Car(car_title=car_title,
                     car_des=car_des,
                     car_link=car_link,
                     car_body=car_body,
                     car_icon=car_icon,
                     car_source="autohome",
                     car_cate='car')

            ca.save()

Example #55

0

Show file

File: sources.py Project: refaim/wots

    def _parseResponse(self, queryText, url, html):
        for userEntry in html.cssselect('table.NoteDivWidth'):
            userInfo = userEntry.cssselect('tr table')[0]
            nickname = userInfo.cssselect('tr th')[0].text
            exchangeUrl = userInfo.cssselect('tr td')[-1].cssselect('a')[0].attrib['href']
            if any(source in exchangeUrl.lower() for source in self.sourceSubstringsToExclude):
                yield None
            else:
                shopFound = not exchangeUrl.endswith('.html')
                if shopFound:
                    cardSource = exchangeUrl
                    self.logger.warning('Found new shop: {}', exchangeUrl)
                else:
                    cardSource = self.getTitle() + '/' + nickname

                userCards = userEntry.cssselect('table.CardInfo')
                if len(userCards) > 0:
                    self.estimatedCardsCount += len(userCards) - 1

                for cardInfo in userCards:
                    cardName = cardInfo.cssselect('th.txt0')[0].text
                    cardUrl = exchangeUrl
                    if not shopFound:
                        cardUrl += '?Title={}'.format(cardName)

                    idSource = cardInfo.cssselect('nobr.txt0')[0].text
                    cardId = int(re.match(r'[^\d]*(\d+)[^\d]*', idSource).group(1)) if idSource else None

                    price = None
                    priceSource = cardInfo.cssselect('td.txt15')[-1].cssselect('b')
                    if len(priceSource) > 0:
                        possiblePrice = priceSource[-1].text
                        if possiblePrice is not None:
                            possiblePrice = possiblePrice.split()[0]
                            if possiblePrice.isdigit():
                                price = decimal.Decimal(possiblePrice)

                    foilness = len(cardInfo.cssselect('#FoilCard')) > 0

                    language = None
                    languageSource = cardInfo.cssselect('td.txt15')[0].cssselect('font')
                    if len(languageSource) > 0:
                        language = languageSource[0].text

                    setSource = cardInfo.cssselect('#table0 td img')[0].attrib['alt']

                    yield {
                        'id': cardId,
                        'name': cardName,
                        'foilness': foilness,
                        'set': setSource,
                        'language': language,
                        'price': price,
                        'currency': core.utils.Currency.RUR,
                        'count': int(cardInfo.cssselect('td.txt15 b')[0].text.split()[0]),
                        'source': self.packSource(cardSource, cardUrl),
                    }

Example #56

0

Show file

File: nyaa.py Project: luketurner/animagic

def _download_from_list_page(html, term):
    for title_node in html.cssselect(".tlistname a"):
        if term in title_node.text_content():
            download_node = title_node.xpath("../..")[0].cssselect(".tlistdownload a")[0]
            torrent_url = download_node.get("href")
            torrent = urlopen(torrent_url).read()
            return torrent
    logger.error("Search results from %s does not contain that term -- there probably is no torrent by that name.", term)
    return False

Example #57

0

Show file

File: dl_tadu.py Project: geniusnut/funpython

def getBookInfo(str):
  html = lxml.html.fromstring(str)
  book = {}
  title = html.cssselect('title')[0].text
  book['title'] = title[0:title.find(',')]
  book['author'] = html.cssselect('a[href^="/book/author/"]')[0].get('alt')

  print("书名:", book['title'])
  print("作者:", book['author'])
  def getlink(a):
    link = a.get('href')
    return ("http://www.tadu.com"+link)
  def geta(div):
    return div.cssselect('a')[0]
  chapter = list(map(geta, html.cssselect('div.chapter_t')))
  book['links'] = list(map(getlink, chapter))
  print("共 %d 页" % len(book['links']))
  return book