def parsePage(html):
    
    # Dictionary to store info
    athInfo = {}
    
    #Now start populating our data object
    athInfo['AthleteName'] = html.cssselect("h2")[0].text
    athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip()    

    infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession']
    detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime']
    
    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text
    
    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HasResults'] = 1
    athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['Bib'], data=athInfo, table_name="RESULTS", verbose=0)
Esempio n. 2
0
def signIn(username, password):
    raw = requests.get(PATHS['login'])
    session_left_slice = raw.headers['set-cookie'].find('=') + 1
    session_right_slice = raw.headers['set-cookie'].find(';')
    session_id = raw.headers['set-cookie'][
        session_left_slice:session_right_slice]
    html = lxml.html.fromstring(raw.text)
    db_viewstate = html.cssselect("input#__DATABASE_VIEWSTATE").value
    print db_viewstate
    ev_validation = html.cssselect("input#__EVENTVALIDATION").value
    # Create the form payload
    username_key = 'txtUserID'
    password_key = 'txtPassword'
    login_button = 'btnSubmit'
    form_payload = {
        '__EVENTTARGET': '',
        '__EVENTARGUMENT': '',
        '__VIEWSTATE': db_viewstate,
        '__EVENTVALIDATION': ev_validation,
        username_key: username,
        password_key: password,
    }
    session = requests.session()
    session.post(PATHS['login'], data=form_payload)
    return session
Esempio n. 3
0
def get_metadata(url):
    resp = requests.head(url, headers=headers, timeout=5)
    resp.raise_for_status()

    if 'text/html' not in resp.headers.get('Content-Type'):
        return {'url': url}

    resp = requests.get(url, headers=headers, timeout=10)
    resp.raise_for_status()

    html = lxml.html.fromstring(resp.content.decode('utf8'))
    tags = html.cssselect('meta[property], meta[name]')

    meta = {}
    for tag in tags:
        prop = tag.attrib.get('property', tag.attrib.get('name'))
        data = tag.attrib.get('content')
        if data is not None:
            meta[prop] = data

    can = html.cssselect('link[rel="canonical"]')
    if can:
        meta['canonical'] = can[0].attrib['href']

    # Canonical data
    meta['url'] = _get(meta, 'canonical', 'og:url', default=url)
    meta['description'] = _get(meta, 'description', 'og:description',
                               'twitter:description')
    meta['title'] = _get(meta, 'og:title', 'twitter:title', url)

    return meta
 def scrape_restaurant_data(self, example):
     # get this from yelp
     
     html = obtain_html(example["url"])
     
     html.make_links_absolute(example["url"])
     
     title = html.cssselect("h1.biz-page-title")[0].text.strip()
     
     review_highlights = html.cssselect("ul.review-highlights-list")
     if len(review_highlights) > 0:
         description = tree_to_str(clean_up_highlights(review_highlights[0]))
     else:
         description = create_description_highlights(html)
     
     images = html.cssselect("img.photo-box-img")
     image_url = None
     if len(images) > 0:
         image_url   = images[0].attrib["src"]
     
     return {
     "title": title,
     "description": description,
     "categories": example["categories"],
     "image_url" : image_url,
     "rating": rating_to_string(example["rating"]),
     "price": example["price"]
     }
Esempio n. 5
0
def parsePage(html):

    # Dictionary to store info
    athInfo = {}

    #Now start populating our data object
    athInfo['AthleteName'] = html.cssselect("h2")[0].text
    athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip()

    infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession']
    detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime']

    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text

    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath(
        "//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath(
        "//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HasResults'] = 1
    athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['Bib'],
                            data=athInfo,
                            table_name="RESULTS",
                            verbose=0)
Esempio n. 6
0
def scrape_daft_page(url):
    info = {'url': url}

    http_client = CachedHttpClient()
    data = http_client.get(url)
    html = lxml.html.fromstring(data)

    content_tag = html.xpath('//div[@id="content"]')[0]

    title_tag = content_tag.cssselect('.smi-info h1')[0]
    info['title'] = title_tag.text

    image_tag = html.cssselect('#smi-gallery-img-main img')[0]
    image_src = image_tag.attrib['src']
    if image_src.startswith('//'):
        image_src = 'https:' + image_src
    info['image'] = image_src

    price_tag = html.cssselect('#smi-price-string')[0]
    info['price'] = price_tag.text

    header_text = html.cssselect('#smi-summary-items .header_text')
    hdrtext = [t.text for t in header_text]

    info['beds'] = hdrtext[1]
    info['baths'] = hdrtext[2]

    info['description'] = '\n\n'.join(
        elem.text_content()
        for elem in html.cssselect('#smi-tab-overview .description_block'))
    # info['description'] = content_tag.cssselect('.overview')[0].text

    return info
Esempio n. 7
0
    def test_render(self):
        # Bouton tout pourri qui affiche "Accueil -> [ Jardins, Variétés ]"
        column = DropDownLinkColumn(links=[
            Link(text=u'Main button',
                 viewname='s5appadherant:accueil'),
            Link(text=u'Jardins',
                 viewname='s5appadherant:jardin_all',
                 args=()),
            Link(text=u'Variété',
                 viewname='s5appadherant:variete_list')
        ])

        output = column.render(G(Adherant))
        html = lxml.html.fromstring(output)

        elements = html.cssselect(".btn-group > a")
        self.assertEqual(1, len(elements))
        self.assertEqual(u'Main button', elements[0].text)
        self.assertEqual(reverse('s5appadherant:accueil'), elements[0].attrib['href'])

        elements = html.cssselect("button.dropdown-toggle")
        self.assertEqual(1, len(elements))
        self.assertEqual('dropdown', elements[0].attrib['data-toggle'])

        elements = html.cssselect("ul.dropdown-menu li a")
        self.assertEqual(2, len(elements))
        self.assertEqual(u'Jardins', elements[0].text)
        self.assertEqual(reverse('s5appadherant:jardin_all'), elements[0].attrib['href'])
        self.assertEqual(u'Variété', elements[1].text)
        self.assertEqual(reverse('s5appadherant:variete_list'), elements[1].attrib['href'])
Esempio n. 8
0
def get_citation(title):
    """Given a paper title, attempts to get citation
    strings for that paper from Google Scholar."""
    # Search for the paper by title
    resp = requests.get(BASE_URL, params={'q': title})
    html = lxml.html.fromstring(resp.content)
    result_els = html.cssselect('.gs_r')
    if not result_els:
        return None

    # Only consider the first match
    result_el = result_els[0]

    # result_title = result_el.cssselect('.gs_rt a')[0].text

    # Request the citations
    result_id = result_el.attrib['data-cid']
    resp = requests.get(BASE_URL,
                        params={
                            'q':
                            'info:{}:scholar.google.com/'.format(result_id),
                            'output': 'cite'
                        })
    html = lxml.html.fromstring(resp.content)
    citations = {}
    for format_el, citation_el in zip(html.cssselect('th'),
                                      html.cssselect('td .gs_citr')):
        format = format_el.text
        citation = citation_el.text_content()
        citations[format] = citation
    return citations
Esempio n. 9
0
def sign_in(u, p):

    session = requests.session()
    raw = session.get(LOGIN_URL)

    html = lxml.html.fromstring(raw.text)

    viewstate = html.cssselect("input#__VIEWSTATE")[0].value
    viewstate_generator = html.cssselect("input#__VIEWSTATEGENERATOR")[0].value

    payload = {
        "ctl00$cphMain$Logon1$_resolution" : "1440x900",
        "ctl00$cphMain$Logon1$_email" : u,
        "ctl00$cphMain$Logon1$_password": p,
        "ctl00$cphMain$Logon1$_login": "******",
        "__EVENTTARGET": "",
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": viewstate,
        "__VIEWSTATEGENERATOR": viewstate_generator,
    }

    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
    }

    r = session.post(LOGIN_URL, data=payload, headers=headers)

    assert "unread messages" in r.content

    return session
def parsePage(html):
    
    # Dictionary to store info
    athInfo = {}
    
    #Now start populating our data object
    athInfo['ATHLETE_NAME'] = html.cssselect("h2")[0].text
    athInfo['DIVISION_RANK'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OVERALL_RANK'] = html.cssselect("#div-rank *")[0].tail.strip()    

    #infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION']
    infoFields = ['BIB', 'DIVISION', 'STATE', 'COUNTRY', 'PROFESSION']
    detailsFields = ['TOTAL_SWIM', 'TOTAL_BIKE', 'TOTAL_RUN', 'TOTAL_TIME']
    
    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text
    
    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HAS_RESULTS'] = 1
    athInfo['SCRAPED'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['BIB'], data=athInfo, table_name="RESULTS", verbose=0)
Esempio n. 11
0
    def test_templates_course_detail_one_open_course_run(self):
        """
        For a course with one open course run, the course run should be in the header
        and the side column should display an indication that there is no other course run.
        """
        course = CourseFactory()
        page = course.extended_object

        # Create an open course run
        now = timezone.now()
        CourseRunFactory(
            direct_course=course,
            start=now + timedelta(hours=1),
            enrollment_start=now - timedelta(hours=1),
            enrollment_end=now + timedelta(hours=1),
        )

        self.assertTrue(page.publish("fr"))

        url = page.get_absolute_url()
        response = self.client.get(url)
        self.assertEqual(response.status_code, 200)

        html = lxml.html.fromstring(response.content)

        # Check syllabus intro
        header = str(etree.tostring(html.cssselect(".subheader__intro")[0]))
        self.assertEqual(header.count("course-detail__run-descriptions"), 1)
        self.assertIn("S’inscrire maintenant", header)

        # Check syllabus aside column
        aside = str(etree.tostring(html.cssselect(".course-detail__aside")[0]))
        self.assertNotIn("course-detail__run-descriptions", aside)
        self.assertNotIn("S’inscrire maintenant", aside)
        self.assertIn("Aucune autre session ouverte", aside)
def get_agent_by_html(html):
    # print tostring(html)
    try:
        name = html.cssselect('a')[0].get('title').encode(
            'ascii', 'ignore').decode('ascii')
        agent_url = html.cssselect('a')[0].get('href')
        estate_name = html.cssselect('a')
        estate_name = estate_name[len(estate_name) - 1].get('title')
        try:
            reg_number = re.search(
                re.escape(r'CEA Registration Number :') + '\s(.{8})',
                tostring(html), re.I).group(1)
        except AttributeError:
            reg_number = None
        try:
            lic_number = re.search(
                re.escape(r'Agency Licence Number :') + '\s(.{9})',
                tostring(html), re.I).group(1)
        except AttributeError:
            lic_number = None
        phone_number = html.cssselect('span a')
        if phone_number:
            phone_number = get_phone_number(phone_number[0])
        agent = AgentIProperty(name=name,
                               phone_number=phone_number,
                               estate_name=estate_name,
                               reg_number=reg_number,
                               lic_number=lic_number,
                               url=agent_url)
        # print agent
        return agent
    except IndexError:
        return None
Esempio n. 13
0
    def test_templates_course_detail_two_open_course_runs(self):
        """
        For a course with two open course runs, the course run starting next should be in the
        header and the other course run should be in the side column.
        """
        course = CourseFactory()
        page = course.extended_object
        url = page.get_absolute_url()

        # Create 2 open course runs
        now = timezone.now()
        start1, start2 = random.sample(
            [now + timedelta(days=1), now + timedelta(days=2)], 2
        )
        CourseRunFactory(
            direct_course=course,
            start=start1,
            enrollment_start=now - timedelta(hours=1),
            enrollment_end=now + timedelta(hours=1),
        )
        CourseRunFactory(
            direct_course=course,
            start=start2,
            enrollment_start=now - timedelta(hours=1),
            enrollment_end=now + timedelta(hours=1),
        )

        self.assertTrue(page.publish("fr"))
        response = self.client.get(url)
        self.assertEqual(response.status_code, 200)

        html = lxml.html.fromstring(response.content)

        # Check syllabus intro
        header = str(
            etree.tostring(
                html.cssselect(".subheader__intro")[0],
                encoding="iso8859-1",
                method="html",
            ).decode("utf-8")
        )
        self.assertEqual(header.count("course-detail__runs--open"), 1)
        self.assertIn("S’inscrire maintenant", header)
        date_string = formats.date_format(min(start1, start2))
        with translation.override("fr"):
            self.assertIn(f"Du {date_string}", header)

        # Check syllabus aside column
        aside = str(
            etree.tostring(
                html.cssselect(".course-detail__aside")[0],
                encoding="iso8859-1",
                method="html",
            ).decode("utf-8")
        )
        self.assertEqual(aside.count("course-detail__runs--open"), 1)
        self.assertIn("S’inscrire maintenant", aside)
        date_string = formats.date_format(max(start1, start2))
        with translation.override("fr"):
            self.assertIn(f"Du {date_string}", aside)
Esempio n. 14
0
def current_usage_info(session):
    response = session.get(current_usage_url)
    html = lxml.html.fromstring(response.text)

    def convert(text):
        m = re.search(r'(\d+(?:\.\d+)?) GB', text)
        if m:
            return float(m.group(1))

        m = re.search(r'(\d+(?:\.\d+)?) MB', text)
        if m:
            return float(m.group(1)) / 1024.0

    tds = html.cssselect('#usageInformation')[0].xpath('.//td')
    info = {
        'download_usage':
        convert(_condense_whitespace(tds[2].text_content())),
        'upload_usage':
        convert(_condense_whitespace(tds[4].text_content())),
        'total_usage':
        convert(_condense_whitespace(tds[6].text_content())),
        'allowance':
        convert(_condense_whitespace(tds[8].text_content())),
        'billing_period':
        re.sub(
            r'Details for ?', '',
            _condense_whitespace(
                html.cssselect('#currentBillingPeriod')[0].text_content())),
    }
    info['left'] = info['allowance'] - info['total_usage']

    return info
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', '--username', help='Username')
    parser.add_argument('-p', '--password', help='Password')
    args = parser.parse_args()



    with requests.Session() as s:
        homepage = s.get('http://trithuc.vinacontrol.com.vn/')
        html = lxml.html.fromstring(homepage.text)

        payloads = {'name': args.username,
                    'pass': args.username,
                    'form_build_id': html.cssselect('input[name=form_build_id]')[0].attrib['value'],
                    'form_id': html.cssselect('input[name=form_id]')[0].attrib['value'],
                    'op': 'Đăng nhập'
                   }
        s.post('http://trithuc.vinacontrol.com.vn/node', data=payloads)
        res = s.get('http://trithuc.vinacontrol.com.vn/ds-cauhoi?field_quiz_phanloai_tid[0]=438&items_per_page=All')
        html = lxml.html.fromstring(res.text)
    table = html.cssselect('table[data-view-name=ds_cauhoi]')[0]
    columns = ['STT', 'Phân loại', 'Câu hỏi', 'Trả lời']
    df = pd.DataFrame(columns=columns)

    for row in table.cssselect('tbody>tr'):
        df = df.append(pd.DataFrame([parse_row(row)], columns=columns))

    df.to_excel('test.xls')
def parsePage(html):

    # Dictionary to store info
    athInfo = {}

    # Now start populating our data object
    athInfo["ATHLETE_NAME"] = html.cssselect("h2")[0].text
    athInfo["DIVISION_RANK"] = html.cssselect("#rank *")[0].tail.strip()
    athInfo["OVERALL_RANK"] = html.cssselect("#div-rank *")[0].tail.strip()

    # infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION']
    infoFields = ["BIB", "DIVISION", "STATE", "COUNTRY", "PROFESSION"]
    detailsFields = ["TOTAL_SWIM", "TOTAL_BIKE", "TOTAL_RUN", "TOTAL_TIME"]

    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text

    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    # have to use xpath to get T1 and T2 data
    athInfo["T1"] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo["T2"] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo["HAS_RESULTS"] = 1
    athInfo["SCRAPED"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=["BIB"], data=athInfo, table_name="RESULTS", verbose=0)
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', '--username', help='Username')
    parser.add_argument('-p', '--password', help='Password')
    args = parser.parse_args()

    with requests.Session() as s:
        homepage = s.get('http://trithuc.vinacontrol.com.vn/')
        html = lxml.html.fromstring(homepage.text)

        payloads = {
            'name':
            args.username,
            'pass':
            args.username,
            'form_build_id':
            html.cssselect('input[name=form_build_id]')[0].attrib['value'],
            'form_id':
            html.cssselect('input[name=form_id]')[0].attrib['value'],
            'op':
            'Đăng nhập'
        }
        s.post('http://trithuc.vinacontrol.com.vn/node', data=payloads)
        res = s.get(
            'http://trithuc.vinacontrol.com.vn/ds-cauhoi?field_quiz_phanloai_tid[0]=438&items_per_page=All'
        )
        html = lxml.html.fromstring(res.text)
    table = html.cssselect('table[data-view-name=ds_cauhoi]')[0]
    columns = ['STT', 'Phân loại', 'Câu hỏi', 'Trả lời']
    df = pd.DataFrame(columns=columns)

    for row in table.cssselect('tbody>tr'):
        df = df.append(pd.DataFrame([parse_row(row)], columns=columns))

    df.to_excel('test.xls')
Esempio n. 18
0
def _result_type(html):

    if html.cssselect(".tinfodownloadbutton a"):
        return "info"
    elif html.cssselect(".tlistdownload a"):
        return "list"
    else:
        return "empty"
Esempio n. 19
0
 def scrape_comment(self, html, parent):
     c = HTMLDocument(
         text = html.cssselect("div.text-holder"),
         headline = html.cssselect("a.commentTitle")[0].text_content().strip(),
         section = parent.props.section,
         date = readDate(" ".join([t.text for t in html.cssselect("ul.meta li.createdate, li.createtime")])),
         author = html.cssselect("ul.meta li.by")[0].text.strip().lstrip("By").strip(),
         url = parent.props.url + "#{}".format(html.cssselect("a.commentTitle")[0].get('id')))
     c.props._parent = "{p.props.headline}, {p.props.date}".format(p = parent)
     return c
Esempio n. 20
0
def nextPage(html,base_url=''):
    # logger.info('have many page')
    car_body = lxml.html.tostring(html.cssselect('.text')[-1])
    while len(html.cssselect('.next')) > 0 and len(html.cssselect('.nextBtn')) == 0:
        nextpage = requests.get(base_url + html.cssselect('.next')[0].get('href'))
        nexthtml = lxml.html.fromstring(nextpage.content)
        body = lxml.html.tostring(nexthtml.cssselect('.text')[-1])
        car_body += body
        html = nexthtml
    return car_body
Esempio n. 21
0
def scrape_detail_page(response: requests.Response) -> dict:
    html = lxml.html.fromstring(response.text)
    ebook = {
        'url': response.url,
        'title': html.cssselect('#bookTitle')[0].text_content(),
        'price': html.cssselect('.buy')[0].text.strip(),
        'content':
        [h3.text_content() for h3 in html.cssselect('#content > h3')]
    }
    return ebook
Esempio n. 22
0
def scrape_detail_page(response):
    html = lxml.html.fromstring(response.text)
    ebook = {
        'url': response.url,
        'key': extract_key(response.url),
        'title': html.cssselect('#bookTitle')[0].text_content(),
        'price': html.cssselect('.buy')[0].text.strip(),
        'content': [normalize_spaces(h3.text_content()) for h3 in html.cssselect('#content>h3')],
    }
    return ebook
Esempio n. 23
0
def download_councillors():
    with open(WEBPAGESTXT, 'r') as txtfile:
        urls = txtfile.readlines()
    urls = [url.strip() for url in urls]

    session = http.client.HTTPSConnection('www.berlin.de', timeout=10)
    councillors = {}
    for url in urls:
        if councillors:
            time.sleep(2)

        bezirk = bezirk_from_url(url)

        headers = {'Accept-Encoding': 'gzip', 'Connection': 'keep-alive'}
        session.request('GET', url, headers=headers)
        response = session.getresponse()

        response = response.read()
        response = zlib.decompress(response, 47)

        try:
            response = response.decode('latin-1', 'strict')
        except UnicodeDecodeError:
            response = response.decode('windows-1252', 'strict')

        html = lxml.html.fromstring(response)
        html.make_links_absolute(url)

        tablerows = html.cssselect('.zl12')
        tablerows += html.cssselect('.zl11')

        number = html.cssselect('table.tk1:nth-child(8)')[0]
        number = number.text_content()
        _, number = number.split(':')
        number = number.strip()
        if number.isdigit():
            number = int(number)
            if not number == len(tablerows):
                print('%s:' % bezirk,
                      '%s councillors were found.' % len(tablerows),
                      'Should be %s councillors.' % number)

        for row in tablerows:
            councillor = extract_councillor(row)
            councillor['BEZIRK'] = bezirk
            identifier = normalized_name(councillor['ANZEIGENAME'])
            try:
                councillors[bezirk][identifier] = councillor
            except KeyError:
                councillors[bezirk] = {identifier: councillor}
    session.close()
    return councillors
Esempio n. 24
0
def parse(page_text):
    urls = []
    html = get_html(page_text)
    url = html.cssselect('link[rel = "canonical"]')[0].get('href')
    print("URL:", url)
    breadcrumbs = html.cssselect('div.breadcrumbs>span.almost_bold')[0].text
    if (breadcrumbs == "History"):
        pagination = int(html.cssselect('div.rating_pagination.pagination>span')[0].text)
        if(pagination == 1):
            rating_pagination = html.cssselect("div.rating_pagination.pagination")[0]
            a_list = rating_pagination.cssselect('div.rating_pagination.pagination>a')
            for a in a_list:
                urls.append(main_type + a.get("href"))
        href_list = []
        a_list = html.cssselect('table.rating.responsive>tr:not([class])>td[style="text-align:left"]>a')
        for a in a_list:
            href_list.append(main_type + a.get("href"))

        href_list = list(set(href_list))
        for href in href_list:
            urls.append(href)
    else:
        type = html.cssselect('div.breadcrumbs>span[itemprop = "itemListElement"]')[1].cssselect('a')[0].get('title')
        group_list = html.cssselect('div.tbt2.row_heading>div>h2')[1:]
        title = html.cssselect('meta[property = "og:title"]')[0].get("content")
        # print("Название", title)
        id_device = add_device(title, type, url)
        # print("Идентификатор продукта", id_device)
        tables = html.cssselect('div.tbt1.single>div.table')
        # print(len(tables))
        for index, table in enumerate(tables):
            group = group_list[index].text
            tbts = table.cssselect('div.tbt5')
            for tbt in tbts:
                divs = tbt.cssselect('div')
                one_block = divs[1].text
                #Прерываем ошибочные таблицs
                if(one_block == None):
                    # add = False
                    break
                two_block = divs[2].text
                if(two_block == None):
                    try:
                        two_block = divs[2].cssselect('span')[0].text
                    except:
                        two_block = divs[2].cssselect('a')[0].text
                if(two_block == "+"):
                    two_block = 1
                else:
                    if(two_block == "-"):
                        two_block = 0
                # print("Группа:", group)
                id_device_variables = add_device_variables(one_block, group)
                # print("Идентификатор VARIABLE:", id_device_variables)
                id_device_value = add_device_value(id_device, id_device_variables, two_block)

    return urls
Esempio n. 25
0
    def test_render(self):
        column = self.get_column()

        output = column.render(G(Adherant))
        html = lxml.html.fromstring(output)

        elements = html.cssselect('a[data-toggle="modal"]')
        self.assertEqual(1, len(elements))
        self.assertEqual(u"Accueil", elements[0].text)

        modal_id = elements[0].attrib['data-target']
        modal = html.cssselect(u'[id="%s"]' % modal_id[1:])
        self.assertEqual(1, len(modal))
Esempio n. 26
0
def upload_course(session, title="Lorem Ipsum", filename="test.jpg", filedata="", filetype="image/png"):
    response = session.get("https://www.iscp.ac.uk/evidence/course.aspx")

    html = lxml.html.fromstring(response.text)

    viewstate = html.cssselect("input#__VIEWSTATE")[0].value
    viewstate_generator = html.cssselect("input#__VIEWSTATEGENERATOR")[0].value
    viewstate_encrypted = html.cssselect("input#__VIEWSTATEENCRYPTED")[0].value
    event_validation = html.cssselect("input#__EVENTVALIDATION")[0].value

    payload = {
        "__EVENTTARGET": "",
        "__EVENTARGUMENT": "",
        "__EVENTVALIDATION": event_validation,
        "__VIEWSTATE": viewstate,
        "__VIEWSTATEGENERATOR": viewstate_generator,
        "__VIEWSTATEENCRYPTED": viewstate_encrypted,
        "ctl00$cphMain$txtDate": "01/01/1970",
        "ctl00$cphMain$txtEndDate": "",
        "ctl00$cphMain$drpTitles": 6,  # Other
        "ctl00$cphMain$txtOtherTitle": title,
        "ctl00$cphMain$drpTypes": 0,
        "ctl00$cphMain$txtOtherType": "",
        "ctl00$cphMain$txtAwardingBody": "",
        "ctl00$cphMain$txtFeedback": "",
        "ctl00$cphMain$txtLearn": "",
        "ctl00$cphMain$txtImprove": "",
        "ctl00$cphMain$txtActionPlan": "",
        "ctl00$cphMain$topicChooser1$hidScrollTop": "",
        "ctl00$cphMain$topicChooser1$hidTpcExpanded": "True",
        "ctl00$cphMain$topicChooser1$hidSelectedTopics": "",
        "ctl00$cphMain$topicChooser1$hdnPopUpShowing": "",
        "ctl00$cphMain$topicChooser1$hidTab": "",
        "ctl00$cphMain$btnInsert": "Save Course/seminar",
        "ctl00$TraineeReport1$download_token_value_id": "17/05/2015 12:20:46",
        "ctl00$TraineeReport1$txtStartDate": "16/05/2014",
        "ctl00$TraineeReport1$txtEndDate": "16/05/2015",
        "ctl00$txtFeedbackComments": "",
    }

    files = {
        "ctl00$cphMain$fupControl1": (
            filename,
            filedata,
            filetype,  # FIXME
        ),
    }

    r = session.post("https://www.iscp.ac.uk/evidence/course.aspx", data=payload, files=files)

    pprint.pprint(r.text)
Esempio n. 27
0
def theme(data):
    html = lxml.html.fromstring(data)  # 等同于etree.HTML函数
    data_theme = ""
    a = html.cssselect('#tab_sdyj > tfoot > tr')
    for i in range(len(a)):
        if (len(
                html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' +
                               str(i + 1) + ') > td:nth-child(2)')) !=
                0) and (len(
                    html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' +
                                   str(i + 1) + ') > td:nth-child(6)')) != 0):
            data_theme += html.cssselect('#tab_sdyj > tfoot > tr:nth-child(' +
                                         str(i + 1) + ')')[0].text_content()
    return (data_theme)
	def download(self,link):
		parent_url='http://link.springer.com'
		source = requests.get(link).content
		html = lxml.html.fromstring(source)
		book_title= html.cssselect('h1#title')[0].text_content() 
		chapter=01
		for i in html.cssselect('li.toc-item'):
			url=urljoin(parent_url,i.cssselect('div.actions')[0].cssselect('span.action')[0].cssselect('a')[0].get('href'))
			pdf=requests.get(url).content
			f = open(book_title+str(chapter)+'.pdf', 'wb+')
			f.write(pdf)
			chapter+=1
			print url
		Download_Book().concatenate_pdf(book_title)
Esempio n. 29
0
    def test_cms_plugins_program_fallback_when_never_published(self):
        """
        The program plugin should render in the fallback language when the program
        page has never been published in the current language.
        """
        # Create a program
        program = ProgramFactory(
            page_title={
                "en": "public program",
                "fr": "programme publique"
            },
            fill_cover={
                "original_filename": "cover.jpg",
                "default_alt_text": "my cover",
            },
        )
        program_page = program.extended_object

        # Create a page to add the plugin to
        page = create_i18n_page({"en": "A page", "fr": "Une page"})
        placeholder = page.placeholders.get(slot="maincontent")
        add_plugin(placeholder, ProgramPlugin, "en", **{"page": program_page})
        add_plugin(placeholder, ProgramPlugin, "fr", **{"page": program_page})

        # Publish only the French version of the program
        program_page.publish("fr")

        # Check the page content in English
        page.publish("en")
        url = page.get_absolute_url(language="en")
        response = self.client.get(url)

        html = lxml.html.fromstring(response.content)

        # The program's full name should be wrapped in a link within an h2
        title = html.cssselect(".program-glimpse__title")[0]
        link = title.cssselect(".program-glimpse__link")[0]
        self.assertEqual(link.text_content().strip(), "programme publique")
        self.assertNotContains(response, "public program")

        # Program's cover should be present
        cover = html.cssselect(".program-glimpse__media")[0]
        self.assertEqual(cover.get("aria-hidden"), "true")
        img = cover.cssselect("img")[0]
        self.assertIsNotNone(
            re.search(
                r"/media/filer_public_thumbnails/filer_public/.*cover\.jpg__300x170",
                img.get("src"),
            ))
Esempio n. 30
0
def getpage(url, f, count):
  html = geturl(url)
  html = lxml.html.fromstring(html)
  title = html.cssselect('h2')[0].text
  
  print('第 %d 页(%s)已下载' % (count, title))
  patt = '(%\w+)+'
  f.write(title + '\n\n')
  content = html.cssselect('script')[7].text
  content = content[content.find('unescape')+10 : content.find('\"))')]
  #print(content)
  content = content.replace('%3Cbr%2F%3E%3Cbr%2F%3E','\n')
  content = content.replace('%','\\').encode()
  content = content.decode('unicode_escape')
  f.write(content + '\n')
Esempio n. 31
0
def scrape_list_page(response: requests.Response) -> Iterator[str]:

    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)
    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        url = a.get('href')
        yield url
Esempio n. 32
0
def current_month_daily_breakdown(session):
    summary_markers = [
        '',
        'Total Usage (GB)',
        'Usage Allowance',
        'Additional Use',
        'Usage',
    ]

    response = session.get(day_to_day_url)
    html = lxml.html.fromstring(response.text)
    rows = html.cssselect(".internetUsageDataContainer .mainSection tr")
    data = []
    summary = []

    for row in rows:
        row = map(_format_cell_data, row.cssselect('td'))
        if row[0] in summary_markers:
            cleaned = [ td for td in row if td ]
            if len(cleaned):
                summary.append(cleaned)
        else:
            data.append(row)

    return DailyBreakdown(data, summary)
Esempio n. 33
0
    def get_submission_list(self, problem_name):
        self.check_problem_exist(problem_name)
        request = urllib2.Request(url=SITE_PREFIX+'judge/submission/recent/?problem='+problem_name)
        response = self.opener.open(request)

        try:
            import lxml.html
        except ImportError:
            print 'lxml library is needed for parsing HTML'
            return

        html = lxml.html.fromstring(unicode(response.read().decode('utf8')))
        context = {}
        fields = ('id', 'problem', 'user', 'language', 'length', 'state', 'stats', 'submitted_on')
        length = {'id': 9, 'problem': 15, 'user': 15, 'language': 5, 'length': 7, 'state': 15, 'stats': 7, 'submitted_on': 15}
        template = u'%(id)s %(problem)s %(user)s %(language)s %(length)s %(state)s %(stats)s %(submitted_on)s'

        def width(string):
            return sum(1+(unicodedata.east_asian_width(c) in 'WF') for c in string)

        for tr in html.cssselect('table.submission_list tr'):
            for field in fields:
                element = tr.find_class(field)
                if element:
                    context[field] = unicode(element[0].text_content().strip())
                else:
                    context[field] = u''
                context[field] = ' ' * (length[field] - width(context[field])) + context[field]
            print template % context
Esempio n. 34
0
def search(topic):
    url = 'https://lobste.rs/search?what=stories&order=newest&q=' + topic
    response = requests.get(url)
    html = lxml.html.fromstring(response.text)
    for item in html.cssselect('.link a'):
        text = item.text_content()
        print(text)
def download():
    i = 0
    aux = 0
    for i in range(int(MAX_BOOKS)):
        try:
            if not os.path.isfile(get_url_from_pne(img_temp, 'DBM_', i, 'jpg')) and not os.path.isfile(get_url_from_pne(img_temp, 'DBM_', i, 'png')):
                link = 'http://www.dragonball-multiverse.com/es/page-'+str(i)+'.html'
                source = requests.get(link).content
                html = lxml.html.fromstring(source)
                book_title = html.cssselect('div')[7].cssselect('img')[0].get('src')
                url=urljoin(parent_url,book_title)
                img_format = book_title[-3:] # file ext catched (.jpg or .png)
                img_file = open(get_url_from_pne(img_temp, 'DBM_', i, img_format),'wb') 
                img_file.write(requests.get(url).content)
                img_file.close()
                time.sleep(0.5)
                print("Downloaded in: "+get_url_from_pne(img_temp, 'DBM_', i, img_format))
            else:
                print("Skipping book number %s\r" %i)


        except Exception as e:
            print(e)
            print("It cannot be downloaded")
            BLACK_LIST.append(i)
            time.sleep(.5)
Esempio n. 36
0
    def __enter__(self):
        # Similar to assertContains(), we verify the status code
        self.test_case.assertEqual(self.response.status_code, self.status_code)

        # TODO consider validating self.response['Content-Type']

        # Parse the response as HTML
        html = lxml.html.fromstring(self.response.content.decode('utf-8'))
        if self.selector is not None:
            # Use cssselect to filter the elements
            elements = html.cssselect(self.selector)

            # Ensure some data exists
            if len(elements) == 0:
                raise SelectorNotFound(
                    'No selector matches found for {0}'.format(self.selector)
                )

            return elements
        if self.element_id is not None:
            try:
                return html.get_element_by_id(self.element_id)
            except KeyError:
                raise ElementIDNotFound(
                    'Element with id, {0}, not present'.format(self.element_id)
                )

        # No filtering defined, return the entire parsed HTML document
        return html
Esempio n. 37
0
def spiderboy(url):

    page = requests.get(url,headers=headers)
    base_url ='/'.join(page.url.split('/')[:-4])

    html = lxml.html.fromstring(page.content.decode('gbk','ignore'))
    items = html.cssselect('.anewsnotitle')
    for item in items:
        car_link = base_url + item.cssselect('.newstext h3 a')[0].get('href')
        logger.info('link: '+car_link)
        try:
            Car.objects.get(car_link = car_link)
            pass
        except Exception,e:
            car_title = str(item.cssselect('.newstext h3 a')[0].text_content())
            logger.info('title: '+car_title)
            car_icon = base_url + item.cssselect('.newspic a img')[0].get('src')
            logger.info('icon_url: '+car_icon)
            car_des = str(item.cssselect('.newstext p')[0].text_content())
            logger.info('get des')

            innerpage = requests.get(car_link,headers=headers)
            innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk','ignore'))

            try:
                next = innerhtml.cssselect('.cpagesizebottom a')[-1]
                if next.text_content() == u'下一页':
                    mid_body = nextPage(innerhtml,base_url)
                else:
                    mid_body = lxml.html.tostring(innerhtml.cssselect('.content')[0])
                    mid_body = cleaner.clean_html(mid_body)

            except:
                mid_body = lxml.html.tostring(innerhtml.cssselect('.content')[0])
                mid_body = cleaner.clean_html(mid_body)

            pattern = re.compile(r'(?:src|href)="([^http].*?[\.jpg])"', re.VERBOSE)

            test = pattern.findall(mid_body)
            test = list(set(test))

            for i in test:
                mid_body = mid_body.replace(i,base_url+i)

            car_body = mid_body
            logger.info('body: catch')
            car_cate = category_select(url,catechoice)
            logger.info('category: '+car_cate)

            ca = Car(car_title=car_title,
                     car_des=car_des,
                     car_link=car_link,
                     car_body=car_body,
                     car_icon=car_icon,
                     car_source="neeu",
                     car_cate=car_cate)


            ca.save()
            logger.info('done one')
Esempio n. 38
0
 def selectMany(self, css):
     html = self.get_html()
     selection = html.cssselect(css)
     if not len(selection):
         self.fail('No elements matching: %r' % css)
     self.state['selection'] = selection
     return selection
Esempio n. 39
0
async def main():
    try:
        all_urls = ALL_URLS.read_text().split()
    except FileNotFoundError:
        all_urls = []

    async with aiohttp.ClientSession(raise_for_status=True) as session:
        async with session.get(
                "https://p.eagate.573.jp/game/sdvx/") as response:
            html = lxml.html.fromstring(await response.text())

        urls = [
            img.attrib["data-original"]
            for img in html.cssselect("div.news_box img")
        ]

        if urls:
            for url in urls:
                if WEBHOOK and url not in all_urls:
                    print(url)
                    all_urls.append(url)
                    async with session.get(url) as response:
                        image_data = await response.read()
                    data = aiohttp.FormData()
                    data.add_field("file", image_data, filename=URL(url).name)
                    await session.post(WEBHOOK, data=data)

            ALL_URLS.write_text("".join(f"{url}\n" for url in all_urls))
            CURRENT_URLS.write_text("".join(f"{url}\n" for url in urls))
Esempio n. 40
0
def parse_article_oneline_list(url, text=None):
    '''
    XPATH: //div[@class="memo_list"]/form[@id="listForm"]/ul/dl

    source:
        <dl>
            <dt class="profile_block reply_size" >
            <div id="pimgWrap_0_6475" class="fl">
                <img src="http://fimg.daum-img.net/tenth/img/y/t/i/u/ccJT/76/96c1c9-42087-d1.bmp" width="32" height="32" alt="" onmouseover="MemoFormController.showProfileLayer(this, 'pimg_0_6475');" onmouseout="MemoFormController.hideProfileLayer();">
                <img id="pimg_0_6475" src="http://fimg.daum-img.net/tenth/img/y/t/i/u/ccJT/76/96c1c9-42087-d3.bmp" width="150" height="150" style="display: none;" alt="프로필 이미지" />
            </div>
            </dt>			
            <dd class="content_block ">
            <div id="memoViewer_0_6475" class="content_viewer ">
                <p class="nickname">
                &nbsp; <a href="#" onclick="showSideView(this, 'Zo6UMXQoclc0', '', 'Ellen[\uC774\uACBD\uBBFC]'); return false;" class="b">Ellen[이경민]</a>
                &nbsp; <span class="txt_sub num">12.07.11. 09:45</span> &nbsp;
                </p>
                <div class="content_memo">
                    7/15(일) 오후 2시에 강서구 등촌동 저희집에서 집들이 할께요! 
                    <br />
                    참석 가능하시면 댓글 달아주세요~ ㅎㅎ좀 멀긴하지만 맛있는 음식과 술이 기다리고 있을거예요~ ^^ 
                    <img src="http://i1.daumcdn.net/cafeimg/cf_img2/img_blank2.gif" width="8" height="12" alt="새글" class="icon_new" />
                    <b>								
                        <a href="#" onclick="ReplyFormController.showReplyForm('0_6475'); return false;" class="txt_point" >
                            [<span id="commentReplyCount_0_6475" class="txt_point">8</span>]
                        </a>
                    </b>						
                </div>
            </div><!-- content_viewer -->
            <div id="memoModify_0_6475" class="content_modify"></div>
            <div id="memoBtns_0_6475" class="memo_btns p11">
                <a href="#" onclick="ReplyFormController.showReplyForm('0_6475'); return false;" class="p11">답글</a>																	</div>
            </dd><!-- end content_block -->
        </dl>
    '''
    _type = namedtuple('BriefArticleInfo', 
        'article_num title post_date author path url'.split())

    # fetch
    if text is None:
        text = urlread(url, timeouts=ARTICLE_TIMEOUTS)
    html = lxml.html.fromstring(text)

    results = []
    articles = html.cssselect('div.memo_list form#listForm ul dl')
    for dl in articles:
        content = dl.cssselect('div.content_viewer div.content_memo')[0].xpath('child::text()')
        nick = dl.cssselect('div.content_viewer p.nickname a')[0]
        date = dl.cssselect('div.content_viewer p.nickname span.txt_sub.num')[0]
        article_num = dl.cssselect('div.content_viewer')[0].attrib['id'].rsplit('_', 1)[-1]
        results.append(_type(
            int(article_num),
            "\n".join(content).strip(),
            date.text.strip(),
            nick.text.strip(),
            None,
            None,
        ))
    return results
Esempio n. 41
0
def fetch_elements(html,lo_name,locate_source):
    """.. :py:method::
    根据对象库获取元素
    """
    path = locate_source[lo_name]
    if isinstance(path,tuple):
        if len(path) == 3:
            return html.cssselect(path[0])[path[1]:path[2]]
        elif len(path) == 2:
            return html.cssselect(path[0])[path[1]]
        else:
            raise Exception
    elif isinstance(path,basestring):
        return html.cssselect(path)
    else:
        raise Exception
def download():
    i = 0
    aux = 0
    for i in range(int(MAX_BOOKS)):
        try:
            if not os.path.isfile(get_url_from_pne(
                    img_temp, 'DBM_', i, 'jpg')) and not os.path.isfile(
                        get_url_from_pne(img_temp, 'DBM_', i, 'png')):
                link = 'http://www.dragonball-multiverse.com/es/page-' + str(
                    i) + '.html'
                source = requests.get(link).content
                html = lxml.html.fromstring(source)
                book_title = html.cssselect('div')[7].cssselect('img')[0].get(
                    'src')
                url = urljoin(parent_url, book_title)
                img_format = book_title[-3:]  # file ext catched (.jpg or .png)
                img_file = open(
                    get_url_from_pne(img_temp, 'DBM_', i, img_format), 'wb')
                img_file.write(requests.get(url).content)
                img_file.close()
                time.sleep(0.5)
                print("Downloaded in: " +
                      get_url_from_pne(img_temp, 'DBM_', i, img_format))
            else:
                print("Skipping book number %s\r" % i)

        except Exception as e:
            print(e)
            print("It cannot be downloaded")
            BLACK_LIST.append(i)
            time.sleep(.5)
Esempio n. 43
0
	def run(self):
		base_cookie = self.get_base_cookie()

		req = requests.get(self.ENDPOINT, headers=self.HEADERS, cookies=base_cookie, verify=False, allow_redirects=False)

		if req.status_code != 200:
			print req.status_code
			print req.content
			print 'Error'
			return

		html = lxml.html.fromstring(req.content)
		categs = html.cssselect('.tabel_categorie')

		for categ in categs:
			a = categ.cssselect('a')[0]
			name = a.text
			name = name.replace('Calorii ', '').lower()

			categories = name.split(" ")
			if len(categories) > 1:
				category_name = "{}-{}".format(categories[0], categories[1])
			else:
				category_name = categories[0]

			url = a.get('href')

			req = requests.get(url, headers=self.HEADERS, cookies=base_cookie, verify=False, allow_redirects=False)

			self._process_category_table(req.content, category_name)
Esempio n. 44
0
def scrape_list_page(response: requests.Response) -> Iterator[str]:
    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)

    for a in html.cssselect('table.cassetteitem_other > tbody > tr > td:nth-child(9) > a'):
        url = a.get('href')
        yield url
Esempio n. 45
0
def getpoll():
    pageURL = "http://polldaddy.com/poll/7575405/"
    html = lxml.html.parse(pageURL).getroot()
    votebutton = html.cssselect('.vote-button')
    datavote = votebutton[0].get("data-vote")
    datadict = ast.literal_eval(datavote)
    return datadict
Esempio n. 46
0
def parse_cafe_inner_url_from_official(url):
    '''Parse cafe official url and return real url.
    
	<frame name="down" id="down" src="http://cafe986.daum.net/_c21_/home?grpid=ccJT" width="100%" height="100%" frameborder="0" marginwidth="0" marginheight="0" title="카페 메인 프레임">
    '''
    #CAFE_HOME_PATTERN = re.compile(u'''
    #    # get src of frame#down
    #    <frame [^>]*
    #        (
    #            (id="down" [^>]*src="([^"]*)")
    #            |
    #            (src="([^"]*)" [^>]*id="down")
    #        )
    #    [^>]*>
    #''', re.S | re.X)

    site1 = urlread(url, timeouts=ARTICLE_TIMEOUTS)

    #match = CAFE_HOME_PATTERN.search(site1)
    #if not match:
    #    raise Exception("parse error")
    #url = match.group(3) or match.group(5)
    html = lxml.html.fromstring(site1)
    frame = html.cssselect('frame#down')[0]
    url = frame.get('src')

    return url
Esempio n. 47
0
def parse_article_album_list(url, text=None):
    ''' parse article phone list and result list of article information as a tuple:
        (article_num, title, post_date, author, path, url)
    '''
    _type = namedtuple('BriefArticleInfo', 
        'article_num title post_date author path url fldid grpid'.split())

    # fetch
    if text is None:
        text = urlread(url, timeouts=ARTICLE_TIMEOUTS)


    html = lxml.html.fromstring(text)
    articles = html.cssselect('div.albumListBox li')

    def _parse(li):
        subject = li.cssselect('dd.subject a')[0]
        author  = li.cssselect('dd.nick a')[0]
        article_num, post_date = li.cssselect('dd.txt_sub.p11 span.num')
        href = subject.get('href')
        path = unescape(href)
        query_dict = urlparse.parse_qs(urllib.splitquery(path)[-1])
        return _type(
            int(article_num.text.strip()), 
            subject.text.strip(),
            post_date.text.strip(),
            author.text.strip(),
            href,
            get_domain(url, href),
            query_dict.get('fldid', [None])[0],
            query_dict.get('grpid', [None])[0],
        )

    return [_parse(li) for li in articles if not li.cssselect('div.blank_thumb')]
Esempio n. 48
0
def parsing_list_product(request):
    html = get_html(request)
    a_list = html.cssselect('ul.sitemap>li>a')
    for a in a_list:
        href = a.get('href')
        url_product.append(href)
        log.info('Ссылка на товар:%s', href)
Esempio n. 49
0
def parsing_category(request):
    global categories_data, filename_category

    parent_category_one = 0
    parent_category_two = 0

    html = get_html(request)

    ul = html.cssselect('ul.sitemap')[0]
    li_list = ul.cssselect('li')
    for li in li_list:
        li_class = li.get('class')
        if (li_class == 'level-0'):
            parent_category = 0
            a = li.cssselect('a')[0]
            category_name = a.text
            category_url = a.get('href')

            temp = [index_add(), parent_category, category_name, category_url]

            categories_data.append(temp)
            parent_category_one = categories_data[-1][0]

            log.info(
                'PARENT_CATEGORY_ID: %s, CATEGORY_NAME: %s, CATEGORY_URL: %s',
                parent_category, category_name, category_url)
        else:
            if (li_class == "level-1"):
                a = li.cssselect('a')[0]
                parent_category = parent_category_one

                category_name = a.text
                category_url = a.get('href')

                temp = [
                    index_add(), parent_category, category_name, category_url
                ]

                categories_data.append(temp)
                parent_category_two = categories_data[-1][0]
            else:
                if (li_class == "level-2"):

                    parent_category = parent_category_two

                    a = li.cssselect('a')[0]

                    category_name = a.text
                    category_url = a.get('href')

                    temp = [
                        index_add(), parent_category, category_name,
                        category_url
                    ]

                    categories_data.append(temp)

    with open(filename_category, "w", newline="", encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(categories_data)
Esempio n. 50
0
    def test_testing_topic_announce(self):
        """Controls topics that are of type announcement don't have sorted options"""
        # Creates posts for announcement topics
        forum = ForumFactory()
        PostFactory(topic=TopicFactory(forum=forum, type=Topic.TOPIC_ANNOUNCE))
        PostFactory(topic=TopicFactory(forum=forum, type=Topic.TOPIC_ANNOUNCE))

        user = UserFactory()
        assign_perm("can_read_forum", user, forum)
        self.client.force_login(user)

        response = self.client.get(f"/forum/forum/{forum.slug}-{forum.pk}/")

        html = lxml.html.fromstring(response.content)
        # Select the header block of the announcement block, the first block
        announce_block = str(
            etree.tostring(html.cssselect(".topiclist .card-header")[0]))

        # Controls that announce_block is about announcements and not topics
        self.assertIn("Announcements", announce_block)
        self.assertNotIn("Topics", announce_block)
        self.assertIn("Replies", announce_block)
        self.assertIn("Views", announce_block)
        self.assertIn("Last post", announce_block)

        # There's no sortable informations
        self.assertNotIn("sortable sorted", announce_block)
        # There's no column that has a sorting link on
        self.assertNotIn("<a href=", announce_block)
        # There's no toggle sorting
        self.assertNotIn("Toggle sorting", announce_block)
Esempio n. 51
0
def get_calendar(data: dict, limit: int = 31) -> str:
    result = []
    now = datetime.datetime.now(tz=TZ)
    for days in range(limit):
        date = now + datetime.timedelta(days=days)
        temp = data.get(date.year, {})
        temp = temp.get(date.month, {})
        temp = temp.get(date.day, {})
        if not temp: break
        info = []
        for key, value in temp.items():
            # "qdhd": 庆典活动
            # "tdz":  团队战
            # "tbhd": 特别活动
            # "jqhd": 剧情活动
            # "jssr": 角色生日
            if value and key in ['qdhd', 'tdz', 'tbhd', 'jqhd', 'jssr']:
                html = lxml.html.fromstring(value)
                nodes = html.cssselect('.cl-t')
                for node in nodes:
                    info.append(node.text)
        msg = '\n'.join(info)
        if not msg: continue
        result.append('\n'.join(['==========',
                                 date.strftime('%Y-%m-%d'), msg]))
    return '\n'.join(result)
Esempio n. 52
0
def get_hrefs(data,css_select):
    # soup = BeautifulSoup(data,'lxml')
    # ele = soup.select(css_select)
    html = lxml.html.fromstring(data)
    ele = html.cssselect(css_select)
    hrefs = [e for e in ele]
    cell = (hrefs[0].text,hrefs[1].get('href'))
    return cell
Esempio n. 53
0
def _get_metadata_citation(html, format=None):

    if format is None:
        format = 'chicago'

    citation = html.cssselect("div.citation#cite_%s" % format)
    citation = citation[0].text_content()
    return citation.replace('<', '&lt;').replace('>', '&gt;')
Esempio n. 54
0
def spiderboy(url):
    page = requests.get(url)
    html = lxml.html.fromstring(page.content.decode('gbk'))
    items = html.cssselect('#ATitle')

    for item in items:
        car_link = item.get('href')
        # logger.info('link: '+car_link)
        try:
            Car.objects.get(car_link=car_link)
            # logger.info('already have ' + car_link)
            pass
        except:
            car_title = str(item.text_content())
            # logger.info('title: '+car_title)
            car_icon = 'http://x.autoimg.cn/news/index/img/20110801/logo_new.png'
            car_des = ''

            innerpage = requests.get(car_link)
            innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk'))

            try:
                next = base_url + innerhtml.cssselect('.page-item-readall')[0].get('href')
            except:
                next = None

            if next:
                innerpage = requests.get(next)
                innerhtml = lxml.html.fromstring(innerpage.content.decode('gbk'))

            try:
                innerhtml.cssselect('.diversion-box')[0].drop_tree()
            except:
                pass

            try:
                innerhtml.cssselect('.btn.font-normal')[0].drop_tree()
            except:
                pass


            article =innerhtml.cssselect('#articleContent')[0]
            mid_body = lxml.html.tostring(article,encoding=unicode)
            mid_body2 = cut(mid_body)
            r = re.compile(r'<a>|</a>')
            mid_body3 = cleaner.clean_html(mid_body2)
            car_body = mid_body3
            car_body = r.sub('',car_body)

            ca = Car(car_title=car_title,
                     car_des=car_des,
                     car_link=car_link,
                     car_body=car_body,
                     car_icon=car_icon,
                     car_source="autohome",
                     car_cate='car')

            ca.save()
Esempio n. 55
0
    def _parseResponse(self, queryText, url, html):
        for userEntry in html.cssselect('table.NoteDivWidth'):
            userInfo = userEntry.cssselect('tr table')[0]
            nickname = userInfo.cssselect('tr th')[0].text
            exchangeUrl = userInfo.cssselect('tr td')[-1].cssselect('a')[0].attrib['href']
            if any(source in exchangeUrl.lower() for source in self.sourceSubstringsToExclude):
                yield None
            else:
                shopFound = not exchangeUrl.endswith('.html')
                if shopFound:
                    cardSource = exchangeUrl
                    self.logger.warning('Found new shop: {}', exchangeUrl)
                else:
                    cardSource = self.getTitle() + '/' + nickname

                userCards = userEntry.cssselect('table.CardInfo')
                if len(userCards) > 0:
                    self.estimatedCardsCount += len(userCards) - 1

                for cardInfo in userCards:
                    cardName = cardInfo.cssselect('th.txt0')[0].text
                    cardUrl = exchangeUrl
                    if not shopFound:
                        cardUrl += '?Title={}'.format(cardName)

                    idSource = cardInfo.cssselect('nobr.txt0')[0].text
                    cardId = int(re.match(r'[^\d]*(\d+)[^\d]*', idSource).group(1)) if idSource else None

                    price = None
                    priceSource = cardInfo.cssselect('td.txt15')[-1].cssselect('b')
                    if len(priceSource) > 0:
                        possiblePrice = priceSource[-1].text
                        if possiblePrice is not None:
                            possiblePrice = possiblePrice.split()[0]
                            if possiblePrice.isdigit():
                                price = decimal.Decimal(possiblePrice)

                    foilness = len(cardInfo.cssselect('#FoilCard')) > 0

                    language = None
                    languageSource = cardInfo.cssselect('td.txt15')[0].cssselect('font')
                    if len(languageSource) > 0:
                        language = languageSource[0].text

                    setSource = cardInfo.cssselect('#table0 td img')[0].attrib['alt']

                    yield {
                        'id': cardId,
                        'name': cardName,
                        'foilness': foilness,
                        'set': setSource,
                        'language': language,
                        'price': price,
                        'currency': core.utils.Currency.RUR,
                        'count': int(cardInfo.cssselect('td.txt15 b')[0].text.split()[0]),
                        'source': self.packSource(cardSource, cardUrl),
                    }
Esempio n. 56
0
def _download_from_list_page(html, term):
    for title_node in html.cssselect(".tlistname a"):
        if term in title_node.text_content():
            download_node = title_node.xpath("../..")[0].cssselect(".tlistdownload a")[0]
            torrent_url = download_node.get("href")
            torrent = urlopen(torrent_url).read()
            return torrent
    logger.error("Search results from %s does not contain that term -- there probably is no torrent by that name.", term)
    return False
Esempio n. 57
0
def getBookInfo(str):
  html = lxml.html.fromstring(str)
  book = {}
  title = html.cssselect('title')[0].text
  book['title'] = title[0:title.find(',')]
  book['author'] = html.cssselect('a[href^="/book/author/"]')[0].get('alt')

  print("书名:", book['title'])
  print("作者:", book['author'])
  def getlink(a):
    link = a.get('href')
    return ("http://www.tadu.com"+link)
  def geta(div):
    return div.cssselect('a')[0]
  chapter = list(map(geta, html.cssselect('div.chapter_t')))
  book['links'] = list(map(getlink, chapter))
  print("共 %d 页" % len(book['links']))
  return book