def get_and_follow_links(initial_link, num_of_returned_links):
	# we will create a simple python dictionary with this function
	#that has a key as the url, and the corresponding value all of the
	#text grabbed from the page
	sites={}
	r=requests.get(initial_link)
	tree=lh.fromstring(r.text)
	#using the more complicated xpath query shown in the second function
	text=tree.xpath('string(/html/head/title)')
	text+='\n'
	text+=tree.xpath('string(//body/*[not(self::script)])')
	text=text.split()
	text=' '.join(text)
	#making sure your computer can render non-ascii characters by encoding in utf-8
	text=text.encode('utf-8')
	sites[initial_link]=text
	links=tree.xpath('//a/@href')
	for l in links[0:num_of_returned_links]:
		r=requests.get(l)
		tree=lh.fromstring(r.text)
		text=tree.xpath('string(/html/head/title)')
		text+='\n'
		text+= tree.xpath('string(//body/*[not(self::script)])')
		text=text.split()
		text=' '.join(text)
		text=text.encode('utf-8')
		sites[l]=text
		#taking a quick break so the websites don't get annoyed at you for too many requests
		time.sleep(random.randint(5,15))
	for i, x in enumerate(sites.items()):
		print i, x
Example #2
0
    def download_sgml_doc(self, info, html_url, current_version=None):
        should_update_xml = False

        s = self.open_url(html_url, self.doc_type)
        doc = html.fromstring(s)
        # Find the link to the SGML
        el = doc.xpath(".//a[contains(., 'Rakenteinen asiakirja')]")
        if len(el) != 1:
            # retry
            self.http.nuke_cache(html_url, self.doc_type)
            s = self.open_url(html_url, self.doc_type)
            doc = html.fromstring(s)
            el = doc.xpath(".//a[contains(., 'Rakenteinen asiakirja')]")

        if current_version:
            ver_el = doc.xpath(".//div[@class='doclist-items']//div[@class='header']/span")
            assert len(ver_el) == 1, "Version element not found"
            m = re.search(r'([0-9]\.[0-9])', ver_el[0].text)
            assert m, "Version number not found (%s)" % ver_el[0].text
            doc_version = m.groups()[0]
            if doc_version != current_version:
                should_update_xml = True
                self.logger.debug("SGML document updated (version %s, stored version %s)" % (doc_version, current_version))

        if len(el) != 1:
            year = info['id'].split('/')[1]
            if int(year) <= 1999:
                return None
            raise ParseError("No link to SGML file found: %s" % html_url)
        doc.make_links_absolute(html_url)
        link = el[0].attrib['href']

        fname = link.split('/')[-1]
        m = re.match(r'^([a-z0-9_]+)\.sgm$', fname)
        if not m:
            raise ParseError("SGML filename invalid")
        fname_base = m.groups()[0]
        stored_sgml_fn = '%s/%s' % (self.sgml_storage, fname)

        if should_update_xml or not os.path.exists(stored_sgml_fn):
            self.logger.debug("downloading SGML file")
            try:
                s = self.open_url(link, self.doc_type)
            except HTTPError:
                # retry after nuking the cache
                self.http.nuke_cache(html_url, self.doc_type)
                self.open_url(html_url, self.doc_type)
                s = self.open_url(link, self.doc_type)
            f = open(stored_sgml_fn, 'w')
            f.write(s)
            f.close()

        xml_fn = '%s/%s.xml' % (self.xml_storage, fname_base)
        if should_update_xml or not os.path.exists(xml_fn):
            ret = os.spawnv(os.P_WAIT, self.sgml_to_xml,
                            [self.SGML_TO_XML, stored_sgml_fn, xml_fn])
            if ret:
                raise ParseError("SGML-to-XML conversion failed")

        return xml_fn
def processGithub(author, url, begin, end):
    assert(author['username'] != None)
    githubURL = url + '/pulls?q=is:pr+author:%s+is:%s+updated:%s..%s'
    #begin = '2014-11-01'
    #end = '2014-12-01'
    closedIssuesURL = githubURL % (author['username'], 'closed', begin, end)
    page = requests.get(closedIssuesURL)
    tree = html.fromstring(page.text)
    issueDetails = tree.xpath('//*[@class="issue-title-link js-navigation-open"]')
    closedIssues = []
    for issue in issueDetails:
        issueURL = 'https://github.com' + issue.get('href')
        issueTitle = issue.text.strip()
        closedIssues.append((issueURL, issueTitle))

    openIssuesURL = githubURL % (author['username'], 'open', begin, end)
    page = requests.get(openIssuesURL)
    tree = html.fromstring(page.text)
    openIssues = []
    issueDetails = tree.xpath('//*[@class="issue-title-link js-navigation-open"]')
    for issue in issueDetails:
        issueURL = 'https://github.com' + issue.get('href')
        issueTitle = issue.text.strip()
        openIssues.append((issueURL, issueTitle))

    return (closedIssues, openIssues)
Example #4
0
def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+
                'URL was: '+request)
    
    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+
                'URL was: '+request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass
def main():
    """
    Convierte la documentación de Trello en una estructura de datos y la
    imprime por salida estándar.

    """
    ep = requests.get(TRELLO_API_DOC).content
    root = html.fromstring(ep)

    links = root.xpath('//a[contains(@class, "reference internal")]/@href')
    pages = [requests.get(TRELLO_API_DOC + u)
             for u in links if u.endswith('index.html')]

    endpoints = []
    for page in pages:
        root = html.fromstring(page.content)
        sections = root.xpath('//div[@class="section"]/h2/..')
        for sec in sections:
            ep_html = etree.tostring(sec).decode('utf-8')
            ep_text = html2text(ep_html).splitlines()
            match = EP_DESC_REGEX.match(ep_text[0])
            if not match:
                continue
            ep_method, ep_url = match.groups()
            ep_text[0] = ' '.join([ep_method, ep_url])
            ep_doc = b64encode(gzip.compress('\n'.join(ep_text).encode('utf-8')))
            endpoints.append((ep_method, ep_url, ep_doc))

    print(yaml.dump(create_tree(endpoints)))
Example #6
0
    def test_edit_post(self):
        self.login_client()
        edit_post_url = reverse('pybb:edit_post', kwargs={'pk': self.post.id})
        response = self.client.get(edit_post_url)
        self.assertEqual(response.status_code, 200)
        tree = html.fromstring(response.content)
        values = dict(tree.xpath('//form[@method="post"]')[0].form_values())
        values['body'] = 'test edit'
        response = self.client.post(edit_post_url, data=values, follow=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(Post.objects.get(pk=self.post.id).body, 'test edit')
        response = self.client.get(self.post.get_absolute_url(), follow=True)
        self.assertContains(response, 'test edit')

        # Check admin form
        self.user.is_staff = True
        self.user.save()
        response = self.client.get(edit_post_url)
        self.assertEqual(response.status_code, 200)
        tree = html.fromstring(response.content)
        values = dict(tree.xpath('//form[@method="post"]')[0].form_values())
        values['body'] = 'test edit'
        values['login'] = '******'
        response = self.client.post(edit_post_url, data=values, follow=True)
        self.assertEqual(response.status_code, 200)
        self.assertContains(response, 'test edit')
def processRietveld(author, guid, begin, end):
    reitveldURL = 'https://codereview.chromium.org/search?closed=%s&owner=%s&repo_guid=%s&modified_after=%s&modified_before=%s&limit=30'
    assert(author != None and guid != None and begin != None and end != None)
    email = ''
    if type(author['email']) == list:
        email = author['email'][0]
    else:
        email = author['email']
    closedIssuesURL = reitveldURL % ('2', email, guid, weekStart, weekEnd)
    page = requests.get(closedIssuesURL)
    tree = html.fromstring(page.text)
    issueDetails = tree.xpath('//*[@class="subject"]/a/text()')
    closedIssues = []
    for i in xrange(0, len(issueDetails), 2):
        issueURL = 'https://codereview.chromium.org/' + issueDetails[i]
        issueTitle = issueDetails[i + 1].strip()
        closedIssues.append((issueURL, issueTitle))

    openIssuesURL = reitveldURL % ('3', email, guid, weekStart, weekEnd)
    page = requests.get(openIssuesURL)
    tree = html.fromstring(page.text)
    issueDetails = tree.xpath('//*[@class="subject"]/a/text()')
    openIssues = []
    for i in xrange(0, len(issueDetails), 2):
        issueURL = 'https://codereview.chromium.org/' + issueDetails[i]
        issueTitle = issueDetails[i + 1].strip()
        openIssues.append((issueURL, issueTitle))

    return (closedIssues, openIssues)
Example #8
0
    def test_get_imdblink(self):
        html = u"""
        <div>
            <div class="wikibase-statementview-mainsnak">
                <div>
                    <div class="wikibase-snakview-value">
                        <a class="wb-external-id" href="http://www.imdb.com/tt0433664">
                            tt0433664
                        </a>
                    </div>
                </div>
            </div>
        </div>
        """
        html_etree = fromstring(html)
        imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')

        html = u"""
        <div>
            <div class="wikibase-statementview-mainsnak">
                <div>
                    <div class="wikibase-snakview-value">
                        <a class="wb-external-id"
                           href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994"">
                            nm4915994
                        </a>
                    </div>
                </div>
            </div>
        </div>
        """
        html_etree = fromstring(html)
        imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
        self.assertIn('https://www.imdb.com/name/nm4915994', imdblink)
Example #9
0
    def test_get_geolink(self):
        html = u"""
        <div>
            <div class="wikibase-statementview-mainsnak">
                <div>
                    <div class="wikibase-snakview-value">
                        60°N, 40°E
                    </div>
                </div>
            </div>
        </div>
        """
        html_etree = fromstring(html)
        geolink = wikidata.get_geolink(html_etree)
        self.assertIn('https://www.openstreetmap.org/', geolink)
        self.assertIn('lat=60&lon=40', geolink)

        html = u"""
        <div>
            <div class="wikibase-statementview-mainsnak">
                <div>
                    <div class="wikibase-snakview-value">
                        34°35'59"S, 58°22'55"W
                    </div>
                </div>
            </div>
        </div>
        """
        html_etree = fromstring(html)
        geolink = wikidata.get_geolink(html_etree)
        self.assertIn('https://www.openstreetmap.org/', geolink)
        self.assertIn('lat=-34.59', geolink)
        self.assertIn('lon=-58.38', geolink)
Example #10
0
File: http.py Project: yamingd/Data
def get_htmldoc(url, encode='utf8', timeout=60):
    if url.startswith('file://'):
        with open(url[7:]) as f:
            content = f.read()
            content = content.decode(encode, 'ignore')
            print content.__class__
            try:
                content = cleaner.clean_html(content)
            except:
                pass
            doc = fromstring(content)
            return doc
    code, data = _getcontent(url, timeout=timeout)
    #print code
    #encode = 'utf8'
    if code:
        encode = code
    codedata = data.decode(encode, 'ignore')
    try:
        print codedata.__class__
        codedata = cleaner.clean_html(codedata)
    except:
        print 'Error: ', url
        log.exception('unexpected error:%s(%s)' % (url, encode))
        #raise
    #with open('error_page.html', 'w+') as fw:
    #    fw.write(codedata.encode(encode))
    doc = fromstring(codedata)
    # log.debug(codedata)
    return doc
 def get_posts_list(self):
     profile_page = self.user_url + '/profile'
     r = self.s.get(profile_page)
     self.output_html(text=r.text, filename='profilepage')   # step 2
     
     rizhi_tab = profile_page + '?v=blog_ajax&undefined'
     r = self.s.get(rizhi_tab)
     self.output_html(text=r.text, filename='rizhi_tab')   # step 3 
     
     first_blog_url = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].attrib['href']
     first_blog_title = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].text
     r = self.s.get(first_blog_url)
     print("Generating 《%s》" % first_blog_title)
     self.output_html(text=r.text, filename='0.'+first_blog_title)
     
     # 根据状态码或页面元素判断已到末尾
     for i in range(1,10000):
         try:
             next_blog_url = html.fromstring(r.text).cssselect(".a-nav .float-right a")[0].attrib['href']
             next_blog_title = html.fromstring(r.text).cssselect(".a-nav .float-right a")[0].text.lstrip('较旧一篇:')
             r = self.s.get(next_blog_url)
             print("Generating 《%s》" % next_blog_title)
             next_blog_title = re.sub(r'[<>"*\\/|?]', '', next_blog_title)   # 标题中的? -> '',: -> -
             next_blog_title = re.sub(':', '-', next_blog_title)
             self.output_html(text=r.text, filename=str(i)+'.'+next_blog_title)
         except:
             print("Unexpected error:", sys.exc_info()[0])
             print('Existing program...')
             break
Example #12
0
def parse_updates_html_str(html_str):
    course_upd_collection = []
    if html_str == '':
        return {"updates": course_upd_collection}

    try:
        course_html_parsed = html.fromstring(html_str)
    except:
        escaped = django.utils.html.eacape(html_str)
        course_html_parsed = html.fromstring(escaped)

    print type(course_html_parsed)

    if course_html_parsed.tag == 'section':
        for index, update in enumerate(course_html_parsed):
            if len(update) > 0:
                content = _course_info_content(update)

                computer_id = len(course_html_parsed) - index

                payload = {
                    "id": computer_id,
                    "date": update.findtext("h2"),
                    "content": content
                }

                course_upd_collection.append(payload)

    return {"updates": course_upd_collection}
    def __init__(self, email, password):

        """
        Initiates the session and Logs in with the provided credentials

        :param email:
        :param password:
        :return:
        """

        self.session_request = requests.session()
        result = self.session_request.get(self.login_url)
        login_page = html.fromstring(result.text)
        token = list(set(login_page.xpath("//input[@name='fkey']/@value")))[0]

        self.credentials = {
            "email": email,
            "password": password,
            "fkey": token
        }

        result = self.session_request.post(
            self.login_url,
            data=self.credentials,
            headers=dict(referer=self.login_url)
        )

        self.main_page = html.fromstring(result.content)
        self.page_title = str(self.main_page.xpath("//title/text()")[0])

        if self.page_title == "Stack Overflow":
            print "Login Successful" + "\n"
            self.questions = self.main_page.xpath("//div[@id='question-mini-list']")[0]
        else:
            print "Invalid Credentials" + "\n"
def get_player_stats(site, headers):
    page1 = requests.get(site)
    tree1 = html.fromstring(page1.text)
    stats1 = tree1.xpath('//td[@align="right"]')
    stats1 = [x.text for x in stats1]
    stats1 = [xnum(x,0) for x in stats1]
    names1 = tree1.xpath('//td[@align="left"][@class=" highlight_text"]/a/text()')
    teams1 = tree1.xpath('//td[@align="left"]/a[starts-with(@href,"/teams")]/text()')

    text_addon = "&offset="
    seq_addon = range(100,401,100)
    for a in seq_addon:
        page_temp = requests.get(site+text_addon+str(a))
        tree_temp = html.fromstring(page_temp.text)
        stats_temp = tree_temp.xpath('//td[@align="right"]')
        stats_temp = [x.text for x in stats_temp]
        stats_temp = [xnum(x,0) for x in stats_temp]
        stats1.extend(stats_temp)
        names_temp = tree_temp.xpath('//td[@align="left"][@class=" highlight_text"]/a/text()')
        teams_temp = tree_temp.xpath('//td[@align="left"]/a[starts-with(@href,"/teams")]/text()')
        names1.extend(names_temp)
        teams1.extend(teams_temp)

    stats1 = np.array(stats1).reshape((len(stats1)/26,26))
    stats1 = pd.DataFrame(stats1, columns=headers)
    stats1['name'] = names1
    stats1['team'] = teams1
    return stats1
Example #15
0
def ShowCartoons(title, url, page_count):

	oc = ObjectContainer(title1 = title)
	thisurl = url
	thisletter = url.split("=",1)[1]
	page = scraper.get(BASE_URL + '/CartoonList' + url + '&page=' + page_count)
	page_data = html.fromstring(page.text)

	for each in page_data.xpath("//tr/td[1]"):
		content = HTML.ElementFromString(each.xpath("./@title")[0])
		url = content.xpath("./div/a[@class='bigChar']/@href")[0]
		title = content.xpath("./div/a[@class='bigChar']/text()")[0].strip()

		thumbhtml = scraper.get(BASE_URL + url)
		page_html = html.fromstring(thumbhtml.text)
		thumb = page_html.xpath("//link[@rel='image_src']/@href")[0]
		Log(thumb)

		oc.add(DirectoryObject(
			key = Callback(ShowEpisodes, title = title, url = url),
				title = title,
				thumb = thumb
				)
		)
	oc.add(NextPageObject(
		key = Callback(ShowCartoons, title = thisletter.upper(), url = thisurl, page_count = int(page_count) + 1),
		title = "More...",
		thumb = R(ICON_NEXT)
			)
		)
	return oc
Example #16
0
File: man.py Project: tjcsl/cslbot
def cmd(send, msg, args):
    """Gets a man page.

    Syntax: {command} [section] <command>

    """
    parser = arguments.ArgParser(args['config'])
    parser.add_argument('section', nargs='?')
    parser.add_argument('command')
    try:
        cmdargs = parser.parse_args(msg)
    except arguments.ArgumentException as e:
        send(str(e))
        return
    if cmdargs.section:
        html = get('http://linux.die.net/man/%s/%s' % (cmdargs.section, cmdargs.command))
        short = fromstring(html.text).find('.//meta[@name="description"]')
        if short is not None:
            short = short.get('content')
            send("%s -- http://linux.die.net/man/%s/%s" % (short, cmdargs.section, cmdargs.command))
        else:
            send("No manual entry for %s in section %s" % (cmdargs.command, cmdargs.section))
    else:
        for section in range(0, 8):
            html = get('http://linux.die.net/man/%d/%s' % (section, cmdargs.command))
            if html.status_code == 200:
                short = fromstring(html.text).find('.//meta[@name="description"]')
                if short is not None:
                    short = short.get('content')
                    send("%s -- http://linux.die.net/man/%d/%s" % (short, section, cmdargs.command))
                    return
        send("No manual entry for %s" % cmdargs.command)
Example #17
0
def extract_info(url, requester):

    response = retrieve(url, requester)

    if response:

        tree = html.fromstring(response.content)

        members = itertools.chain(parse(tree, MXPATH))

        next = parse(tree, NXPATH)[0]

        while next:

            response = retrieve(HOST + next, requester)

            if response:

                tree = html.fromstring(response.content)

                members = itertools.chain(members, parse(tree, MXPATH))

                next = parse(tree, NXPATH)[0]

            else:

                break

        return members

    else:

        return itertools.chain()
Example #18
0
def crawler(url_ip):
    global eqid, counter, serial, body_list
    print "Starting Crawler Service for: " + url_ip

    url = "http://" + url_ip + "/cgi-bin/dynamic/printer/config/reports/deviceinfo.html"
    urleqid = "http://" + url_ip + "/cgi-bin/dynamic/topbar.html"
    response = requests.get(url)
    tree = html.fromstring(response.text)
    # xpath sequence should be pulled using the google source inspection
    counter = tree.xpath('//td[contains(p,"Count")]/following-sibling::td/p/text()')
    serial = tree.xpath('//td[contains(p, "Serial")]/following-sibling::td/p/text()')
    counter = counter[0].split(' ')[3]
    serial = serial[0].split(' ')[3]
    responseeqid = requests.get(urleqid)
    treeequid = html.fromstring(responseeqid.text)
    eqid = treeequid.xpath('//descendant-or-self::node()/child::b[contains(., "Location")]/text()')[1].split(' ')[-1]

    # print basic data
    print " -- equipment id found: " + eqid
    print " -- count found: " + counter
    print " -- serial found: " + serial


    body_of_email = "Equipment ID = " + eqid + "<br>Total Meter Count = " + counter + "<br>Serial Number = " + serial + "<br><br>"
    body_list.append(body_of_email)


    print "Stopping Crawler Service for: " + url_ip
    return
Example #19
0
 def get_posts_list(self):
     profile_page = self.user_url + '/profile'
     r = self.s.get(profile_page)
     self.output_html(text=r.text, filename='profilepage')   # step 2
     
     rizhi_tab = profile_page + '?v=blog_ajax&undefined'
     r = self.s.get(rizhi_tab)
     self.output_html(text=r.text, filename='rizhi_tab')   # step 3 
     
     first_blog_url = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].attrib['href']
     first_blog_title = html.fromstring(r.text).cssselect('[stats="blog_blog"]')[0].text
     r = self.s.get(first_blog_url)
     if sys.stdout.encoding == 'UTF-8':
         print("Generating 《%s》" % first_blog_title)
     else:
         print('0')
     self.output_html(text=r.text, filename='0.'+first_blog_title)
     
     # 根据状态码或页面元素判断已到末尾
     for i in range(1, 10000):
         next_blog_element = html.fromstring(r.text).cssselect(".a-nav .float-right a")
         if next_blog_element:
             next_blog_url = next_blog_element[0].attrib['href']
         else:
             break   # already the last blogpost
         next_blog_title = html.fromstring(r.text).cssselect(".a-nav .float-right a")[0].text.lstrip('较旧一篇:')
         r = self.s.get(next_blog_url)
         if sys.stdout.encoding == 'UTF-8':
             print("Generating 《%s》" % next_blog_title)
         else:
             print(i)
         next_blog_title = re.sub(r'[<>"*\\/|?]', '', next_blog_title)   # 标题中的? -> '',: -> -
         next_blog_title = re.sub(':', '-', next_blog_title)
         self.output_html(text=r.text, filename=str(i)+'.'+next_blog_title)
Example #20
0
    def __loadPages(self, maxPages):
        response = requests.get(self.baseUrl)
        firstpage = html.fromstring(response.text)
        pagination = firstpage.xpath('//div[@class="pagination loop-pagination"]')[0]
        pagecount = int(max(pagination.xpath('//a[@class="page-numbers"]/text()')))
        pagecount = pagecount if maxPages == 0 else maxPages

        for pagenumber in range(1, pagecount + 1):
            print "Scraping page %d... (%s)" % (pagenumber, (self.baseUrl + "/page/%d" % pagenumber))
            if pagenumber == 1:
                page = firstpage
            else:
                response = requests.get(self.baseUrl + "/page/%d" % pagenumber)
                page = html.fromstring(response.text)

            articles = page.xpath("//article")
            for article in articles:
                # Header
                header = article.xpath('.//h1[@class="entry-title"]/a/text()')
                if not header:
                    header = article.xpath('.//h1[@class="entry-title"]/a/span/text()')
                post = header[0].upper() + "\n\n"
                # Body
                post += "\n".join(article.xpath('.//div[@class="entry-content"]/p/text()')).strip()
                self.posts.append(post)
Example #21
0
def make_ecas_session():
    session = requests.Session()
    data = {"lgid": "en", "action": "gp"}
    res = session.get("http://ted.europa.eu/TED/browse/browseByBO.do")
    res = session.post(
        "http://ted.europa.eu/TED/main/HomePage.do?pid=secured", data=data, cookies={"lg": "en"}, allow_redirects=True
    )
    a = html.fromstring(res.content).find('.//div[@id="main_domain"]/a[@title="External"]')
    res = session.get(a.get("href"))
    form = html.fromstring(res.content).find('.//form[@id="loginForm"]')
    data = dict([(i.get("name"), i.get("value")) for i in form.findall(".//input")])
    data["username"] = ECAS_USER
    data["password"] = ECAS_PASSWORD
    data["selfHost"] = "webgate.ec.europa.eu"
    data["timeZone"] = "GMT-04:00"
    res = session.post(form.get("action"), data=data, allow_redirects=True)
    doc = html.fromstring(res.content)
    # print res.content
    form = doc.find('.//form[@id="showAccountDetailsForm"]')
    # print form
    data = dict([(i.get("name"), i.get("value")) for i in form.findall(".//input")])
    res = session.post(form.get("action"), data=data, allow_redirects=True)
    doc = html.fromstring(res.content)
    link = filter(lambda a: "redirecting-to" in a.get("href", ""), doc.findall(".//a"))
    res = session.get(link.pop().get("href"))
    log.info("ECAS Session created.")
    return session
def getFpDkData():
    sleep(randint(2, 5))
    url = "http://www.fantasypros.com/nfl/draftkings-lineup-optimizer.php"
    try:
        response = requests.get(url)
        tree = html.fromstring(response.text)
    except:
        try:
            response = requests.get(url)
            tree = html.fromstring(response.text)
        except:
            print "Couldn't scrape:", url
            return []

    names         = getDkName(tree)
    points        = getPlayerPoints(tree)
    salaries      = getPlayerSalaries(tree)
    positions     = getPlayerPosition(tree)
    opponents     = getPlayerOpponent(tree)

    players = []
    for i in xrange(min(len(names), len(points), len(salaries))):
        player = {}
        player["Name"]         = names[i]
        player["Position"]     = positions[i][0]
        player["PositionRank"] = positions[i][-1]
        player["dkPoints"]     = points[i]
        player["dkSalary"]     = salaries[i]
        # player["Opponent"]     = opponents[i]
        if player["Position"] != "NR":
        	players.append(player)
        	print "Scraped Fantasy Pros for DK Points/Salary for:", player["Name"]

    return players
Example #23
0
def get_match(match_id):
    f = open("data/reget.json", "a+")
    r = s.get('http://www.oddsportal.com/a/b/c/d-%s/' % match_id)
    tree = html.fromstring(r.text)
    try:
        match = {
            'match_id': match_id
        }
        print(match_id)
        match = match_dl.get_match(match)
        name = tree.xpath(
            '//div[@id="col-content"]/h1')[0].text_content().split(' - ')
        match['home'] = name[0]
        match['away'] = name[1]
        match['event'] = get_league_info(r.url)[1:]
        event_request = requests.get(
            'http://www.soccer24.com/match/' + match['match_id'])
        event_tree = html.fromstring(event_request.text)
        phrases = event_tree.xpath(
            '//table[@class="detail"]//a/text()')[0].split(' - ')[1:]
        match['event'] += phrases[::-1]
        f.write(json.dumps(match) + '\n')
    except:
        fail = open("to_reget.dat", 'a+')
        fail.write(match_id + '\n')
        fail.close()
    f.close()
Example #24
0
def removeWikiDuplicates(rd_file, syn_file):
	with open(rd_file,"r") as f:
		htmls = h.fromstring(f.read())
	rd_pages = htmls.xpath('page')
	with open(syn_file,"r") as f:
		htmls = h.fromstring(f.read())
	syn_pages = htmls.xpath('page')

	rd_titles = []
	for page in rd_pages:
		rd_titles.append(page.xpath("title")[0].text_content())
	syn_titles = []
	for page in syn_pages:
		syn_titles.append(page.xpath("title")[0].text_content())

	sub_syn_inds = []
	for i,title in enumerate(syn_titles):
		if not title in rd_titles:
			sub_syn_inds.append(i)

	open(syn_file,"w").close()
	f = open(syn_file,"a")	
	for i in sub_syn_inds:
		f.write(h.tostring(syn_pages[i]))
	f.close()
Example #25
0
    def test_process_idname(self):
        from mobilize.components import CssPath, XPath
        src_html = '''<div>
<nav>
  <a href="/A">A</a>
  <a href="/B">B</a>
</nav>
'''
        def prep_component(**kw):
            return c
        # check that default_idname is required if self.idname not defined
        c1 = CssPath('nav')
        c1.extract(html.fromstring(src_html))
        with self.assertRaises(AssertionError):
            c1.process()

        # check that idname argument 
        c2 = CssPath('nav', idname='foo')
        c2.extract(html.fromstring(src_html))
        c2.process() # no AssertionError on this line meanst the test passes

        # check that default_idname supresses the error
        c3 = CssPath('nav')
        c3.extract(html.fromstring(src_html))
        c3.process('foo') # no AssertionError on this line meanst the test passes
Example #26
0
def get_info_from_filmup(film_url):
	page = requests.get(film_url)
	tree = html.fromstring(page.content)

	#Fetch the infos as a list:
	info = tree.xpath('//div[@id="container"]/table/tr/td/div/table/tr/td/table/tr/td/table/tr/td/font/node()')
	print len(info)
	info = [get_text(i) for i in info]
	plot = tree.xpath('//div[@id="container"]/table/tr/td/div/table/tr/td/table/tr/td/font/text()')
	image = tree.xpath('//div[@id="container"]/table/tr/td/div/table/tr/td/table/tr/td/table/form/tr/td/a[@class="filmup"]/@href')
	plot[1] = parse_apostrophe(plot[1])
	plot[1] = substitue_accents(plot[1])
	
	#Fetch large image url
	image_page = requests.get('http://filmup.leonardo.it/'+image[0])
	tree = html.fromstring(image_page.content)
	
	image_big = tree.xpath('//div[@id="container"]/table/tr/td/div/div/img/@src')

	#Download image in local folder
	s = r'/sc_(.[^\.]*)\.htm'
	#print re.findall(s,film_url)
	img_title = "images/"+re.findall(s,film_url)[0]+".jpg"
	urlretrieve('http://filmup.leonardo.it'+image_big[0], img_title)
	#resize image:
	image = Image.open(img_title)
	image_small = resize_image(image,height=330)
	image_small.save("images/"+re.findall(s,film_url)[0]+"_small.jpg")
	image_fullsize = resize_image(image,height=600)
	image_fullsize.save("images/"+re.findall(s,film_url)[0]+".jpg")
	
	res = merge_infos(info, plot)
	
	return res
Example #27
0
    def test_innerhtml(self):
        from mobilize.components import XPath
        html_str = '''<table><tr><td>Hello</td></tr></table>'''
        # test for innerhtml=False
        component_f = XPath('//td', idname='foo', innerhtml=False)
        component_f.extract(html.fromstring(html_str))
        extracted = component_f.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo"><td>Hello</td></div>'
        e = normxml(expected)
        a = normxml(extracted_str)
        self.assertSequenceEqual(e, a)
        
        # test for innerhtml=True
        component_t = XPath('//td', idname='foo', innerhtml=True)
        component_t.extract(html.fromstring(html_str))
        extracted = component_t.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo">Hello</div>'
        self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
        
        # test for ineffectiveness of innerhtml=True with multiple matching elements
        component_t = XPath('//td', idname='foo', innerhtml=True)
        component_t.extract(html.fromstring('''
<table><tr>
<td>Hello</td>
<td>Goodbye</td>
</tr></table>
'''))
        extracted = component_t.process()
        extracted_str = html.tostring(extracted)
        expected = '<div class="mwu-elem" id="foo"><td>Hello</td><td>Goodbye</td></div>'
        self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
Example #28
0
def pyld_36kr():
    """<a style="color:#000000;" target="_blank" href="http://36kr.com/" title="36氪是一个关注互联网创业的科技博客,旨在帮助互联网创业者实现创业梦。我们相信每个人都可以像来氪星人超人那样强大无比。还行吧,有质有量还有料">36kr-首页</a>"""
    starttime = time.time()
    my_title = pyld_36kr.__doc__
    title_clean = re.sub("<.*?>", "", my_title)
    column = 6
    iscover = 1
    try:
        r = requests.get("http://36kr.com/")
        xpath1 = fromstring(r.text).xpath
        items = xpath1("//article")
        newurl = "http://36kr.com" + xpath1('//a[@id="info_flows_next_link"]/@href')[0]
        r = requests.get(newurl)
        items = items + fromstring(r.text).xpath("//article")
        items = [i for i in items if i.xpath("./div/div/span/time/@datetime")]
        urls = ["http://36kr.com" + i.xpath("./a/@href")[0] for i in items]
        covers = [i.xpath("./a/@data-lazyload")[0] for i in items]
        titles = [i.xpath("./div/a/text()")[0] for i in items]
        sums = [i.xpath('./div/div[@class="brief"]/text()')[0] for i in items]
        ptime = [
            '<div align="right"><br>%s</div>' % re.sub(" \+\d\d\d\d$", "", i.xpath("./div/div/span/time/@datetime")[0])
            for i in items
        ]
        sums = ["<br>".join(i) for i in list(zip(sums, ptime))]
        aa = [i for i in list(zip(covers, titles, urls, sums)) if thisday.strftime("%Y-%m-%d") in i[3]]

    except Exception as e:
        print("%s  %s" % (title_clean, e))
        aa = [["error"] * 4]
        iscover = 0
    runtime1 = round(time.time() - starttime, 3)
    print(title_clean, "finished in %s seconds" % runtime1)
    return [my_title, aa, column, iscover]
Example #29
0
	def open(self, url=None):
	
		try:
			if url is not None:
				if url in self.storedlinks: return -1
				
				else:
						if ("http://" in url) or ("https://" in url):
							self.current = ulib.urlopen(url)
							self.source = self.current.read()
							self.storedlinks.add(url)
							self.soup = html.fromstring(self.source)
							return 1
						else:
							return None

			else:
				self.current = ulib.urlopen(self.root)
				self.source = "".join(self.current.readlines())
				self.soup = html.fromstring(self.source)
				return 1
				
				
		except ValueError as ve:
			print ve.message
			return None
			
		except ulib.HTTPError as ht:
			print ht.message
			return None
			
		except ulib.URLError as u:
			print "Could not connect to given URL"
			return None
def main():
    session_requests = requests.session()

    # Get login csrf token
    result = session_requests.get(LOGIN_URL)
    tree = html.fromstring(result.text)
    authenticity_token = list(set(tree.xpath("//input[@name='csrfmiddlewaretoken']/@value")))[0]

    # Create payload
    payload = {
        "username": username, 
        "password": password, 
        "csrfmiddlewaretoken": authenticity_token
    }

    # Perform login
    result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))

    # Scrape url
    result = session_requests.get(URL, headers = dict(referer = URL))
    tree = html.fromstring(result.content)
    bucket_elems = tree.findall(".//a[@class='execute']")
    bucket_names = [bucket_elem.text_content().replace("\n", "").strip() for bucket_elem in bucket_elems]

    print bucket_names
 }
 print('翻页###################', begin)
 query_fakeid_response = requests.get(appmsg_url,
                                      cookies=cookies,
                                      headers=header,
                                      params=query_id_data)
 fakeid_list = query_fakeid_response.json().get('app_msg_list')
 try:
     for item in fakeid_list:
         msg_link = item.get('link')
         # print(msg_link)
         msg_title = item.get('title')
         if int(item.get('update_time')) > int('1519833600'):
             print(msg_link + 'continue')
             get_msg_response = requests.get(msg_link)
             tree = html.fromstring(get_msg_response.text)
             try:
                 # if tree.xpath('//*[@id="meta_content"]/em[2]/text()')[0] == kevin:
                 print(msg_title, end='\n')
                 # 处理文本
                 msg_content = tree.xpath(
                     '//*[@id="js_content"]//text()')
                 with open('sentence.txt', 'a',
                           encoding='utf-8') as f:
                     f.write(
                         '\n\n====================================================================\n\n'
                         + msg_title + '\n\n')
                     for text in msg_content:
                         f.write(text + '\n')
                     f.write(
                         "\n\n====================================================================\n\n"
Example #32
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, '@alt')[0]

            img_base64_id = eval_xpath(img_node, '@data-iid')
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, '@src')
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, '@data-src')
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ''

            link_node = eval_xpath(img_node, '../../../a[2]')[0]
            url = eval_xpath(link_node, '@href')[0]

            pub_nodes = eval_xpath(link_node, './div/div')
            pub_descr = img_alt
            pub_source = ''
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            img_src_id = eval_xpath(img_node, '../../../@data-id')[0]
            src_url = scrap_img_by_id(img_src_script, img_src_id)
            if not src_url:
                src_url = thumbnail_src

            results.append({
                'url': url,
                'title': img_alt,
                'content': pub_descr,
                'source': pub_source,
                'img_src': src_url,
                # 'img_format': img_format,
                'thumbnail_src': thumbnail_src,
                'template': 'images.html'
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
def parseProduct(asin, amazon_url, retrying_time):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
    }
    url = amazon_url + "/dp/" + asin

    try:
        # Adding verify=False to avold ssl related issues
        response = requests.get(url, headers=headers)

        doc = html.fromstring(response.content)
        XPATH_NAME = '//h1[@id="title"]//text()'
        XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
        XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
        XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()'
        XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
        XPATH_RATING = '//span[@id="acrPopover"]'
        XPATH_REVIEWS_NUMBER = '//span[@id="acrCustomerReviewText"]//text()'

        raw_name = doc.xpath(XPATH_NAME)
        raw_sale_price = doc.xpath(XPATH_SALE_PRICE)
        raw_category = doc.xpath(XPATH_CATEGORY)
        raw_original_price = doc.xpath(XPATH_ORIGINAL_PRICE)
        raw_availability = doc.xpath(XPATH_AVAILABILITY)
        raw_rating_elem = doc.xpath(XPATH_RATING)
        raw_product_rating = []
        if raw_rating_elem != []:
            for elem in raw_rating_elem:
                raw_product_rating = elem.attrib['title']
        raw_number_of_review = doc.xpath(XPATH_REVIEWS_NUMBER)

        name = ' '.join(''.join(raw_name).split()) if raw_name else None
        sale_price = ' '.join(''.join(
            raw_sale_price).split()).strip() if raw_sale_price else None
        category = ' > '.join([i.strip() for i in raw_category
                               ]) if raw_category else None
        original_price = ''.join(
            raw_original_price).strip() if raw_original_price else None
        availability = ''.join(
            raw_availability).strip() if raw_availability else None
        rating = ''.join(raw_product_rating).replace(
            ' su 5 stelle', '') if raw_product_rating else None
        reviews_number = ''.join(raw_number_of_review).replace(
            ' recensioni clienti', '') if raw_number_of_review else None

        if not original_price:
            original_price = sale_price

        # retrying in case of captcha (only first time)
        if not name:
            if retrying_time:
                raise ValueError('captcha')
            parseProduct(asin, amazon_url, True)
            return

        data = {
            'name': name,
            'salePrice': sale_price,
            'category': category,
            'originalPrice': original_price,
            'availability': availability,
            'url': url,
            'date': datetime.datetime.now().strftime("%d-%m-%Y %H:%M"),
            'numberOfReviews': reviews_number,
            'productRating': rating
        }
        return data

    except:
        print("Error scraping product info")
Example #34
0
# date: 2019.05.09
# author: Bartłomiej 'furas' Burek
# https://stackoverflow.com/questions/56059703/how-can-i-make-lxml-save-two-pages-to-the-pages-so-it-can-be-read-by-the-tree

from lxml import html
import requests

data = {
    'BTC': 'id-bitcoin',
    'TRX': 'id-tron',
    # ...
    'HC': 'id-hypercash',
    'XZC': 'id-zcoin',
}

all_results = {}

for url in ('https://coinmarketcap.com/', 'https://coinmarketcap.com/2'):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    print(tree.cssselect('body'))
    for key, val in data.items():
        result = tree.xpath('//*[@id="' + val + '"]/td[4]/a/text()')
        print(key, result)
        if result:
            all_results[key] = result[0]

print('---')
print(all_results)
from lxml import html, etree
from urllib.request import urlopen

pagina = urlopen("https://www.pythonparatodos.com.br/formulario.html")
tree = html.fromstring(pagina.read())
tr = tree.xpath('//tr[2]')
print(tr)
for elemento in tr:
    print(etree.tostring(elemento))
def parseResponse(response, amazon_url, rev_index):
    p_resp = response
    reviews = p_resp.split('&&&')

    XPATH_ID = './/div[@data-hook="review"]'
    XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
    XPATH_TITLE = './/a[@data-hook="review-title"]//text()'
    XPATH_AUTHOR = './/a[@data-hook="review-author"]//text()'
    XPATH_AUTHOR_PROFILE = './/a[@data-hook="review-author"]'
    XPATH_POSTED_DATE = './/span[@data-hook="review-date"]//text()'
    XPATH_VERIFIED_PURCHASE = './/span[@data-hook="avp-badge"]//text()'
    XPATH_BODY = './/span[@data-hook="review-body"]//text()'
    XPATH_HELPFUL = './/span[@data-hook="helpful-vote-statement"]//text()'

    reviews_list = {}

    # the first 3 fields of the array can be discarded since useless
    for i in range(3, len(reviews)):
        rev_fields = reviews[i].split('\",\"')

        if len(rev_fields) == 3:
            # the 3rd field (rev_fields[2]) contains the html of the review
            review = rev_fields[2].replace("\\\"", "\"")

            is_amazon_vine = True if "Recensione Vine " in review else False

            parser = html.fromstring(review)

            # DATA COMPUTATION
            # if 'id' field is not found we can assume we are working on a html response referred to a review
            # the last 3 fields contain no information
            review_id = ""
            for elem in parser.xpath(XPATH_ID):
                review_id = elem.attrib['id']
            if review_id is "":
                break
            raw_review_rating = parser.xpath(XPATH_RATING)
            raw_review_header = parser.xpath(XPATH_TITLE)
            raw_review_author = parser.xpath(XPATH_AUTHOR)
            # retrieve the link to the review's author profile
            for elem in parser.xpath(XPATH_AUTHOR_PROFILE):
                raw_review_author_profile = elem.attrib['href']
            raw_review_posted_date = parser.xpath(XPATH_POSTED_DATE)
            raw_review_verified_purchase = parser.xpath(
                XPATH_VERIFIED_PURCHASE)
            raw_review_body = parser.xpath(XPATH_BODY)
            raw_review_helpful_vote = parser.xpath(XPATH_HELPFUL)

            # DATA COMPOSITION
            author_name = ' '.join(' '.join(
                raw_review_author).split()) if raw_review_author else ""
            author_profile = amazon_url + ''.join(
                raw_review_author_profile) if raw_review_author_profile else ""
            author_code = (author_profile.split("account.")[1]).split("/")[0]
            review_rating = ''.join(raw_review_rating).replace(
                ' su 5 stelle', '') if raw_review_rating else ""
            review_header = ' '.join(' '.join(
                raw_review_header).split()) if raw_review_header else ""
            review_text = ' '.join(
                ' '.join(raw_review_body).split()) if raw_review_body else ""
            review_verified_purchase = True if raw_review_verified_purchase else False
            review_posted_date = parseDate(
                raw_review_posted_date) if raw_review_posted_date else None
            try:
                review_helpful_vote = int([
                    x for x in (''.join(raw_review_helpful_vote).split(' '))
                    if x != ''
                ][1]) if raw_review_helpful_vote else 0
            except:
                review_helpful_vote = 1

            author_id, author_rank = ac.getCustomerId(author_code, amazon_url)
            if author_id <> None:
                author_helpful_votes, author_reviews_count = ac.getHelpfulVotesAndTotalReviewsCount(
                    author_id, amazon_url)
            else:
                author_helpful_votes, author_reviews_count = -1, -1
            latest_author_reviews = ac.getLatestCustomerReviewAndTextAnalisys(
                author_code, amazon_url)

            # set a sleeping time
            if execute_sleep:
                #sleep(random.uniform(2,4))
                sleep(2)

            review_summary = {
                'reviewId': review_id,
                'reviewLink': amazon_url + "/gp/customer-reviews/" + review_id,
                'reviewText': review_text,
                'reviewPostedDate': review_posted_date,
                'reviewHeader': review_header,
                'reviewRating': review_rating,
                'reviewAuthor': {
                    'name': author_name,
                    'profileLink': author_profile,
                    'code': author_code,
                    'id': author_id,
                    'rank': author_rank,
                    'helpfulVotes': author_helpful_votes,
                    'totalReviewsCount': author_reviews_count,
                    'latestReviews': latest_author_reviews
                },
                'reviewVerifiedPurchase': review_verified_purchase,
                'reviewHelpfulVote': review_helpful_vote,
                'isAmazonVineReviewer': is_amazon_vine
            }
            reviews_list[rev_index] = review_summary
            rev_index += 1

    return reviews_list
Example #37
0
    def parse_html(self, root, first_page=False):
        if random() > 0.8:
            if len(root.xpath("//div[@class='controls']/a/text()")):
                self.display.exit(self.display.api_error(" "))

        book_content = root.xpath("//div[@id='sbo-rt-content']")
        if not len(book_content):
            self.display.exit(
                "Parser: book content's corrupted or not present: %s (%s)" %
                (self.filename, self.chapter_title)
            )

        page_css = ""
        stylesheet_links = root.xpath("//link[@rel='stylesheet']")
        if len(stylesheet_links):
            stylesheet_count = 0
            for s in stylesheet_links:
                css_url = urljoin("https:", s.attrib["href"]) if s.attrib["href"][:2] == "//" \
                    else urljoin(self.base_url, s.attrib["href"])

                if css_url not in self.css:
                    self.css.append(css_url)
                    self.display.log("Crawler: found a new CSS at %s" % css_url)

                page_css += "<link href=\"Styles/Style{0:0>2}.css\" " \
                            "rel=\"stylesheet\" type=\"text/css\" />\n".format(stylesheet_count)
                stylesheet_count += 1

        stylesheets = root.xpath("//style")
        if len(stylesheets):
            for css in stylesheets:
                if "data-template" in css.attrib and len(css.attrib["data-template"]):
                    css.text = css.attrib["data-template"]
                    del css.attrib["data-template"]

                try:
                    page_css += html.tostring(css, method="xml", encoding='unicode') + "\n"

                except (html.etree.ParseError, html.etree.ParserError) as parsing_error:
                    self.display.error(parsing_error)
                    self.display.exit(
                        "Parser: error trying to parse one CSS found in this page: %s (%s)" %
                        (self.filename, self.chapter_title)
                    )

        # TODO: add all not covered tag for `link_replace` function
        svg_image_tags = root.xpath("//image")
        if len(svg_image_tags):
            for img in svg_image_tags:
                image_attr_href = [x for x in img.attrib.keys() if "href" in x]
                if len(image_attr_href):
                    svg_url = img.attrib.get(image_attr_href[0])
                    svg_root = img.getparent().getparent()
                    new_img = svg_root.makeelement("img")
                    new_img.attrib.update({"src": svg_url})
                    svg_root.remove(img.getparent())
                    svg_root.append(new_img)

        book_content = book_content[0]
        book_content.rewrite_links(self.link_replace)

        xhtml = None
        try:
            if first_page:
                is_cover = self.get_cover(book_content)
                if is_cover is not None:
                    page_css = "<style>" \
                               "body{display:table;position:absolute;margin:0!important;height:100%;width:100%;}" \
                               "#Cover{display:table-cell;vertical-align:middle;text-align:center;}" \
                               "img{height:90vh;margin-left:auto;margin-right:auto;}" \
                               "</style>"
                    cover_html = html.fromstring("<div id=\"Cover\"></div>")
                    cover_div = cover_html.xpath("//div")[0]
                    cover_img = cover_div.makeelement("img")
                    cover_img.attrib.update({"src": is_cover.attrib["src"]})
                    cover_div.append(cover_img)
                    book_content = cover_html

                    self.cover = is_cover.attrib["src"]

            xhtml = html.tostring(book_content, method="xml", encoding='unicode')

        except (html.etree.ParseError, html.etree.ParserError) as parsing_error:
            self.display.error(parsing_error)
            self.display.exit(
                "Parser: error trying to parse HTML of this page: %s (%s)" %
                (self.filename, self.chapter_title)
            )

        return page_css, xhtml
Example #38
0
SCOPES = 'https://www.googleapis.com/auth/calendar'

store = file.Storage('token.json')
creds = store.get()
if not creds or creds.invalid:
    flow = client.flow_from_clientsecrets(os.environ['CREDENTIALS_PATH'],
                                          SCOPES)
    creds = tools.run_flow(flow, store)
service = build('calendar', 'v3', http=creds.authorize(Http()))

# Call the Calendar API
now = datetime.datetime.utcnow().isoformat() + 'Z'  # 'Z' indicates UTC time

data = open('example.html', 'rb').read().decode("utf-8")
tree = html.fromstring(data)
table = tree.xpath('/html/body/table/tr[6]/td/table')[0]
for row in table.getchildren()[1:]:
    childrens = row.getchildren()
    if len(childrens) > 1:
        time_str = childrens[3].getchildren()[0].text.strip()
        date_str = childrens[4].getchildren()[0].text.strip()
        start, end = time_str.split('-')
        start = start.strip()
        end = end.strip()
        month, day, year = date_str.split('.')
        start_date = f'{year}-{day}-{month}T{start}:00+03:00'
        end_date = f'{year}-{day}-{month}T{end}:00+03:00'
        print(f'Meeting at {start_date} on {end_date}')

        event = {
Example #39
0
if file == "":
    file = "TestCSV.csv"

try:
    with open(file, newline='') as csvfile:
        puts(colored.yellow("lese Datei: " + file))

        logoreader = csv.DictReader(csvfile)
        for row in logoreader:
            r = requests.post(
                "http://www.vereinswappen.de/vereine.php?option=vereinssuchen",
                data={
                    'verein': row['verein'],
                    'suchen': 'Suchen'
                })
            tree = html.fromstring(r.content)
            #print(r.content)
            vereinswappen = tree.xpath(
                '//img[@style="max-width: 100px; width: 100px;"]/@src')
            i = 1
            for x in vereinswappen:
                if not os.path.exists('Wappen'):
                    os.makedirs('Wappen')
                filename = "Wappen/" + row['verein'] + str(i) + ".png"
                urllib.request.urlretrieve(x, filename)
                i += 1
            #print(vereinswappen)
            puts(colored.yellow('Lade Wappen ' + row['verein'] + '...'))
            print(
                emoji.emojize('Success! :white_check_mark:', use_aliases=True))
Example #40
0
def tree_from_url(url: str, decode: str=None):
    content = requests.get(url).content
    if decode:
        content = content.decode(decode)
    return html.fromstring(content)
Example #41
0
def fetch_rfc(number, force=False):

    url = 'https://tools.ietf.org/html/rfc%d' % number
    output_dir = 'data/%04d' % (number // 1000 % 10 * 1000)
    output_file = '%s/rfc%d.json' % (output_dir, number)

    # すでに出力ファイルが存在する場合は終了 (--forceオプションが有効なとき以外)
    if not force and os.path.isfile(output_file):
        return 0

    # 出力先ディレクトリの作成
    os.makedirs(output_dir, exist_ok=True)

    # RFCページのDOMツリーの取得
    headers = {'User-agent': '', 'referer': url}
    page = requests.get(url, headers)
    tree = html.fromstring(cleanhtml(page.content))

    # タイトルの取得
    title = tree.xpath('//title/text()')
    if len(title) == 0:
        raise RFCNotFound
    title = title[0]

    # ページが存在するか確認
    content_h1 = tree.xpath('//div[@class="content"]/h1/text()')
    if len(content_h1) >= 1 and content_h1[0].startswith('Not found:'):
        raise RFCNotFound

    # DOMツリーから文章を取得
    contents = tree.xpath(
        '//pre/text() | '  # 本文
        '//pre/a/text() | '  # 本文中のリンク
        # セクションのタイトル
        '//pre/span[@class="h1" or @class="h2" or @class="h3" or '
        '@class="h4" or @class="h5" or @class="h6"]//text() |'
        '//pre/span/a[@class="selflink"]/text() |'  # セクションの番号
        '//a[@class="invisible"]'  # ページの区切り
    )

    # ページ区切りで段落がページをまたぐ場合の処理
    contents_len = len(contents)
    for i, content in enumerate(contents):
        # ページ区切りのとき
        if (isinstance(content, html.HtmlElement)
                and content.get('class') == 'invisible'):

            contents[i - 1] = contents[i - 1].rstrip()  # 前ページの末尾の空白を除去
            contents[i + 0] = ''  # ページ区切りの除去
            if i + 1 >= contents_len:
                continue
            contents[i + 1] = ''  # 余分な改行の除去
            if i + 2 >= contents_len:
                continue
            contents[i + 2] = ''  # 余分な空白の除去
            if i + 3 >= contents_len:
                continue
            if not isinstance(contents[i + 3], str):
                continue
            contents[i + 3] = contents[i + 3].lstrip('\n')  # 次ページの先頭の改行を除去

            # ページをまたぐ文章に対応する処理
            first, last = 0, -1
            prev_last_line = contents[i - 1].split('\n')[last]  # 前ページの最後の行
            next_first_line = contents[i + 3].split('\n')[first]  # 次ページの最初の行
            indent1 = get_indent(prev_last_line)
            indent2 = get_indent(next_first_line)
            # print('newpage:', i)
            # print('  ', indent1, prev_last_line)
            # print('  ', indent2, next_first_line)

            # 以下の条件のとき、段落がページをまたいでいると判断する
            #   1) 前ページの最後の段落の字下げの幅と、次ページの最初の段落の字下げの幅が同じとき
            #   2) 前ページの最後の段落が、文終端の「.」や「;」ではないとき
            if (not prev_last_line.endswith('.')
                    and not prev_last_line.endswith(';')
                    and re.match(r'^ *[a-zA-Z0-9(]', next_first_line)
                    and indent1 == indent2):
                # 内容がページをまたぐ場合、BREAKを挿入する
                # BREAK は文章のときは空白に置き換えて、コードのときは改行の置き換える。
                contents[i + 3] = BREAK + contents[i + 3]
            else:
                # 内容がページをまたがない場合、段落区切り(改行2つ)を挿入する
                contents[i + 0] = '\n\n'

    # ページ番号を非表示にする
    contents[-1] = re.sub(r'.*\[Page \d+\]$', '',
                          contents[-1].rstrip()).rstrip()
    # 全ての段落を結合する(段落の区切りは\n\n)
    text = ''.join(contents).strip()

    paragraphs = Paragraphs(text)

    # 段落情報をJSONに変換する
    obj = {
        'title': {
            'text': title
        },
        'number': number,
        'created_at': str(datetime.now(JST)),
        'updated_by': '',
        'contents': [],
    }
    for paragraph in paragraphs:
        obj['contents'].append({
            'indent': paragraph.indent,
            'text': paragraph.text,
        })
        if paragraph.is_section_title:
            obj['contents'][-1]['section_title'] = True
        if paragraph.is_code:
            obj['contents'][-1]['raw'] = True
        if paragraph.is_toc:
            obj['contents'][-1]['toc'] = True

    json_file = open(output_file, 'w')
    json.dump(obj, json_file, indent=2, ensure_ascii=False)
Example #42
0
    def __init__(self, args):
        self.args = args
        self.display = Display("info_%s.log" % escape(args.bookid))
        self.display.intro()

        self.cookies = {}
        self.jwt = {}

        if not args.cred:
            if not os.path.isfile(COOKIES_FILE):
                self.display.exit("Login: unable to find cookies file.\n"
                                  "    Please use the --cred option to perform the login.")

            self.cookies = json.load(open(COOKIES_FILE))

        else:
            self.display.info("Logging into Safari Books Online...", state=True)
            self.do_login(*args.cred)
            if not args.no_cookies:
                json.dump(self.cookies, open(COOKIES_FILE, "w"))

        self.book_id = args.bookid
        self.api_url = self.API_TEMPLATE.format(self.book_id)

        self.display.info("Retrieving book info...")
        self.book_info = self.get_book_info()
        self.display.book_info(self.book_info)

        self.display.info("Retrieving book chapters...")
        self.book_chapters = self.get_book_chapters()

        self.chapters_queue = self.book_chapters[:]

        if len(self.book_chapters) > sys.getrecursionlimit():
            sys.setrecursionlimit(len(self.book_chapters))

        self.book_title = self.book_info["title"]
        self.base_url = self.book_info["web_url"]

        self.clean_book_title = "".join(self.escape_dirname(self.book_title).split(",")[:2]) \
                                + " ({0})".format(self.book_id)

        books_dir = os.path.join(PATH, "Books")
        if not os.path.isdir(books_dir):
            os.mkdir(books_dir)

        self.BOOK_PATH = os.path.join(books_dir, self.clean_book_title)
        self.css_path = ""
        self.images_path = ""
        self.create_dirs()
        self.display.info("Output directory:\n    %s" % self.BOOK_PATH)

        self.chapter_title = ""
        self.filename = ""
        self.css = []
        self.images = []

        self.display.info("Downloading book contents... (%s chapters)" % len(self.book_chapters), state=True)
        self.BASE_HTML = self.BASE_01_HTML + (self.KINDLE_HTML if not args.no_kindle else "") + self.BASE_02_HTML

        self.cover = False
        self.get()
        if not self.cover:
            self.cover = self.get_default_cover()
            cover_html = self.parse_html(
                html.fromstring("<div id=\"sbo-rt-content\"><img src=\"Images/{0}\"></div>".format(self.cover)), True
            )

            self.book_chapters = [{
                "filename": "default_cover.xhtml",
                "title": "Cover"
            }] + self.book_chapters

            self.filename = self.book_chapters[0]["filename"]
            self.save_page_html(cover_html)

        self.css_done_queue = Queue(0) if "win" not in sys.platform else WinQueue()
        self.display.info("Downloading book CSSs... (%s files)" % len(self.css), state=True)
        self.collect_css()
        self.images_done_queue = Queue(0) if "win" not in sys.platform else WinQueue()
        self.display.info("Downloading book images... (%s files)" % len(self.images), state=True)
        self.collect_images()

        self.display.info("Creating EPUB file...", state=True)
        self.create_epub()

        if not args.no_cookies:
            json.dump(self.cookies, open(COOKIES_FILE, "w"))

        self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub"))
        self.display.unregister()

        if not self.display.in_error and not args.log:
            os.remove(self.display.log_file)

        sys.exit(0)
Example #43
0
def handler(event, context):
    url = "https://twitter.com/realDonaldTrump"
    response = requests.request("GET", url)
    tree = html.fromstring(response.content)
    vecTweets = tree.xpath('//div[@class="js-tweet-text-container"]//p')
    return vecTweets[0].text_content()
Example #44
0
def tree_from_html(url: str):
    page = requests.get(url)
    return html.fromstring(page.content)