Beispiel #1
0
def interscience(url):
    '''downloads the PDF from sciencedirect given a link to an article'''

    url = str(url)
    buffer = StringIO()

    curl = pycurl.Curl()
    curl.setopt(curl.URL, url)
    curl.setopt(curl.WRITEFUNCTION, buffer.write)
    curl.setopt(curl.VERBOSE, 0)
    curl.setopt(curl.USERAGENT, user_agent)
    curl.setopt(curl.TIMEOUT, 20)
    curl.perform()
    curl.close()

    buffer = buffer.getvalue().strip()
    html = lxml.html.parse(StringIO(buffer))
    image = html.findall("//img[@name='pdf']")[0]
    link = image.getparent()
    pdf_href = link.attrib["href"]

    #now let's get the article title
    title_div = html.findall("//div[@class='articleTitle']")[0]
    paper_title = title_div.text
    paper_title = paper_title.replace("\n", "")
    if paper_title[-1] == " ": paper_title = paper_title[:-1]
    re.sub('[^a-zA-Z0-9_\-.() ]+', '', paper_title)

    #now fetch the document for the user
    os.system(
        "wget --user-agent=\"pyscholar/blah\" --output-document=\"%s.pdf\" \"%s\""
        % (paper_title, pdf_href))
    print "\n\n"
def interscience(url):
    '''downloads the PDF from sciencedirect given a link to an article'''

    url = str(url)
    buffer = StringIO()

    curl = pycurl.Curl()
    curl.setopt(curl.URL, url)
    curl.setopt(curl.WRITEFUNCTION, buffer.write)
    curl.setopt(curl.VERBOSE, 0)
    curl.setopt(curl.USERAGENT, user_agent)
    curl.setopt(curl.TIMEOUT, 20)
    curl.perform()
    curl.close()

    buffer = buffer.getvalue().strip()
    html = lxml.html.parse(StringIO(buffer))
    image = html.findall("//img[@name='pdf']")[0]
    link = image.getparent()
    pdf_href = link.attrib["href"]

    #now let's get the article title
    title_div = html.findall("//div[@class='articleTitle']")[0]
    paper_title = title_div.text
    paper_title = paper_title.replace("\n", "")
    if paper_title[-1] == " ": paper_title = paper_title[:-1]
    re.sub('[^a-zA-Z0-9_\-.() ]+', '', paper_title)

    #now fetch the document for the user
    os.system("wget --user-agent=\"pyscholar/blah\" --output-document=\"%s.pdf\" \"%s\"" % (paper_title, pdf_href))
    print "\n\n"
Beispiel #3
0
def get_realtime_title():
    """Get ALL Category and Source Realtime news from chinatimes
    realtime url may change or invaild when it is not *realtime*
            
    return: dict{category, source, time, title, url}
    """
    
    response, content = h.request(news_list_url)

    html = lxml.html.fromstring(content.decode('big5', 'ignore'))
    html.make_links_absolute(base_url)

    # Get news-list section
    div = html.findall("*div")[1]

    # Get all title-info to list
    tr = list(div.iterdescendants("tr"))[1:]

    result_list = []
    for title_info in tr:
        news_url = list(title_info.iterlinks())[0][2]
        info_list = map(lambda x: x.text_content(), list(title_info))

        info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1],
                     "category": info_list[2], "source": info_list[3],
                     "url": news_url}
    
        result_list.append(info_dict)
        
    return result_list
    def get_semester_course_data(self, url, semester):
        """inputs, url (str) to call, semester (str) that url find data for"""
        print(f"Obtaining and indexing information for {semester}")
        html = self.call_url_and_get_html_object(url)
        tables = html.findall(
            ".//table[@class='courseListing basicTable courseListingSetWidths']"
        )

        # Parse html to get course offering data
        for table in tables:
            fields = table.findall(".//td")
            spans = table.findall(".//span")
            course_number = str(spans[1].text.strip())
            title = str(fields[4].text).strip()
            professor = str(fields[6].text).strip()
            status = str(fields[0].text)
            crn = str(fields[1].text)

            # Add course offering data to dictionary of course classes
            if course_number not in self.course_dict.keys():
                # If course doesn't already exist in dictionary keys, instantiate class of it
                self.course_dict[course_number] = Course(title=title,
                                                         semester=semester,
                                                         professor=professor,
                                                         crn=crn,
                                                         status=status)
            else:
                self.course_dict[course_number].add_instance_of_course(
                    semester, professor, crn, status)
Beispiel #5
0
def compass(answers=None):
    answers = answers or def_answers.copy()
    questions = {}
    post_args = {}

    while post_args is not None:
        # Post previous responses, Get new questions (first post is empty, gets page 1)
        html_text = submit_page(post_args)
        html = lxml.html.fromstring(html_text)
        curr_questions = reap_questions(html)

        # If the test isn't done, prepare [post_args] for next page
        if len(curr_questions):
            # Verify test integrity
            if not all(item in def_questions.items()
                       for item in curr_questions.items()):
                raise RuntimeError(
                    "Questions have changed. Answer cache is bad!")
            questions.update(curr_questions)

            # Assemble responses
            post_args = {
                'answer_' + str(key): answers[key]
                for key in curr_questions
            }

            # Print responses
            for num in sorted(curr_questions):
                print(
                    str(num) + ":\t" + curr_questions[num] + "\n\t" +
                    values[int(answers[num])] + '\n')

            submit_tag = html.find(".//input[@type='submit']")
            post_args[
                "submit"] = submit_tag.value  # submit_tag.type == "submit"
            for tag in html.findall(".//input[@type='hidden']"):
                post_args[tag.name] = tag.value
            pageno = post_args["pageno"]
        else:
            post_args = None
            pageno = 'f'

        # with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f:
        # f.write(html_text)

    h2 = html.find(".//h2")
    print(h2.text_content())

    lines = h2.text_content().split('\n')
    x = float(lines[0].split(":")[1])
    y = float(lines[1].split(":")[1])
    pyplot.scatter(x, y)
    pyplot.xlim(-10, 10)
    pyplot.ylim(-10, 10)
    pyplot.title("Political coordinates")
    pyplot.xlabel("Economic Left/Right")
    pyplot.ylabel("Social Libertarian/Authoritarian")
    pyplot.grid()
    pyplot.show()
    return questions
Beispiel #6
0
def get_realtime_title(pages=5):
    """
    Get ALL Category Realtime news from libertytimes
    realtime url may change or invaild when it is not **realtime**
    
    get_realtime_title(pages=5, encoding="UTF-8")
    
    *pages*: get page 1 to pages, default is 5 pages
        
    return: dict{time, title, url}
    """

    result_list = []

    for page in xrange(1, pages + 1):
        response, content = h.request("%s&ipage=%d" % (news_list_url, page))
        html = lxml.html.fromstring(content.decode("utf-8", "ignore"))
        html.make_links_absolute(base_url)

        # Get news-list section
        div = html.findall("*div")[0]

        # Get all title-info to list
        tr = list(div.iterdescendants("tr"))[1:-1]

        for title_info in tr:
            news_url = list(title_info.iterlinks())[1][2]
            info_list = map(lambda x: x.text_content(), list(title_info))

            try:
                info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1], "url": news_url}
            except IndexError, error_infomation:
                pass

            result_list.append(info_dict)
Beispiel #7
0
def debate_of_term1(id):
    """Parse a debate transcript in term 1 format and return list of
    its paragraphs' text content."""
    # download the debate transcript or use a local fixed debate if there is one
    filename = os.path.join('fixed_debates', 'debate_%s.html' % id)
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            content = f.read()
    else:
        url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id
        content = scrapeutils.download(url)
        if 'Unexpected error!' in content:
            raise RuntimeError("Debate with id '%s' does not exist" % id)

    # fix markup and parse to HTML tree
    content = content.replace('12. 9. 1995<o:p></o:p>', '12. septembra 1995')
    content = content.replace('<o:p></o:p>', '')
    html = lxml.html.fromstring(content)

    # extract paragraph texts, use blank line as paragraph separator
    result = []
    text = ''
    for par in html.findall('.//p'):
        line = scrapeutils.plaintext(par.text_content())
        if len(line) > 0 and not re.match(r'\w+ deň rokovania', line):
            text += '\n%s' % line
        else:
            if text:
                result.append(scrapeutils.clear_hyphens(text, '\n'))
            text = line
    result.append(scrapeutils.clear_hyphens(text, '\n'))

    return scrapeutils.plaintext(result)
Beispiel #8
0
def mp(id, term):
    """Parse MP from his profile webpage."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/poslanec&PoslanecID=%s&CisObdobia=%s' % (id, term)
    content = scrapeutils.download(url)
    if 'Unexpected error!' in content:
        raise RuntimeError("MP with id '%s' does not exist in term '%s'" % (id, term))
    html = lxml.html.fromstring(content)

    result = {
        'id': str(id),
        'url': url
    }
    for div in html.findall('.//div[@class="mp_personal_data"]//div[strong]'):
        label = div.findtext('strong')
        value = div.find('span')
        result[label.lower()] = value.text_content() if value is not None else ''

    image_url = html.find('.//div[@class="mp_foto"]/img').get('src')
    image = requests.get(image_url).content
    with open(os.path.join(BASE_DIR, 'dummy-image.jpg'), 'rb') as f:
        dummy_image = f.read()
    result['fotka'] = image_url if image != dummy_image else ''

    result['členstvo'] = []
    ul = html.find('.//span[@id="_sectionLayoutContainer_ctl01_ctlClenstvoLabel"]').getparent().getnext()
    for li in ul.findall('li'):
        m = re.search(r'(.*?)\s*\((.*?)\)', li.text)
        result['členstvo'].append({'meno': m.group(1), 'rola': m.group(2)})

    return scrapeutils.plaintext(result)
Beispiel #9
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean))
    html = lxml.html.document_fromstring(page.text)

    #Jump further
    further_url = "http://www.rebuy.de/" + html.find('.//a[@class="productConversion"]').attrib["href"]
    
    page = requests.get(further_url)
    html = lxml.html.document_fromstring(page.text)
    result = dict()
    result["title"] = html.find('.//h1/span[@class="loud"]').text_content()
    result["type"] = TYPE_TRANSLATE[html.xpath('.//p[contains(@class, "category-icon")]')[0].text_content()]
    result["imgurl"] = html.find(".//img[@id='cover']").attrib["src"] 

    attribs = dict()

    for i in html.findall(".//ul[@id='main-info-facts']/li"):
        name, sep, val = i.text_content().strip().partition(":")
        attribs[name] = val

    result["created"] = defNone(attribs.get("Erscheinungsdatum"), lambda x: toDBDate(x.strip(), "%d.%m.%Y"))
    result["author"] = None
    result["artists"] = None
    result["description"] = None
    result["duration"] = None
    
    return result
Beispiel #10
0
 def _parse_departures(self, html):
     ns = html.get('xmlns', '')
     rows = html.findall(
         './/*[@id="dnn_ctr1608_ModuleContent"]//{%(ns)s}table'
         '//{%(ns)s}table//{%(ns)s}tr' % ({
             'ns': ns
         }))[1:-1]
     return [Departure(self, row) for row in rows]
Beispiel #11
0
def clean_up_html(html, method='html'):
    html = autolink_html(html, link_regexes=_link_regexes)
    html = lxml.html.fromstring(cleaner.clean_html(html))
    for h1 in html.findall('h1'):
        h1.tag = 'h2'
    for a in html.cssselect('a'):
        a.attrib['target'] = '_blank'
    return lxml.html.tostring(html, encoding='utf-8', method=method)
    def handle(self, **options):
        self.stdout.write('Loading file %s...' % options['html_file'])
        self.stdout.write('Save %s...' % options['save'])
        save = options['save']
        #save = True
        html = lxml.html.parse(options['html_file'])
        tables = html.findall(".//table")
        self.stdout.write('nb tables %s' % len(tables))
        for table in html.iterfind(".//table"):
            self.error_warning=0

            # First child must be a caption
            self.caption = table[0]

            # Next row: parse brand name, url, and date
            row = self.caption.getnext()
            if row[0].get('colspan') is not '5':
                 self.stdout.write('\tNot a brand table')
                 continue
            if row[1].get('colspan') is not '5':
                 self.stdout.write('\tNot a brand table')
                 continue

            # Parse Brand
            self.parseBrand(row[1])
            brand_note = None
            new_company = None
            if self.brand_note != []:
                brand_note = Note(note = '. '.join(self.brand_note))
                self.stdout.write("\tNote: %s" % brand_note.note)
                if save:
                    brand_note.save()
            if save:
                new_company = Company(  name=self.caption.text, 
                                        validation_date = self.date,
                                        note = brand_note,
                                        certification = ' / '.join(self.certification) )
                new_company.save()
            #new_company = Company.objects.get( name=self.caption.text )
            for name in self.brand_name.split('/'):
                if save:
                    brand = Brand(name=name.strip(), company=new_company)
                    brand.save()
            for url in self.url:
                url = url.strip('/')
                if save:
                    new_site = Site(domain=url, company=new_company)
                    new_site.save()

            # Next row must be for table header
            row = row.getnext()
            text = row.find("td").xpath("string()")
            if text != "Description":
                self.stdout.write(red+'Header table first column %s is not Description' % text+reset)

            # Parse products
            self.parseProducts(row, new_company, options['save'])
Beispiel #13
0
 def get_string_ids(self):
     """docstrig for get_string_ids"""
     print "Check available items .."
     ids = []
     data = self.fetch_url(self.string_list_url)
     html = lxml.html.fromstring(data)
     for i in html.findall('li'):
         ids.append(i.get('id').replace('pstring_', ''))
     return ids
 def get_string_ids(self):
     """docstrig for get_string_ids"""
     print "Check available items .."
     ids = []
     data = self.fetch_url(self.string_list_url)
     html = lxml.html.fromstring(data)
     for i in html.findall('li'):
         ids.append(i.get('id').replace('pstring_', ''))
     return ids
Beispiel #15
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean=ean))
    
    #Check if something was found
    if "Ihre Suche ergab leider keine Treffer" in page.text:
        return None

    html = lxml.html.document_fromstring(page.text)
    result = defaultdict()

    transform = list()

    #Check media type
    result["type"] = html.find('.//span[@class="noCategory"]').text_content().strip()

    resolve_author = lambda: defNone(html.find('.//span[@class="oAuthorLinked"]'), lambda x: x.text_content()) 
    if result["type"].startswith("Buch"):
        result["type"] = "book"
        result["author"] = resolve_author()
        result["artists"] = None
    elif result["type"] == "Hörbuch":
        result["type"] = "audiobook"
        result["author"] = resolve_author()
        result["artists"] = None
    else:
        result["type"] = "movie"
        result["artists"] = [elm.text for elm in html.findall('.//span[@class="oAuthorLinked"]/a')]
        result["author"] = None

    #Extract simple attributes from the head of the page
    result["title"] = html.find('.//span[@class="oProductTitle"]').text.strip()
    result["imgurl"] = html.find('.//img[@id="elevateZoom"]').attrib["src"]

    result["description"] = defNone(html.find('.//dd[@class="cTypeBeschreibung"]'), lambda x: x.text_content().strip())

    #Extract attributes of the dd/dt Table next to the article picture
    attr_container = html.find('.//dl[@class="dlCols30_70"]')

    attr_list = dict()
    for elm in attr_container.getchildren():
        if elm.tag == "dt":
            curName = elm.text.strip()
        if elm.tag == "dd":
            attr_list[curName] = elm.text_content().strip()

    result["duration"] = defNone(attr_list.get("Spieldauer"), lambda x:int(x.replace("Minuten", "")))

    result["studio"] = attr_list.get("Studio")
    result["genre"] = attr_list.get("Genre") 
    import locale
    oldlocale = locale.getlocale(locale.LC_TIME)
    locale.setlocale(locale.LC_TIME, "de_DE.utf8")
    result["created"] = defNone(attr_list.get("Erscheinungsdatum"), lambda x: interpDate(x))
    locale.setlocale(locale.LC_TIME, oldlocale)

    return result 
Beispiel #16
0
 def _parse_stations(self, html):
     options = html.findall(
         './/*select[@datevaluefield="StationName"]/option')
     stations = []
     for option in options[1:]:
         data = urlencode({'stationname': option.get('value')})
         name = '%s' % (option.get('value')).rsplit(' Stn', 1)[0]
         url = '%s?%s' % (self.url, data)
         stations += [Station(name, url)]
     return stations
def get_string(html_text):
    html = lxml.html.fromstring(html_text)
    remove_tags = ('.//style', './/script', './/noscript')
    for remove_tag in remove_tags:
        for tag in html.findall(remove_tag):
            tag.drop_tree()
            # ここでの削除は元の変数tに反映されます。

    codeframe_list = []
    lang_list = []
    # コードの削除
    for tag in html.findall(".//div[@class='code-frame']"):
        codeframe_list.append(tag.text_content())
        lang_list.append(tag.attrib["data-lang"])
        tag.drop_tree()

    atext_list = []
    ahref_list = []
    # href リンクの削除
    for tag in html.cssselect('a'):
        if tag.text is not None:
            atext_list.append(tag.text)
        if tag.get('href') is not None:
            ahref_list.append(tag.get('href'))
        tag.drop_tree()

    code_list = []
    # 一行コードの削除
    for cc in html.cssselect('code'):
        if cc.text is not None:
            code_list.append(cc.text)
        cc.drop_tree()

    text = html.text_content().strip('\n')

    return pd.Series(
        [
            "".join(text.split('\n')), ",".join(codeframe_list),
            ",".join(lang_list), ",".join(code_list), ",".join(atext_list),
            ",".join(ahref_list)
        ],
        index=['text', 'code-frame', 'lang', 'code', 'a-text', 'a-href'])
Beispiel #18
0
 def _parse_stations(self, html):
     ns = html.get('xmlns', '')
     options = html.findall(
         './/*[@id="EntryForm"]//{%(ns)s}select/{%(ns)s}option' %({'ns':ns}))
     stations = []
     for option in options:
         data = urlencode({'stationname': option.get('value')})
         name = '%s' %(option.get('value')).rsplit(' Stn', 1)[0]
         url = '%s?%s' %(self.url, data)
         stations += [Station(name, url)]
     return stations
Beispiel #19
0
    def parse_html(self, url):
        page = url.split('articles/')[-1]
        if self.base_path.joinpath(page).exists():
            html = lxml.html.parse(page)
            logging.info('HTML page `{}` exists, and parses.'.format(url))

            # Dateline is in the first p, unless that is an image, then it is in the third.
            dateline = html.find('.//{*}p')
            if dateline.text is None:
                dateline = html.findall('.//{*}p')[2]
            if 'BLACKSBURG, Va.' in dateline.text:
                self.spatial_coverage = 'Blacksburg, Va.'
            else:
                date_issued = self.date_issued.strftime(', %b')
                self.spatial_coverage = dateline.text.split(date_issued)[0].title()
            if len(self.spatial_coverage) > 25 or '\n' in self.spatial_coverage or ' ' == self.spatial_coverage:
                # Sanity check: These are symptoms of errors. Change them to Blacksburg.
                self.spatial_coverage = 'Blacksburg, Va.'
            logging.debug('Spatial Coverage: {}'.format(self.spatial_coverage))

            # Author is in the first li of the last ul, or the one before that, if it exists.
            html_lists = html.findall('.//{*}ul')
            author = html_lists[-1].find('./{*}li').text
            if author is None:
                try:
                    author = html_lists[-2].find('./{*}li').text
                except IndexError as e:
                    logging.error('No author found.')
            if author is not None:
                author = ' '.join(author.split())
            self.author = author
            logging.debug('Author: {}'.format(self.author))

            # Any img tag is a related file.
            for image in html.iterfind('.//{*}img'):
                self.image_urls.add(image.get('src'))
            if len(self.image_urls) > 0:
                logging.debug('All image urls: {}'.format(self.image_urls))
        else:
            logging.error('Url `{}` does not map to an HTML file in the archive.'.format(url))
            self.error_urls.add(url)
Beispiel #20
0
 def _parse_stations(self, html):
     select = [
         select for select
         in html.findall('.//*div[@id="divTrainLineStationOption"]//select')
         if select.get('name').endswith('TrainStation')][0]
     stations = []
     for option in select.findall('option')[1:]:
         data = urlencode({'stationname': option.get('value')})
         name = '%s' % (option.get('value')).rsplit(' Stn', 1)[0]
         url = '%s?%s' % (self.url, data)
         stations += [Station(name, url)]
     return sorted(stations)
def compass():
	answers = def_answers.copy()
	questions = {}
	post_args = {}

	while post_args is not None:
		# Post previous responses, Get new questions (first post is empty, gets page 1)
		html_text = submit_page(post_args)
		html = lxml.html.fromstring(html_text)
		curr_questions = reap_questions(html)

		# If the test isn't done, prepare [post_args] for next page
		if len(curr_questions):
			# Verify test integrity
			if not all(item in def_questions.items() for item in curr_questions.items()):
				raise RuntimeError("Questions have changed. Answer cache is bad!")
			questions.update(curr_questions)

			# Assemble responses
			post_args = {'answer_' + str(key): answers[key] for key in curr_questions}

			# Print responses
			for num in sorted(curr_questions):
				print(str(num) + ":\t" + curr_questions[num] + "\n\t" + values[int(answers[num])] + '\n')

			submit_tag = html.find(".//input[@type='submit']")
			post_args["submit"] = submit_tag.value  # submit_tag.type == "submit"
			for tag in html.findall(".//input[@type='hidden']"):
				post_args[tag.name] = tag.value
			pageno = post_args["pageno"]
		else:
			post_args = None
			pageno = 'f'

		# with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f:
			# f.write(html_text)

	h2 = html.find(".//h2")
	print(h2.text_content())

	lines = h2.text_content().split('\n')
	x = float(lines[0][-6:])
	y = float(lines[1][-6:])
	pyplot.scatter(x, y)
	pyplot.xlim(-10, 10)
	pyplot.ylim(-10, 10)
	pyplot.title("Political coordinates")
	pyplot.xlabel("Economic Left/Right")
	pyplot.ylabel("Social Libertarian/Authoritarian")
	pyplot.grid()
	pyplot.show()
	return questions
Beispiel #22
0
def get_realtime_title(pages=5, encoding="UTF-8"):
    """
    Get ALL Category Realtime news from appledaily
    realtime url may change or invaild when it is not **realtime**
    
    get_realtime_title(pages=5, encoding="UTF-8")
    
    *pages*: get page 1 to pages, default is 5 pages
    
    *encoding*: html text encoding
    
    return: dict{time, title, url}
    
    """

    result_list = []

    for page in xrange(1, pages + 1):
        response, content = h.request("%s/index/type/apple/page/%d" % (news_list_url, page))
        html = lxml.html.fromstring(content.decode("utf-8", "ignore"))
        html.make_links_absolute(base_url)

        # Get news-list section
        div = html.findall("*div")[0]

        # Get all title-info to list
        li = list(div.iterdescendants("li"))[10:-29]

        for title_info in li:
            news_url = list(title_info.iterlinks())[0][2]
            info_list = map(lambda x: x.text_content().encode(encoding), list(title_info))
            # info_list = info_list[0].strip("\r\n ").replace("\n", "")

            # time = info_list[:5]
            # category = info_list[5:11]
            # title = info_list[11:].strip("\r\n ")
            # title = title[: title.rfind("(") - 1]

            arr = [i.strip() for i in info_list[0].split("\n")]
            time = arr[1][:5]
            category = arr[1][5:]
            title = arr[2]

            try:
                info_dict = {"title": title, "time": time, "category": category, "url": news_url}
            except IndexError, error_infomation:
                pass

            result_list.append(info_dict)
 def get_data_urls_and_terms(self, subject):
     """Return list of urls to obtain data from available previous semesers"""
     print("Finding terms with course information available")
     html = self.call_url_and_get_html_object(
         "https://my.gwu.edu/mod/pws/")  # Call home page url
     term_elements = html.findall(".//div[@class='tableHeaderFont']"
                                  )  # Obtain list of available terms
     terms = [term.text.lower().strip() for term in term_elements]
     term_urls = []
     for term in terms:
         num_term = translate_term_to_numerical(term)
         self.terms.append(int(num_term))
         term_urls.append((self.get_url_from_term_string(num_term,
                                                         subject), term))
     return term_urls
Beispiel #24
0
def read():
    import requests
    import lxml.html

    # WebサイトのURLを指定
    url = "https://status.aws.amazon.com/"

    # Requestsを利用してWebページを取得する
    r = requests.get(url)

    # lxmlを利用してWebページを解析する
    html = lxml.html.fromstring(r.text)

    # lxmlのfindallを利用して、ヘッドラインのタイトルを取得する
    elems = html.findall(".//td")

    for elem in elems:
        print(elem.text)
Beispiel #25
0
def grab_cloudflare(url):
    sess = requests.Session()
    sess.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0"}
    safe_eval = lambda s: eval(s, {"__builtins__": {}}) if "#" not in s and "__" not in s else ""
    page = sess.get(url).content
    
    if "a = $('#jschl_answer');" in page:
        # Cloudflare anti-bots is on
        html = lxml.html.fromstring(page)
        challenge = html.find(".//input[@name='jschl_vc']").attrib["value"]
        script = html.findall(".//script")[-1].text_content()
        domain = url.split("/")[2]
        math = re.search(r"a\.val\((\d.+?)\)", script).group(1)
        
        answer = str(safe_eval(math) + len(domain))
        data = {"act": "jschl", "jschl_vc": challenge, "jschl_answer": answer}
        return sess.post(url, data).content
    else:
        return page
Beispiel #26
0
def mp_list(term=None):
    """Parse list of MPs."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)
    term = term or max(terms.keys())

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia=%s' % term
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    result = {
        'url': url,
        '_items': [{
            'id': re.search(r'PoslanecID=(\d+)', mp.get('href')).group(1),
            'meno': mp.text,
        } for mp in html.findall('.//div[@class="mps_list"]//li/a')]
    }

    return scrapeutils.plaintext(result)
Beispiel #27
0
def get_queues(html):
    all_items = html.findall('.//div[@class="quote"]')
    for item in all_items:

        all_text = item.findall('./span[@class="text"]')
        for txt in all_text:
            print(txt.text_content())

        all_authors = item.findall('./span/small[@class="author"]')
        for txt in all_authors:
            print('author:', txt.text_content())

        all_urls = item.findall('./span/a')
        for txt in all_urls:
            print('url:', txt.text_content(), txt.attrib['href'])

        all_tags = item.findall('./div/a')
        for txt in all_tags:
            print('tag:', txt.text_content(), txt.attrib['href'])

        print('---')
Beispiel #28
0
def debate_of_terms234(id):
    """Parse a debate transcript in terms 2-4 format and return list of
    its paragraphs' text content."""
    # download RTF file or use a local fixed debate if there is one
    filename = os.path.join('fixed_debates', 'debate_%s.rtf' % id)
    if not os.path.exists(filename):
        url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id
        rtf = scrapeutils.download(url)
        filename = os.path.join(scrapeutils.WEBCACHE_PATH, 'debate_%s.rtf' % id)
        with open(filename, 'w') as f:
            f.write(rtf)

    # convert from RTF to HTML using unoconv using LibreOffice
    content = subprocess.check_output(['unoconv', '-f', 'html', '--stdout', filename])
    html = lxml.html.fromstring(content)

    result = []
    for par in html.findall('./body/p'):
        result.append(par.text_content())

    return scrapeutils.plaintext(result)
Beispiel #29
0
def get_queues(html):
    all_items = html.findall('.//div[@class="quote"]')
    for item in all_items:
    
        all_text = item.findall('./span[@class="text"]')
        for txt in all_text:
            print(txt.text_content())
    
        all_authors = item.findall('./span/small[@class="author"]')
        for txt in all_authors:
            print('author:', txt.text_content())
    
        all_urls = item.findall('./span/a')
        for txt in all_urls:
            print('url:', txt.text_content(), txt.attrib['href'])
    
        all_tags = item.findall('./div/a')
        for txt in all_tags:
            print('tag:', txt.text_content(), txt.attrib['href'])
    
        print('---')
Beispiel #30
0
def get_realtime_title(encoding="UTF-8"):
    """
    Get ALL Category Realtime news from udn news
    realtime url may change or invaild when it is not **realtime**
    
    get_realtime_title(encoding="UTF-8")
    
    *encoding*: html text encoding
    
    return: dict{category, time, title, url}
    """
    
    response, content = h.request(news_list_url)
    
    html = lxml.html.fromstring(content.decode('big5', 'ignore'))
    html.make_links_absolute(base_url)

    # Get news-list section
    table = html.findall("*table")[0]
    
    # Get all title-info to list
    tr = list(table.iterdescendants("tr"))[13: -3]
    
    result_list = []
    for title_info in tr:
        news_url = list(title_info.iterlinks())
        if not news_url or "gif" in news_url[0][2]:
            continue
        
        news_url = news_url[0][2]
        info_list = map(lambda x: x.text_content().encode(encoding), list(title_info))
        
        try:
            info_dict = {"title": info_list[1].strip("\r\n "), "time": info_list[0],
                        "category": info_list[2], "url": news_url}
        except IndexError, error_infomation:
            pass
    
        result_list.append(info_dict)
Beispiel #31
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean))
    html = lxml.html.document_fromstring(page.text)
    
    result = dict()
    title_elm = html.find(".//span[@itemprop='name']")

    #When the title is not found on the page, the product seems to be in the unsorted section of geizhals...
    if title_elm is None:
        return None

    result["title"] = title_elm.text_content()
    result["genre"] = html.find(".//li[@class='ghnavhi']").text_content()
    description = html.find(".//div[@id='gh_proddesc']").text_content()
    result["firstrelease"] = defNone(re.search("Ersterscheinung: (\d+)", description), lambda x: x.group(1))

    for i in html.findall(".//a[@class='revlink']"):
        if "imdb" in i.attrib["href"]:
            result["imdb_link"] = i.attrib["href"]
            break;

    return result
def grab_cloudflare(url, *args, **kwargs):
    sess = requests.Session()
    sess.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0"}
    safe_eval = lambda s: eval(s, {"__builtins__": {}}) if "#" not in s and "__" not in s else ""
    page = sess.get(url, *args, **kwargs)

    if "a = document.getElementById('jschl-answer');" in page.content:
        logger.info("Encountered CloudFlare anti-bot wall")
        # Cloudflare anti-bots is on
        html = lxml.html.fromstring(page.content)
        challenge = html.find(".//input[@name='jschl_vc']").attrib["value"]
        script = html.findall(".//script")[-1].text_content()
        domain_parts = url.split("/")
        domain = domain_parts[2]
        math = re.search(r"a\.value = (\d.+?);", script).group(1)

        answer = str(safe_eval(math) + len(domain))
        data = {"jschl_vc": challenge, "jschl_answer": answer}
        get_url = domain_parts[0] + '//' + domain + "/cdn-cgi/l/chk_jschl"
        return sess.get(get_url, params=data, headers={'referer': url}, *args, **kwargs)
    else:
        return page
Beispiel #33
0
def old_debates_list(term):
    """Parse list of debates for the given term of office from NRSR
    Digital Library.
    Appropriate for older terms (1.-4.) where debates are not split
    by speaker."""
    if term not in ['1', '2', '3', '4']:
        raise ValueError("Old style transcripts are not available for term '%s'" % term)

    base_url = 'http://www.nrsr.sk/dl/Browser/Grid?nodeType=DocType&legId=13&chamberId=0' + \
            '&categoryId=1&committeeId=0&documentTypeId=5&folderId=0&meetingNr=' + \
            '&termNr=%s' % term
    result = {
        'url': base_url,
        '_items': []
    }
    page = 0
    while True:
        url = base_url + '&pageIndex=%s' % page
        content = scrapeutils.download(url)
        html = lxml.html.fromstring(content)

        # extract all debates from the current page
        for tr in html.findall('.//table[@class="resultTable"]//tr'):
            sequence_number = tr.findtext('td[1]/a')
            title = tr.find('td[2]/a')
            doc_id = re.search(r'documentId=(\d+)', title.get('href'))
            debate = {
                'časť': sequence_number,
                'názov': title.text,
                'url': 'http://www.nrsr.sk' + title.get('href'),
                'id': doc_id.group(1)
            }
            result['_items'].append(debate)

        page += 1
        pages = html.findtext('.//div[@class="pager"]/span[last()]')
        if page >= int(pages): break

    return scrapeutils.plaintext(result)
Beispiel #34
0
def session_list(term=None):
    """Parse list of sessions in one term of office of the parliament."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    url = 'http://www.nrsr.sk/web/default.aspx?sid=schodze/hlasovanie/schodze'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    # scraping for older terms requires another POST request to emulate selectbox choice
    if term:
        data = {
            '_sectionLayoutContainer$ctl01$_termsCombo': term,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = '|%s' % term
        content = scrapeutils.download(url, 'POST', data, ext)
        html = lxml.html.fromstring(content)

    # pick list items
    result = {
        'url': url,
        '_items': []
    }
    for li in html.findall('.//div[@id="_sectionLayoutContainer__panelContent"]//ul//li'):
        a = li.find('a')
        link = a.get('href')
        session = {
            'číslo': re.search(r'CisSchodze=(\d+)', link).group(1),
            'názov': a.text,
            'trvanie': re.search(r'\((.+?)\)', li.text_content()).group(1),
            'url': 'http://www.nrsr.sk/web/' + link,
        }
        result['_items'].append(session)

    return scrapeutils.plaintext(result)
Beispiel #35
0
def deputy_speakers():
    """Parse current deputy speakers (podpredsedovia) of the chamber."""
    url = 'http://www.nrsr.sk/web/default.aspx?sid=podpredsedovia'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    result = []
    for div in html.findall(".//div[@class='vicechairman_bigbox']"):
        name = div.find('.//a')
        link = name.get('href')
        id = re.search(r'PoslanecID=(\d+)', link)
        description = div.find(".//div[@class='vicechairman_description']")

        result.append({
            'fotka': 'http://www.nrsr.sk/web/' + div.find('.//img').get('src'),
            'meno': name.text,
            'url': 'http://www.nrsr.sk/web/' + link,
            'id': id.group(1),
            'kandidoval(a) za': description.find('div[1]/strong').tail,
            'narodený(á):': description.find('div[2]/strong').tail,
            'národnosť': description.find('div[3]/strong').tail,
        })

    return scrapeutils.plaintext(result)
Beispiel #36
0
def reap_questions(html):
    questions = {}
    for tag in html.findall(".//label[1]/input[@type='radio']"):
        num = int(tag.name.split('_')[-1])
        questions[num] = tag.find("....../td[1]").text_content()
    return questions
Beispiel #37
0
    def format_body(self, html):
        body = html.findall('body')
        body_list = []
        footer =  self.format_footer(body[-1].getchildren())
        for b in body[:-1]:
            body_list.append(etree.tostring(b).replace('\t', '').replace('\n',''))
        html_body ='''
        <script type="text/javascript">

        var indexer = 0;
        var aryTest = %s ;
        function nextData()
            {
            if(indexer < aryTest.length -1)
                {
                indexer += 1;
                document.forms[0].prev.disabled = false;
                document.getElementById("openerp_data").innerHTML=aryTest[indexer];
                document.getElementById("counter").innerHTML= indexer + 1 + ' / ' + aryTest.length;
                }
            else
               {
                document.forms[0].next.disabled = true;
               }
            }
        function prevData()
            {
            if (indexer > 0)
                {
                indexer -= 1;
                document.forms[0].next.disabled = false;
                document.getElementById("openerp_data").innerHTML=aryTest[indexer];
                document.getElementById("counter").innerHTML=  indexer + 1 + ' / ' + aryTest.length;
                }
            else
               {
                document.forms[0].prev.disabled = true;
               }
            }
    </script>
    </head>
    <body>
        <div id="openerp_data">
            %s
        </div>
        <div>
        %s
        </div>
        <br>
        <form>
            <table>
                <tr>
                    <td td align="left">
                        <input name = "prev" type="button" value="Previous" onclick="prevData();">
                    </td>
                    <td>
                        <div id = "counter">%s / %s</div>
                    </td>
                    <td align="right">
                        <input name = "next" type="button" value="Next" onclick="nextData();">
                    </td>
                </tr>
            </table>
        </form>
    </body></html>'''%(body_list,body_list[0],footer,'1',len(body_list))
        return html_body
Beispiel #38
0
 def format_header(self, html):
     head = html.findall('head')
     header = ''
     for node in head:
         header += etree.tostring(node)
     return header
Beispiel #39
0
    def _render(self, data, template_name=None):
        """Render output of view function to HTML.

        :param data: Data dictionary from view function
        :param template_name: Name of template file
        :return: Rendered HTML
        """

        nested = template_name is None
        template_name = template_name or self.template_name

        if nested and self.detect_render_nested:
            try:
                renderer = self.detect_renderer(None, template_name)
            except KeyError:
                renderer = self.renderer
        else:
            renderer = self.renderer

        # Catch errors and return appropriate debug divs
        # todo: add debug parameter
        try:
            rendered = renderer(self.template_dir, template_name, data)
        except IOError:
            return '<div>Template {} not found.</div>'.format(template_name)

        html = lxml.html.fragment_fromstring(rendered, create_parent='remove')

        for element in html.findall('.//*[@mod-meta]'):

            # Render nested template
            template_rendered, is_replace = self.render_element(element, data)

            original = lxml.html.tostring(element)
            if is_replace:
                replacement = template_rendered
            else:
                replacement = original
                replacement = replacement.replace(
                    '><', '>' + template_rendered + '<')

            rendered = rendered.replace(original, replacement)

        ## Parse HTML using html5lib; lxml is too strict and e.g. throws
        ## errors if missing parent container; htmlparser mangles whitespace
        ## and breaks replacement
        #parsed = BeautifulSoup(rendered, 'html5lib')
        #subtemplates = parsed.find_all(
        #    lambda tag: tag.has_attr('mod-meta')
        #)
        #
        #for element in subtemplates:
        #
        #    # Extract HTML of original element
        #    element_html = str(element)
        #
        #    # Render nested template
        #    template_rendered, is_replace = self.render_element(element, data)
        #
        #    # Build replacement
        #    if is_replace:
        #        replacement = template_rendered
        #    else:
        #        element.string = template_rendered
        #        replacement = str(element)
        #
        #    # Replace
        #    rendered = rendered.replace(element_html, replacement)

        return rendered
Beispiel #40
0
    def _render(self, data, template_name=None):
        """Render output of view function to HTML.

        :param data: Data dictionary from view function
        :param template_name: Name of template file
        :return: Rendered HTML
        """

        nested = template_name is None
        template_name = template_name or self.template_name

        if nested and self.detect_render_nested:
            try:
                renderer = self.detect_renderer(None, template_name)
            except KeyError:
                renderer = self.renderer
        else:
            renderer = self.renderer

        # Catch errors and return appropriate debug divs
        # todo: add debug parameter
        try:
            # TODO: Seems like Jinja2 and handlebars renderers would not work with this call sig
            rendered = renderer(self.template_dir, template_name, data, trust=self.trust)
        except IOError:
            return '<div>Template {} not found.</div>'.format(template_name)

        html = lxml.html.fragment_fromstring(rendered, create_parent='remove')

        for element in html.findall('.//*[@mod-meta]'):

            # Render nested template
            template_rendered, is_replace = self.render_element(element, data)

            original = lxml.html.tostring(element)
            if is_replace:
                replacement = template_rendered
            else:
                replacement = original
                replacement = replacement.replace('><', '>' + template_rendered + '<')

            rendered = rendered.replace(original, replacement)

        ## Parse HTML using html5lib; lxml is too strict and e.g. throws
        ## errors if missing parent container; htmlparser mangles whitespace
        ## and breaks replacement
        #parsed = BeautifulSoup(rendered, 'html5lib')
        #subtemplates = parsed.find_all(
        #    lambda tag: tag.has_attr('mod-meta')
        #)
        #
        #for element in subtemplates:
        #
        #    # Extract HTML of original element
        #    element_html = str(element)
        #
        #    # Render nested template
        #    template_rendered, is_replace = self.render_element(element, data)
        #
        #    # Build replacement
        #    if is_replace:
        #        replacement = template_rendered
        #    else:
        #        element.string = template_rendered
        #        replacement = str(element)
        #
        #    # Replace
        #    rendered = rendered.replace(element_html, replacement)

        return rendered
Beispiel #41
0
def group_list(type, term=None):
    """Parse list of groups of a given type (committee, parliamentary group, delegation, friendship group)."""
    types = {
        'committee': {
            'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=77',
            'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm',
        },
        'parliamentary group': {
            'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=69',
            'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm',
        },
        'delegation': {
            'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/delegacie/zoznam',
            'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm',
        },
        'friendship group': {
            'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/sp/zoznam',
            'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm',
        },
    }

    if type not in types:
        raise ValueError("unknown type of group '%s'" % type)
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    content = scrapeutils.download(types[type]['url'])
    html = lxml.html.fromstring(content)

    # scraping for older terms requires another POST request to emulate selectbox choice
    if term:
        data = {
            types[type]['term_param_name']: term,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = '|%s' % term
        content = scrapeutils.download(types[type]['url'], 'POST', data, ext)
        html = lxml.html.fromstring(content)

    # pick list items
    result = {
        'url': types[type]['url'],
        '_items': []
    }
    for li in html.findall('.//ul[@class="longlist"]//li'):
        a = li.find('a')
        group = {
            'id': re.search(r'(ID|SkupinaId)=(\d+)', a.get('href')).group(2),
            'názov': a.text,
        }
        line = li.text_content()
        info = re.search(group['názov'] + r'\s*(\((.+?) - (.+?)\))?\s*(\S.*)?$', line, re.DOTALL)
        if info:
            if info.group(2):
                group['od'] = info.group(2)
                group['do'] = info.group(3)
            if info.group(4):
                group['poznámka'] = info.group(4)
        result['_items'].append(group)

    return scrapeutils.plaintext(result)
Beispiel #42
0
def new_debates_list(term, since_date=None, until_date=None):
    """Parse list of debate parts for the given term of office from
    NRSR web. Appropriate for newer terms (since 5th) where split
    debates are available. If `since_date` or `until_date` is given
    in ISO format only the debate parts since/until that date are
    returned.
    """
    if term not in ['5', '6', '7']:
        raise ValueError("Parsed transcripts are not available for term '%s'" % term)

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/rozprava'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    # a POST request to emulate choice of term in second selectbox and pressing the button
    data = {
        '_sectionLayoutContainer$ctl01$_termNr': term,
        '_sectionLayoutContainer$ctl01$_search': 'Vyhľadať',
        '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
        '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
    }
    base_ext = '|new|%s' % term
    if since_date:
        data['_sectionLayoutContainer$ctl01$_dateFrom$dateInput'] = since_date + '-00-00-00'
        base_ext += '|s%s' % since_date
    if until_date:
        data['_sectionLayoutContainer$ctl01$_dateTo$dateInput'] = since_date + '-00-00-00'
        base_ext += '|u%s' % since_date
    content = scrapeutils.download(url, 'POST', data, base_ext)
    html = lxml.html.fromstring(content)

    result = {
        'url': url,
        '_items': []
    }
    page = 1
    while True:
        # extract all debate parts from the current page
        for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]/tr'):
            if tr.get('class') in ('pager', 'tab_zoznam_header'): continue
            session_number = tr.find('td[1]')
            date = tr.find('td[2]')
            time_interval = tr.find('td[3]')
            time = re.search(r'(.*?) - (.*)', time_interval.text)
            part_type = time_interval.find('em')
            speaker = tr.find('td[4]')
            speaker_label = speaker.find('br').tail.strip('( ')
            debate_part = {
                'schôdza': session_number.text.replace('.', ''),
                'dátum': date.text,
                'trvanie': {'od': time.group(1), 'do': time.group(2)},
                'druh': part_type.text or '',
                'osoba': {'meno': speaker.findtext('strong'), 'funkcia': speaker_label}
            }
            speaker_link = speaker.find('a')
            if speaker_link is not None:
                speaker_url = speaker_link.get('href')
                id = re.search(r'PoslanecID=(\d+)', speaker_url)
                debate_part['osoba']['url'] = speaker_url
                debate_part['osoba']['id'] = id.group(1)
            for a in tr.findall('td[5]/a'):
                link = a.get('href')
                src = a.find('img').get('src')
                if 'speak' in src:
                    id = re.search(r'id=(\d+)', link)
                    debate_part['video'] = {'url': link, 'id': id.group(1)}
                elif 'all' in src:
                    debate_part['video_rokovania'] = {'url': link}
                elif 'rewrite' in src:
                    id = re.search(r'id=(\d+)', link)
                    debate_part['prepis'] = {'url': link, 'id': id.group(1)}
                else:
                    raise RuntimeError('Unrecognized link in section %s/%s/%s' %
                        (session_number.text, date.text, time_interval.text))
            result['_items'].append(debate_part)

        # test if there is a link to next page
        current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]//tr[1]//span')
        if current_page is None: break
        next_page = current_page.getparent().getnext()
        if next_page is None: break
        page += 1

        # a POST request to emulate pager click
        data = {
            '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_newDebate',
            '__EVENTARGUMENT': 'Page$%s' % page,
            '_sectionLayoutContainer$ctl01$_termNr': term,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = base_ext + '|%s' % page
        content = scrapeutils.download(url, 'POST', data, ext)
        html = lxml.html.fromstring(content)

    return scrapeutils.plaintext(result)
Beispiel #43
0
def get_next_page(html):
    all_items = html.findall('.//li[@class="next"]/a')
    for item in all_items:
        print('url:', item.text_content(), txt.attrib['href'])
        return txt.attrib['href']
Beispiel #44
0
def get_top_tags(html):
    all_items = html.findall('.//span[@class="tag-item"]/a')
    for item in all_items:
        print(item.text_content(), '->', item.attrib['href'])