Ejemplo n.º 1
0
def check_errors(html):
    # Check error message from acm.timus.ru
    mpos = html.find('color:Red')
    if mpos > -1:  # Extract red text
        spos = html.find('>', mpos) + 1
        epos = html.find('</', spos)
        raise OnlineJudje(html[spos:epos])
Ejemplo n.º 2
0
def resolve_ean(ean):
    page = requests.get(SEARCH_URL.format(ean))
    html = lxml.html.document_fromstring(page.text)

    #Jump further
    further_url = "http://www.rebuy.de/" + html.find('.//a[@class="productConversion"]').attrib["href"]
    
    page = requests.get(further_url)
    html = lxml.html.document_fromstring(page.text)
    result = dict()
    result["title"] = html.find('.//h1/span[@class="loud"]').text_content()
    result["type"] = TYPE_TRANSLATE[html.xpath('.//p[contains(@class, "category-icon")]')[0].text_content()]
    result["imgurl"] = html.find(".//img[@id='cover']").attrib["src"] 

    attribs = dict()

    for i in html.findall(".//ul[@id='main-info-facts']/li"):
        name, sep, val = i.text_content().strip().partition(":")
        attribs[name] = val

    result["created"] = defNone(attribs.get("Erscheinungsdatum"), lambda x: toDBDate(x.strip(), "%d.%m.%Y"))
    result["author"] = None
    result["artists"] = None
    result["description"] = None
    result["duration"] = None
    
    return result
Ejemplo n.º 3
0
def check_errors(html):
    # Check error message from acm.timus.ru
    mpos = html.find('color:Red')
    if mpos > -1:  # Extract red text
        spos = html.find('>', mpos) + 1
        epos = html.find('</', spos)
        raise OnlineJudje(html[spos:epos])
Ejemplo n.º 4
0
 def html_to_text(self, html):
     if html.find('head') is not None:
         html.head.decompose()
     if html.find('script') is not None:
         html.script.decompose()
     without_tags = html.get_text(" ", strip=True)
     return without_tags
Ejemplo n.º 5
0
def txt_wrap_by(start_str, end, html):
    start = html.find(start_str)
    if start >= 0:
        start += len(start_str)
        end = html.find(end, start)
        if end >= 0:
            return html[start:end].strip()
Ejemplo n.º 6
0
def compass(answers=None):
    answers = answers or def_answers.copy()
    questions = {}
    post_args = {}

    while post_args is not None:
        # Post previous responses, Get new questions (first post is empty, gets page 1)
        html_text = submit_page(post_args)
        html = lxml.html.fromstring(html_text)
        curr_questions = reap_questions(html)

        # If the test isn't done, prepare [post_args] for next page
        if len(curr_questions):
            # Verify test integrity
            if not all(item in def_questions.items()
                       for item in curr_questions.items()):
                raise RuntimeError(
                    "Questions have changed. Answer cache is bad!")
            questions.update(curr_questions)

            # Assemble responses
            post_args = {
                'answer_' + str(key): answers[key]
                for key in curr_questions
            }

            # Print responses
            for num in sorted(curr_questions):
                print(
                    str(num) + ":\t" + curr_questions[num] + "\n\t" +
                    values[int(answers[num])] + '\n')

            submit_tag = html.find(".//input[@type='submit']")
            post_args[
                "submit"] = submit_tag.value  # submit_tag.type == "submit"
            for tag in html.findall(".//input[@type='hidden']"):
                post_args[tag.name] = tag.value
            pageno = post_args["pageno"]
        else:
            post_args = None
            pageno = 'f'

        # with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f:
        # f.write(html_text)

    h2 = html.find(".//h2")
    print(h2.text_content())

    lines = h2.text_content().split('\n')
    x = float(lines[0].split(":")[1])
    y = float(lines[1].split(":")[1])
    pyplot.scatter(x, y)
    pyplot.xlim(-10, 10)
    pyplot.ylim(-10, 10)
    pyplot.title("Political coordinates")
    pyplot.xlabel("Economic Left/Right")
    pyplot.ylabel("Social Libertarian/Authoritarian")
    pyplot.grid()
    pyplot.show()
    return questions
    def non_tag_chars(html):
        n = 0
        while n < len(html):
            angle = html.find('<', n)
            if angle == -1:
                yield html[n:]
                n = len(html)
                break
            yield html[n:angle]
            n = angle

            while n < len(html):
                nl = html.find('\n', n)
                angle = html.find('>', n)
                if angle == -1:
                    yield ' ' * (len(html) - n)
                    n = len(html)
                    break
                elif nl == -1 or angle < nl:
                    yield ' ' * (angle + 1 - n)
                    n = angle + 1
                    break
                else:
                    yield ' ' * (nl - n) + '\n'
                    n = nl + 1
Ejemplo n.º 8
0
    def on_response(self, resp, req):
        ctype = resp.headers.iget('content-type')
        if not ctype:
            return

        ctype = ctype.split(";", 1)[0]

        # if this is an html page, parse it
        if ctype in HTML_CTYPES:
            body = resp.body_string()

            html = lxml.html.fromstring(body)

            # rewrite links to absolute
            html.rewrite_links(self.rewrite_link)

            # add base
            old_base = html.find(".//base")
            base = etree.Element("base")
            base.attrib['href'] = self.absolute_path

            if not old_base:
                head = html.find(".//head")
                head.append(base)

            # modify response
            rewritten_body = lxml.html.tostring(html)
            try:
                resp.headers.ipop('content-length')
            except KeyError:
                pass

            resp.headers['Content-Length'] = str(len(rewritten_body))
            resp._body = StringIO(rewritten_body)
            resp._already_read = False
Ejemplo n.º 9
0
    def on_response(self, resp, req):
        ctype = resp.headers.iget('content-type')
        if not ctype:
            return

        ctype = ctype.split(";", 1)[0]

        # if this is an html page, parse it
        if ctype in HTML_CTYPES:
            body = resp.body_string()

            html = lxml.html.fromstring(body)

            # rewrite links to absolute
            html.rewrite_links(self.rewrite_link)

            # add base
            old_base = html.find(".//base")
            base = etree.Element("base")
            base.attrib['href'] = self.absolute_path

            if not old_base:
                head = html.find(".//head")
                head.append(base)

            # modify response
            rewritten_body = lxml.html.tostring(html)
            try:
                resp.headers.ipop('content-length')
            except KeyError:
                pass

            resp.headers['Content-Length'] = str(len(rewritten_body))
            resp._body = StringIO(rewritten_body)
            resp._already_read = False
Ejemplo n.º 10
0
def mp(id, term):
    """Parse MP from his profile webpage."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/poslanec&PoslanecID=%s&CisObdobia=%s' % (id, term)
    content = scrapeutils.download(url)
    if 'Unexpected error!' in content:
        raise RuntimeError("MP with id '%s' does not exist in term '%s'" % (id, term))
    html = lxml.html.fromstring(content)

    result = {
        'id': str(id),
        'url': url
    }
    for div in html.findall('.//div[@class="mp_personal_data"]//div[strong]'):
        label = div.findtext('strong')
        value = div.find('span')
        result[label.lower()] = value.text_content() if value is not None else ''

    image_url = html.find('.//div[@class="mp_foto"]/img').get('src')
    image = requests.get(image_url).content
    with open(os.path.join(BASE_DIR, 'dummy-image.jpg'), 'rb') as f:
        dummy_image = f.read()
    result['fotka'] = image_url if image != dummy_image else ''

    result['členstvo'] = []
    ul = html.find('.//span[@id="_sectionLayoutContainer_ctl01_ctlClenstvoLabel"]').getparent().getnext()
    for li in ul.findall('li'):
        m = re.search(r'(.*?)\s*\((.*?)\)', li.text)
        result['členstvo'].append({'meno': m.group(1), 'rola': m.group(2)})

    return scrapeutils.plaintext(result)
    def non_tag_chars(html):
        n = 0
        while n < len(html):
            angle = html.find('<', n)
            if angle == -1:
                yield html[n:]
                n = len(html)
                break
            yield html[n:angle]
            n = angle

            while n < len(html):
                nl = html.find('\n', n)
                angle = html.find('>', n)
                if angle == -1:
                    yield ' ' * (len(html) - n)
                    n = len(html)
                    break
                elif nl == -1 or angle < nl:
                    yield ' ' * (angle + 1 - n)
                    n = angle + 1
                    break
                else:
                    yield ' ' * (nl - n) + '\n'
                    n = nl + 1
Ejemplo n.º 12
0
def getLinks():
	url_base = open("./url_base.txt","w+")
	link_files = glob.glob("./links*.html")
	for link_file in link_files:
		filename = os.path.basename(link_file)
		#print (filename)
		f = open(filename,"r")
		html = f.read()
		glob_left_index = html.find("<div class=\"page__b-offers__guru\">")+35
		#print (html[glob_left_index:glob_left_index+15])
		glob_right_index = html.find("<div class=\"b-offers b-offers_type_guru\">",glob_left_index)
		cursor = html.find("class=\"b-offers b-offers_type_guru\"",glob_left_index,glob_right_index)
		#print (html[cursor+39:cursor+47])
		cursor = cursor+10

		while (cursor < glob_right_index) and (cursor != its_magick):
			cursor = html.find("class=\"b-offers b-offers_type_guru\"",cursor,glob_right_index)
			#print ("Еще :")
			pid = (html[cursor+39:cursor+47])
			pid = pid.replace("\"","")
			#print ("http://market.yandex.ru/model.xml?hid=90578&modelid="+pid)
			cursor = cursor+10
			url_list.append(pid)
			url_base.write("http://market.yandex.ru/model.xml?hid=90578&modelid="+pid+"\n")

		url_list.pop()		
	url_base.close()
    def find_text(self, html, key, num_chars):

        # Find specified value in HTML tags.

        pos_begin = html.find(key) + len(key) + num_chars
        pos_end = html.find('"', pos_begin)

        return html[pos_begin:pos_end]
Ejemplo n.º 14
0
def process(html):
    html = html[html.find('対戦用のアイテムと効果と入手方法'):html.find('同じわざしか出せなくなる代わりに')]
    parsed_html = lxml.html.fromstring(html)
    items_table = parsed_html.cssselect('table > tbody > tr')
    item_id = 0
    for item_tr in items_table:
        item_dict = {}
        item_id += 1
        item_dict["id"] = item_id
        item_dict["name"] = str.strip(item_tr[0].text_content())
        items_list.append(item_dict)
Ejemplo n.º 15
0
    def extract_mediainfo(self, html):
        """ extract media info from onleihe search result """
        media_info = []
        if html.find('mediaInfo') != -1:
            # Suchergebnisse
            LOGGER.info("extracting mediaInfo items from result")
            doc = lxml.html.document_fromstring(html)
            # <article class="list-item">
            for item in doc.xpath("//article[@class='list-item']"):
                # <a class="cover-link" title="Details zum Titel: Die Rache des Kaisers"
                media_item = {}
                details = item.xpath(".//a[@class='cover-link']")
                assert len(details) == 1, "missing article details"
                details = details[0]

                title = details.attrib['title']
                assert title.startswith(
                    TITLE_DETAIL_PREFIX), "failed to extract media title"
                title = title[len(TITLE_DETAIL_PREFIX):].strip()
                media_item["title"] = title
                media_item["href"] = details.attrib['href']

                abstract = item.xpath(".//div[@class='abstract']")
                if abstract:
                    assert len(abstract) == 1, "multiple abstracts?"
                    abstract = abstract[0]
                    media_item['abstract'] = abstract.text_content().strip()
                else:
                    media_item['abstract'] = None

                # //div[@class='media-type']
                # <svg class="svg-icon ic_ebook"><use xlink:href="#ic_ebook"></use></svg>

                author = item.xpath(".//div[@class='author']")
                if author:
                    assert len(author) == 1
                    author_text = author[0].text_content().strip()
                    if author_text.startswith('Autor:'):
                        author_text = author_text[6:]
                    media_item["author"] = author_text
                else:
                    media_item["author"] = None

                # //div[@class='available']
                # <div class="available">Verfügbar</div>

                media_info.append(media_item)

        else:
            assert html.find('Suchergebnisse') == -1
            assert html.find('contentlist resultlist') == -1

        return media_info
Ejemplo n.º 16
0
def getLatLng(html,info):
    latlng1 = html.find('map.setCenter(new GLatLng(');
    latlng2 = html.find(', 13);');

    gmapsStr = html[latlng1+26:latlng2-4]

    gmaps = gmapsStr.split(',');

    info['lat'] = gmaps[0];
    info['lng'] = gmaps[1];
    
    return info;
Ejemplo n.º 17
0
def getLatLng(html,info):
    latlng1 = html.find('map.setCenter(new GLatLng(');
    latlng2 = html.find(', 13);');

    gmapsStr = html[latlng1+26:latlng2-4]

    gmaps = gmapsStr.split(',');

    info['lat'] = gmaps[0];
    info['lng'] = gmaps[1];
    
    return info;
Ejemplo n.º 18
0
def convert_google_sheet(sid, gid, options):
    html = parse_google_document(
        'https://docs.google.com/spreadsheets/d/{sid}/htmlembed/sheet?gid={gid}&{options}'
            .format(sid=sid, gid=gid, options=options),
        errhelp={'sid' : sid, 'gid' : gid} )
    for script in html.iter('script'):
        v = script.get('src')
        if v is None:
            #pass #script.getparent().remove(script)
            script.text = script.text.replace("CHARTS_EXPORT_URI.push('","CHARTS_EXPORT_URI.push('https://docs.google.com")
        else:
            script.set('src',"https://docs.google.com"+v)
        
    html.find('head/link').rewrite_links(
        lambda s: 'https://docs.google.com' + s )
    html.find('head').append(lxml.html.Element( 'link',
        rel='stylesheet', href=url_for('static', filename='metatable.css'),
    ))
    html.find('body').append(lxml.html.Element( 'script',
        src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"
    ))
    html.find('body').append(lxml.html.Element( 'script',
        src=url_for('static', filename='metatable.js')
    ))
    script = lxml.html.Element('script')
    script.text = ( "$(init); "
        "function init() { "
            "$('body').css('overflow', 'hidden'); "
            "var $table = $('#sheets-viewport table').detach(); "
            "var $metatable = create_metatable($table); "
            "$('body').empty().append($metatable); "
            "$metatable.resize(); "
        " }" 
        "$('.row-header-wrapper').remove();"  
        #"$('td').css('min-width', '100px');"
        "$(window).bind('load', function() {"
        "i=1;"
        "tableWidth=0;"
        "while (true) {  idStr = '#0C'+i.toString(); obj = $(idStr); if (obj[0]==undefined) {break;}; wstr=obj[0].style.width.replace('px', ''); tableWidth+=parseInt(wstr); i++; }"
        "tblList = $('table.waffle');"
        "tblList[1].style.width=tableWidth.toString()+'px';"   
        "tblList[3].style.width=tableWidth.toString()+'px';"   
        "initCharts();"

        "});"
        )
    html.find('body').append(script)
    # with open("output.txt", "w") as text_file:
    #     text_file.write(lxml.html.tostring(html, encoding='utf-8'))
    
    return b'<!DOCTYPE html>\n<meta charset="UTF-8">\n' + \
        lxml.html.tostring(html, encoding='utf-8')
Ejemplo n.º 19
0
def compass():
	answers = def_answers.copy()
	questions = {}
	post_args = {}

	while post_args is not None:
		# Post previous responses, Get new questions (first post is empty, gets page 1)
		html_text = submit_page(post_args)
		html = lxml.html.fromstring(html_text)
		curr_questions = reap_questions(html)

		# If the test isn't done, prepare [post_args] for next page
		if len(curr_questions):
			# Verify test integrity
			if not all(item in def_questions.items() for item in curr_questions.items()):
				raise RuntimeError("Questions have changed. Answer cache is bad!")
			questions.update(curr_questions)

			# Assemble responses
			post_args = {'answer_' + str(key): answers[key] for key in curr_questions}

			# Print responses
			for num in sorted(curr_questions):
				print(str(num) + ":\t" + curr_questions[num] + "\n\t" + values[int(answers[num])] + '\n')

			submit_tag = html.find(".//input[@type='submit']")
			post_args["submit"] = submit_tag.value  # submit_tag.type == "submit"
			for tag in html.findall(".//input[@type='hidden']"):
				post_args[tag.name] = tag.value
			pageno = post_args["pageno"]
		else:
			post_args = None
			pageno = 'f'

		# with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f:
			# f.write(html_text)

	h2 = html.find(".//h2")
	print(h2.text_content())

	lines = h2.text_content().split('\n')
	x = float(lines[0][-6:])
	y = float(lines[1][-6:])
	pyplot.scatter(x, y)
	pyplot.xlim(-10, 10)
	pyplot.ylim(-10, 10)
	pyplot.title("Political coordinates")
	pyplot.xlabel("Economic Left/Right")
	pyplot.ylabel("Social Libertarian/Authoritarian")
	pyplot.grid()
	pyplot.show()
	return questions
Ejemplo n.º 20
0
def get_sector(ticker_old):
    na_count = 0
    ticker = ticker_old.replace('.', '')
    prefix = 'http://finance.yahoo.com/q/pr?s='
    # ticker = 'LMCK'
    response = urllib2.urlopen(prefix + ticker)
    html = response.read()
    response.close()
    # print html

    start_string = 'Sector:</td><td class="yfnc_tabledata1">'
    end_string = 'Full Time Employees'
    start_index = html.find(start_string)
    start_length = len(start_string)
    end_index = html.find(end_string)
    sub_string = html[start_index + start_length:end_index - 1]

    if ticker == r'JW/A':
        sector = 'Services'
        industry = 'Publishing - Books'
    elif ticker == 'PGRE':
        sector = 'Financials'
        industry = ' Real Estate Development & Operations - NEC'
    elif start_index == -1 or end_index == -1 or sub_string[0:3] == 'N/A':
        na_count = 1
        sector = 'N/A'
        industry = 'N/A'
    else:
        sub_string = sub_string[sub_string.find('">') + 2:]
        # print sub_string
        start_string_2 = 'Industry:</td><td class="yfnc_tabledata1">'
        start_index_2 = sub_string.find(start_string_2)
        start_length_2 = len(start_string_2)
        sub_string_2 = sub_string[(start_index_2 + start_length_2):]
        sub_string_2 = sub_string_2[sub_string_2.find('">') + 2:]
        # print sub_string

        h = HTMLParser.HTMLParser()
        sector = h.unescape(sub_string[0:sub_string.find('</a>')])
        industry = h.unescape(sub_string_2[0:sub_string_2.find('</a>')])
    print 'sector = %s' % sector
    print 'industry = %s ' % industry

    tmp_df = pd.DataFrame([{
        'ticker': ticker_old,
        'sector': sector,
        'industry': industry
    }])
    return tmp_df, na_count
Ejemplo n.º 21
0
    def execute(self):
        rewrite, headers = self.rewrite_headers()
        if not headers:
            msg = "HTTP/1.1 502 Gateway Error\r\n\r\n bad request."
            self.resp.send(msg)
            return
        
        if rewrite:
            body = self.parser.body_string()
            if not body:
                rewritten_body = ''
            else:
                html = lxml.html.fromstring(body)

                # rewrite links to absolute 
                html.rewrite_links(self.rewrite_link)

                # add base
                absolute_path = "%s%s" % (self.local_base,
                        self.extra.get('path', ''))
                
                old_base = html.find(".//base")
                base = etree.Element("base")
                base.attrib['href'] = absolute_path 

                if not old_base:
                    head = html.find(".//head")
                    head.append(base)
            
                # modify response
                rewritten_body = bytes(lxml.html.tostring(html))
            
            # finally send response.
            headers.extend([
                'Content-Length: %s\r\n' % len(rewritten_body),
                "\r\n"])
           
            self.resp.writeall(bytes("".join(headers)))
            stream = io.BytesIO(rewritten_body)
            while True:
                data = stream.read(io.DEFAULT_BUFFER_SIZE)
                if not data:
                    break
                self.resp.writeall(data)
        else:
            self.resp.writeall(bytes("".join(headers) + "\r\n"))
            body = self.parser.body_file()
            send_body(self.resp, body, self.parser.is_chunked())
Ejemplo n.º 22
0
def grow(year,quarter):
    pn=1
    DF=pd.DataFrame()
    ws._write_head()
    while True:
        try:
            ws._write_console()
            url='http://vip.stock.finance.sina.com.cn/q/go.php/vFinanceAnalyze/kind/grow/index.phtml?s_i=&s_a=&s_c=&reportdate={0}&quarter={1}&p={2}'.format(year,quarter,pn)
            r=requests.get(url,headers=hds())
            r=r.content.decode('gbk')
            html=BeautifulSoup(r,'lxml')
            text=html.find(id='dataTable')
            df=pd.read_html(str(text),header=0)[0]
            if df.empty is True:
                break
            else:
                pn = pn + 1
                DF =DF.append(df)
        except:
            break
    DF=DF.applymap(lambda x:np.where(x=='--',np.nan,x))
    DF=DF.set_index('股票代码')
    DF.index=DF.index.map(lambda x: str(x).split('.')[0].zfill(6))
    DF['date']=str(year)+'_'+str(quarter).zfill(2)
    name=list(DF.columns)
    name.remove('股票名称')
    name.remove('date')
    for label in name:
        DF[label]=DF[label].astype(float)     
    return DF
Ejemplo n.º 23
0
def get_company_info(company_number):
    '''
    Get company info for a given company number.
    Returning a dict with the information, with 
    CompanyNumber set to -1 if no company is found.
    '''
    html = scraperwiki.scrape(SEARCH_SITE+str(company_number))
    if( html.find("The search did not produce any results") != -1 ):
        info = {"CompanyNumber":-1}
    else:
        root = lxml.html.fromstring(html)
        info = {}
        info["CompanyNumber"] = company_number
        tables = root.cssselect("table")
        table = tables[7]
        tds = table.cssselect("td")
        td = tds[1]
        fonts = td.cssselect("font")
        info["CompanyName"] = fonts[0].text
        brs = td.cssselect("br")
        info["Address"] = lxml.html.tostring(brs[1]).replace("<br>&#13;","")
        zip_city = lxml.html.tostring(brs[2]).replace("<br>&#13;","")
        info["PostalCode"] = zip_city.split('-')[0].strip(" ")
        info["City"] = zip_city.split('-')[1].strip(" ")
        info["ScrapedTime"] = datetime.datetime.now()
        status_code = fonts[1].text.split(":")
        info["Status"] = status_code[0]
        if( len(status_code) == 2 ):
            info["LicenceCode"] = status_code[1].strip()
        

    return info
Ejemplo n.º 24
0
    def render(self, only_op):
        html = E.html(
            E.head(
                E.title(self.title),
                E.link(href='story.css', type='text/css', rel='stylesheet') #TODO: convert it to a PI.
            ),
            E.body(
                E.h1(self.title)
            ),
            xmlns='http://www.w3.org/1999/xhtml'
        )

        body = html.find('body')
        body.append(self.op.render(display_title=False))
        #calc mean length of replies
        sortmean = [i for i in self.replies if self.postlen(i) > 0]
        #sortmean.sort(key=lambda x: self.postlen(x))
        mean = sum(self.postlen(i) for i in sortmean) / len(sortmean)
        #print('mean reply length:', mean)

        if self.author.name == 'Anonymous' or self.author.name == '' or self.author.name == None:
            only_op = False
        for reply in self.replies:
            # Remove user answers if not wanted.
            if only_op and not reply.is_op(self.op):
                continue
            if not only_op and self.postlen(reply) < mean:
                continue
            
            body.append(reply.render())

        return html
Ejemplo n.º 25
0
    def next5(self, irc, msg, args, optteam):
        """<optional team>
        Gets next 5 games, for a specific team if passed.
        """

        url = 'http://www.nhl.com/ice/schedulebymonth.htm'
        try:
            sched = utils.web.getUrl(url).decode('utf8')
        except URLError as e:
            print 'An error occured fetching %s \n %s' % (url, e.reason)
        html = BeautifulSoup(sched, "lxml")

        try:
            table = html.find('table', { 'class' : 'data schedTbl'})
            rows = table.findAll('tr')
        except AttributeError as e:
            raise ValueError("No valid table found")

        # Get data
        parser = PrivFuncs()
        table_data = parser._parse_rows(rows)
        table_data = table_data[0]

        # # Print data
        # for i in table_data:
        #     print '\t'.join(i)
        irc.reply(table_data)
Ejemplo n.º 26
0
    def scrape_html_title(self, url):
        """Scrape the ``<title>`` tag contents from an HTML page.
        """
        # Let's see what's on the other end...
        r = requests.get(url.geturl())
        # Only bother with 200 OK
        if r.status_code != requests.codes.ok:
            self.log.debug(u'request failed for ' + url.geturl())
            return None
        if 'html' not in r.headers['Content-Type']:
            self.log.debug(u'Content-Type not HTML-ish ({}): {}'
                           .format(r.headers['Content-Type'], url.geturl()))
            return None

        # Attempt to scrape the HTML for a <title>
        html = lxml.html.document_fromstring(r.text)
        title = html.find('.//title')

        if title is None:
            self.log.debug(u'failed to find <title>: ' + url.geturl())
            return None

        # Normalise title whitespace
        title = ' '.join(title.text.strip().split())
        nsfw = url.netloc.endswith('.xxx')

        # See if the title is in the URL
        if self._filter_title_in_url(url, title):
            return None

        # Return the scraped title
        return 'Title', nsfw, u'"{}"'.format(title)
Ejemplo n.º 27
0
def FI_sina(code):
    """
    获得个股的财务指标
    ---------
    Return
    """
    DF = pd.DataFrame()
    url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/stockid/{0}/displaytype/4.phtml'.format(
        code)
    r = requests.get(url)
    r = r.content.decode('gbk')
    html = lxml.html.parse(StringIO(r))
    urls = html.xpath("//div[@id='con02-1']/table[1]//a/@href")

    for url in urls:
        r = requests.get(url)
        r = r.content.decode('gbk')
        html = BeautifulSoup(r, 'lxml')
        text = html.find(id='BalanceSheetNewTable0')
        df = pd.read_html(str(text), header=0)[0]
        df.columns = df.iloc[0, :]
        df = df.drop(0, axis=0)
        df = df.set_index('报告日期')
        df = df.T
        #print(df)
        DF = DF.append(df)
    return DF
Ejemplo n.º 28
0
def parse_html(html_file):
    html = lxml.html.parse(html_file)
    # 本文を抽出
    main_text = html.find('//div[@class="main_text"]')
    if main_text is None:
        return None

    # ルビの除去
    for rt in main_text.findall('.//rt'):
        rt.getparent().remove(rt)
    for rp in main_text.findall('.//rp'):
        rp.getparent().remove(rp)

    # 注記と前後の不要な空白を除去
    text = re.sub(
        '[#[^]]+]\n?',
        '',
        main_text.text_content(),
        flags=(re.MULTILINE)
    ).strip()

    # 正規化
    text = unicodedata.normalize('NFKC', text)
    text = text.lower()
    return text
Ejemplo n.º 29
0
def xsjj(begin,end):
    """
    查询限售解禁的数据,主要是当前数据的前20页的数据
    -----------------
    Return:
        DataFrame
    """
    pn=1
    DF=pd.DataFrame()
    ws._write_head()
    while True:
        try:
            ws._write_console()
            url='http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/xsjj/index.phtml?bdate={0}&edate={1}&showall=%CF%D4%CA%BE%C8%AB%B2%BF&p={2}'.format(begin,end,pn)
            r=requests.get(url)
            #print(url)
            r=r.content.decode('gbk')
            html=BeautifulSoup(r,'lxml')
            sarr=html.find(id='dataTable')
            df=pd.read_html(str(sarr),header=0)[0]
            DF =DF.append(df)
            pn=pn+1
            if pn > 50:
                break
            if df.empty is True:
                break
        except:
            break
        
    DF=DF.applymap(lambda x:np.where(x=='--',np.nan,x))
    DF=DF.drop_duplicates()
    DF['代码']=DF['代码'].map(lambda x:str(x).split('.')[0].zfill(6))
    return DF
Ejemplo n.º 30
0
 def getActorPhoto(self, htmltree):
     htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
     names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
     t = {}
     for name, a in zip(names, htmla):
         if name.strip() == '他':
             continue
         p = {name.strip(): a.attrib['href']}
         t.update(p)
     o = {}
     for k, v in t.items():
         if '/search_act/' not in v:
             continue
         r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object')
         if not r.ok:
             continue
         html = r.text
         pos = html.find('.full-bg')
         if pos<0:
             continue
         css = html[pos:pos+100]
         cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
         if not cssBGjpgs or not len(cssBGjpgs[0]):
             continue
         p = {k: urljoin(r.url, cssBGjpgs[0])}
         o.update(p)
     return o
Ejemplo n.º 31
0
    def parse_title(self, htmlstring):
        html = lxml.html.document_fromstring(htmlstring)
        title = html.find('*/title')
        if title is None:
            return

        return title.text.encode('iso-8859-1').replace('\n', '').strip()
Ejemplo n.º 32
0
def add_css_js(html):

    head_sub_elements = [
        #css
        r'<link rel="stylesheet" href="/static/css/word-recall.css" type="text/css">',
        r'<link rel="stylesheet" href="/static/css/popbox.css" type="text/css" media="screen" charset="utf-8">',
        #js
        r'<script type="text/javascript" src="/static/js/jquery.min.js"></script>',
    ]


    body_sub_elements = [
        #js
        r'<script type="text/javascript" src="/static/js/recall-word.js"></script>',
        #html
        loader.render_to_string('recall/box.html'),
    ]


    add_elements_map = {'body': body_sub_elements, 'head': head_sub_elements}
    for add_tag in add_elements_map:
        element = html.find(add_tag)
        if element:
            for raw_html in add_elements_map[add_tag]:
                element.append(lxml.html.fromstring(raw_html))
Ejemplo n.º 33
0
def margin_share(code, begin, end):
    """
    datetime:日期 如2017-06-16
    -----------------
    Return:
        DataFrame
    """
    DF = pd.DataFrame()
    ws._write_head()

    try:
        ws._write_console()
        url = 'http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/rzrq/index.phtml?symbol={0}&bdate={1}&edate={2}'.format(
            code, begin, end)
        #print(url)
        r = requests.get(url)
        r = r.content.decode('gbk')

        html = BeautifulSoup(r, 'lxml')
        sarr = html.find(id='dataTable')
        df = pd.read_html(str(sarr), header=None, skiprows=3)[0]
        df = df.applymap(lambda x: np.where(x == '--', np.nan, x))
        df = df.drop(0, axis=1)
        #print(df.head())

        df.columns = name
        df = df.set_index('date')
        return df
    except Exception as e:
        print(e)
        pass
def try_accept_cookies():
    # Searches soup for the button on the cookie banner
    cookie_tag = html.find('button', { 'id' : 'onetrust-accept-btn-handler'})
    if cookie_tag:
        #print('cookie tag found')
        cookie_xpath = xpath_soup(cookie_tag)
        cookie_button = browser.find_element_by_xpath(cookie_xpath).click()
Ejemplo n.º 35
0
Archivo: web.py Proyecto: viperfx/GSOC
def get_page():
    # request the page
    r = requests.get(request.args['url'])
    # parse the dom into python objects
    html = lxml.html.document_fromstring(r.content)
    # prase the requested url so we can form the base href
    url = urlparse(request.args['url'])
    # create the base url dom fragment
    base_url = lxml.html.fromstring("<base href='%s://%s'>" % (url.scheme, url.hostname)).find('.//base')
    # find the head element
    head = html.find(".//head")
    # insert the base href in the last place of the head elements
    head.insert(-1, base_url)
    # rewrite urls to have absolute url
    html.resolve_base_href()
    # rewrite links to load through this proxy
    for element, attribute, link, pos in html.iterlinks():
        if element.tag == "a" and attribute == "href":
            link = "http://localhost:8888/translate_url?url=%s" % (link)
            element.set("href", link)
            element.set("target", "_parent")
    # translate through DOM Traversal
    # html = translate_dom_string(html, lxml.html.tostring(html))
    # translate through HTML regex string replacement
    html = translate_html(html, lxml.html.tostring(html))
    # dump the html string for debugging
    # with open('html_dump', 'w') as f:
    #     f.write(lxml.html.tostring(html))
    # a little regex to remove any script tags
    return re.subn(r'<(script).*?</\1>(?s)', '', lxml.html.tostring(html))[0]
Ejemplo n.º 36
0
def dzjy():
    """
    查询大宗交易的数据,主要是当前数据的前20页的数据
    -----------------
    Return:
        DataFrame
    """
    pn = 1
    DF = pd.DataFrame()
    ws._write_head()
    while True:
        try:
            ws._write_console()
            url = 'http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/dzjy/index.phtml?num=60&p={0}'.format(
                pn)
            r = requests.get(url)
            #print(url)
            r = r.content.decode('gbk')
            html = BeautifulSoup(r, 'lxml')
            sarr = html.find(id='dataTable')
            df = pd.read_html(str(sarr), header=0)[0]
            DF = DF.append(df)
            pn = pn + 1
            if pn > 20:
                break
            if df.empty is True:
                break
        except:
            break

    DF = DF.applymap(lambda x: np.where(x == '--', np.nan, x))
    DF = DF.drop_duplicates()
    return DF
Ejemplo n.º 37
0
def search(bookname):
    # 搜索听书
    # 参数 page int 例子: 1
    # 参数 searchword string 56听书的限制搜索必须提交GBK编码的字符串 例子: %D5%C2%D3%E3
    path = '/search.asp?page=1&searchword=' + urllib.request.quote(
        bookname.encode('gb2312'))
    html = getHtmlContent(path)
    searchList = html.find_all(class_='list-ov-tw')
    pageCount = html.find(
        class_='cate-pages').find_all('li')[1].text.split('/')
    nowPageCount = pageCount[0]  # 当前页
    allPageCount = pageCount[1]  # 总页数
    bookList = []  # 搜索结果
    for searchItem in searchList:
        bookUrl = searchItem.find(class_='bt').find('a').attrs['href']
        bookImg = searchItem.find('img').attrs['original']
        bookName = searchItem.find(class_='bt').text
        bookAuthor = searchItem.find_all(
            class_='zz')[0].text + ' ' + searchItem.find_all(
                class_='zz')[1].text
        bookContent = searchItem.find(class_='nr').text
        book = Book(bookUrl, bookImg, bookName, bookAuthor, bookContent,
                    nowPageCount, allPageCount)
        bookList.append(book)
    return bookList
Ejemplo n.º 38
0
def main():
	print 'start at %s' % time.asctime()  
	users = db.select("users")
	print 'current users count %s ' % len(users)
	for user in users:
		# print 'user %s ' % user.token
		# print 'user %s ' % user.secret
		access_token = OAuthToken(user.token, user.secret) 
		
		if not user.trunk_key:
			continue
		
		t = Trunkly(user.trunk_key)
		
		sinat = Sinat(sinaConsumer, access_token=access_token)	
		statuses = sinat.statuses__user_timeline('GET')
		for status in statuses:
			weibo = status['text']
			if status.has_key('retweeted_status'):
				weibo = '%s //@%s: %s' % (weibo , 
											status['retweeted_status']['user']['name'],
											status['retweeted_status']['text'])
				
			# print 'status %s' % status['text']
			urls = p.findall(weibo)
			for url in urls:
				print 'url is %s ' % url
				title = None
				trunk = None
				
					
				try:
					html = lxml.html.parse(url)
					title = html.find(".//title").text
					url = html.getroot().base_url
					print 'title is %s' % title 
					print 'base url is %s ' % url
					
					try:
						try:
							trunk = t.get_link(parameters={'url': url})
							print 'url Already exists!!!'
							continue
						except:
							print 'error'
							pass

						if title and not trunk:
							print 'post url to trunk.ly'
							t.post_link(parameters={'url': url,
										'title': title,
										'tags' : '',
										'note' : weibo,
										'text' : weibo})
					except:
						print 'post to trunk error. url %s title %s' % (url, title)
				except:
					print 'url %s fetch error' % (url)
					
	print '---------------- end ---------------------'
Ejemplo n.º 39
0
def get_actor_photo(lx, session):
    htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
    names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
    t = {}
    for name, a in zip(names, htmla):
        if name.strip() == '他':
            continue
        p = {name.strip(): a.attrib['href']}
        t.update(p)
    o = {}
    for k, v in t.items():
        if '/search_act/' not in v:
            continue
        r = session.get(urljoin(G_SITE, v))
        if not r.ok:
            continue
        html = r.text
        pos = html.find('.full-bg')
        if pos<0:
            continue
        css = html[pos:pos+100]
        cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
        if not cssBGjpgs or not len(cssBGjpgs[0]):
            continue
        p = {k: urljoin(r.url, cssBGjpgs[0])}
        o.update(p)
    return o
Ejemplo n.º 40
0
def scrape_pacific_plants_names():
  r = requests.get('http://www.hear.org/pier/wralist.htm')
  html = BeautifulSoup(r.text, 'lxml')

  table = html.find('table')
  # remove header row
  rows = table.find_all('tr')[1:]
  
  # find scientific names
  data = [row.find_all('td')[0] for row in rows]
  plant_scientific_names = []
  # clean parsed data
  for d in data:
    plant_scientific_names.append(parse_html(str(d)))

  # write names to json
  with open('data/plant_scientific_names.json', 'w') as datafile:
    json.dump(plant_scientific_names, datafile, indent=2, separators=(',', ':'))

  # find common names
  data = [row.find_all('td')[2] for row in rows]
  plant_common_names = []
  # clean parsed data
  for d in data:
    plant_common_names.append(parse_html(str(d)))

  # write names to json
  with open('data/plant_common_names.json', 'w') as datafile:
    json.dump(plant_common_names, datafile, indent=2, separators=(',', ':'))
Ejemplo n.º 41
0
def print_url(url):
    # Get webpage content
    try:
        r = requests.get(url, timeout=2)
    except requests.exceptions.MissingSchema:
        return print_url("http://{}".format(url))
    except:
        return False

    # Print URL and title
    try:
        html = lxml.html.fromstring(r.text)
        print(url)

        try:  # Some valid webpages don't have titles
            print(html.find(".//title").text.strip())
        except:
            pass

        print('')
        sys.stdout.flush()
        return True

    except:
        return False
Ejemplo n.º 42
0
def CS_sina(code):
    """
    20170616,测试有问题
    获得个股的现金流量表
    -------
    Return
    """
    DF = pd.DataFrame()
    url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_CashFlow/stockid/{0}/ctrl/part/displaytype/4.phtml'.format(
        code)
    r = requests.get(url)
    r = r.content.decode('gbk')
    html = lxml.html.parse(StringIO(r))
    urls = html.xpath("//div[@id='con02-1']/table[1]//a/@href")

    for url in urls:
        r = requests.get(url)
        r = r.content.decode('gbk')
        html = BeautifulSoup(r, 'lxml')
        text = html.find(id='ProfitStatementNewTable0')
        df = pd.read_html(str(text), header=0)[0]
        df.columns = df.iloc[0, :]
        df = df.drop(0, axis=0)
        df = df.set_index('报表日期')
        df = df.T
        #print(df)
        DF = DF.append(df)

    return DF
Ejemplo n.º 43
0
def speaker():
    """Parse current speaker (predseda) of the chamber."""
    url = 'http://www.nrsr.sk/web/default.aspx?sid=predseda'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    div = html.find(".//div[@id='_sectionLayoutContainer__panelContent']")
    result = {
        'url': url,
        'meno': div.find(".//h1").text_content(),
    }

    image = div.find('.//img')
    if image is not None:
        result['fotka'] = 'http://www.nrsr.sk/web/' + image.get('src')

    born = div.find("div[@class='article']")
    if born is not None:
        result['narodený'] = re.search(r'Narodený: (.*)', born.text_content()).group(1)

    bio = div.find('table')
    if bio is not None:
        result['životopis'] = lxml.html.tostring(bio, encoding='unicode', with_tail=False)

    return scrapeutils.plaintext(result)
Ejemplo n.º 44
0
def get_predict_share_Sina(code, mtype):
    """
    mtype:
         eps--每股收益,
         sales--营业收入,
         np--净利润,
         roe--净资产收益率
    """
    pn = 1
    DF = pd.DataFrame()
    ws._write_head()
    while True:
        try:
            ws._write_console()
            url = 'http://vip.stock.finance.sina.com.cn/q/go.php/vPerformancePrediction/kind/{0}/index.phtml?symbol={1}&p={2}'.format(
                mtype, code, pn)
            r = requests.get(url)
            r = r.content.decode('gbk')
            html = BeautifulSoup(r, 'lxml')
            text = html.find(id='dataTable')
            df = pd.read_html(str(text), header=0)[0]
            if df.empty is True:
                break
            else:
                pn = pn + 1
                DF = DF.append(df)
        except:
            break
    DF = DF.applymap(lambda x: np.where(x == '--', np.nan, x))
    DF = DF.set_index('股票代码')
    DF.index = DF.index.map(lambda x: str(x).split('.')[0].zfill(6))
    return DF
Ejemplo n.º 45
0
def current_term():
    url = 'http://www.nrsr.sk/web/default.aspx?sid=poslanci'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    option = html.find('.//select[@id="_sectionLayoutContainer_ctl01__currentTerm"]/option[@selected]')
    return option.get('value')
Ejemplo n.º 46
0
def get_actor_photo(browser):
    htmla = browser.page.select(
        '#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a'
    )
    t = {}
    for a in htmla:
        if a.text.strip() == '他':
            continue
        p = {a.text.strip(): a['href']}
        t.update(p)
    o = {}
    for k, v in t.items():
        if '/search_act/' not in v:
            continue
        r = browser.open_relative(v)
        if not r.ok:
            continue
        html = browser.page.prettify()
        pos = html.find('.full-bg')
        if pos < 0:
            continue
        css = html[pos:pos + 100]
        cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
        if not cssBGjpgs or not len(cssBGjpgs[0]):
            continue
        p = {k: urljoin(browser.url, cssBGjpgs[0])}
        o.update(p)
    return o
def get_sector(ticker_old):
    na_count = 0
    ticker = ticker_old.replace('.', '')
    prefix = 'http://finance.yahoo.com/q/pr?s='
    # ticker = 'LMCK'
    response = urllib2.urlopen(prefix + ticker)
    html = response.read()
    response.close()
    # print html

    start_string = 'Sector:</td><td class="yfnc_tabledata1">'
    end_string = 'Full Time Employees'
    start_index = html.find(start_string)
    start_length = len(start_string)
    end_index = html.find(end_string)
    sub_string = html[start_index+start_length: end_index-1]

    if ticker == r'JW/A':
        sector = 'Services'
        industry = 'Publishing - Books'
    elif ticker == 'PGRE':
        sector = 'Financials'
        industry = ' Real Estate Development & Operations - NEC'
    elif start_index == -1 or end_index == -1 or sub_string[0:3] == 'N/A':
        na_count = 1
        sector = 'N/A'
        industry = 'N/A'
    else:
        sub_string = sub_string[sub_string.find('">')+2:]
        # print sub_string
        start_string_2 = 'Industry:</td><td class="yfnc_tabledata1">'
        start_index_2 = sub_string.find(start_string_2)
        start_length_2 = len(start_string_2)
        sub_string_2 = sub_string[(start_index_2+start_length_2):]
        sub_string_2 = sub_string_2[sub_string_2.find('">')+2:]
        # print sub_string

        h = HTMLParser.HTMLParser()
        sector = h.unescape(sub_string[0:sub_string.find('</a>')])
        industry = h.unescape(sub_string_2[0:sub_string_2.find('</a>')])
    print 'sector = %s' % sector
    print 'industry = %s ' % industry

    tmp_df = pd.DataFrame([{'ticker': ticker_old,
                            'sector': sector,
                            'industry': industry}])
    return tmp_df, na_count
Ejemplo n.º 48
0
def scrape_haodoo():
    """
    Main
    """
    skip_stage1 = False
    try:
        print(">>> Stage 1 - Collecting all book urls {}<<<".format(
            datetime.now()))
        if not skip_stage1:
            for url in urls():
                # print(url)
                html = scrape(url)
                parse_books_from_html(html)

                page = 1
                while True:
                    suburl = get_suburl(url, page)
                    #print(suburl)
                    if html.find(urlparse(suburl).query):
                        html = scrape(suburl)
                        if html.find("<strong>404") != -1:
                            break
                        parse_books_from_html(html)
                        page = page + 1
                    else:
                        break

        print(">>> Stage 2 - Analysising all book urls {}<<<".format(
            datetime.now()))
        p = Pool()
        results = p.map(grab_and_analysis, Page.query_all())
        # results = map(grab_and_analysis, Page.query_all())

        print(">>> Stage 3 - Saving results {}<<<".format(
            datetime.now()))
        for volumes in results:
            for volume in volumes:
                save_volume(volume)

        print(">>> State 4 - done {}<<<".format(datetime.now()))

    except Exception as e:
        print("Got exception:")
        print(e)
        print(traceback.format_exc())
Ejemplo n.º 49
0
def estrai(html,search,nrline,offset):
    i=html.find(search)
    if i > 0:
        i=i+len(search)
        j=i+offset
        s = html[i:j]
        return (s)
    else:
        return ''
Ejemplo n.º 50
0
    def from_url(cls, url, tokenizer):
        data = fetch_url(url)

        # form the lxml tree from the data
        html = lxml.html.fromstring(data)

        # find and store the title in the instance
        title = html.find(".//title").text
        return cls(data, tokenizer, url, title)
def main():
    urls = [
        'http://www.haodoo.net/?M=hd&P=wisdom',
        'http://www.haodoo.net/?M=hd&P=history',
        'http://www.haodoo.net/?M=hd&P=martial',
        'http://www.haodoo.net/?M=hd&P=mystery',
        'http://www.haodoo.net/?M=hd&P=romance',
        'http://www.haodoo.net/?M=hd&P=scifi',
        'http://www.haodoo.net/?M=hd&P=fiction',
        ]

    skip_stage1 = False
    try:
        print( ">>> Stage 1 - Collecting all book urls <<<" )
        if not skip_stage1:
            for url in urls:
                html = scraperwiki.scrape(url)
    
                page = 1
                while True:
                    suburl = "{0}-{1}".format( url, page )
                    if html.find( suburl[suburl.find('?'):] ):
                        html = scraperwiki.scrape( suburl )
                        if html.find("<strong>404")!=-1:
                            break
                        parse_books_from_html( html )
                        page = page + 1
                    else:
                        break
        
        print( ">>> Stage 2 - Analysising all book urls <<<" )
        for book in scraperwiki.sqlite.select("* from bookpages"):
            # grab html
            html = scraperwiki.scrape( book['url'] )

            # analysis and store information into book
            analysis_book_html_and_save( book, html )

        print( ">>> State 3 - done <<<" )

    except Exception, e:
        print( "Got exception:" )
        print( e )
        print( traceback.format_exc() )
Ejemplo n.º 52
0
def artverify(art, html='', pdf=''):
    """
    Check whether HTML and PDF documents match abstract text
    Arguments:
        html (str): HTML text (optional)
        pdf (str): PDF text (optional)
    """

    # Cast article to Article
    art = toart(art)

    # Get article info
    info = artinfo({'xml' : art.xml})

    # Quit if no abstract
    if info['abstxt'] is None:
        return None, None

    # Tokenize abstract
    abstxt = info['abstxt']
    abswords = re.split('\s+', abstxt)
    abswords = [word.lower() for word in abswords]

    # Ignore punctuation
    for char in ['.', ',', ';', ':']:
        abswords = [word.strip(char) for word in abswords]

    # Load HTML
    if not html:
        html = loadhtml(art, overwrite=True)
    
    # Load PDF
    if not pdf:
        pdf = loadpdf(art)
        pdf = to_unicode(pdf)

    # To lower-case
    html = html.lower()
    pdf = pdf.lower()

    # Check HTML
    if html:
        htmlwords = [word for word in abswords if html.find(word) > -1]
        htmlprop = float(len(htmlwords)) / len(abswords)
    else:
        htmlprop = None

    # Check PDF
    if pdf:
        pdfwords = [word for word in abswords if pdf.find(word) > -1]
        pdfprop = float(len(pdfwords)) / len(abswords)
    else:
        pdfprop = None

    # Return
    return htmlprop, pdfprop
def extract_set_title( html ):
    start_pos = html.find( 'SetTitle("' )
    if start_pos == -1:
        return ("", "")

    start_quote = html.find( '"', start_pos )
    if start_quote == -1:
        return ("", "")

    end_quote = html.find( '"', start_quote+1 )
    if end_quote == -1:
        return ("", "")

    set_title = html[ start_quote+1: end_quote-1 ]
    set_title = set_title.replace( '《', ',' )
    r = set_title.split(',')
    if len(r)!=2:
        return ("", "" )
    return r
Ejemplo n.º 54
0
def resolve_ean(ean):
    page = requests.post(SEARCH_URL, data={"form[q]": ean})

    #Check if something was found
    if "keine Artikel gefunden" in page.text:
        return None

    html = lxml.html.document_fromstring(page.text)
    result = dict()

    result["type"] = html.find('.//li[@class="variant"]').text_content().strip()
    if result["type"] == "Audio CD":
        result["type"] = "audiobook"
        result["author"] = html.find('.//a[@class="author"]').text_content().strip()
        result["artists"] = None
    elif result["type"] == "Gebundenes Buch":
        result["type"] = "book"
        result["author"] = html.find('.//a[@class="author"]').text_content().strip()
        result["artists"] = None
    else:
        result["artists"] = result["author"] = None
        result["type"] = "movie"


    result["title"] = html.find('.//h1[@class="headline"]').text
    attr_field = html.find('.//ul[@class="plain"]')
    attrs = dict()
    for li in attr_field.findall(".//li"):
        data = li.text_content()
        if data:
            title, sep, val = data.partition(":") 
            attrs[title] = val.strip()
    #Extract description
    description_element = html.find('.//div[@class="product-description"]/div[2]/div[1]')

    #Convert brs to nl
    if description_element is not None:
        for br in description_element.xpath(".//br"):
            br.tail = "\n" + br.tail if br.tail else "\n"
        description = description_element.text_content()

        #Strip trailing crap
        result["description"] = description[:description.find("Bonusmaterial")]
    else:
        #Ignore this hit if there is no description
        return None

    try:
        result["duration"] = int(re.search("Gesamtlaufzeit: (\d+) Min.", page.text).group(1))
    except:
        result["duration"] = None

    result["created"] = defNone(attrs.get("Erscheinungstermin"), lambda x: interpDate(x)) 
    result["studio"] = attrs.get("Hersteller")

    result["imgurl"] = html.find('.//img[@class="cover"]').attrib["src"]

    return result 
Ejemplo n.º 55
0
 def xpath_analyzer(self, content_html):
     """Use XPath to extract stats, and return stats dictionary."""
     html = lxml.html.fromstring(content_html)
     stats = dict()
     for stat_name, selector in self.stats_map.iteritems():
         stat_block = html.find(self.xpath_search_str % (selector))
         assert stat_block is not None
         logging.debug('%s: %s', stat_name, stat_block.text_content())
         stats[stat_name] = int(stat_block.text_content().replace(',', ''))
     return stats
Ejemplo n.º 56
0
    def scrape_html_title(self, url):
        """Scrape the ``<title>`` tag contents from the HTML page at *url*.

        Returns a :class:`LinkInfoResult`.
        """
        make_error = partial(LinkInfoResult, url.geturl(), is_error=True)

        # Let's see what's on the other end...
        with closing(simple_http_get(url.geturl(), stream=True)) as r:
            # Only bother with 200 OK
            if r.status_code != requests.codes.ok:
                return make_error("HTTP request failed: {}".format(r.status_code))
            # Only process HTML-ish responses
            if "Content-Type" not in r.headers:
                return make_error("No Content-Type header")
            elif "html" not in r.headers["Content-Type"]:
                return make_error("Content-Type not HTML-ish: {}".format(r.headers["Content-Type"]))
            # Don't try to process massive responses
            if "Content-Length" in r.headers:
                max_size = int(self.config_get("max_response_size"))
                if int(r.headers["Content-Length"]) > max_size:
                    return make_error(
                        "Content-Length too large: {} bytes, >{}".format(
                            r.headers["Content-Length"], self.config_get("max_response_size")
                        )
                    )

            # Get the correct parser
            if "charset=" in r.headers["content-type"]:
                # If present, HTTP Content-Type header charset takes precedence
                parser = lxml.html.HTMLParser(encoding=r.headers["content-type"].rsplit("=", 1)[1])
            else:
                parser = lxml.html.html_parser

            # Get only a chunk, in case Content-Length is absent on massive file
            chunk = next(r.iter_content(int(self.config_get("max_response_size"))))
            # Try to trim chunk to a tag end to help the HTML parser out
            try:
                chunk = chunk[: chunk.rindex(b">") + 1]
            except ValueError:
                pass

            # Attempt to get the <title> tag
            html = lxml.etree.fromstring(chunk, parser)
            title = html.find(".//title")
            if title is None:
                return make_error("failed to find <title>")

            # Normalise title whitespace
            title = " ".join(title.text.strip().split())
            # Build result
            result = LinkInfoResult(url, title, nsfw=url.netloc.endswith(".xxx"))
            # See if the title is redundant, i.e. appears in the URL
            result.is_redundant = self._filter_title_in_url(url, title)
            return result
Ejemplo n.º 57
0
    def parse_page_content(self, page_content, cached_soup={}):
        page_key = hashlib.sha224(page_content.encode("utf-8")).hexdigest()
        try:
            if not cached_soup.has_key(page_key):
                import lxml.html.soupparser

                cached_soup[page_key] = lxml.html.soupparser.fromstring(page_content)
            html = deepcopy(cached_soup[page_key])

            body = html.find(".//body")
            if body is None:
                raise UnknownContentException()
        # for simplicity we decided to use BeatifulSoup parser for now
        #            html = etree.XML(page_content, etree.XMLParser())
        #            body = html.find('{%s}body' % NS['html'])
        #            if body is None:
        #                raise UnknownContentException()
        except (ExpatError, etree.XMLSyntaxError, UnknownContentException):
            raise
        #            logging.warning('Was not valid XHTML; trying with BeautifulSoup')
        #            try:
        #                import lxml.html.soupparser
        #                html = lxml.html.soupparser.fromstring(page_content)
        #                body = html.find('.//body')
        #                if body is None:
        #                    raise
        #            except:
        #                # Give up
        #                logging.error("Giving up on this content")
        #                raise UnknownContentException()
        if self.current_anchor is None:
            return html
        elements_to_remove = []
        start_elem, end_elem = find_bounding_elements(
            body,
            self.previous_anchor,
            None if self.current_anchor["id"] is None else self.current_anchor,
            self.next_anchor,
        )

        within_start_and_end_elem = True if start_elem is None else False
        for elem in body.iterdescendants():
            if elem == start_elem:
                within_start_and_end_elem = True
            elif elem == end_elem:
                within_start_and_end_elem = False
            if not within_start_and_end_elem and start_elem not in elem.iterdescendants():
                elements_to_remove.append(elem)
        for elem in elements_to_remove:
            elem.clear()
            try:
                body.remove(elem)
            except ValueError:
                pass
        return html