Python find_classの例、lxml.html.find_class Pythonの例

コード例 #1

0

ファイルを表示

ファイル: three.py プロジェクト: timmyomahony/python-parsnip

	def login(self):
		# Check login 
		response = self.connection.send(self.LOGIN_POST,data={
		                       'mobile':self.phone_number,
                               'pin':self.pin,
                               'serviceId':'19088',
                               'originCountryPrefix':'353'})
		html = lxml.html.fromstring(response)
		loginpage = html.find_class("LoginHeader")
		if len(loginpage) > 0:
			for s in html.find_class("leftdiv"):
				if "Invalid login. Please try again." in s.text_content():
					raise exceptions.LoginError(msg="Invalid Login Username or Pin", webtexter=self)
			raise exceptions.LoginError("Unknown Login Error",  webtexter=self)
		return True

コード例 #2

0

ファイルを表示

ファイル: calc.py プロジェクト: Mithorium/Mithnet

def calc(phenny, input): 
   """Google calculator."""
   if not input.group(2):
      return phenny.reply("Nothing to calculate.")
   q = input.group(2).encode('utf-8')
   q = q.replace('\xcf\x95', 'phi') # utf-8 U+03D5
   q = q.replace('\xcf\x80', 'pi') # utf-8 U+03C0
   uri = 'http://www.google.com/search?q='
   bytes = web.get(uri + web.urllib.quote(q))
   html = lxml.html.fromstring(bytes)
   try:
      answer = html.get_element_by_id("cwos").text_content().strip()
   except KeyError:
      try:
         answer = lxml.etree.tostring(html.find_class("vk_ans")[0])
         answer = answer[answer.find('>')+1:answer.rfind('<')]
      except IndexError:
         answer = None
   if answer:
      answer = web.decode(answer)
      answer = answer.replace(u'\xc2\xa0', ',')
      answer = answer.replace('<sup>', '^(')
      answer = answer.replace('</sup>', ')')
      answer = answer.encode('utf-8')
      phenny.say(answer)
   else: phenny.say('Sorry, no result.')

コード例 #3

0

ファイルを表示

ファイル: parse_html.py プロジェクト: headrick/OSU-Registration

def get_grades(original_html):
    html = lxml.html.fromstring(original_html)
    table_elements = html.find_class("datadisplaytable")[0].getchildren()
    
    classes_elements = []
    classes_term = []
    classes = {}
    
    for element in table_elements:
        try:
            term = element.find_class("fieldOrangetextbold")
        except:
            pass

        if len(element.getchildren()) in [6,9]:
            classes_elements.append(element)
            classes_term.append(term)

    for index, element in enumerate(classes_elements):
        text = []
        
        for subelement in element.getchildren():
            text.append(subelement.text_content())
        
        classes[index] = {'Name' : text[2], 'Department' : text[0], 'Course Number' : text[1], 'Credits' : float(text[3]), 'Grade' : text[4], 'Term' : classes_term[index]}

    return classes

コード例 #4

0

ファイルを表示

ファイル: parse_html.py プロジェクト: headrick/OSU-Registration

def get_total_credits(original_html):
    html = lxml.html.fromstring(original_html)
    credits = 0
    for element in html.find_class("ddlabel"):
        if element.text_content() == 'Overall:':
            credits = float(element.getnext().getchildren()[0].text_content())
    return credits

コード例 #5

0

ファイルを表示

ファイル: century_scrape.py プロジェクト: bqpd/sprocket

def extract_springs(page, springFile):
	html = lxml.html.fromstring(page)
	if len(html.find_class('tablesorter'))<1:
		return 0
	else:
		springTable = html.find_class('tablesorter')[0]
		springs = springTable.cssselect('tbody')[0].cssselect('tr')
		for spring in springs:
			line = []
			for cell in spring.getchildren():
				line.append(cell.text_content())
			toWrite = '|'.join(line)
			toWrite = 'newline'.join(toWrite.split('\n'))
			toWrite = toWrite + '|\n'
			springFile.write(toWrite)
		return len(springs)

コード例 #6

0

ファイルを表示

ファイル: news.py プロジェクト: olegarch/novomapia

def getNews2Ru(url):
    startTime = printTime("!news2ru: " + url)
    data = {}
    data["name"] = url
    data["children"] = []
    try:
        req = urllib2.Request(url)
        req.add_header("User-agent", "Mozilla/5.0")

        # printTime("before urlopen... "+url)
        #        html = lxml.html.parse(url).getroot()

        response = urllib2.urlopen(req)
        doc = response.read()
        html = lxml.html.document_fromstring(doc)

        # print lxml.html.tostring(html)
        # response = urllib2.urlopen(req)
        # printTime("...urlopen done "+ url)
        # print lxml.html.tostring(html)
        # print dir(html)

        parseTime = printTime("!!news2ru: " + url, startTime)
        for index, news_placeholder in enumerate(html.find_class("news_placeholder")):
            id = news_placeholder.get("id").split("_")[1]
            # print 'ID',id
            votes = num(news_placeholder.xpath(".//div[@id='vote_num_" + id + "']/a/text()")[0])
            # print 'votes =',votes
            href = "http://news2.ru/story/" + id
            title_elem = news_placeholder.xpath(".//h3[@id='news_title_" + id + "']/a/text()")[0]
            title = title_elem
            desc_elem = news_placeholder.find(".//div[@id='news_description_" + id + "']")
            desc = ""  # desc_elem.text
            img_elem = news_placeholder.find(".//img")
            img_elem.set("src", "http://news2.ru" + img_elem.get("src"))

            comm_elem = news_placeholder.xpath(".//div[@class='comments_ico']/a/text()")[0]
            comments = num(comm_elem.split()[0])

            data["children"].append({})
            data["children"][index]["link"] = href
            # data['children'][index]['name'] = '<span>'+title+'</span>'
            data["children"][index]["rawvotes"] = votes
            data["children"][index]["description"] = desc
            data["children"][index]["img"] = lxml.etree.tostring(img_elem)
            data["children"][index]["rawcomments"] = comments
            data["children"][index]["title"] = title
            data["children"][-1]["id"] = int(id)
            data["children"][-1]["socialactivity"] = (
                data["children"][index]["rawcomments"] + data["children"][index]["rawvotes"]
            )
            data["children"][-1]["popularity"] = len(data["children"])
        return data
    except IOError as (errno):
        print "I/O error({0})".format(errno)
    finally:
        printTime("#news2ru: " + url, parseTime)
        return data

コード例 #7

0

ファイルを表示

ファイル: scrape.py プロジェクト: ECCSGL/comfy-beta

def scrape():
    return_matches = []
    r = requests.get("http://csgolounge.com")
    if r.status_code != 200:
        return
    html = lxml.html.document_fromstring(r.text)

    matches = html.find_class("matchmain")

    for match in matches:
        try:
            available = True
            if match.find_class("notavailable"):
                available = False

            matchleft = match.find_class("matchleft")[0]
            team_div = matchleft.find_class("team")
            teams = matchleft.find_class("teamtext")
            links = matchleft.iterlinks()
            for l in links:
                id = l[2].strip("match?m=")
                break

            team_1_name = teams[0][0].text_content()
            team_1_odds = teams[0][2].text_content().strip("%")
            team_1_won = len(team_div[0]) > 0

            team_2_name = teams[1][0].text_content()
            team_2_odds = teams[1][2].text_content().strip("%")
            team_2_won = len(team_div[1]) > 0

            when = match.find_class("whenm")[0].text_content().strip('Â\xa0Â\xa0\r\n').strip('Â\xa0Â\xa0a')
            live = False

            if "LIVE" in when:
                live = True

            event = match.find_class("eventm")[0].text_content()

            match_details = {
                "available" : available,
                "id" : id,
                "team_1_name" : team_1_name,
                "team_1_odds" : team_1_odds,
                "team_1_won" : team_1_won,
                "team_2_name" : team_2_name,
                "team_2_odds" : team_2_odds,
                "team_2_won" : team_2_won,
                "when" : when,
                "live" : live,
                "event" : event,
            }
            return_matches.append(match_details)
        except:
            print("Match format weird. Trying next one.")
    return return_matches

コード例 #8

0

ファイルを表示

ファイル: crawler.py プロジェクト: marlboromoo/getlocalization_crawler

 def get_string_translation(self, id_):
     """docstring for get_string_translation"""
     try:
         data = self.fetch_url("%s%s" % (self.string_translantion_url, id_))
         html = lxml.html.fromstring(data)
         winner = html.find_class('ot_row_winner')[0]
         string = winner.find_class('ot_string')[0].text_content()
         translation = winner.find_class('ot_translation')[0].text_content()
     except Exception:
         return None, None
     return (unicode(string), unicode(translation))

コード例 #9

0

ファイルを表示

ファイル: indexer.py プロジェクト: maldy/shellob

	def parse_doc(self, html_file):
		title = ""
		doc = ""
		html = lxml.html.fromstring(html_file)
		title_el = html.xpath('//title')
		if title_el:
			title = title_el[0].text_content()
		div_el = html.find_class('freestyle-text')
		if div_el:
			doc = div_el[0].text_content()
		return (title, doc)

コード例 #10

0

ファイルを表示

ファイル: get_gpa.py プロジェクト: ngokevin/reglib

def get_gpa(original_html):
    html = lxml.html.fromstring(original_html)
  
    osu_gpa = 0
    transfer_gpa = 0
    for element in html.find_class("ddlabel"):
        if element.text_content() == 'Total Institution:':
            osu_gpa = element.getnext().getnext().getnext().getnext().getchildren()[0].text_content()
        if element.text_content() == 'Total Transfer:':
            transfer_gpa = element.getnext().getnext().getnext().getnext().getchildren()[0].text_content()

    return {'osu_gpa': osu_gpa, 'transfer_gpa': transfer_gpa}

コード例 #11

0

ファイルを表示

ファイル: updater.py プロジェクト: ghallberg/supergraph

def update():
    now_time = time.time()

    supersite = urllib.request.urlopen("http://supersupersupersuper.com")
    html = lxml.html.document_fromstring(supersite.read())

    supernum_elem = html.find_class("supernumber")[0]
    supernum = int(supernum_elem.text_content())

    json_dict = get_cur_dict()

    json_dict[now_time] = supernum

    with open("super.json", "w") as f:
        json.dump(json_dict, f)

コード例 #12

0

ファイルを表示

ファイル: get_current_classes.py プロジェクト: ngokevin/reglib

def get_current_classes(original_html):
    html = lxml.html.fromstring(original_html)
    classes = []
    total_classes = [] 
    elements = html.find_class("datadisplaytable")
    for index in range(0, len(elements), 2):
        classes.append([elements[index], elements[index+1]])
    for loop_index, each_class in enumerate(classes):
        cl = {}
        title_string = each_class[0].find_class("captiontext")[0].text_content()
        string_components = title_string.split(' - ')
        cl["title"] = string_components[0]
        cl["section"] = string_components[2]
        string_components = string_components[1].split(' ')
        cl["department"] = string_components[0]
        cl["number"] = string_components[1]
        
        class_elements = []
        for index, element in enumerate(each_class[0].getchildren()[1:9]):
            class_elements.append(element.getchildren()[1].text_content())

        cl["term"] = class_elements[0]
        cl["crn"] = class_elements[1]
        cl["registration"] = class_elements[2]
        cl["instructor"] = class_elements[3].replace('\n','')
        cl["grading_mode"] = class_elements[4]
        cl["credits"] = float(class_elements[5])
        cl["level"] = class_elements[6]
        cl["campus"] = class_elements[7]
        #cl["E-mail"] = each_class[0].getchildren()[4].getchildren()[1].getchildren()[0].attrib['href'].split(':')[1]
        
        class_elements = []
        for index, element in enumerate(each_class[1].getchildren()[2].getchildren()):
            class_elements.append(element.text_content())
        
        cl['class_type'] = class_elements[0]
        cl['times'] = class_elements[1].split(' - ')
        for index, time in enumerate(cl['times']):
            cl['times'][index] = datetime.strptime(time, '%I:%M %p').strftime('%H:%M')
        cl['days'] = list(class_elements[2])
        cl['location'] = {'building' : (' ').join(class_elements[3].split(' ')[:-1]), 'room': class_elements[3].split(' ')[-1]}
        cl['duration'] = class_elements[4]
        cl['type'] = class_elements[5]
        
        total_classes.append(cl)

    return total_classes

コード例 #13

0

ファイルを表示

ファイル: get_credits.py プロジェクト: ngokevin/reglib

def get_credits(original_html):
    # Get dictonary of institution/transfer/total credits


    html = lxml.html.fromstring(original_html)
    institution_credits = 0
    transfer_credits = 0
    total_credits = 0

    for element in html.find_class("ddlabel"):
        if element.text_content() == 'Total Institution:':
            institution_credits = float(element.getnext().getchildren()[0].text_content())
        if element.text_content() == 'Total Transfer:':
            transfer_credits = float(element.getnext().getchildren()[0].text_content())
        if element.text_content() == 'Overall:':
            total_credits = float(element.getnext().getchildren()[0].text_content())
    credits = {'institution_credits': institution_credits, 'transfer_credits': transfer_credits, 'total_credits': total_credits}

    return credits

コード例 #14

0

ファイルを表示

ファイル: news.py プロジェクト: olegarch/novomapia

def getHabrPage(url):
    startTime = printTime("!habr: " + url)
    data = {}
    data["name"] = url
    data["children"] = []
    try:

        req = urllib2.Request(url)
        req.add_header("User-agent", "NovoMapia.com/1.0")
        response = urllib2.urlopen(req)
        doc = response.read()
        # print doc[:50]
        # response.close()
        # doc=unicode(doc,'windows-1251')
        html = lxml.html.document_fromstring(doc)
        parseTime = printTime("!!habr: " + url, startTime)
        for index, news in enumerate(html.find_class("post")):
            title = news.xpath(".//a[@class='post_title']/text()")[0]
            comments_elem = news.xpath(".//div[@class='comments']//span[@class='all']/text()")
            if len(comments_elem) > 0:
                comments = num(comments_elem[0].split()[0])
            else:
                comments = 0

            href = news.xpath(".//a[@class='post_title']/@href")[0]
            ""  # news.xpath(".//h3/a/@href")[0] #[0].get('href')
            id = news.get("id").split("_")[1]

            # print href, comments, id

            data["children"].append({})
            data["children"][-1]["link"] = href
            data["children"][-1]["socialactivity"] = comments
            data["children"][-1]["popularity"] = len(data["children"])
            data["children"][-1]["description"] = ""
            data["children"][-1]["img"] = ""
            data["children"][-1]["title"] = title
            data["children"][-1]["id"] = num(id)

        return data
    except urllib2.URLError, e:
        print repr(e)

コード例 #15

0

ファイルを表示

ファイル: news.py プロジェクト: olegarch/novomapia

def getMembranaPage(url):
    startTime = printTime("!membrana: " + url)
    data = {}
    data["name"] = url
    data["children"] = []
    try:
        req = urllib2.Request(url)
        req.add_header("User-agent", "Mozilla/5.0")
        response = urllib2.urlopen(req)
        doc = response.read()
        doc = unicode(doc, "windows-1251")
        html = lxml.html.document_fromstring(doc)
        parseTime = printTime("!!membrana: " + url, startTime)
        for index, news in enumerate(html.find_class("list-article")):
            title = news.xpath(".//h3/a/text()")[0]
            comments_elem = news.xpath(".//div[@class='extras']/a/text()")
            if len(comments_elem) > 0:
                comments = num(comments_elem[0].split()[0])
            else:
                comments = 0

            # a = news.xpath(".//div[@class='extras']/a")[0].get('href')
            # print repr(a.get('href'))

            href = news.xpath(".//h3/a/@href")[0]  # [0].get('href')
            id = href.split("/")[1]

            data["children"].append({})
            data["children"][-1]["link"] = "http://membrana.ru/" + href
            data["children"][-1]["socialactivity"] = comments
            data["children"][-1]["popularity"] = len(data["children"])
            data["children"][-1]["description"] = ""
            data["children"][-1]["img"] = ""
            data["children"][-1]["title"] = title
            data["children"][-1]["id"] = num(id)

        return data
    except IOError as (errno):
        print "I/O error({0})".format(errno)
    finally:
        printTime("#membrana: " + url, parseTime)
        return data

コード例 #16

0

ファイルを表示

ファイル: drop_classes.py プロジェクト: ngokevin/reglib

def drop_classes(original_html, crn_list):
    html = lxml.html.fromstring(original_html)

    course_table = html.find_class("datadisplaytable")[0]

    # make a list of crns in the order on the website to match them up
    # with the enumerated form values
    infosu_crns = []
    for element in course_table.find_class('dddefault'):
        try:
            crn = element.getchildren()[1]._value__get()
            if crn: 
                infosu_crns.append(crn)
        except:
            pass
 
#    for element in [element.getchildren()[0]._value__get() for element in course_table.find_class('dddefault') if type(element.getchildren()[0]) is lxml.html.InputElement]: 
#        if 'DUMMY' in element and len(temp_list) > 0:
#            course_list.append(temp_list)
#            temp_list = []
#        elif 'DUMMY' not in element:
#            temp_list.append(element)

    # with the list of crns from the website in order, we can make a list
    # of form values to submit and drop
    action_id_list = []
    for crn in crn_list:
        for index, infosu_crn in enumerate(infosu_crns):
            if crn == infosu_crn:
                action_id_list.append("action_id" + str(index+1)) # action_id starts at 1
                break

    
    # set each course to drop in the dropdown boxes (from value '' to 'DX')
    for action_id in action_id_list:
        html.get_element_by_id(action_id)._value__set('DX') # set to drop

    form = html.forms[1]
    values = form.form_values()
    values.append(('REG_BTN', 'Submit Changes'))
                
    return values

コード例 #17

0

ファイルを表示

ファイル: crawler.py プロジェクト: marlboromoo/getlocalization_crawler

 def get_locales(self):
     """generate language tag/id mapping"""
     print "Check available languages .."
     locales = {}
     langs = {
         "Chinese (China)" : "zh-CN",
         "Chinese (Taiwan)" : "zh-TW",
         "Czech (Czech Republic)" : "cs-CZ",
         "Danish" : "da",
         "Dutch" : "nl",
         "English" : "en",
         "Finnish" : "fi",
         "French" : "fr",
         "German" : "de",
         "Hungarian (Hungary)" : "hu-HU",
         "Italian" : "it",
         "Japanese (Japan)" : "ja-JP",
         "Korean" : "ko",
         "Latvian" : "lv",
         "Lithuanian" : "lt",
         "Norwegian" : "no",
         "Polish": "pl",
         "Polish (Poland)": "pl-PL",
         "Portuguese (Brazil)" : "pt-BR",
         "Russian" : "ru",
         "Spanish" : "es",
         "Swedish" : "sv",
         "Thai (Thailand)" : "th-TH",
         "Turkish (Turkey)" : "tr-TR",
         "Welsh" : "cy",
     }
     data = self.fetch_url("%s/%s" % (self.BASE_URL, self.project))
     html = lxml.html.fromstring(data)
     for i in html.find_class('cell1'):
         a = i.find('a')
         if a != None:
             lang = a.text_content()
             code = a.get('href').replace('/',' ').split().pop()
             if langs.has_key(lang):
                 locales[langs[lang]] = code
     return locales

コード例 #18

0

ファイルを表示

ファイル: get_grades.py プロジェクト: ngokevin/reglib

def get_grades(original_html):
    html = lxml.html.fromstring(original_html)
    table_elements = html.find_class("datadisplaytable")[0].getchildren()
    
    classes_elements = []
    classes_term = []
    classes = []
    
    for element in table_elements:
        try:
            term = element.find_class("fieldOrangetextbold")[0].text_content()
        except:
            pass

        if len(element.getchildren()) in [6,9]:
            classes_elements.append(element)
            classes_term.append(term)

    for index, element in enumerate(classes_elements):
        text = []
        
        for subelement in element.getchildren():
            text.append(subelement.text_content())
        
        term = classes_term[index]
        terms = {'01': 'Fall', '02':'Winter', '03':'Spring', '04':'Winter'}
        try:
            if len(term)>7:
                term = term.split(': ')[1]
            else:
                term = ' '.join([terms[term[4:6]], term[0:4] ])
        except:
            pass
        classes.append({'title' : text[2], 'department' : text[0], 'number' : text[1], 'credits' : float(text[3]), 'grade' : text[4], 'term' : term})

    return classes

コード例 #19

0

ファイルを表示

ファイル: scraper.py プロジェクト: swelt/pillen

    "Amphetamin",
    "Amphetami",
    "Coffein",
    "MDDMA",
    "4-Fa",
    "Buflomedil",
    "Amoxicillin",
    "m-CCP",
    "4-F-A",
    ])

if __name__ == "__main__":
    results = []
    html    = html.fromstring(open("tmp/page.html").read())
    names   = map(lambda _: _.text_content(),  html.get_element_by_id("cc").findall("h2"))
    for i, tag in enumerate(html.find_class("inhalt_pillen")):
        infos  = dict(name=names[i])
        images = map(lambda _: _.get("src"), tag.findall("img"))
        for image in images:
            image_filename = os.path.join("tmp/pictures", os.path.basename(image))
            if not os.path.isfile(image_filename):
                r = requests.get("http://www.mindzone.info/%s" % (image), stream=True)
                with open(image_filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024): 
                        if chunk: # filter  keep-alive new chunks
                            f.write(chunk)
                            f.flush()
        infos["images"] = map(lambda _: os.path.basename(_), images)
        if not infos["images"]: continue
        for line in etree.tostring(tag).split("\n"):
            tag_idx = line.find("<strong>")

コード例 #20

0

ファイルを表示

ファイル: extractrating.0.py プロジェクト: tswast/restauraunt-recommender-system

import lxml
get_ipython().magic(u'pwd ')
f = open("alex.0.html")
import lxml.html
html = lxml.html.parse(f)
html
html.tostring()
lxml.html.tostring(html)
help(lxml.html.find_class)
lxml.html.find_class(html, "biz_info")
html = lxml.html.fromstring("".join(f.readlines()))
html = lxml.html.fromstring("".join(f.readlines()))
f.close()
f = open("alex.0.html")
html = lxml.html.fromstring("".join(f.readlines()))
html.find_class("biz_info")
[elem.text_content() for elem in html.find_class("biz_info")]
[elem.cssselect("a").text_content() for elem in html.find_class("biz_info")]
[elem.cssselect("a")[0].text_content() for elem in html.find_class("biz_info")]
html.cssselect("div.review div.rating_info img")
[elem.get(alt) for elem in html.cssselect("div.review div.rating_info img")]
[elem.get("alt") for elem in html.cssselect("div.review div.rating_info img")]
reviews = html.cssselect("div.review")
reviews
reviews[0].cssselect("img")
reviews[0].cssselect("div.biz_info a")
reviews[0].cssselect("div.biz_info h4 a")
reviews[0].cssselect("div.biz_info h4 a")[0].get("href")
reviews[0].cssselect("div.rating img")[0].get("alt")
ratings = {}
for review in reviews:

コード例 #21

0

ファイルを表示

ファイル: download_data.py プロジェクト: simon-wenmouth/dealerships

combined_name=os.path.join(os.path.dirname(directory_name), 'dealer.com.json')

dealers = []

for file_name in file_names:
    with open(file_name, 'r') as fd:
        url = os.path.splitext(os.path.basename(file_name))[0]
        text = fd.read()
        if not 'static.dealer.com' in text:
            print 'Dealer %s has moved on ...' % url
            continue
        try:
            html = lxml.html.document_fromstring(text)
            data = {}
            data['url'] = url
            for vcard in html.find_class('vcard'):
                data['name'] = get_text(vcard, 'org')
                data['tels'] = []
                for tel_el in vcard.find_class('tel'):
                    tel = {}
                    tel['value'] = get_text(tel_el, 'value')
                    tel['type']  = get_text(tel_el, 'type')
                    data['tels'].append(tel)
                for adr in vcard.find_class('adr'):
                    data['address'] = {}
                    data['address']['street_address'] = get_text(adr, 'street-address')
                    data['address']['locality']       = get_text(adr, 'locality')
                    data['address']['region']         = get_text(adr, 'region')
                    data['address']['postal_code']    = get_text(adr, 'postal-code')
                    break
                for geo in vcard.find_class('geo'):

コード例 #22

0

ファイルを表示

ファイル: add_class.py プロジェクト: ngokevin/reglib

def add_class_has_errors(original_html):
    html = lxml.html.fromstring(original_html)
    if html.find_class("errortext"):
        return False
    return True

コード例 #23

0

ファイルを表示

ファイル: download_data.py プロジェクト: simon-wenmouth/dealerships

try:
    html = lxml.html.document_fromstring(text)
    data = {'url': url, 'address': {}, 'geo': {}, 'departments': []}
    for meta in html.iter('meta'):
        name    = meta.get('name')
        content = meta.get('content')
        if name is not None:
            if name == 'geo.position':
                lat, lng = content.split(',')
                data['geo']['latitude']  = lat
                data['geo']['longitude'] = lng
            elif name == 'geo.placement':
                data['address']['addressLocality'] = content
            elif name == 'geo.region':
                data['address']['addressRegion'] = content
    for div in html.find_class('hours-page'):
        for span in div.iter('span'):
            itemprop = span.get('itemprop')
            content  = span.text_content()
            if itemprop is not None and len(itemprop) > 0 and len(content) > 0:
                if itemprop == 'name':
                    if itemprop not in data:
                        data[itemprop] = content
                elif itemprop == 'email':
                    data[itemprop] = content
                elif itemprop == 'telephone':
                    data[itemprop] = content
                elif itemprop == 'streetAddress':
                    data['address'][itemprop] = content
                elif itemprop == 'postalCode':
                    data['address'][itemprop] = content

コード例 #24

0

ファイルを表示

ファイル: clean_indeed.py プロジェクト: lilyrobin/StatW4242

def by_class(html, class_name):
  return html.find_class(class_name)[0].text_content().replace("\n", " ").replace("\t", " ")

コード例 #25

0

ファイルを表示

    print('\t[' + str(i) + ']: '+ cat.text.split('\n')[0])
    cat_list = cat.find_element_by_xpath('following-sibling::*')
    category_html[i] = lxml.html.fromstring(cat_list.get_attribute('innerHTML'))
    category_html[i].make_links_absolute(base_url)

# Prompt user for category selection
input_numbers = input("Enter desired category numbers, seperated by spaces. (Leave blank for all): ")
input_numbers = list(map(lambda x: int(x), input_numbers.split()))
if len(input_numbers) == 0:
    input_numbers = range(0, len(categories))

# Parse code solutions, print to stdout
for i in input_numbers:
    print("Scraping category " + str(i))
    html = category_html[i]
    for prob in html.find_class("problemlink"):
        href = prob.get('href')
        browser.get(href)
        print(href.split('/').pop())
        try:
            # could make this a randomized wait if we feel like Stuart Reges
            # is on to us
            solution = browser.find_element_by_id("solution")
            soln_text = py_html.unescape(solution.get_attribute('innerHTML'))
            print(soln_text)
        except:
            print("No code solution found")
        print()
        browser.back()
    
print("Done scraping")

コード例 #26

0

ファイルを表示

ファイル: pipeline_jiameng.py プロジェクト: chocoai/integrated_crawler

def get_info_on_page(html):
    info_list = []
    lis = html.find_class('items items_V clearfix')
    for li in lis:
        info_list.append(get_info_from_li(li))
    return info_list

コード例 #27

0

ファイルを表示

ファイル: html_to_tabular.py プロジェクト: ll2jwork/dc_mayor_schedule

#!/usr/bin/env python

import glob  # to find files
import lxml.html
import csv  # to write file

repo_dir = "/home/ubuntu/dc_mayor_schedule"

rows = list()

for filename in sorted(glob.glob(repo_dir + "/html/*.html")):
    tree = lxml.html.parse(filename)
    html = tree.iter().next()
    table = html.find_class('views-table')[0]
    for tr in table.iter('tr'):
        tds = list()
        for td in tr.iter('td'):
            tds.append(td)
        span = tds[0].iter('span').next()
        datetime = span.values()[3].encode("utf-8")
        strong = tds[1].iter('strong').next()
        event = strong.text_content().encode("utf-8")
        div = tds[1].iter('div').next()
        venue = div.text_content().encode("utf-8")
        p = tds[1].iter('p').next()
        comment = p.text_content().encode("utf-8")
        rows.append([datetime, event, venue, comment])

# We're about to reverse everything
rows.append(["datetime", "event", "venue", "comment"])

コード例 #28

0

ファイルを表示

ファイル: html_to_tabular.py プロジェクト: shiftyp/dc_mayor_schedule

#!/usr/bin/env python

import glob # to find files
import lxml.html
import csv # to write file

repo_dir = "/home/aaron/dc_mayor_schedule"

rows = list()

for filename in sorted(glob.glob(repo_dir + "/html/*.html")):
  tree = lxml.html.parse(filename)
  html = tree.iter().next()
  table = html.find_class('views-table')[0]
  for tr in table.iter('tr'):
    tds = list()
    for td in tr.iter('td'):
      tds.append(td)
    span = tds[0].iter('span').next()
    datetime = span.values()[3].encode("utf-8")
    strong = tds[1].iter('strong').next()
    event = strong.text_content().encode("utf-8")
    div = tds[1].iter('div').next()
    venue = div.text_content().encode("utf-8")
    p = tds[1].iter('p').next()
    comment = p.text_content().encode("utf-8")
    rows.append([datetime, event, venue, comment])

# We're about to reverse everything
rows.append(["datetime", "event", "venue", "comment"])

コード例 #29

0

ファイルを表示

ファイル: download_data.py プロジェクト: simon-wenmouth/dealerships

combined_name=os.path.join(os.path.dirname(directory_name), 'dealerfire.json')

dealers = []

for file_name in file_names:
    with open(file_name, 'r') as fd:
        url = os.path.splitext(os.path.basename(file_name))[0]
        text = fd.read()
        if not '.dealerfire.com' in text:
            print 'Dealer %s has moved on ...' % url
            continue
        try:
            html = lxml.html.document_fromstring(text)
            data = {'url': url, 'address': {}, 'geo': {}, 'departments': []}
            suck = True
            for info in html.find_class('dealer-info'):
                suck = False
                fail = True
                for meta in info.iter('meta'):
                    itemprop = meta.get('itemprop')
                    content  = meta.get('content')
                    if itemprop is not None and len(itemprop) > 0 and len(content) > 0:
                        fail = False
                        if itemprop == 'name':
                            if itemprop not in data:
                                data[itemprop] = content
                        elif itemprop == 'email':
                            data[itemprop] = content
                        elif itemprop == 'telephone':
                            data[itemprop] = content
                        elif itemprop == 'streetAddress':

コード例 #30

0

ファイルを表示

ファイル: susnov.py プロジェクト: alenamak/Newspaper-Homework

def parseContents(contents):
    global totalWords
    global nextURL

    html = lxml.html.fromstring(contents)
    elems = html.find_class('leading')

    for elem in elems:
        links = elem.findall('.//a')
        title = links[0].text
        url = links[0].get('href')

        m = re.search('\/([0-9]+)\-', url)
        articleId = m.group(1)

        published = elem.find_class('published')[0].text
        m = re.search(
            '(([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{4})) ([0-9]{1,2}:[0-9]{1,2})',
            published)
        day = m.group(2)
        month = m.group(3)
        year = m.group(4)
        date = m.group(1)
        time = m.group(2)

        print("%s: %s, %s @ %s" % (articleId, url, date, time))

        paragraphs = elem.findall('.//p')

        text = ""
        articleWords = 0

        for paragraph in paragraphs:
            if paragraph.text:
                if len(text) >= 0:
                    text += u"\n\n"
                text += paragraph.text
                articleWords += paragraph.text.count(' ')

        totalWords += articleWords
        print(" %d words in this article (%d total)" %
              (articleWords, totalWords))
        print("")

        makeDirectory("Mystem-texts")
        makeDirectory("texts")
        makeDirectory("texts/" + year)
        makeDirectory("texts/" + year + "/" + month)

        text_file = open(
            "texts/" + year + "/" + month + "/" + articleId + '.txt', 'wb')
        text_file.write(text.encode("UTF-8"))
        mystfile = open("Mystem-texts/" + articleId + '.txt', 'w')
        lemmas = mys.lemmatize(text)
        for item in lemmas:
            mystfile.write(item)
        text_file.close()
        mystfile.close()

    nextLink = html.find_class("pagination-next")[0].find('.//a')
    if nextLink is not None:
        nextURL = nextLink.get('href')
    else:
        nextURL = None

コード例 #31

0

ファイルを表示

ファイル: weather.py プロジェクト: wefcoco/weather

sock = urllib.urlopen("http://www.weather.com/weather/hourbyhour/graph/USNY0996")
wsource = sock.read()
sock.close()
html = lxml.html.fromstring(wsource)


# part 1: wx-timepart
# part 2: wx-conditions (image + description)
# part 3: temperature
# part 4: details

now = datetime.datetime.now().strftime('%H:%M:%S')
print 'now=%s'%now

temp = []
elements = html.find_class("wx-temp")
for i in range(len(elements)):
    temp.append(float(elements[i].text_content()[1:3]))
print temp

time1 = []
elements = html.find_class("wx-time")
for i in range(len(elements)):
    time1.append(elements[i].text_content())
print time1

res = []


sys.exit(2)

コード例 #32

0

ファイルを表示

ファイル: spider.py プロジェクト: indieman/scrapper

    def search(self, html):
        results = list()
        org_element_list = html.find_class('org_full_box')

        if org_element_list:
            org_element = org_element_list.pop()
     
            name = org_element.find_class('org_header')[0].text_content()
            name = ' '.join(name.split())

            org_info = org_element.find_class('orginfo')
            phone = ''
            address = ''
            metro = ''
            area = ''
            site = ''
            email = ''
            time = ''
            category = ''
            services = ''

            for tr in org_info[0]:
                label = tr[0].text_content()
                if u'Телефоны:' == label:
                    phone = tr[1].text_content()
                    phone = ' '.join(phone.split())
                if u'Адрес:' == label:
                    address = tr[1].text_content()
                    address = ' '.join(address.split())
                if u'Метро:' == label:
                    metro = tr[1].text_content()
                    metro = ' '.join(metro.split())
                if u'Район:' == label:
                    area = tr[1].text_content()
                    area = ' '.join(area.split())
                if u'Сайт:' == label:
                    site = tr[1].text_content()
                    site = ' '.join(site.split())
                if u'E-mail:' == label:
                    email = tr[1].text_content()
                    email = ' '.join(email.split())
                if u'Время работы:' == label:
                    time = tr[1].text_content()
                    time = ' '.join(time.split())
                if u'Рубрики:' == label:
                    category = tr[1].text_content()
                    category = ' '.join(category.split())
                if u'Услуги и товары:' == label:
                    services = tr[1].text_content()
                    services = ' '.join(services.split())

            results = [[
                name,
                phone,
                address,
                metro,
                area,
                site,
                email,
                time,
                category,
                services
            ],]
        return results

コード例 #33

0

ファイルを表示

ファイル: susnov.py プロジェクト: alenamak/Newspaper-Homework

def parseContents(contents):
    global totalWords
    global nextURL
    
    html = lxml.html.fromstring(contents)
    elems = html.find_class('leading')
    
    for elem in elems:
        links = elem.findall('.//a');
        title = links[0].text
        url = links[0].get('href')
        
        m = re.search('\/([0-9]+)\-', url)
        articleId = m.group(1)
        
        published = elem.find_class('published')[0].text
        m = re.search('(([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{4})) ([0-9]{1,2}:[0-9]{1,2})', published)
        day = m.group(2)
        month = m.group(3)
        year = m.group(4)
        date = m.group(1)
        time = m.group(2)
        
        print("%s: %s, %s @ %s" % (articleId, url, date, time))

        paragraphs = elem.findall('.//p')
        
        text = ""
        articleWords = 0
        
        for paragraph in paragraphs:
            if paragraph.text:
                if len(text) >= 0:
                    text += u"\n\n"
                text += paragraph.text
                articleWords += paragraph.text.count(' ')
        
        totalWords += articleWords
        print(" %d words in this article (%d total)" % (articleWords, totalWords))
        print("")
        
        makeDirectory("Mystem-texts")        
        makeDirectory("texts")
        makeDirectory("texts/"+year)
        makeDirectory("texts/"+year+"/"+month)
        
        text_file = open("texts/"+year+"/"+month+"/"+articleId+'.txt', 'wb')
        text_file.write(text.encode("UTF-8"))
        mystfile = open("Mystem-texts/"+articleId+'.txt', 'w')        
        lemmas = mys.lemmatize(text)
        for item in lemmas:
            mystfile.write(item)
        text_file.close()
        mystfile.close()
        
        
    nextLink = html.find_class("pagination-next")[0].find('.//a')
    if nextLink is not None:
        nextURL = nextLink.get('href')
    else:
        nextURL = None