def login(self): # Check login response = self.connection.send(self.LOGIN_POST,data={ 'mobile':self.phone_number, 'pin':self.pin, 'serviceId':'19088', 'originCountryPrefix':'353'}) html = lxml.html.fromstring(response) loginpage = html.find_class("LoginHeader") if len(loginpage) > 0: for s in html.find_class("leftdiv"): if "Invalid login. Please try again." in s.text_content(): raise exceptions.LoginError(msg="Invalid Login Username or Pin", webtexter=self) raise exceptions.LoginError("Unknown Login Error", webtexter=self) return True
def calc(phenny, input): """Google calculator.""" if not input.group(2): return phenny.reply("Nothing to calculate.") q = input.group(2).encode('utf-8') q = q.replace('\xcf\x95', 'phi') # utf-8 U+03D5 q = q.replace('\xcf\x80', 'pi') # utf-8 U+03C0 uri = 'http://www.google.com/search?q=' bytes = web.get(uri + web.urllib.quote(q)) html = lxml.html.fromstring(bytes) try: answer = html.get_element_by_id("cwos").text_content().strip() except KeyError: try: answer = lxml.etree.tostring(html.find_class("vk_ans")[0]) answer = answer[answer.find('>')+1:answer.rfind('<')] except IndexError: answer = None if answer: answer = web.decode(answer) answer = answer.replace(u'\xc2\xa0', ',') answer = answer.replace('<sup>', '^(') answer = answer.replace('</sup>', ')') answer = answer.encode('utf-8') phenny.say(answer) else: phenny.say('Sorry, no result.')
def get_grades(original_html): html = lxml.html.fromstring(original_html) table_elements = html.find_class("datadisplaytable")[0].getchildren() classes_elements = [] classes_term = [] classes = {} for element in table_elements: try: term = element.find_class("fieldOrangetextbold") except: pass if len(element.getchildren()) in [6,9]: classes_elements.append(element) classes_term.append(term) for index, element in enumerate(classes_elements): text = [] for subelement in element.getchildren(): text.append(subelement.text_content()) classes[index] = {'Name' : text[2], 'Department' : text[0], 'Course Number' : text[1], 'Credits' : float(text[3]), 'Grade' : text[4], 'Term' : classes_term[index]} return classes
def get_total_credits(original_html): html = lxml.html.fromstring(original_html) credits = 0 for element in html.find_class("ddlabel"): if element.text_content() == 'Overall:': credits = float(element.getnext().getchildren()[0].text_content()) return credits
def extract_springs(page, springFile): html = lxml.html.fromstring(page) if len(html.find_class('tablesorter'))<1: return 0 else: springTable = html.find_class('tablesorter')[0] springs = springTable.cssselect('tbody')[0].cssselect('tr') for spring in springs: line = [] for cell in spring.getchildren(): line.append(cell.text_content()) toWrite = '|'.join(line) toWrite = 'newline'.join(toWrite.split('\n')) toWrite = toWrite + '|\n' springFile.write(toWrite) return len(springs)
def getNews2Ru(url): startTime = printTime("!news2ru: " + url) data = {} data["name"] = url data["children"] = [] try: req = urllib2.Request(url) req.add_header("User-agent", "Mozilla/5.0") # printTime("before urlopen... "+url) # html = lxml.html.parse(url).getroot() response = urllib2.urlopen(req) doc = response.read() html = lxml.html.document_fromstring(doc) # print lxml.html.tostring(html) # response = urllib2.urlopen(req) # printTime("...urlopen done "+ url) # print lxml.html.tostring(html) # print dir(html) parseTime = printTime("!!news2ru: " + url, startTime) for index, news_placeholder in enumerate(html.find_class("news_placeholder")): id = news_placeholder.get("id").split("_")[1] # print 'ID',id votes = num(news_placeholder.xpath(".//div[@id='vote_num_" + id + "']/a/text()")[0]) # print 'votes =',votes href = "http://news2.ru/story/" + id title_elem = news_placeholder.xpath(".//h3[@id='news_title_" + id + "']/a/text()")[0] title = title_elem desc_elem = news_placeholder.find(".//div[@id='news_description_" + id + "']") desc = "" # desc_elem.text img_elem = news_placeholder.find(".//img") img_elem.set("src", "http://news2.ru" + img_elem.get("src")) comm_elem = news_placeholder.xpath(".//div[@class='comments_ico']/a/text()")[0] comments = num(comm_elem.split()[0]) data["children"].append({}) data["children"][index]["link"] = href # data['children'][index]['name'] = '<span>'+title+'</span>' data["children"][index]["rawvotes"] = votes data["children"][index]["description"] = desc data["children"][index]["img"] = lxml.etree.tostring(img_elem) data["children"][index]["rawcomments"] = comments data["children"][index]["title"] = title data["children"][-1]["id"] = int(id) data["children"][-1]["socialactivity"] = ( data["children"][index]["rawcomments"] + data["children"][index]["rawvotes"] ) data["children"][-1]["popularity"] = len(data["children"]) return data except IOError as (errno): print "I/O error({0})".format(errno) finally: printTime("#news2ru: " + url, parseTime) return data
def scrape(): return_matches = [] r = requests.get("http://csgolounge.com") if r.status_code != 200: return html = lxml.html.document_fromstring(r.text) matches = html.find_class("matchmain") for match in matches: try: available = True if match.find_class("notavailable"): available = False matchleft = match.find_class("matchleft")[0] team_div = matchleft.find_class("team") teams = matchleft.find_class("teamtext") links = matchleft.iterlinks() for l in links: id = l[2].strip("match?m=") break team_1_name = teams[0][0].text_content() team_1_odds = teams[0][2].text_content().strip("%") team_1_won = len(team_div[0]) > 0 team_2_name = teams[1][0].text_content() team_2_odds = teams[1][2].text_content().strip("%") team_2_won = len(team_div[1]) > 0 when = match.find_class("whenm")[0].text_content().strip('Â\xa0Â\xa0\r\n').strip('Â\xa0Â\xa0a') live = False if "LIVE" in when: live = True event = match.find_class("eventm")[0].text_content() match_details = { "available" : available, "id" : id, "team_1_name" : team_1_name, "team_1_odds" : team_1_odds, "team_1_won" : team_1_won, "team_2_name" : team_2_name, "team_2_odds" : team_2_odds, "team_2_won" : team_2_won, "when" : when, "live" : live, "event" : event, } return_matches.append(match_details) except: print("Match format weird. Trying next one.") return return_matches
def get_string_translation(self, id_): """docstring for get_string_translation""" try: data = self.fetch_url("%s%s" % (self.string_translantion_url, id_)) html = lxml.html.fromstring(data) winner = html.find_class('ot_row_winner')[0] string = winner.find_class('ot_string')[0].text_content() translation = winner.find_class('ot_translation')[0].text_content() except Exception: return None, None return (unicode(string), unicode(translation))
def parse_doc(self, html_file): title = "" doc = "" html = lxml.html.fromstring(html_file) title_el = html.xpath('//title') if title_el: title = title_el[0].text_content() div_el = html.find_class('freestyle-text') if div_el: doc = div_el[0].text_content() return (title, doc)
def get_gpa(original_html): html = lxml.html.fromstring(original_html) osu_gpa = 0 transfer_gpa = 0 for element in html.find_class("ddlabel"): if element.text_content() == 'Total Institution:': osu_gpa = element.getnext().getnext().getnext().getnext().getchildren()[0].text_content() if element.text_content() == 'Total Transfer:': transfer_gpa = element.getnext().getnext().getnext().getnext().getchildren()[0].text_content() return {'osu_gpa': osu_gpa, 'transfer_gpa': transfer_gpa}
def update(): now_time = time.time() supersite = urllib.request.urlopen("http://supersupersupersuper.com") html = lxml.html.document_fromstring(supersite.read()) supernum_elem = html.find_class("supernumber")[0] supernum = int(supernum_elem.text_content()) json_dict = get_cur_dict() json_dict[now_time] = supernum with open("super.json", "w") as f: json.dump(json_dict, f)
def get_current_classes(original_html): html = lxml.html.fromstring(original_html) classes = [] total_classes = [] elements = html.find_class("datadisplaytable") for index in range(0, len(elements), 2): classes.append([elements[index], elements[index+1]]) for loop_index, each_class in enumerate(classes): cl = {} title_string = each_class[0].find_class("captiontext")[0].text_content() string_components = title_string.split(' - ') cl["title"] = string_components[0] cl["section"] = string_components[2] string_components = string_components[1].split(' ') cl["department"] = string_components[0] cl["number"] = string_components[1] class_elements = [] for index, element in enumerate(each_class[0].getchildren()[1:9]): class_elements.append(element.getchildren()[1].text_content()) cl["term"] = class_elements[0] cl["crn"] = class_elements[1] cl["registration"] = class_elements[2] cl["instructor"] = class_elements[3].replace('\n','') cl["grading_mode"] = class_elements[4] cl["credits"] = float(class_elements[5]) cl["level"] = class_elements[6] cl["campus"] = class_elements[7] #cl["E-mail"] = each_class[0].getchildren()[4].getchildren()[1].getchildren()[0].attrib['href'].split(':')[1] class_elements = [] for index, element in enumerate(each_class[1].getchildren()[2].getchildren()): class_elements.append(element.text_content()) cl['class_type'] = class_elements[0] cl['times'] = class_elements[1].split(' - ') for index, time in enumerate(cl['times']): cl['times'][index] = datetime.strptime(time, '%I:%M %p').strftime('%H:%M') cl['days'] = list(class_elements[2]) cl['location'] = {'building' : (' ').join(class_elements[3].split(' ')[:-1]), 'room': class_elements[3].split(' ')[-1]} cl['duration'] = class_elements[4] cl['type'] = class_elements[5] total_classes.append(cl) return total_classes
def get_credits(original_html): # Get dictonary of institution/transfer/total credits html = lxml.html.fromstring(original_html) institution_credits = 0 transfer_credits = 0 total_credits = 0 for element in html.find_class("ddlabel"): if element.text_content() == 'Total Institution:': institution_credits = float(element.getnext().getchildren()[0].text_content()) if element.text_content() == 'Total Transfer:': transfer_credits = float(element.getnext().getchildren()[0].text_content()) if element.text_content() == 'Overall:': total_credits = float(element.getnext().getchildren()[0].text_content()) credits = {'institution_credits': institution_credits, 'transfer_credits': transfer_credits, 'total_credits': total_credits} return credits
def getHabrPage(url): startTime = printTime("!habr: " + url) data = {} data["name"] = url data["children"] = [] try: req = urllib2.Request(url) req.add_header("User-agent", "NovoMapia.com/1.0") response = urllib2.urlopen(req) doc = response.read() # print doc[:50] # response.close() # doc=unicode(doc,'windows-1251') html = lxml.html.document_fromstring(doc) parseTime = printTime("!!habr: " + url, startTime) for index, news in enumerate(html.find_class("post")): title = news.xpath(".//a[@class='post_title']/text()")[0] comments_elem = news.xpath(".//div[@class='comments']//span[@class='all']/text()") if len(comments_elem) > 0: comments = num(comments_elem[0].split()[0]) else: comments = 0 href = news.xpath(".//a[@class='post_title']/@href")[0] "" # news.xpath(".//h3/a/@href")[0] #[0].get('href') id = news.get("id").split("_")[1] # print href, comments, id data["children"].append({}) data["children"][-1]["link"] = href data["children"][-1]["socialactivity"] = comments data["children"][-1]["popularity"] = len(data["children"]) data["children"][-1]["description"] = "" data["children"][-1]["img"] = "" data["children"][-1]["title"] = title data["children"][-1]["id"] = num(id) return data except urllib2.URLError, e: print repr(e)
def getMembranaPage(url): startTime = printTime("!membrana: " + url) data = {} data["name"] = url data["children"] = [] try: req = urllib2.Request(url) req.add_header("User-agent", "Mozilla/5.0") response = urllib2.urlopen(req) doc = response.read() doc = unicode(doc, "windows-1251") html = lxml.html.document_fromstring(doc) parseTime = printTime("!!membrana: " + url, startTime) for index, news in enumerate(html.find_class("list-article")): title = news.xpath(".//h3/a/text()")[0] comments_elem = news.xpath(".//div[@class='extras']/a/text()") if len(comments_elem) > 0: comments = num(comments_elem[0].split()[0]) else: comments = 0 # a = news.xpath(".//div[@class='extras']/a")[0].get('href') # print repr(a.get('href')) href = news.xpath(".//h3/a/@href")[0] # [0].get('href') id = href.split("/")[1] data["children"].append({}) data["children"][-1]["link"] = "http://membrana.ru/" + href data["children"][-1]["socialactivity"] = comments data["children"][-1]["popularity"] = len(data["children"]) data["children"][-1]["description"] = "" data["children"][-1]["img"] = "" data["children"][-1]["title"] = title data["children"][-1]["id"] = num(id) return data except IOError as (errno): print "I/O error({0})".format(errno) finally: printTime("#membrana: " + url, parseTime) return data
def drop_classes(original_html, crn_list): html = lxml.html.fromstring(original_html) course_table = html.find_class("datadisplaytable")[0] # make a list of crns in the order on the website to match them up # with the enumerated form values infosu_crns = [] for element in course_table.find_class('dddefault'): try: crn = element.getchildren()[1]._value__get() if crn: infosu_crns.append(crn) except: pass # for element in [element.getchildren()[0]._value__get() for element in course_table.find_class('dddefault') if type(element.getchildren()[0]) is lxml.html.InputElement]: # if 'DUMMY' in element and len(temp_list) > 0: # course_list.append(temp_list) # temp_list = [] # elif 'DUMMY' not in element: # temp_list.append(element) # with the list of crns from the website in order, we can make a list # of form values to submit and drop action_id_list = [] for crn in crn_list: for index, infosu_crn in enumerate(infosu_crns): if crn == infosu_crn: action_id_list.append("action_id" + str(index+1)) # action_id starts at 1 break # set each course to drop in the dropdown boxes (from value '' to 'DX') for action_id in action_id_list: html.get_element_by_id(action_id)._value__set('DX') # set to drop form = html.forms[1] values = form.form_values() values.append(('REG_BTN', 'Submit Changes')) return values
def get_locales(self): """generate language tag/id mapping""" print "Check available languages .." locales = {} langs = { "Chinese (China)" : "zh-CN", "Chinese (Taiwan)" : "zh-TW", "Czech (Czech Republic)" : "cs-CZ", "Danish" : "da", "Dutch" : "nl", "English" : "en", "Finnish" : "fi", "French" : "fr", "German" : "de", "Hungarian (Hungary)" : "hu-HU", "Italian" : "it", "Japanese (Japan)" : "ja-JP", "Korean" : "ko", "Latvian" : "lv", "Lithuanian" : "lt", "Norwegian" : "no", "Polish": "pl", "Polish (Poland)": "pl-PL", "Portuguese (Brazil)" : "pt-BR", "Russian" : "ru", "Spanish" : "es", "Swedish" : "sv", "Thai (Thailand)" : "th-TH", "Turkish (Turkey)" : "tr-TR", "Welsh" : "cy", } data = self.fetch_url("%s/%s" % (self.BASE_URL, self.project)) html = lxml.html.fromstring(data) for i in html.find_class('cell1'): a = i.find('a') if a != None: lang = a.text_content() code = a.get('href').replace('/',' ').split().pop() if langs.has_key(lang): locales[langs[lang]] = code return locales
def get_grades(original_html): html = lxml.html.fromstring(original_html) table_elements = html.find_class("datadisplaytable")[0].getchildren() classes_elements = [] classes_term = [] classes = [] for element in table_elements: try: term = element.find_class("fieldOrangetextbold")[0].text_content() except: pass if len(element.getchildren()) in [6,9]: classes_elements.append(element) classes_term.append(term) for index, element in enumerate(classes_elements): text = [] for subelement in element.getchildren(): text.append(subelement.text_content()) term = classes_term[index] terms = {'01': 'Fall', '02':'Winter', '03':'Spring', '04':'Winter'} try: if len(term)>7: term = term.split(': ')[1] else: term = ' '.join([terms[term[4:6]], term[0:4] ]) except: pass classes.append({'title' : text[2], 'department' : text[0], 'number' : text[1], 'credits' : float(text[3]), 'grade' : text[4], 'term' : term}) return classes
"Amphetamin", "Amphetami", "Coffein", "MDDMA", "4-Fa", "Buflomedil", "Amoxicillin", "m-CCP", "4-F-A", ]) if __name__ == "__main__": results = [] html = html.fromstring(open("tmp/page.html").read()) names = map(lambda _: _.text_content(), html.get_element_by_id("cc").findall("h2")) for i, tag in enumerate(html.find_class("inhalt_pillen")): infos = dict(name=names[i]) images = map(lambda _: _.get("src"), tag.findall("img")) for image in images: image_filename = os.path.join("tmp/pictures", os.path.basename(image)) if not os.path.isfile(image_filename): r = requests.get("http://www.mindzone.info/%s" % (image), stream=True) with open(image_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter keep-alive new chunks f.write(chunk) f.flush() infos["images"] = map(lambda _: os.path.basename(_), images) if not infos["images"]: continue for line in etree.tostring(tag).split("\n"): tag_idx = line.find("<strong>")
import lxml get_ipython().magic(u'pwd ') f = open("alex.0.html") import lxml.html html = lxml.html.parse(f) html html.tostring() lxml.html.tostring(html) help(lxml.html.find_class) lxml.html.find_class(html, "biz_info") html = lxml.html.fromstring("".join(f.readlines())) html = lxml.html.fromstring("".join(f.readlines())) f.close() f = open("alex.0.html") html = lxml.html.fromstring("".join(f.readlines())) html.find_class("biz_info") [elem.text_content() for elem in html.find_class("biz_info")] [elem.cssselect("a").text_content() for elem in html.find_class("biz_info")] [elem.cssselect("a")[0].text_content() for elem in html.find_class("biz_info")] html.cssselect("div.review div.rating_info img") [elem.get(alt) for elem in html.cssselect("div.review div.rating_info img")] [elem.get("alt") for elem in html.cssselect("div.review div.rating_info img")] reviews = html.cssselect("div.review") reviews reviews[0].cssselect("img") reviews[0].cssselect("div.biz_info a") reviews[0].cssselect("div.biz_info h4 a") reviews[0].cssselect("div.biz_info h4 a")[0].get("href") reviews[0].cssselect("div.rating img")[0].get("alt") ratings = {} for review in reviews:
combined_name=os.path.join(os.path.dirname(directory_name), 'dealer.com.json') dealers = [] for file_name in file_names: with open(file_name, 'r') as fd: url = os.path.splitext(os.path.basename(file_name))[0] text = fd.read() if not 'static.dealer.com' in text: print 'Dealer %s has moved on ...' % url continue try: html = lxml.html.document_fromstring(text) data = {} data['url'] = url for vcard in html.find_class('vcard'): data['name'] = get_text(vcard, 'org') data['tels'] = [] for tel_el in vcard.find_class('tel'): tel = {} tel['value'] = get_text(tel_el, 'value') tel['type'] = get_text(tel_el, 'type') data['tels'].append(tel) for adr in vcard.find_class('adr'): data['address'] = {} data['address']['street_address'] = get_text(adr, 'street-address') data['address']['locality'] = get_text(adr, 'locality') data['address']['region'] = get_text(adr, 'region') data['address']['postal_code'] = get_text(adr, 'postal-code') break for geo in vcard.find_class('geo'):
def add_class_has_errors(original_html): html = lxml.html.fromstring(original_html) if html.find_class("errortext"): return False return True
try: html = lxml.html.document_fromstring(text) data = {'url': url, 'address': {}, 'geo': {}, 'departments': []} for meta in html.iter('meta'): name = meta.get('name') content = meta.get('content') if name is not None: if name == 'geo.position': lat, lng = content.split(',') data['geo']['latitude'] = lat data['geo']['longitude'] = lng elif name == 'geo.placement': data['address']['addressLocality'] = content elif name == 'geo.region': data['address']['addressRegion'] = content for div in html.find_class('hours-page'): for span in div.iter('span'): itemprop = span.get('itemprop') content = span.text_content() if itemprop is not None and len(itemprop) > 0 and len(content) > 0: if itemprop == 'name': if itemprop not in data: data[itemprop] = content elif itemprop == 'email': data[itemprop] = content elif itemprop == 'telephone': data[itemprop] = content elif itemprop == 'streetAddress': data['address'][itemprop] = content elif itemprop == 'postalCode': data['address'][itemprop] = content
def by_class(html, class_name): return html.find_class(class_name)[0].text_content().replace("\n", " ").replace("\t", " ")
print('\t[' + str(i) + ']: '+ cat.text.split('\n')[0]) cat_list = cat.find_element_by_xpath('following-sibling::*') category_html[i] = lxml.html.fromstring(cat_list.get_attribute('innerHTML')) category_html[i].make_links_absolute(base_url) # Prompt user for category selection input_numbers = input("Enter desired category numbers, seperated by spaces. (Leave blank for all): ") input_numbers = list(map(lambda x: int(x), input_numbers.split())) if len(input_numbers) == 0: input_numbers = range(0, len(categories)) # Parse code solutions, print to stdout for i in input_numbers: print("Scraping category " + str(i)) html = category_html[i] for prob in html.find_class("problemlink"): href = prob.get('href') browser.get(href) print(href.split('/').pop()) try: # could make this a randomized wait if we feel like Stuart Reges # is on to us solution = browser.find_element_by_id("solution") soln_text = py_html.unescape(solution.get_attribute('innerHTML')) print(soln_text) except: print("No code solution found") print() browser.back() print("Done scraping")
def get_info_on_page(html): info_list = [] lis = html.find_class('items items_V clearfix') for li in lis: info_list.append(get_info_from_li(li)) return info_list
#!/usr/bin/env python import glob # to find files import lxml.html import csv # to write file repo_dir = "/home/ubuntu/dc_mayor_schedule" rows = list() for filename in sorted(glob.glob(repo_dir + "/html/*.html")): tree = lxml.html.parse(filename) html = tree.iter().next() table = html.find_class('views-table')[0] for tr in table.iter('tr'): tds = list() for td in tr.iter('td'): tds.append(td) span = tds[0].iter('span').next() datetime = span.values()[3].encode("utf-8") strong = tds[1].iter('strong').next() event = strong.text_content().encode("utf-8") div = tds[1].iter('div').next() venue = div.text_content().encode("utf-8") p = tds[1].iter('p').next() comment = p.text_content().encode("utf-8") rows.append([datetime, event, venue, comment]) # We're about to reverse everything rows.append(["datetime", "event", "venue", "comment"])
#!/usr/bin/env python import glob # to find files import lxml.html import csv # to write file repo_dir = "/home/aaron/dc_mayor_schedule" rows = list() for filename in sorted(glob.glob(repo_dir + "/html/*.html")): tree = lxml.html.parse(filename) html = tree.iter().next() table = html.find_class('views-table')[0] for tr in table.iter('tr'): tds = list() for td in tr.iter('td'): tds.append(td) span = tds[0].iter('span').next() datetime = span.values()[3].encode("utf-8") strong = tds[1].iter('strong').next() event = strong.text_content().encode("utf-8") div = tds[1].iter('div').next() venue = div.text_content().encode("utf-8") p = tds[1].iter('p').next() comment = p.text_content().encode("utf-8") rows.append([datetime, event, venue, comment]) # We're about to reverse everything rows.append(["datetime", "event", "venue", "comment"])
combined_name=os.path.join(os.path.dirname(directory_name), 'dealerfire.json') dealers = [] for file_name in file_names: with open(file_name, 'r') as fd: url = os.path.splitext(os.path.basename(file_name))[0] text = fd.read() if not '.dealerfire.com' in text: print 'Dealer %s has moved on ...' % url continue try: html = lxml.html.document_fromstring(text) data = {'url': url, 'address': {}, 'geo': {}, 'departments': []} suck = True for info in html.find_class('dealer-info'): suck = False fail = True for meta in info.iter('meta'): itemprop = meta.get('itemprop') content = meta.get('content') if itemprop is not None and len(itemprop) > 0 and len(content) > 0: fail = False if itemprop == 'name': if itemprop not in data: data[itemprop] = content elif itemprop == 'email': data[itemprop] = content elif itemprop == 'telephone': data[itemprop] = content elif itemprop == 'streetAddress':
def parseContents(contents): global totalWords global nextURL html = lxml.html.fromstring(contents) elems = html.find_class('leading') for elem in elems: links = elem.findall('.//a') title = links[0].text url = links[0].get('href') m = re.search('\/([0-9]+)\-', url) articleId = m.group(1) published = elem.find_class('published')[0].text m = re.search( '(([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{4})) ([0-9]{1,2}:[0-9]{1,2})', published) day = m.group(2) month = m.group(3) year = m.group(4) date = m.group(1) time = m.group(2) print("%s: %s, %s @ %s" % (articleId, url, date, time)) paragraphs = elem.findall('.//p') text = "" articleWords = 0 for paragraph in paragraphs: if paragraph.text: if len(text) >= 0: text += u"\n\n" text += paragraph.text articleWords += paragraph.text.count(' ') totalWords += articleWords print(" %d words in this article (%d total)" % (articleWords, totalWords)) print("") makeDirectory("Mystem-texts") makeDirectory("texts") makeDirectory("texts/" + year) makeDirectory("texts/" + year + "/" + month) text_file = open( "texts/" + year + "/" + month + "/" + articleId + '.txt', 'wb') text_file.write(text.encode("UTF-8")) mystfile = open("Mystem-texts/" + articleId + '.txt', 'w') lemmas = mys.lemmatize(text) for item in lemmas: mystfile.write(item) text_file.close() mystfile.close() nextLink = html.find_class("pagination-next")[0].find('.//a') if nextLink is not None: nextURL = nextLink.get('href') else: nextURL = None
sock = urllib.urlopen("http://www.weather.com/weather/hourbyhour/graph/USNY0996") wsource = sock.read() sock.close() html = lxml.html.fromstring(wsource) # part 1: wx-timepart # part 2: wx-conditions (image + description) # part 3: temperature # part 4: details now = datetime.datetime.now().strftime('%H:%M:%S') print 'now=%s'%now temp = [] elements = html.find_class("wx-temp") for i in range(len(elements)): temp.append(float(elements[i].text_content()[1:3])) print temp time1 = [] elements = html.find_class("wx-time") for i in range(len(elements)): time1.append(elements[i].text_content()) print time1 res = [] sys.exit(2)
def search(self, html): results = list() org_element_list = html.find_class('org_full_box') if org_element_list: org_element = org_element_list.pop() name = org_element.find_class('org_header')[0].text_content() name = ' '.join(name.split()) org_info = org_element.find_class('orginfo') phone = '' address = '' metro = '' area = '' site = '' email = '' time = '' category = '' services = '' for tr in org_info[0]: label = tr[0].text_content() if u'Телефоны:' == label: phone = tr[1].text_content() phone = ' '.join(phone.split()) if u'Адрес:' == label: address = tr[1].text_content() address = ' '.join(address.split()) if u'Метро:' == label: metro = tr[1].text_content() metro = ' '.join(metro.split()) if u'Район:' == label: area = tr[1].text_content() area = ' '.join(area.split()) if u'Сайт:' == label: site = tr[1].text_content() site = ' '.join(site.split()) if u'E-mail:' == label: email = tr[1].text_content() email = ' '.join(email.split()) if u'Время работы:' == label: time = tr[1].text_content() time = ' '.join(time.split()) if u'Рубрики:' == label: category = tr[1].text_content() category = ' '.join(category.split()) if u'Услуги и товары:' == label: services = tr[1].text_content() services = ' '.join(services.split()) results = [[ name, phone, address, metro, area, site, email, time, category, services ],] return results
def parseContents(contents): global totalWords global nextURL html = lxml.html.fromstring(contents) elems = html.find_class('leading') for elem in elems: links = elem.findall('.//a'); title = links[0].text url = links[0].get('href') m = re.search('\/([0-9]+)\-', url) articleId = m.group(1) published = elem.find_class('published')[0].text m = re.search('(([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{4})) ([0-9]{1,2}:[0-9]{1,2})', published) day = m.group(2) month = m.group(3) year = m.group(4) date = m.group(1) time = m.group(2) print("%s: %s, %s @ %s" % (articleId, url, date, time)) paragraphs = elem.findall('.//p') text = "" articleWords = 0 for paragraph in paragraphs: if paragraph.text: if len(text) >= 0: text += u"\n\n" text += paragraph.text articleWords += paragraph.text.count(' ') totalWords += articleWords print(" %d words in this article (%d total)" % (articleWords, totalWords)) print("") makeDirectory("Mystem-texts") makeDirectory("texts") makeDirectory("texts/"+year) makeDirectory("texts/"+year+"/"+month) text_file = open("texts/"+year+"/"+month+"/"+articleId+'.txt', 'wb') text_file.write(text.encode("UTF-8")) mystfile = open("Mystem-texts/"+articleId+'.txt', 'w') lemmas = mys.lemmatize(text) for item in lemmas: mystfile.write(item) text_file.close() mystfile.close() nextLink = html.find_class("pagination-next")[0].find('.//a') if nextLink is not None: nextURL = nextLink.get('href') else: nextURL = None