def read_data(page_count, filepath="./datas/job_python/"): """ :param page_count: 需要爬取的页数 :param filepath: 源文件存放的路径 :return: 无返回值,在函数的最后将解析后的数据存储文件中 """ parser = etree.HTMLParser(encoding='utf-8') for i in range(1, page_count): html_tree = etree.parse(filepath + f"python_{i}.html", parser=parser) path = "//div[@class='dw_table']/div[@class='el']" jobs = html_tree.xpath(path) jobs_list = [] for job in jobs: dict_job = std_job(job) jobs_list.append(dict_job) # job_title = job.xpath('./p/span/a')[0].text # job_company = job.xpath('./span/a')[0].text # job_place = job.xpath('./span[@class="t3"]')[0].text # job_salary = job.xpath('./span[@class="t4"]')[0].text # job_date = job.xpath('./span[@class="t5"]')[0].text # 加入文件到csv文件中 #保存页面中的信息到csv文件 save_csv( f"./handled_data/job_python_{str(datetime.datetime.now()).split(' ')[0]}.csv", jobs_list)
def get_congress(cong, N): #POST data to pass to THOMAS data = { "database": "nominations", "MaxDocs": "%d" % N, "querytype": "phrase", "query": "", "Stemming": "Yes", "congress": "%d" % cong, "CIVcategory": "on", "committee": "", "LBDateSel": "FLD606", "EBSDate": "", "EBEDate": "", "sort": "sh_docid_c", "submit": "SEARCH" } resp = urllib.urlopen(URL, data=urllib.urlencode(data)) results = etree.parse(StringIO.StringIO(resp.read()), parser) nominations = results.xpath('//div[@id="content"]/p[2]/a/@href') start = 0 end = len(nominations) for i in range(start, end): nomination = nominations[i] url = "http://thomas.loc.gov" + nomination try: get_nomination(url) except Exception as e: print url print e if i % 100 == 0: print i
def get_congress(cong, N): #POST data to pass to THOMAS data = { "database":"nominations", "MaxDocs":"%d" % N, "querytype":"phrase", "query":"", "Stemming":"Yes", "congress":"%d" % cong, "CIVcategory":"on", "committee":"", "LBDateSel":"FLD606", "EBSDate":"", "EBEDate":"", "sort":"sh_docid_c", "submit":"SEARCH" } resp = urllib.urlopen(URL, data=urllib.urlencode(data)) results = etree.parse(StringIO.StringIO(resp.read()), parser) nominations = results.xpath('//div[@id="content"]/p[2]/a/@href') start = 0 end = len(nominations) for i in range(start, end): nomination = nominations[i] url = "http://thomas.loc.gov" + nomination try: get_nomination(url) except Exception as e: print url print e if i % 100 == 0: print i
def xml_to_example(xmlpath, imgpath): xml = etree.parse(xmlpath) root = xml.getroot() imgname = root.find('filename').text imgname = os.path.join(imgpath, imgname) image = tf.gfile.GFile(imgname, 'rb').read() size = root.find('size') height = int(size.find('height').text) width = int(size.find('width').text) depth = int(size.find('depth').text) shape = np.asarray([height, width, depth], np.int32) xpath = xml.xpath('//object') ground_truth = np.zeros([len(xpath), 5], np.float32) for i in range(len(xpath)): obj = xpath[i] classid = classname_to_ids[obj.find('name').text] bndbox = obj.find('bndbox') ymin = float(bndbox.find('ymin').text) ymax = float(bndbox.find('ymax').text) xmin = float(bndbox.find('xmin').text) xmax = float(bndbox.find('xmax').text) ground_truth[i, :] = np.asarray([ymin, ymax, xmin, xmax, classid], np.float32) #GT所包含的信息 features = { 'image': bytes_feature(image), 'shape': bytes_feature(shape.tobytes()), 'ground_truth': bytes_feature(ground_truth.tobytes()) } example = tf.train.Example(features=tf.train.Features(feature=features)) return example
def parseNoteXML(note_name): convert_xml(note_name, note_name + '.xml') context = etree.iterparse(note_name + '.xml', encoding='utf-8', strip_cdata=False) note_dict = {} notes = [] for ind, (action, elem) in enumerate(context): text = elem.text if elem.tag == 'content': # x = to_valid_xml(elem.text.encode('utf-8')) x = elem.text.encode('utf-8') r = etree.parse(StringIO(x), p) for e in r.iter(): try: text.append(e.text) except: print('cannot print', file=sys.stderr) note_dict[elem.tag] = text # NixNote use "Note"; Evernote Windows & Mac Client use "note" # if elem.tag == "Note" or elem.tag == 'note': if elem.tag == "note": notes.append(note_dict) note_dict = {} return notes
def resolving(): html = etree.parse('RoomTable.html', etree.HTMLParser()) tr_list = html.xpath("//center/table[3]") cell = tr_list[0].xpath("./tr/td/text() | ./tr/td/a/attribute::href") a = tr_list[0].xpath("./tr/td/a/attribute::*") print(cell) data = [] tmp_obj = [] k = 0 for i in cell: content = replaceCoding(i) if (content.__len__() == 0): continue k += 1 if (k == 6): tmp_obj.append(getCode(i)) data.append(tmp_obj) tmp_obj = [] k = 0 else: tmp_obj.append(content) # result=html.xpath("//center/table/tr/td/a[1]/attribute::*") # print(result) sql = "" for i in data: sql += "('%s','%s',%s,'%s','%s')," % (i[1], i[2], i[3], i[4], i[5]) sql = "insert into c_origin_data (`city`,`location`,`num`,`category`,`code`) values " + sql[ 0:-1] print(sql) cur.execute(sql) db.commit()
def crawl(path, pid=None): body = download("http://petitions.whitehouse.gov" + path, path.split('/')[2] + ".html") page = etree.parse(StringIO(body), parser) #catch page text whether or not petition is still active #http://stackoverflow.com/questions/5662404/how-can-i-select-an-element-with-multiple-classes-with-xpath text = "\n".join( page.xpath( "//div[contains(concat(' ',@class,' '),' petition-detail')]/p/text()" )) #check if expired if "The petition you are trying to access has expired" in text: return {"status": "expired"} #if raw_date not found, probably a bad link (or change in HTML, so we should be careful) try: raw_date = page.xpath("//div[@class='date']/text()")[0].strip() except: return {"status": "error", "reason": "no date"} created = datetime.strptime(raw_date, "%b %d, %Y").strftime("%Y-%m-%d") signatures = page.xpath("//div[@class='num-block num-block2']/text()") #indiciates possible response if len(signatures) == 0: signatures = page.xpath("//div[@class='num-block']/text()") response = page.xpath( "//div[contains(concat(' ',@class,' '),' petition-response')]") if response: status = "answered" else: return {"status": "error", "reason": "no signatures"} else: status = "active" signatures = int(signatures[0].replace(",", '')) if not pid: #no pid if fewer than 20 signatures try: pid = page.xpath( "//a[@class='load-next no-follow active']/@rel")[0] except: pid = "N/A" return { "id": pid, "status": status, "title": page.xpath("//h1[@class='title']/text()")[0].strip(), "body": text, "issues": page.xpath("//div[@class='issues']/a/text()"), "created": created, "visited": datetime.now().strftime("%Y-%m-%d-%H:%M:%S"), "signature_count": signatures, "url": "http://petitions.whitehouse.gov" + path }
def visit_page(url,path="",save=False): content = {"title" : "", "url" : "", "keywords" : "", "links" : [], "body" : ""} if urlparse(url).netloc == "": #print "partial:", url return content if urlparse(url).netloc in BANNED: #print "banned:", url return content try: resp = urllib2.urlopen(url) except URLError as e: print e return content if resp.getcode() != 200: "Bad response: ",resp.getcode() return content #resolves URL content["url"] = resp.url html = resp.read() try: tree = etree.parse(StringIO.StringIO(html), parser) except: print "LXML error" return content content["title"] = tree.xpath("//title//text()") if len(content["title"]) > 0: content["title"] = content["title"][0].strip() content["links"] = tree.xpath("//body//@href") content["keywords"] = tree.xpath("//meta[@name='keywords']/@content") if content["keywords"] == "": content["keywords"] = tree.xpath("//meta[@name='Keywords']/@content") print "caught a case ",url #content["body"] = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0])) body = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0])) content["word_count"] = len(body.split(" ")) #will save full html if save: filename = urllib.quote_plus(content["url"][0:60])+".txt" #filename.replace("http%3A%2F%2F", "") #if file doesn't already exist if not findInSub(filename,path): #make that day's path path = path + strftime("/%Y/%m/%d/", gmtime()) if not os.path.exists(path): os.makedirs(path) f = open(path+filename, "w+") f.write(html) f.close() print "wrote " + path+filename else: print "already had " + filename return content
def get_load_test_result(load_test_report_path): if (os.path.exists(load_test_report_path)): parser = etree.HTMLParser(encoding='utf-8') html = etree.parse(load_test_report_path, parser=parser) cbft_name = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[1]/text()')[0] cbft_status = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[2]/text()')[0] cbft_tps = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[3]/text()')[0] cbft_cpu = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[4]/text()')[0] cbft_memory = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[5]/text()')[0] cbft_bwup = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[6]/text()')[0] cbft_bwdown = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[7]/text()')[0] wasm_name = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[1]/text()')[0] wasm_status = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[2]/text()')[0] wasm_tps = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[3]/text()')[0] wasm_cpu = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[4]/text()')[0] wasm_memory = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[5]/text()')[0] wasm_bwup = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[6]/text()')[0] wasm_bwdown = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[7]/text()')[0] cbft = '场景:' + cbft_name + '<br>' + \ '状态:' + cbft_status + '<br>' + \ '每秒事务数:' + cbft_tps + '<br>' + \ '处理器(%):' + cbft_cpu + '<br>' + \ '内存(%):' + cbft_memory + '<br>' + \ '带宽上行(Mb/s):' + cbft_bwup + '<br>' + \ '带宽下行(Mb/s):' + cbft_bwdown wasm = '场景:' + wasm_name + '<br>' + \ '状态:' + wasm_status + '<br>' + \ '每秒事务数:' + wasm_tps + '<br>' + \ '处理器(%):' + wasm_cpu + '<br>' + \ '内存(%):' + wasm_memory + '<br>' + \ '带宽上行(Mb/s):' + wasm_bwup + '<br>' + \ '带宽下行(Mb/s):' + wasm_bwdown if cbft_status == 'success' and wasm_status == 'success': TestResult = 'PASS' else: TestResult = 'FAIL' return TestResult, cbft + '<br><br>' + wasm else: return 'ERROR', '测试被人为中断或测试代码出错,未能生成报告'
def check_waf(self, resp): self._xmlstr_dom = etree.parse(cwd+'/fingerprinting.xml') waf_doms = self._xmlstr_dom.xpath("waf") detect = 0 for waf_dom in waf_doms: finger_dom = waf_dom.xpath("finger") rule_dom = finger_dom[0].xpath("rule") head_type =rule_dom[0].get("header").lower() if head_type in resp.headers: regx = self.regexp_header(rule_dom,waf_dom,head_type,resp) if regx > 0 : detect +=1 return detect
def find_text_by_id(xml_file, node_id: str): if isinstance(xml_file, str): xml_file = open(xml_file) tree = et.parse(xml_file) root = tree.getroot() ns = {"default": root.nsmap[None]} dest_elements = root.xpath(".//*[contains(@id, '%s')]//parent::*" "//parent::*//following-sibling::*//text()" % (node_id, ), namespaces=ns) res = "" for t in dest_elements: res += t if t is not None else "" return res
def crawl(path, pid=None): body = download("http://petitions.whitehouse.gov" + path, path.split("/")[2] + ".html") page = etree.parse(StringIO(body), parser) # catch page text whether or not petition is still active # http://stackoverflow.com/questions/5662404/how-can-i-select-an-element-with-multiple-classes-with-xpath text = "\n".join(page.xpath("//div[contains(concat(' ',@class,' '),' petition-detail')]/p/text()")) # check if expired if "The petition you are trying to access has expired" in text: return {"status": "expired"} # if raw_date not found, probably a bad link (or change in HTML, so we should be careful) try: raw_date = page.xpath("//div[@class='date']/text()")[0].strip() except: return {"status": "error", "reason": "no date"} created = datetime.strptime(raw_date, "%b %d, %Y").strftime("%Y-%m-%d") signatures = page.xpath("//div[@class='num-block num-block2']/text()") # indiciates possible response if len(signatures) == 0: signatures = page.xpath("//div[@class='num-block']/text()") response = page.xpath("//div[contains(concat(' ',@class,' '),' petition-response')]") if response: status = "answered" else: return {"status": "error", "reason": "no signatures"} else: status = "active" signatures = int(signatures[0].replace(",", "")) if not pid: # no pid if fewer than 20 signatures try: pid = page.xpath("//a[@class='load-next no-follow active']/@rel")[0] except: pid = "N/A" return { "pid": pid, "status": status, "title": page.xpath("//h1[@class='title']/text()")[0].strip(), "text": text, "tags": page.xpath("//div[@class='issues']/a/text()"), "created": created, "visited": datetime.now().strftime("%Y-%m-%d-%H:%M:%S"), "signatures": signatures, "url": "http://petitions.whitehouse.gov" + path, }
def gain_config(filepath) : """ 功能:解析配置文件,原理是和解析xml文件类同。 :param filepath: 配置文件路径 :return: 返回默认的配置值 """ config_tree = etree.parse(filepath) datas = config_tree.xpath('/spiders/spider[@id="job_spider"]') print(len(datas)) condition = datas[0].xpath('./condition')[0].text page_count = datas[0].xpath('./pageAccount')[0].text return condition, page_count
def smart_read(url): resp = urllib2.urlopen(url) #resolve url url = resp.url domain = urlparse(url).netloc path = urlparse(url).path html = resp.read() tree = etree.parse(StringIO.StringIO(html), parser) links = tree.xpath("//body//@href") nmax = 0 for link in links: if urlparse(link).netloc == domain: ng = NGram.compare(urlparse(link).path,path) #print link,ng if ng > nmax and ng < 1: nmax = ng mirror = link diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"]) tree = etree.parse(StringIO.StringIO(diffh), parser) diff = tree.xpath("//ins//text()") for d in diff: print d
def diy_parse_content_list(content_list_file): if isinstance(content_list_file, str): content_list_file = open(content_list_file) tree = et.parse(content_list_file) root = tree.getroot() ns = {"default": root.nsmap[None]} nav_points = root.xpath(".//*[contains(@class, 'MsoHyperlink')]/default:a", namespaces=ns) nav_points_tuples = [] for node in nav_points: chapter_name = node.text chapter_loc = "content/" + node.attrib["href"] nav_points_tuples.append((chapter_name, chapter_loc)) return nav_points_tuples
def parse_toc(toc_file): if isinstance(toc_file, str): toc_file = open(toc_file) tree = et.parse(toc_file) root = tree.getroot() ns = {"default": root.nsmap[None]} nav_points = root.xpath(".//default:navPoint", namespaces=ns) nav_points_tuples = [] for node in nav_points: chapter_name = node.find("./default:navLabel/default:text", namespaces=ns).text chapter_loc = node.find("./default:content", namespaces=ns).attrib["src"] nav_points_tuples.append((chapter_name, chapter_loc)) return nav_points_tuples
def check_regexp(self, content): if not content: return self._xmlstr_dom = etree.parse('regexp.xml') javaid_doms = self._xmlstr_dom.xpath("javaid") for javaid_dom in javaid_doms: self._vultype =javaid_dom.get("vultype") #print "vul_type "+self._vultype function_doms = javaid_dom.xpath("function") for function_dom in function_doms: rule_dom = function_dom.xpath("rule") self._function =rule_dom[0].get("name") self.regexp_search(rule_dom,content) #print "check_regexp search ..." return True
def get_metadata(opf_file): if isinstance(opf_file, str): opf_file = open(opf_file) tree = et.parse(opf_file) ns = { "dc": "http://purl.org/dc/elements/1.1/", "opf": "http://www.idpf.org/2007/opf", "calibre": "http://calibre.kovidgoyal.net/2009/metadata" } namespace_dc = tree.xpath(".//opf:metadata/*[namespace-uri()='%s']" % (ns["dc"], ), namespaces=ns) dc_tuples = [] for node in namespace_dc: dc_tuples.append((strip_namespace(node.tag), node.text)) return dc_tuples
def petitions(start=1, mx=None): if mx is None: mx = -1 #log objects for tracking signatures over time hits = 0 #scan WH site, add any new petitions to DB #surely a better way to get indefinite number of results than to create a functionally infinite loop, then breaking it, but drawing a blank for pg in range(start, 1000): log("Loading page %d" % pg) #The WH site loads petitions from an external HTML doc in a JSON shell url = "https://petitions.whitehouse.gov/petitions/more/all/%d/2/0/" % pg try: raw = s.urlopen(url).encode('utf-8') except scrapelib.HTTPError: log("Error downloading %s" % url) return hits resp = json.loads(raw) if "markup" not in resp or len(resp["markup"]) == 0: log("No results at page %i" % pg) return hits page = etree.parse(StringIO(resp['markup']), parser) #there are two links to each petition in the results, but can reduce to uniques with "nofollow" petitions = page.xpath("body/div[@class]") if len(petitions) == 0: return hits for petition in petitions: pid = petition.xpath("@id")[0].split('-')[1] #get uid for each petition from main div id path = petition.xpath("div/div/a/@href")[0] data = crawl(path, pid) #if petition is dead (unlikely if scanned from WH site directly, but you never know): if data["status"] == "expired": scrapelog["signatures"][path.split("/")[2]] = -1 elif data["status"] == "active": scrapelog["signatures"][path.split("/") [2]] = data["signature_count"] write(json.dumps(data, indent=2, sort_keys=True), "scrape/petitions/" + data['id'] + ".json") hits += 1 if mx != -1 and hits >= mx: return hits
def check_waf(self, resp): try: self._xmlstr_dom = etree.parse(cwd + '/dic/fingerprinting.xml') waf_doms = self._xmlstr_dom.xpath("waf") detect = 0 for waf_dom in waf_doms: finger_dom = waf_dom.xpath("finger") rule_dom = finger_dom[0].xpath("rule") head_type = str(rule_dom[0].get("header").lower()) if head_type in resp.headers: regx = self.regexp_header(rule_dom, waf_dom, head_type, resp) if regx > 0: detect += 1 return detect except: print "[+] Error al obtener cabeceras del Servidor : " + self._url
def petitions(start=1, mx=None): if mx is None: mx = -1 #log objects for tracking signatures over time hits = 0 #scan WH site, add any new petitions to DB #surely a better way to get indefinite number of results than to create a functionally infinite loop, then breaking it, but drawing a blank for pg in range(start, 1000): log("Loading page %d" % pg) #The WH site loads petitions from an external HTML doc in a JSON shell url = "https://petitions.whitehouse.gov/petitions/more/all/%d/2/0/" % pg try: raw = s.urlopen(url).encode('utf-8') except scrapelib.HTTPError: log("Error downloading %s" % url) return hits resp = json.loads(raw) if "markup" not in resp or len(resp["markup"]) == 0: log("No results at page %i" % pg) return hits page = etree.parse(StringIO(resp['markup']), parser) #there are two links to each petition in the results, but can reduce to uniques with "nofollow" petitions = page.xpath("body/div[@class]") if len(petitions) == 0: return hits for petition in petitions: pid = petition.xpath("@id")[0].split('-')[1] #get uid for each petition from main div id path = petition.xpath("div/div/a/@href")[0] data = crawl(path, pid) #if petition is dead (unlikely if scanned from WH site directly, but you never know): if data["status"] == "expired": scrapelog["signatures"][path.split("/")[2]] = -1 elif data["status"] == "active": scrapelog["signatures"][path.split("/")[2]] = data["signature_count"] write(json.dumps(data, indent=2, sort_keys=True), "scrape/petitions/" + data['id'] + ".json") hits += 1 if mx != -1 and hits >= mx: return hits
def from_kingsoft(query: str): try: xml = urlopen( f"http://dict-co.iciba.com/api/dictionary.php?w={query}&key=1F287830F78CD6CFEB5E4279236CBEBB" ) root = et.parse(xml) xpath_pos = ".//pos//text()" xpath_acc = ".//acceptation//text()" acceptations = root.xpath(xpath_acc) pos = root.xpath(xpath_pos) meaning = [ p + utils_text_preprocess.clean_text(a) for p, a in zip(pos, acceptations) ] except BaseException as e: print(e) meaning = [] return meaning
def get_congress(cong): params = urllib.urlencode({'congress': cong}) results = urllib.urlopen('http://bioguide.congress.gov/biosearch/biosearch1.asp', params) page = etree.parse(StringIO.StringIO(results.read()), etree.HTMLParser()) nas = 1 for member in page.xpath("//table")[1].xpath("tr")[1:]: name = member.xpath("td/a/text()") print name if len(name) == 0: name = "" print nas nas += 1 continue else: name = name[0] pid = member.xpath("td/a/@href")[0].split("=")[1] stats = member.xpath("td/text()") c.execute('''INSERT OR IGNORE INTO terms (pid, name, dates, position, party, state, congress) VALUES (?,?,?,?,?,?,?)''', (pid, name, stats[0], stats[1][0], stats[2], stats[3], int(stats[4]))) conn.commit()
def check_waf(self, resp): if not resp.content: return self._xmlstr_dom = etree.parse('finger.xml') waf_doms = self._xmlstr_dom.xpath("waf") for waf_dom in waf_doms: finger_dom = waf_dom.xpath("finger") rule_dom = finger_dom[0].xpath("rule") head_type = rule_dom[0].get("header").lower() if head_type in resp.headers: if self.regexp_header(rule_dom, waf_dom, head_type, resp): return True else: self._nowaf = "This website has no waf or identify false!!!" print "get waf finger false:" + self._nowaf else: print "head type search ..." if check_resp(resp): return True return False
def getXMLConfiguration(path="regexp.xml"): config = {} xml_dom = etree.parse(path) jsid_doms = xml_dom.xpath("jsid") try: for jsid_dom in jsid_doms: vultype = jsid_dom.get("vultype") #print vultype config[vultype] = [] rule_doms = jsid_dom.xpath("rule") for rule_dom in rule_doms: regexp_dom = rule_dom.xpath("regexp")[0] value = [rule_dom.get("name"), re.compile(regexp_dom.text)] config[vultype].append(value) except: print "Error when parsing xml file. Please check the format" return config
def check_waf(self, resp): if not resp.content: return self._xmlstr_dom = etree.parse('finger.xml') waf_doms = self._xmlstr_dom.xpath("waf") for waf_dom in waf_doms: finger_dom = waf_dom.xpath("finger") rule_dom = finger_dom[0].xpath("rule") head_type =rule_dom[0].get("header").lower() if head_type in resp.headers: if self.regexp_header(rule_dom,waf_dom,head_type,resp): return True else: self._nowaf="This website has no waf or identify false!!!" #print "[+]【Wafid】get waf finger false: "+self._nowaf else: continue #print "head type search ..." if self.check_resp(resp): return True return False
def diff_rss(url, name, limit=-1): rss = feedparser.parse(url) links = {} #print rss if limit==-1 or limit > len(rss.entries): limit = len(rss.entries) first_index = get_first_index(rss) for i in range(first_index, limit+1): links[rss.entries[i].link] = [] post1 = rss.entries[i-1].link if i == limit: post2 = rss.entries[first_index-1].link else: post2 = rss.entries[i].link print post2 diffh = htmldiff(get_content(post1)["body"], get_content(post2)["body"]) tree = etree.parse(StringIO.StringIO(diffh), parser) diff = tree.xpath("//ins//@href") for d in diff: if urlparse(d).netloc != urlparse(rss.feed.link).netloc and urlparse(d).path != '/': links[rss.entries[i].link].append(d) return links
def rss_search(): r = c.execute('''select * from tips where rss=""''').fetchall() for line in r: source = line["domain"] # print source try: response = urllib2.urlopen(source).read() tree = etree.parse(StringIO(response), etree.HTMLParser()) root = tree.getroot() feed = tree.xpath("//link[@type='application/rss+xml']") if feed == []: feed = tree.xpath("//a[text()='RSS']") if feed == []: feed = tree.xpath("//a[@href='/feed/']") if feed == []: feed = tree.xpath("//a[@href='" + source + "/feed/']") if feed == []: feed = tree.xpath("//a[@href='/feeds/']") if feed == []: feed = tree.xpath("//a[@href='" + source + "/feeds/']") if feed != []: print feed[0] rss = feed[0].get("href") if urlparse(rss).netloc != "" and urlparse(rss).netloc != "/": print rss c.execute("""update tips set rss = ? where domain = ?""", (rss, source)) conn.commit() else: rss_feed = source + rss print rss_feed c.execute("""update tips set rss = ? where domain = ?""", (rss_feed, source)) conn.commit() except IOError: print "bad call", source
def auth(user, passwd): os = 'Linux' useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1' AuthCheckURL = 'http://google.com/' parser = etree.HTMLParser() config = ConfigParser.SafeConfigParser({'os': os, 'useragent': useragent, 'debug': 'False'}) config.read('unCleanAccess.cfg') if not config.has_section('login'): config.add_section('login') debug = config.get('login', 'debug') in ('True', 'true') print 'Checking if Authenticated' if not debug: responseAuthCheck = urllib2.urlopen(AuthCheckURL) AuthCheckhtml = responseAuthCheck.read() else: f = open('unCleanAuthCheckunauthed.html','r') AuthCheckhtml = f.read() f.close() if AuthCheckhtml.find('/auth/perfigo_weblogin.jsp') != -1: print 'Not Authenticated Yet' urlSplit = AuthCheckhtml.split('URL=') if len(urlSplit) != 2: print 'Error extracting redirect URL (1)' else: urlSplit = re.split("'>|;", urlSplit[1]) if len(urlSplit) < 2: print 'Error extracting redirect URL (2)' else: print 'Fetching Login Page' if not debug: responseAuthPage = urllib2.urlopen(urlSplit[0]) AuthPagehtml = etree.parse(responseAuthPage, parser) else: f = open('authPage.html','r') AuthPagehtml = etree.parse(f, parser) f.close() print 'Parsing Login Page' POSTDataItems = dict() for formInput in AuthPagehtml.xpath(".//form[@name='loginform']//input"): if formInput.get('name'): POSTDataItems[formInput.get('name')] = formInput.get('value') POSTDataItems['pm'] = config.get('login', 'os') POSTDataItems['username'] = user POSTDataItems['password'] = passwd authData = urllib.urlencode(POSTDataItems) authHeaders = {'Referer' : urlSplit[0].split('perfigo_weblogin.jsp', 1)[0], 'User-Agent' : config.get('login', 'useragent')} print 'Logging in' authReq = urllib2.Request(urlSplit[0].split('auth/perfigo_weblogin.jsp', 1)[0] + AuthPagehtml.xpath(".//form[@name='loginform']")[0].get('action').split('/', 1)[1], authData, authHeaders) responseAuthReq = urllib2.urlopen(authReq) authReqhtml = responseAuthReq.read() if authReqhtml.find('You have been successfully logged on the network') != -1: print 'Successfuly Authenticated!' else: print 'Invalid credentials' (userName, password) = setCreds(regKeyVal) auth(userName, password) else: print 'Already Authenticated'
def parse(url): return etree.parse(StringIO.StringIO(requests.get(url).text))
html = driver.page_source.encode('utf-8') page_num = 0 while driver.find_element_by_xpath('//*[@class="load_more mt3"]/a'): break driver.find_element_by_xpath('//*[@class="load_more mt3"]/a').click() page_num += 1 print("getting page number " + str(page_num)) time.sleep(1) if page_num == 1: break return driver.page_source.encode('utf-8') response = getMoreRequests( 'https://www.kickstarter.com/discover/advanced?category_id=16') htmlparser = etree.HTMLParser() tree = etree.parse(response, htmlparser) soup = BeautifulSoup(response, 'html.parser') # projects_grid = soup.find('div', id="projects") projects = soup.find_all( 'div', {"class": "js-react-proj-card grid-col-12 grid-col-6-sm grid-col-4-lg"}) for project in projects: print(project['data-projects']) ## '//div[contains(@data-project)]/@data-project').getall()
outfile = '/tmp/pret.pkl'; soupurl = 'http://www.pret.com/our_food/soup.htm' def fix_text(astr): """Remove undesirable characters and strings""" astr = astr.strip() return astr html = urllib2.urlopen(soupurl).read() html = html.replace('<br />', '') parser = etree.HTMLParser() doc = etree.parse(StringIO(html), parser) soupnames = doc.xpath('//div[@class="soup_details_hidden"]//img[contains(@src, \'souptitle\')]//@alt') #pprint(soupnames) allsoups = map(fix_text, soupnames) #pprint(allsoups) souplist = [ [allsoups[0], allsoups[1], allsoups[2]], [allsoups[3], allsoups[4], allsoups[5]], [allsoups[6], allsoups[7], allsoups[8]], [allsoups[9], allsoups[10], allsoups[11]], [allsoups[12], allsoups[13], allsoups[14]]
URL_source = u'http://unicode.org/repos/cldr/trunk/common/supplemental/supplementalData.xml' encoding_source = "utf-8" import os URL_path, downloaded_source = os.path.split(URL_source) file_output = downloaded_source.split(os.extsep)[0] + ".tsv" ## Definitions of variables and their xpaths ## ## Using requests to download and lxml to parse from lxml.html import fromstring, tostring, parse, etree from io import StringIO, BytesIO try: tree = etree.parse(downloaded_source) #etree.parse was used to parse xml except: XML_encoding = encoding_source XML_src_url = URL_source import requests r = requests.get(XML_src_url, stream=True) r.raw.decode_content = True if not (r.status_code == 200): print( "Downloading the data from {} failed. Plese check Internet connections." .format(XML_src_url)) exit() XML_src = r.content # r.raw.read()#r.raw#r.text
def auth(user, passwd): os = 'Linux' useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1' AuthCheckURL = 'http://google.com/' parser = etree.HTMLParser() config = ConfigParser.SafeConfigParser({ 'os': os, 'useragent': useragent, 'debug': 'False' }) config.read('unCleanAccess.cfg') if not config.has_section('login'): config.add_section('login') debug = config.get('login', 'debug') in ('True', 'true') print 'Checking if Authenticated' if not debug: responseAuthCheck = urllib2.urlopen(AuthCheckURL) AuthCheckhtml = responseAuthCheck.read() else: f = open('unCleanAuthCheckunauthed.html', 'r') AuthCheckhtml = f.read() f.close() if AuthCheckhtml.find('/auth/perfigo_weblogin.jsp') != -1: print 'Not Authenticated Yet' urlSplit = AuthCheckhtml.split('URL=') if len(urlSplit) != 2: print 'Error extracting redirect URL (1)' else: urlSplit = re.split("'>|;", urlSplit[1]) if len(urlSplit) < 2: print 'Error extracting redirect URL (2)' else: print 'Fetching Login Page' if not debug: responseAuthPage = urllib2.urlopen(urlSplit[0]) AuthPagehtml = etree.parse(responseAuthPage, parser) else: f = open('authPage.html', 'r') AuthPagehtml = etree.parse(f, parser) f.close() print 'Parsing Login Page' POSTDataItems = dict() for formInput in AuthPagehtml.xpath( ".//form[@name='loginform']//input"): if formInput.get('name'): POSTDataItems[formInput.get('name')] = formInput.get( 'value') POSTDataItems['pm'] = config.get('login', 'os') POSTDataItems['username'] = user POSTDataItems['password'] = passwd authData = urllib.urlencode(POSTDataItems) authHeaders = { 'Referer': urlSplit[0].split('perfigo_weblogin.jsp', 1)[0], 'User-Agent': config.get('login', 'useragent') } print 'Logging in' authReq = urllib2.Request( urlSplit[0].split('auth/perfigo_weblogin.jsp', 1)[0] + AuthPagehtml.xpath(".//form[@name='loginform']")[0].get( 'action').split('/', 1)[1], authData, authHeaders) responseAuthReq = urllib2.urlopen(authReq) authReqhtml = responseAuthReq.read() if authReqhtml.find( 'You have been successfully logged on the network' ) != -1: print 'Successfuly Authenticated!' else: print 'Invalid credentials' (userName, password) = setCreds(regKeyVal) auth(userName, password) else: print 'Already Authenticated'
except ImportError: import ConfigParser as configparser Config = configparser.ConfigParser() Config.read("config.ini") dir_source = Config.get("Directory", 'source') dir_outcome = Config.get("Directory",'outcome') fn_suffix = Config.get("Filename",'suffix') fn_output = [x.strip() for x in Config.get("Filename",'CLDR_suppl').split(",")] data_src=Config.get("Source", 'CLDR_suppl') fn_operating=os.path.join(dir_source,data_src.split('/')[-1]) try: tree=etree.parse(fn_operating) except: XML_src_url=data_src import requests r = requests.get(XML_src_url, stream=True) r.raw.decode_content = True if not( r.status_code == 200): logging.warning ("Downloading the data from {0} failed. Plese check Internet connections.".format(XML_src_url)) exit() ##Requests will automatically decode content from the server [as r.text]. ... You can also access the response body as bytes [as r.content]. XML_src=r.content# r.raw.read()#r.raw#r.text XML_encoding=r.encoding #'ISO-8859-1'
import requests import codecs import os path_data = u'../data' data_src_url = u'http://unicode.org/repos/cldr/trunk/common/supplemental/supplementalData.xml' data_src_path, data_src_local = os.path.split(data_src_url) encoding_source = "utf-8" fn_output1 = os.path.join (path_data, 'CLDR_web.tsv') fn_output3 = os.path.join (path_data, 'CLDR_web_regin_country_no.tsv') ## Parsing data from remote or local sources try: tree = etree.parse(data_src_local) #etree.parse was used to parse xml except: r = requests.get(data_src_url, stream=True) if not(r.status_code == 200): print ("Downloading the data from {} failed. Plese check Internet connections.".format(data_src_url)) exit() r.encoding = 'utf-8' XML_src = r.text #content unicode.... with codecs.open(data_src_local, mode="w", encoding="utf-8") as file: file.write(XML_src) tree = etree.parse(data_src_local)
def __init__(self, xml_file_name): self._tree = etree.parse(xml_file_name) self.root = self._tree.getroot() self.content = []
# import xml.etree.ElementTree as ET from lxml.html import etree as ET import re def strip_namespace(xml_tag: str): if "}" in xml_tag: return xml_tag.split("}")[-1] return xml_tag if __name__ == "__main__": tree = ET.parse( '../datafield/A Clash of Kings - George R.R. Martin/metadata.opf') NS = { "dc": "http://purl.org/dc/elements/1.1/", "opf": "http://www.idpf.org/2007/opf", "calibre": "http://calibre.kovidgoyal.net/2009/metadata" } NamespaceDC = tree.xpath(".//opf:metadata/*[namespace-uri()='%s']" % (NS["dc"], ), namespaces=NS) DCTuples = [] for node in NamespaceDC: DCTuples.append((strip_namespace(node.tag), node.text))
def __init__(self, page_source, elements=None): parser = etree.HTMLParser() self.tree = etree.parse(StringIO(page_source), parser) self.elements = elements or tuple(imap(self._map_lxml_element, self.tree.iter(tag=etree.Element))) self.root_xpath = self._get_xpath(self.tree.getroot())
from lxml.html import etree as ET if __name__ == "__main__": tree = ET.parse( '../datafield/A Clash of Kings - George R.R. Martin/content/' 'George R.R. Martin - Fire and Ice 02 - A Clash of Kings v4.0 (BD)_split_6.html' ) root = tree.getroot() ns = {"default": root.nsmap[None]} # NavPoints = root.xpath(".//*[contains(@class, 'MsoHyperlink')]/default:a", namespaces=ns) # tmpNode = NavPoints[0] # NavPointsTuples = [] # for node in NavPoints: # chapter_name = node.xpath("./default:navLabel/default:text/text()", namespaces=ns) # chapter_loc = node.xpath("./default:content/@src", namespaces=ns) # NavPointsTuples.append((chapter_name, chapter_loc))
__author__ = 'mike' import lxml.html from lxml.html import etree # from lxml.html.soupparser import etree print("hello world") parser = lxml.html.HTMLParser() tree = etree.parse('e:\\users\\mike\\documents\\bookmarks_12_5_14_small.html', parser) root = tree.getroot() clean_links = {} pre_text = "<DT><A " post_text = "</A>" for element in root.iter("a"): # print("%s - %s - %s" % (element.tag, element.attrib, element.text)) clean_links[element.text] = element.attrib # if (len(clean_links) > 10) : # break for link in sorted(clean_links.keys()): line = pre_text for attrib_key in clean_links[link].keys(): line += " " line += str(attrib_key).upper() + "=" + '"' line += clean_links[link][attrib_key] line += '"' line += ">" + link + post_text