Python parseの例、lxml.html.etree.parse Pythonの例

コード例 #1

0

ファイルを表示

def read_data(page_count, filepath="./datas/job_python/"):
    """
    :param page_count: 需要爬取的页数
    :param filepath: 源文件存放的路径
    :return: 无返回值，在函数的最后将解析后的数据存储文件中
    """
    parser = etree.HTMLParser(encoding='utf-8')

    for i in range(1, page_count):
        html_tree = etree.parse(filepath + f"python_{i}.html", parser=parser)
        path = "//div[@class='dw_table']/div[@class='el']"
        jobs = html_tree.xpath(path)

        jobs_list = []

        for job in jobs:
            dict_job = std_job(job)
            jobs_list.append(dict_job)
            # job_title = job.xpath('./p/span/a')[0].text
            # job_company = job.xpath('./span/a')[0].text
            # job_place = job.xpath('./span[@class="t3"]')[0].text
            # job_salary = job.xpath('./span[@class="t4"]')[0].text
            # job_date = job.xpath('./span[@class="t5"]')[0].text

            # 加入文件到csv文件中
        #保存页面中的信息到csv文件
        save_csv(
            f"./handled_data/job_python_{str(datetime.datetime.now()).split(' ')[0]}.csv",
            jobs_list)

コード例 #2

0

ファイルを表示

def get_congress(cong, N):
    #POST data to pass to THOMAS
    data = {
        "database": "nominations",
        "MaxDocs": "%d" % N,
        "querytype": "phrase",
        "query": "",
        "Stemming": "Yes",
        "congress": "%d" % cong,
        "CIVcategory": "on",
        "committee": "",
        "LBDateSel": "FLD606",
        "EBSDate": "",
        "EBEDate": "",
        "sort": "sh_docid_c",
        "submit": "SEARCH"
    }

    resp = urllib.urlopen(URL, data=urllib.urlencode(data))
    results = etree.parse(StringIO.StringIO(resp.read()), parser)
    nominations = results.xpath('//div[@id="content"]/p[2]/a/@href')

    start = 0
    end = len(nominations)

    for i in range(start, end):
        nomination = nominations[i]
        url = "http://thomas.loc.gov" + nomination
        try:
            get_nomination(url)
        except Exception as e:
            print url
            print e
        if i % 100 == 0:
            print i

コード例 #3

0

ファイルを表示

ファイル: collect.py プロジェクト: wilson428/get.gov

def get_congress(cong, N):
    #POST data to pass to THOMAS
    data = {
        "database":"nominations",
        "MaxDocs":"%d" % N,
        "querytype":"phrase",
        "query":"",
        "Stemming":"Yes",
        "congress":"%d" % cong,
        "CIVcategory":"on",
        "committee":"",
        "LBDateSel":"FLD606",
        "EBSDate":"",
        "EBEDate":"",
        "sort":"sh_docid_c",
        "submit":"SEARCH"
    }

    resp = urllib.urlopen(URL, data=urllib.urlencode(data))
    results = etree.parse(StringIO.StringIO(resp.read()), parser)
    nominations = results.xpath('//div[@id="content"]/p[2]/a/@href')

    start = 0
    end = len(nominations)

    for i in range(start, end):
        nomination = nominations[i]
        url = "http://thomas.loc.gov" + nomination
        try:
            get_nomination(url)
        except Exception as e:
            print url
            print e
        if i % 100 == 0:
            print i

コード例 #4

0

ファイルを表示

def xml_to_example(xmlpath, imgpath):
    xml = etree.parse(xmlpath)
    root = xml.getroot()
    imgname = root.find('filename').text
    imgname = os.path.join(imgpath, imgname)
    image = tf.gfile.GFile(imgname, 'rb').read()
    size = root.find('size')
    height = int(size.find('height').text)
    width = int(size.find('width').text)
    depth = int(size.find('depth').text)
    shape = np.asarray([height, width, depth], np.int32)
    xpath = xml.xpath('//object')
    ground_truth = np.zeros([len(xpath), 5], np.float32)
    for i in range(len(xpath)):
        obj = xpath[i]
        classid = classname_to_ids[obj.find('name').text]
        bndbox = obj.find('bndbox')
        ymin = float(bndbox.find('ymin').text)
        ymax = float(bndbox.find('ymax').text)
        xmin = float(bndbox.find('xmin').text)
        xmax = float(bndbox.find('xmax').text)
        ground_truth[i, :] = np.asarray([ymin, ymax, xmin, xmax, classid],
                                        np.float32)  #GT所包含的信息
    features = {
        'image': bytes_feature(image),
        'shape': bytes_feature(shape.tobytes()),
        'ground_truth': bytes_feature(ground_truth.tobytes())
    }
    example = tf.train.Example(features=tf.train.Features(feature=features))
    return example

コード例 #5

0

ファイルを表示

ファイル: parseNoteXML.py プロジェクト: hbhzwj/VizEvernote

def parseNoteXML(note_name):
    convert_xml(note_name, note_name + '.xml')
    context = etree.iterparse(note_name + '.xml',
                              encoding='utf-8',
                              strip_cdata=False)
    note_dict = {}
    notes = []
    for ind, (action, elem) in enumerate(context):
        text = elem.text
        if elem.tag == 'content':
            # x = to_valid_xml(elem.text.encode('utf-8'))
            x = elem.text.encode('utf-8')
            r = etree.parse(StringIO(x), p)
            for e in r.iter():
                try:
                    text.append(e.text)
                except:
                    print('cannot print', file=sys.stderr)

        note_dict[elem.tag] = text
        # NixNote use "Note"; Evernote Windows & Mac Client use "note"
        # if elem.tag == "Note" or elem.tag == 'note':
        if elem.tag == "note":
            notes.append(note_dict)
            note_dict = {}
    return notes

コード例 #6

0

ファイルを表示

ファイル: getRoomList.py プロジェクト: konwww/jiaowuSpider

def resolving():
    html = etree.parse('RoomTable.html', etree.HTMLParser())
    tr_list = html.xpath("//center/table[3]")
    cell = tr_list[0].xpath("./tr/td/text() | ./tr/td/a/attribute::href")
    a = tr_list[0].xpath("./tr/td/a/attribute::*")
    print(cell)
    data = []
    tmp_obj = []
    k = 0
    for i in cell:
        content = replaceCoding(i)
        if (content.__len__() == 0):
            continue
        k += 1
        if (k == 6):
            tmp_obj.append(getCode(i))
            data.append(tmp_obj)
            tmp_obj = []
            k = 0
        else:
            tmp_obj.append(content)
    # result=html.xpath("//center/table/tr/td/a[1]/attribute::*")
    # print(result)
    sql = ""
    for i in data:
        sql += "('%s','%s',%s,'%s','%s')," % (i[1], i[2], i[3], i[4], i[5])
    sql = "insert into c_origin_data (`city`,`location`,`num`,`category`,`code`) values " + sql[
        0:-1]
    print(sql)
    cur.execute(sql)
    db.commit()

コード例 #7

0

ファイルを表示

def crawl(path, pid=None):
    body = download("http://petitions.whitehouse.gov" + path,
                    path.split('/')[2] + ".html")
    page = etree.parse(StringIO(body), parser)
    #catch page text whether or not petition is still active
    #http://stackoverflow.com/questions/5662404/how-can-i-select-an-element-with-multiple-classes-with-xpath
    text = "\n".join(
        page.xpath(
            "//div[contains(concat(' ',@class,' '),' petition-detail')]/p/text()"
        ))

    #check if expired
    if "The petition you are trying to access has expired" in text:
        return {"status": "expired"}

    #if raw_date not found, probably a bad link (or change in HTML, so we should be careful)
    try:
        raw_date = page.xpath("//div[@class='date']/text()")[0].strip()
    except:
        return {"status": "error", "reason": "no date"}

    created = datetime.strptime(raw_date, "%b %d, %Y").strftime("%Y-%m-%d")
    signatures = page.xpath("//div[@class='num-block num-block2']/text()")

    #indiciates possible response
    if len(signatures) == 0:
        signatures = page.xpath("//div[@class='num-block']/text()")
        response = page.xpath(
            "//div[contains(concat(' ',@class,' '),' petition-response')]")
        if response:
            status = "answered"
        else:
            return {"status": "error", "reason": "no signatures"}
    else:
        status = "active"
    signatures = int(signatures[0].replace(",", ''))

    if not pid:
        #no pid if fewer than 20 signatures
        try:
            pid = page.xpath(
                "//a[@class='load-next no-follow active']/@rel")[0]
        except:
            pid = "N/A"

    return {
        "id": pid,
        "status": status,
        "title": page.xpath("//h1[@class='title']/text()")[0].strip(),
        "body": text,
        "issues": page.xpath("//div[@class='issues']/a/text()"),
        "created": created,
        "visited": datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
        "signature_count": signatures,
        "url": "http://petitions.whitehouse.gov" + path
    }

コード例 #8

0

ファイルを表示

ファイル: librarian.py プロジェクト: justzx2011/Robottke

def visit_page(url,path="",save=False):
    content = {"title" : "", "url" : "", "keywords" : "", "links" : [], "body" : ""}
    if urlparse(url).netloc == "":
        #print "partial:", url
        return content
    
    if urlparse(url).netloc in BANNED:
        #print "banned:", url
        return content
    try:
        resp = urllib2.urlopen(url)
    except URLError as e:
        print e
        return content
    if resp.getcode() != 200:
        "Bad response: ",resp.getcode() 
        return content
    #resolves URL
    content["url"] = resp.url  
    html = resp.read()
    try:
        tree = etree.parse(StringIO.StringIO(html), parser)
    except:
        print "LXML error"
        return content
    content["title"] = tree.xpath("//title//text()")
    if len(content["title"]) > 0:
        content["title"] = content["title"][0].strip()
    content["links"] = tree.xpath("//body//@href")
    content["keywords"] = tree.xpath("//meta[@name='keywords']/@content")
    if content["keywords"] == "":
        content["keywords"] = tree.xpath("//meta[@name='Keywords']/@content")
        print "caught a case ",url
    #content["body"] = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0]))
    body = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0]))
    content["word_count"] = len(body.split(" "))

    #will save full html
    if save:
        filename = urllib.quote_plus(content["url"][0:60])+".txt"
        #filename.replace("http%3A%2F%2F", "")
                
        #if file doesn't already exist
        if not findInSub(filename,path):
            #make that day's path
            path = path + strftime("/%Y/%m/%d/", gmtime())
            if not os.path.exists(path):
                os.makedirs(path)
            f = open(path+filename, "w+")
            f.write(html)
            f.close()
            print "wrote " + path+filename
        else:
            print "already had " + filename
    return content

コード例 #9

0

ファイルを表示

ファイル: make_report.py プロジェクト: wyq5610522/PlatON-CI

def get_load_test_result(load_test_report_path):
    if (os.path.exists(load_test_report_path)):
        parser = etree.HTMLParser(encoding='utf-8')
        html = etree.parse(load_test_report_path, parser=parser)
        cbft_name = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[1]/text()')[0]
        cbft_status = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[2]/text()')[0]
        cbft_tps = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[3]/text()')[0]
        cbft_cpu = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[4]/text()')[0]
        cbft_memory = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[5]/text()')[0]
        cbft_bwup = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[6]/text()')[0]
        cbft_bwdown = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[7]/text()')[0]
        wasm_name = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[1]/text()')[0]
        wasm_status = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[2]/text()')[0]
        wasm_tps = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[3]/text()')[0]
        wasm_cpu = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[4]/text()')[0]
        wasm_memory = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[5]/text()')[0]
        wasm_bwup = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[6]/text()')[0]
        wasm_bwdown = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[7]/text()')[0]
        cbft = '场景：' + cbft_name + '<br>' + \
               '状态：' + cbft_status + '<br>' + \
               '每秒事务数：' + cbft_tps + '<br>' + \
               '处理器(%)：' + cbft_cpu + '<br>' + \
               '内存(%)：' + cbft_memory + '<br>' + \
               '带宽上行(Mb/s)：' + cbft_bwup + '<br>' + \
               '带宽下行(Mb/s)：' + cbft_bwdown
        wasm = '场景：' + wasm_name + '<br>' + \
               '状态：' + wasm_status + '<br>' + \
               '每秒事务数：' + wasm_tps + '<br>' + \
               '处理器(%)：' + wasm_cpu + '<br>' + \
               '内存(%)：' + wasm_memory + '<br>' + \
               '带宽上行(Mb/s)：' + wasm_bwup + '<br>' + \
               '带宽下行(Mb/s)：' + wasm_bwdown
        if cbft_status == 'success' and wasm_status == 'success':
            TestResult = 'PASS'
        else:
            TestResult = 'FAIL'
        return TestResult, cbft + '<br><br>' + wasm
    else:
        return 'ERROR', '测试被人为中断或测试代码出错，未能生成报告'

コード例 #10

0

ファイルを表示

ファイル: whatwaff.py プロジェクト: morristech/whatwaff

    def check_waf(self, resp):
        self._xmlstr_dom = etree.parse(cwd+'/fingerprinting.xml')
        waf_doms = self._xmlstr_dom.xpath("waf")
        detect = 0 
        for waf_dom in waf_doms:
            finger_dom = waf_dom.xpath("finger")
            rule_dom = finger_dom[0].xpath("rule")
            head_type =rule_dom[0].get("header").lower()
            if head_type in resp.headers:
                 regx = self.regexp_header(rule_dom,waf_dom,head_type,resp)
		 if regx > 0 :
			detect +=1
	return detect

コード例 #11

0

ファイルを表示

ファイル: utils_xml.py プロジェクト: fishfishfishfishfish/books_process

def find_text_by_id(xml_file, node_id: str):
    if isinstance(xml_file, str):
        xml_file = open(xml_file)
    tree = et.parse(xml_file)
    root = tree.getroot()
    ns = {"default": root.nsmap[None]}
    dest_elements = root.xpath(".//*[contains(@id, '%s')]//parent::*"
                               "//parent::*//following-sibling::*//text()" %
                               (node_id, ),
                               namespaces=ns)
    res = ""
    for t in dest_elements:
        res += t if t is not None else ""
    return res

コード例 #12

0

ファイルを表示

ファイル: petitions.py プロジェクト: pallih/petitions

def crawl(path, pid=None):
    body = download("http://petitions.whitehouse.gov" + path, path.split("/")[2] + ".html")
    page = etree.parse(StringIO(body), parser)
    # catch page text whether or not petition is still active
    # http://stackoverflow.com/questions/5662404/how-can-i-select-an-element-with-multiple-classes-with-xpath
    text = "\n".join(page.xpath("//div[contains(concat(' ',@class,' '),' petition-detail')]/p/text()"))

    # check if expired
    if "The petition you are trying to access has expired" in text:
        return {"status": "expired"}

    # if raw_date not found, probably a bad link (or change in HTML, so we should be careful)
    try:
        raw_date = page.xpath("//div[@class='date']/text()")[0].strip()
    except:
        return {"status": "error", "reason": "no date"}

    created = datetime.strptime(raw_date, "%b %d, %Y").strftime("%Y-%m-%d")
    signatures = page.xpath("//div[@class='num-block num-block2']/text()")

    # indiciates possible response
    if len(signatures) == 0:
        signatures = page.xpath("//div[@class='num-block']/text()")
        response = page.xpath("//div[contains(concat(' ',@class,' '),' petition-response')]")
        if response:
            status = "answered"
        else:
            return {"status": "error", "reason": "no signatures"}
    else:
        status = "active"
    signatures = int(signatures[0].replace(",", ""))

    if not pid:
        # no pid if fewer than 20 signatures
        try:
            pid = page.xpath("//a[@class='load-next no-follow active']/@rel")[0]
        except:
            pid = "N/A"

    return {
        "pid": pid,
        "status": status,
        "title": page.xpath("//h1[@class='title']/text()")[0].strip(),
        "text": text,
        "tags": page.xpath("//div[@class='issues']/a/text()"),
        "created": created,
        "visited": datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
        "signatures": signatures,
        "url": "http://petitions.whitehouse.gov" + path,
    }

コード例 #13

0

ファイルを表示

def gain_config(filepath) :
    """
    功能：解析配置文件，原理是和解析xml文件类同。
    :param filepath: 配置文件路径
    :return: 返回默认的配置值
    """
    config_tree = etree.parse(filepath)
    datas = config_tree.xpath('/spiders/spider[@id="job_spider"]')
    print(len(datas))

    condition = datas[0].xpath('./condition')[0].text
    page_count = datas[0].xpath('./pageAccount')[0].text

    return condition, page_count

コード例 #14

0

ファイルを表示

ファイル: smartread.py プロジェクト: justzx2011/Robottke

def smart_read(url):
    resp = urllib2.urlopen(url)
    #resolve url
    url = resp.url
    domain = urlparse(url).netloc
    path = urlparse(url).path
    
    html = resp.read()
    tree = etree.parse(StringIO.StringIO(html), parser)
    links = tree.xpath("//body//@href")
    nmax = 0
    for link in links:
        if urlparse(link).netloc == domain:
            ng = NGram.compare(urlparse(link).path,path)
            #print link,ng
            if ng > nmax and ng < 1:
                nmax = ng
                mirror = link
    diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"])
    tree = etree.parse(StringIO.StringIO(diffh), parser)
    diff = tree.xpath("//ins//text()")
    for d in diff:
        print d

コード例 #15

0

ファイルを表示

def diy_parse_content_list(content_list_file):
    if isinstance(content_list_file, str):
        content_list_file = open(content_list_file)
    tree = et.parse(content_list_file)
    root = tree.getroot()
    ns = {"default": root.nsmap[None]}
    nav_points = root.xpath(".//*[contains(@class, 'MsoHyperlink')]/default:a",
                            namespaces=ns)
    nav_points_tuples = []
    for node in nav_points:
        chapter_name = node.text
        chapter_loc = "content/" + node.attrib["href"]
        nav_points_tuples.append((chapter_name, chapter_loc))
    return nav_points_tuples

コード例 #16

0

ファイルを表示

ファイル: utils_xml.py プロジェクト: fishfishfishfishfish/books_process

def parse_toc(toc_file):
    if isinstance(toc_file, str):
        toc_file = open(toc_file)
    tree = et.parse(toc_file)
    root = tree.getroot()
    ns = {"default": root.nsmap[None]}
    nav_points = root.xpath(".//default:navPoint", namespaces=ns)
    nav_points_tuples = []
    for node in nav_points:
        chapter_name = node.find("./default:navLabel/default:text",
                                 namespaces=ns).text
        chapter_loc = node.find("./default:content",
                                namespaces=ns).attrib["src"]
        nav_points_tuples.append((chapter_name, chapter_loc))
    return nav_points_tuples

コード例 #17

0

ファイルを表示

 def check_regexp(self, content):
     if not content:
         return
     self._xmlstr_dom = etree.parse('regexp.xml')
     javaid_doms = self._xmlstr_dom.xpath("javaid")
     for javaid_dom in javaid_doms:
         self._vultype =javaid_dom.get("vultype")
         #print "vul_type "+self._vultype
         function_doms = javaid_dom.xpath("function")
         for function_dom in function_doms:
             rule_dom = function_dom.xpath("rule")
             self._function =rule_dom[0].get("name")
             self.regexp_search(rule_dom,content)
             #print "check_regexp search ..."
     return True

コード例 #18

0

ファイルを表示

ファイル: utils_xml.py プロジェクト: fishfishfishfishfish/books_process

def get_metadata(opf_file):
    if isinstance(opf_file, str):
        opf_file = open(opf_file)
    tree = et.parse(opf_file)
    ns = {
        "dc": "http://purl.org/dc/elements/1.1/",
        "opf": "http://www.idpf.org/2007/opf",
        "calibre": "http://calibre.kovidgoyal.net/2009/metadata"
    }
    namespace_dc = tree.xpath(".//opf:metadata/*[namespace-uri()='%s']" %
                              (ns["dc"], ),
                              namespaces=ns)
    dc_tuples = []
    for node in namespace_dc:
        dc_tuples.append((strip_namespace(node.tag), node.text))
    return dc_tuples

コード例 #19

0

ファイルを表示

def petitions(start=1, mx=None):
    if mx is None:
        mx = -1

    #log objects for tracking signatures over time
    hits = 0

    #scan WH site, add any new petitions to DB
    #surely a better way to get indefinite number of results than to create a functionally infinite loop, then breaking it, but drawing a blank
    for pg in range(start, 1000):
        log("Loading page %d" % pg)

        #The WH site loads petitions from an external HTML doc in a JSON shell
        url = "https://petitions.whitehouse.gov/petitions/more/all/%d/2/0/" % pg
        try:
            raw = s.urlopen(url).encode('utf-8')
        except scrapelib.HTTPError:
            log("Error downloading %s" % url)
            return hits
        resp = json.loads(raw)
        if "markup" not in resp or len(resp["markup"]) == 0:
            log("No results at page %i" % pg)
            return hits
        page = etree.parse(StringIO(resp['markup']), parser)
        #there are two links to each petition in the results, but can reduce to uniques with "nofollow"

        petitions = page.xpath("body/div[@class]")
        if len(petitions) == 0:
            return hits

        for petition in petitions:
            pid = petition.xpath("@id")[0].split('-')[1]
            #get uid for each petition from main div id
            path = petition.xpath("div/div/a/@href")[0]
            data = crawl(path, pid)

            #if petition is dead (unlikely if scanned from WH site directly, but you never know):
            if data["status"] == "expired":
                scrapelog["signatures"][path.split("/")[2]] = -1
            elif data["status"] == "active":
                scrapelog["signatures"][path.split("/")
                                        [2]] = data["signature_count"]
                write(json.dumps(data, indent=2, sort_keys=True),
                      "scrape/petitions/" + data['id'] + ".json")
                hits += 1
                if mx != -1 and hits >= mx:
                    return hits

コード例 #20

0

ファイルを表示

 def check_waf(self, resp):
     try:
         self._xmlstr_dom = etree.parse(cwd + '/dic/fingerprinting.xml')
         waf_doms = self._xmlstr_dom.xpath("waf")
         detect = 0
         for waf_dom in waf_doms:
             finger_dom = waf_dom.xpath("finger")
             rule_dom = finger_dom[0].xpath("rule")
             head_type = str(rule_dom[0].get("header").lower())
             if head_type in resp.headers:
                 regx = self.regexp_header(rule_dom, waf_dom, head_type,
                                           resp)
                 if regx > 0:
                     detect += 1
         return detect
     except:
         print "[+] Error al obtener cabeceras del Servidor : " + self._url

コード例 #21

0

ファイルを表示

ファイル: petitions.py プロジェクト: imclab/petitions

def petitions(start=1, mx=None):
    if mx is None:
        mx = -1
    
    #log objects for tracking signatures over time
    hits = 0
    
    #scan WH site, add any new petitions to DB
    #surely a better way to get indefinite number of results than to create a functionally infinite loop, then breaking it, but drawing a blank
    for pg in range(start, 1000):
        log("Loading page %d" % pg)

        #The WH site loads petitions from an external HTML doc in a JSON shell
        url = "https://petitions.whitehouse.gov/petitions/more/all/%d/2/0/" % pg
        try:
            raw = s.urlopen(url).encode('utf-8')
        except scrapelib.HTTPError:
            log("Error downloading %s" % url)
            return hits
        resp = json.loads(raw)
        if "markup" not in resp or len(resp["markup"]) == 0:
            log("No results at page %i" % pg)
            return hits
        page = etree.parse(StringIO(resp['markup']), parser)
        #there are two links to each petition in the results, but can reduce to uniques with "nofollow"
        
        petitions = page.xpath("body/div[@class]")
        if len(petitions) == 0:
            return hits
            
        for petition in petitions:
            pid = petition.xpath("@id")[0].split('-')[1]
            #get uid for each petition from main div id
            path = petition.xpath("div/div/a/@href")[0]
            data = crawl(path, pid)

            #if petition is dead (unlikely if scanned from WH site directly, but you never know):
            if data["status"] == "expired":
                scrapelog["signatures"][path.split("/")[2]] = -1
            elif data["status"] == "active":
                scrapelog["signatures"][path.split("/")[2]] = data["signature_count"]
                write(json.dumps(data, indent=2, sort_keys=True), "scrape/petitions/" + data['id'] + ".json")
                hits += 1
                if mx != -1 and hits >= mx:
                    return hits

コード例 #22

0

ファイルを表示

def from_kingsoft(query: str):
    try:
        xml = urlopen(
            f"http://dict-co.iciba.com/api/dictionary.php?w={query}&key=1F287830F78CD6CFEB5E4279236CBEBB"
        )
        root = et.parse(xml)
        xpath_pos = ".//pos//text()"
        xpath_acc = ".//acceptation//text()"
        acceptations = root.xpath(xpath_acc)
        pos = root.xpath(xpath_pos)
        meaning = [
            p + utils_text_preprocess.clean_text(a)
            for p, a in zip(pos, acceptations)
        ]
    except BaseException as e:
        print(e)
        meaning = []
    return meaning

コード例 #23

0

ファイルを表示

def get_congress(cong):
    params = urllib.urlencode({'congress': cong})
    results = urllib.urlopen('http://bioguide.congress.gov/biosearch/biosearch1.asp', params)
    page = etree.parse(StringIO.StringIO(results.read()), etree.HTMLParser())
    nas = 1
    for member in page.xpath("//table")[1].xpath("tr")[1:]:
        name = member.xpath("td/a/text()")
        print name
        if len(name) == 0:
            name = ""
            print nas
            nas += 1
            continue        
        else:
            name = name[0]
            pid = member.xpath("td/a/@href")[0].split("=")[1]
        stats = member.xpath("td/text()")
        c.execute('''INSERT OR IGNORE INTO terms (pid, name, dates, position, party, state, congress)
            VALUES (?,?,?,?,?,?,?)''', (pid, name, stats[0], stats[1][0], stats[2], stats[3], int(stats[4])))        
    conn.commit()

コード例 #24

0

ファイルを表示

    def check_waf(self, resp):
        if not resp.content:
            return
        self._xmlstr_dom = etree.parse('finger.xml')
        waf_doms = self._xmlstr_dom.xpath("waf")
        for waf_dom in waf_doms:
            finger_dom = waf_dom.xpath("finger")
            rule_dom = finger_dom[0].xpath("rule")
            head_type = rule_dom[0].get("header").lower()
            if head_type in resp.headers:
                if self.regexp_header(rule_dom, waf_dom, head_type, resp):
                    return True
                else:
                    self._nowaf = "This website has no waf or identify false!!!"
                    print "get waf finger false:" + self._nowaf
            else:
                print "head type search ..."

        if check_resp(resp):
            return True
        return False

コード例 #25

0

ファイルを表示

ファイル: jsid.py プロジェクト: cl0udz/JavaScriptID

    def getXMLConfiguration(path="regexp.xml"):
        config = {}

        xml_dom = etree.parse(path)
        jsid_doms = xml_dom.xpath("jsid")

        try:
            for jsid_dom in jsid_doms:
                vultype = jsid_dom.get("vultype")
                #print vultype
                config[vultype] = []

                rule_doms = jsid_dom.xpath("rule")
                for rule_dom in rule_doms:
                    regexp_dom = rule_dom.xpath("regexp")[0]
                    value = [rule_dom.get("name"), re.compile(regexp_dom.text)]
                    config[vultype].append(value)
        except:
            print "Error when parsing xml file. Please check the format"

        return config

コード例 #26

0

ファイルを表示

ファイル: wafid.py プロジェクト: Cryin/wafid

 def check_waf(self, resp):
     if not resp.content:
         return
     self._xmlstr_dom = etree.parse('finger.xml')
     waf_doms = self._xmlstr_dom.xpath("waf")
     for waf_dom in waf_doms:
         finger_dom = waf_dom.xpath("finger")
         rule_dom = finger_dom[0].xpath("rule")
         head_type =rule_dom[0].get("header").lower()
         if head_type in resp.headers:
             if self.regexp_header(rule_dom,waf_dom,head_type,resp):
                 return True
             else:
                 self._nowaf="This website has no waf or identify false!!!"
                 #print "[+]【Wafid】get waf finger false: "+self._nowaf
         else:
             continue
             #print "head type search ..."
     
     if self.check_resp(resp):
        return True
     return False

コード例 #27

0

ファイルを表示

ファイル: librarian.py プロジェクト: justzx2011/Robottke

def diff_rss(url, name, limit=-1):  
    rss = feedparser.parse(url)
    links = {}
    #print rss
    if limit==-1 or limit > len(rss.entries):
        limit = len(rss.entries)
    first_index = get_first_index(rss)
    for i in range(first_index, limit+1):
        links[rss.entries[i].link] = []
        post1 = rss.entries[i-1].link
        if i == limit:
            post2 = rss.entries[first_index-1].link
        else:
            post2 = rss.entries[i].link
        print post2
        diffh = htmldiff(get_content(post1)["body"], get_content(post2)["body"])
        tree = etree.parse(StringIO.StringIO(diffh), parser)
        diff = tree.xpath("//ins//@href")
        for d in diff:
            if urlparse(d).netloc != urlparse(rss.feed.link).netloc and urlparse(d).path != '/':
                links[rss.entries[i].link].append(d)
    return links

コード例 #28

0

ファイルを表示

ファイル: find_rss.py プロジェクト: justzx2011/Robottke

def rss_search():
    r = c.execute('''select * from tips where rss=""''').fetchall()

    for line in r:
        source = line["domain"]
        # print source
        try:
            response = urllib2.urlopen(source).read()
            tree = etree.parse(StringIO(response), etree.HTMLParser())
            root = tree.getroot()

            feed = tree.xpath("//link[@type='application/rss+xml']")
            if feed == []:
                feed = tree.xpath("//a[text()='RSS']")
                if feed == []:
                    feed = tree.xpath("//a[@href='/feed/']")
                    if feed == []:
                        feed = tree.xpath("//a[@href='" + source + "/feed/']")
                        if feed == []:
                            feed = tree.xpath("//a[@href='/feeds/']")
                            if feed == []:
                                feed = tree.xpath("//a[@href='" + source + "/feeds/']")

            if feed != []:
                print feed[0]
                rss = feed[0].get("href")
                if urlparse(rss).netloc != "" and urlparse(rss).netloc != "/":
                    print rss
                    c.execute("""update tips set rss = ? where domain = ?""", (rss, source))
                    conn.commit()
                else:
                    rss_feed = source + rss
                    print rss_feed
                    c.execute("""update tips set rss = ? where domain = ?""", (rss_feed, source))
                    conn.commit()

        except IOError:
            print "bad call", source

コード例 #29

0

ファイルを表示

ファイル: unCleanAccess.py プロジェクト: Fragger/unClean-Access

def auth(user, passwd):
    os = 'Linux'
    useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1'
    AuthCheckURL = 'http://google.com/'

    parser = etree.HTMLParser()

    config = ConfigParser.SafeConfigParser({'os': os, 'useragent': useragent, 'debug': 'False'})
    config.read('unCleanAccess.cfg')
    if not config.has_section('login'):
        config.add_section('login')

    debug = config.get('login', 'debug') in ('True', 'true')

    print 'Checking if Authenticated'
    if not debug:
        responseAuthCheck = urllib2.urlopen(AuthCheckURL)
        AuthCheckhtml = responseAuthCheck.read()
    else:
        f = open('unCleanAuthCheckunauthed.html','r')
        AuthCheckhtml = f.read()
        f.close()

    if AuthCheckhtml.find('/auth/perfigo_weblogin.jsp') != -1:
        print 'Not Authenticated Yet'
        urlSplit = AuthCheckhtml.split('URL=')
        if len(urlSplit) != 2:
            print 'Error extracting redirect URL (1)'
        else:
            urlSplit = re.split("'>|;", urlSplit[1])
            if len(urlSplit) < 2:
                print 'Error extracting redirect URL (2)'
            else:
                print 'Fetching Login Page'
                if not debug:
                    responseAuthPage = urllib2.urlopen(urlSplit[0])
                    AuthPagehtml = etree.parse(responseAuthPage, parser)
                else:
                    f = open('authPage.html','r')
                    AuthPagehtml = etree.parse(f, parser)
                    f.close()

                print 'Parsing Login Page'
                POSTDataItems = dict()
                for formInput in AuthPagehtml.xpath(".//form[@name='loginform']//input"):
                    if formInput.get('name'):
                        POSTDataItems[formInput.get('name')] = formInput.get('value')
        
                POSTDataItems['pm'] = config.get('login', 'os')
                POSTDataItems['username'] = user
                POSTDataItems['password'] = passwd

                authData = urllib.urlencode(POSTDataItems)
                authHeaders = {'Referer' : urlSplit[0].split('perfigo_weblogin.jsp', 1)[0], 'User-Agent' : config.get('login', 'useragent')}

                print 'Logging in'
                authReq = urllib2.Request(urlSplit[0].split('auth/perfigo_weblogin.jsp', 1)[0] + AuthPagehtml.xpath(".//form[@name='loginform']")[0].get('action').split('/', 1)[1], authData, authHeaders)
                responseAuthReq = urllib2.urlopen(authReq)
                authReqhtml = responseAuthReq.read()

                if authReqhtml.find('You have been successfully logged on the network') != -1:
                    print 'Successfuly Authenticated!'
                else:
                    print 'Invalid credentials'
                    (userName, password) = setCreds(regKeyVal)
                    auth(userName, password)
                    
                
    else:
        print 'Already Authenticated'

コード例 #30

0

ファイルを表示

ファイル: rel.py プロジェクト: kenpusney/pastebin

def parse(url):
    return etree.parse(StringIO.StringIO(requests.get(url).text))

コード例 #31

0

ファイルを表示

ファイル: t.py プロジェクト: Xelanos/Kickstater_Crawler

    html = driver.page_source.encode('utf-8')
    page_num = 0

    while driver.find_element_by_xpath('//*[@class="load_more mt3"]/a'):
        break
        driver.find_element_by_xpath('//*[@class="load_more mt3"]/a').click()
        page_num += 1
        print("getting page number " + str(page_num))
        time.sleep(1)
        if page_num == 1:
            break

    return driver.page_source.encode('utf-8')


response = getMoreRequests(
    'https://www.kickstarter.com/discover/advanced?category_id=16')

htmlparser = etree.HTMLParser()
tree = etree.parse(response, htmlparser)

soup = BeautifulSoup(response, 'html.parser')
# projects_grid = soup.find('div', id="projects")
projects = soup.find_all(
    'div',
    {"class": "js-react-proj-card grid-col-12 grid-col-6-sm grid-col-4-lg"})

for project in projects:
    print(project['data-projects'])

## '//div[contains(@data-project)]/@data-project').getall()

コード例 #32

0

ファイルを表示

ファイル: fetch_pret.py プロジェクト: simonharris/whatsoupisittoday.com

outfile = '/tmp/pret.pkl';
soupurl = 'http://www.pret.com/our_food/soup.htm'


def fix_text(astr):
	"""Remove undesirable characters and strings"""
	astr = astr.strip()
	return astr


html = urllib2.urlopen(soupurl).read()
html = html.replace('<br />', '')

parser = etree.HTMLParser()
doc    = etree.parse(StringIO(html), parser)

soupnames = doc.xpath('//div[@class="soup_details_hidden"]//img[contains(@src, \'souptitle\')]//@alt')

#pprint(soupnames)

allsoups = map(fix_text, soupnames)

#pprint(allsoups)

souplist = [
	[allsoups[0], allsoups[1], allsoups[2]],
	[allsoups[3], allsoups[4], allsoups[5]],
	[allsoups[6], allsoups[7], allsoups[8]],
	[allsoups[9], allsoups[10], allsoups[11]],
	[allsoups[12], allsoups[13], allsoups[14]]

コード例 #33

0

ファイルを表示

ファイル: supplementalData.py プロジェクト: Jasonkira/pyCountryGroup

URL_source = u'http://unicode.org/repos/cldr/trunk/common/supplemental/supplementalData.xml'
encoding_source = "utf-8"

import os

URL_path, downloaded_source = os.path.split(URL_source)
file_output = downloaded_source.split(os.extsep)[0] + ".tsv"

## Definitions of variables and their xpaths ##

## Using requests to download and lxml to parse
from lxml.html import fromstring, tostring, parse, etree
from io import StringIO, BytesIO

try:
    tree = etree.parse(downloaded_source)  #etree.parse was used to parse xml
except:
    XML_encoding = encoding_source
    XML_src_url = URL_source

    import requests
    r = requests.get(XML_src_url, stream=True)
    r.raw.decode_content = True

    if not (r.status_code == 200):
        print(
            "Downloading the data from {} failed. Plese check Internet connections."
            .format(XML_src_url))
        exit()

    XML_src = r.content  # r.raw.read()#r.raw#r.text

コード例 #34

0

ファイルを表示

ファイル: unCleanAccess.py プロジェクト: Fragger/unClean-Access

def auth(user, passwd):
    os = 'Linux'
    useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1'
    AuthCheckURL = 'http://google.com/'

    parser = etree.HTMLParser()

    config = ConfigParser.SafeConfigParser({
        'os': os,
        'useragent': useragent,
        'debug': 'False'
    })
    config.read('unCleanAccess.cfg')
    if not config.has_section('login'):
        config.add_section('login')

    debug = config.get('login', 'debug') in ('True', 'true')

    print 'Checking if Authenticated'
    if not debug:
        responseAuthCheck = urllib2.urlopen(AuthCheckURL)
        AuthCheckhtml = responseAuthCheck.read()
    else:
        f = open('unCleanAuthCheckunauthed.html', 'r')
        AuthCheckhtml = f.read()
        f.close()

    if AuthCheckhtml.find('/auth/perfigo_weblogin.jsp') != -1:
        print 'Not Authenticated Yet'
        urlSplit = AuthCheckhtml.split('URL=')
        if len(urlSplit) != 2:
            print 'Error extracting redirect URL (1)'
        else:
            urlSplit = re.split("'>|;", urlSplit[1])
            if len(urlSplit) < 2:
                print 'Error extracting redirect URL (2)'
            else:
                print 'Fetching Login Page'
                if not debug:
                    responseAuthPage = urllib2.urlopen(urlSplit[0])
                    AuthPagehtml = etree.parse(responseAuthPage, parser)
                else:
                    f = open('authPage.html', 'r')
                    AuthPagehtml = etree.parse(f, parser)
                    f.close()

                print 'Parsing Login Page'
                POSTDataItems = dict()
                for formInput in AuthPagehtml.xpath(
                        ".//form[@name='loginform']//input"):
                    if formInput.get('name'):
                        POSTDataItems[formInput.get('name')] = formInput.get(
                            'value')

                POSTDataItems['pm'] = config.get('login', 'os')
                POSTDataItems['username'] = user
                POSTDataItems['password'] = passwd

                authData = urllib.urlencode(POSTDataItems)
                authHeaders = {
                    'Referer': urlSplit[0].split('perfigo_weblogin.jsp', 1)[0],
                    'User-Agent': config.get('login', 'useragent')
                }

                print 'Logging in'
                authReq = urllib2.Request(
                    urlSplit[0].split('auth/perfigo_weblogin.jsp', 1)[0] +
                    AuthPagehtml.xpath(".//form[@name='loginform']")[0].get(
                        'action').split('/', 1)[1], authData, authHeaders)
                responseAuthReq = urllib2.urlopen(authReq)
                authReqhtml = responseAuthReq.read()

                if authReqhtml.find(
                        'You have been successfully logged on the network'
                ) != -1:
                    print 'Successfuly Authenticated!'
                else:
                    print 'Invalid credentials'
                    (userName, password) = setCreds(regKeyVal)
                    auth(userName, password)

    else:
        print 'Already Authenticated'

コード例 #35

0

ファイルを表示

except ImportError:
    import ConfigParser as configparser

Config = configparser.ConfigParser()
Config.read("config.ini")

dir_source = Config.get("Directory", 'source')
dir_outcome = Config.get("Directory",'outcome')
fn_suffix = Config.get("Filename",'suffix')
fn_output = [x.strip() for x in Config.get("Filename",'CLDR_suppl').split(",")]
data_src=Config.get("Source", 'CLDR_suppl')

fn_operating=os.path.join(dir_source,data_src.split('/')[-1])

try:
    tree=etree.parse(fn_operating)
except:
    XML_src_url=data_src

    import requests
    r = requests.get(XML_src_url, stream=True)
    r.raw.decode_content = True

    if not( r.status_code == 200):
        logging.warning ("Downloading the data from {0} failed. Plese check Internet connections.".format(XML_src_url))
        exit()

    ##Requests will automatically decode content from the server [as r.text]. ... You can also access the response body as bytes [as r.content].
    XML_src=r.content# r.raw.read()#r.raw#r.text
    XML_encoding=r.encoding  #'ISO-8859-1'

コード例 #36

0

ファイルを表示

import requests
import codecs
import os

path_data = u'../data'

data_src_url = u'http://unicode.org/repos/cldr/trunk/common/supplemental/supplementalData.xml'
data_src_path, data_src_local = os.path.split(data_src_url)
encoding_source = "utf-8"

fn_output1 = os.path.join (path_data, 'CLDR_web.tsv')
fn_output3 = os.path.join (path_data, 'CLDR_web_regin_country_no.tsv')

## Parsing data from remote or local sources
try:
    tree = etree.parse(data_src_local) #etree.parse was used to parse xml
except:
    r = requests.get(data_src_url, stream=True)

    if not(r.status_code == 200):
        print ("Downloading the data from {} failed. Plese check Internet connections.".format(data_src_url))
        exit()
        
    r.encoding = 'utf-8'
    XML_src = r.text #content unicode....
       
    with codecs.open(data_src_local, mode="w",  encoding="utf-8") as file:
        file.write(XML_src)
        
    tree = etree.parse(data_src_local)

コード例 #37

0

ファイルを表示

ファイル: rel.py プロジェクト: kenpusney/pastebin

def parse(url):
    return etree.parse(StringIO.StringIO(requests.get(url).text))

コード例 #38

0

ファイルを表示

ファイル: xml2excel.py プロジェクト: Andychu525/python

 def __init__(self, xml_file_name):
     self._tree = etree.parse(xml_file_name)
     self.root = self._tree.getroot()
     self.content = []

コード例 #39

0

ファイルを表示

# import xml.etree.ElementTree as ET
from lxml.html import etree as ET
import re


def strip_namespace(xml_tag: str):
    if "}" in xml_tag:
        return xml_tag.split("}")[-1]
    return xml_tag


if __name__ == "__main__":
    tree = ET.parse(
        '../datafield/A Clash of Kings - George R.R. Martin/metadata.opf')
    NS = {
        "dc": "http://purl.org/dc/elements/1.1/",
        "opf": "http://www.idpf.org/2007/opf",
        "calibre": "http://calibre.kovidgoyal.net/2009/metadata"
    }

    NamespaceDC = tree.xpath(".//opf:metadata/*[namespace-uri()='%s']" %
                             (NS["dc"], ),
                             namespaces=NS)
    DCTuples = []
    for node in NamespaceDC:
        DCTuples.append((strip_namespace(node.tag), node.text))

コード例 #40

0

ファイルを表示

ファイル: html_extraction.py プロジェクト: gabrielpjordao/funpath

 def __init__(self, page_source, elements=None):
     parser = etree.HTMLParser()
     self.tree = etree.parse(StringIO(page_source), parser)
     self.elements = elements or tuple(imap(self._map_lxml_element, self.tree.iter(tag=etree.Element)))
     self.root_xpath = self._get_xpath(self.tree.getroot())

コード例 #41

0

ファイルを表示

from lxml.html import etree as ET

if __name__ == "__main__":
    tree = ET.parse(
        '../datafield/A Clash of Kings - George R.R. Martin/content/'
        'George R.R. Martin - Fire and Ice 02 - A Clash of Kings v4.0 (BD)_split_6.html'
    )
    root = tree.getroot()
    ns = {"default": root.nsmap[None]}
    # NavPoints = root.xpath(".//*[contains(@class, 'MsoHyperlink')]/default:a", namespaces=ns)
    # tmpNode = NavPoints[0]
    # NavPointsTuples = []
    # for node in NavPoints:
    #     chapter_name = node.xpath("./default:navLabel/default:text/text()", namespaces=ns)
    #     chapter_loc = node.xpath("./default:content/@src", namespaces=ns)
    #     NavPointsTuples.append((chapter_name, chapter_loc))

コード例 #42

0

ファイルを表示

ファイル: BookmarkClean.py プロジェクト: mikehentges/BookmarkClean

__author__ = 'mike'

import lxml.html

from lxml.html import etree
# from lxml.html.soupparser import etree

print("hello world")

parser = lxml.html.HTMLParser()
tree = etree.parse('e:\\users\\mike\\documents\\bookmarks_12_5_14_small.html', parser)
root = tree.getroot()

clean_links = {}
pre_text = "<DT><A "
post_text = "</A>"

for element in root.iter("a"):
    # print("%s - %s - %s" % (element.tag, element.attrib, element.text))
    clean_links[element.text] = element.attrib
    # if (len(clean_links) > 10) :
    # break

for link in sorted(clean_links.keys()):
    line = pre_text
    for attrib_key in clean_links[link].keys():
        line += " "
        line += str(attrib_key).upper() + "=" + '"'
        line += clean_links[link][attrib_key]
        line += '"'
    line += ">" + link + post_text