def htmldiff(path1, path2):
    tree1 = parse(path1).getroot()
    tree2 = parse(path2).getroot()

    elementsA_hash = {}
    elementsB_hash = {}
    isLeafNodeA = {}
    isLeafNodeB = {}
    outputDict = {}
    numberOfChanges = 0

    hashNodes(tree1, elementsA_hash, isLeafNodeA, False, None)
    hashNodes(tree2, elementsB_hash, isLeafNodeB, True, elementsA_hash)
    sameKeys = findSimilarNodes(elementsA_hash, elementsB_hash)

    for key, value in elementsA_hash.iteritems():
        output = {}
        try:
            if key in sameKeys:
                continue

            isSameNode(elementsA_hash[key], elementsB_hash[key],
                       isLeafNodeA[key], outputDict)
        except KeyError as e:
            node = elementsA_hash[key]
            # print 'I got a KeyError - reason "%s"' % str(e)
            output['afterText'] = ""
            output['afterAttribute'] = ""
            output['rawTextChange'] = node.text
            output['rawAttributeChange'] = node.attrib
            output['elementType'] = node.tag
            output['op'] = ""
            output['fullText'] = ""
            output['otherInfo'] = "DELETED NODE"
            outputDict[getDiffOutputNumber(outputDict)] = output

    for key, value in elementsB_hash.iteritems():
        output = {}
        try:
            # Check to see if this node exist in the original file
            tempNode = elementsA_hash[key]
        except KeyError as e:
            node = elementsB_hash[key]
            # print 'I got a KeyError - reason "%s"' % str(e)
            output['afterText'] = node.text
            output['afterAttribute'] = node.attrib
            output['rawTextChange'] = node.text
            output['rawAttributeChange'] = node.attrib
            output['elementType'] = node.tag
            output['op'] = ""
            output['fullText'] = ""
            output['otherInfo'] = "ADDED NODE"
            outputDict[getDiffOutputNumber(outputDict)] = output

    print getDiffOutputNumber(outputDict)
    return outputDict
Esempio n. 2
0
def getfields(keys):
    for page in ['system','signal','status']:
        root=parse(fetch("%s/%s.asp" % (host, page)))
        for x in root.xpath(".//tr"):
            fields=[totext(t) for t in x.xpath('./td')]
            if "%s/%s" % (page, fields[0]) in keys:
                print fields[0], fields[1]
Esempio n. 3
0
    def from_str(self, html_str, partial=False) -> HtmlNode:
        '''
            Create an `HtmlNode` from a string.

            Keyword Arguments:
                partial: If True, the provided string is considered as a part of HTML for parsing.
        '''
        if partial:
            html_str = f"<html><body>{html_str}</body></html>"
            lxml_tree = soupparser.parse(StringIO(html_str))
            body = lxml_tree.getroot().find('body')
            return HtmlNode(list(body)[0])
        else:
            lxml_tree = soupparser.parse(StringIO(html_str))
            body = lxml_tree.getroot()
            return HtmlNode(body)
Esempio n. 4
0
    def _getElementTreeRoot(self, url):
        import lxml.html.soupparser as soupparser
        import urllib.request

        _url = urllib.request.urlopen(url)
        tree = soupparser.parse(_url)
        return tree.getroot()
Esempio n. 5
0
def readXMLlxml(filename):
	from lxml.html import soupparser

	tree = soupparser.parse(filename)
	root = tree.getroot()
	for child in root:
		yield child.tag, child.attrib
Esempio n. 6
0
def fetch(url):
    # url to etree
    try:
        f = urlopen(url)
    except:
        return '[!] unable to open %s' % url
    return parse(f)
Esempio n. 7
0
def fetch(url):
    # url to etree
    try:
        f=urlopen(url)
    except:
        return '[!] unable to open %s' % url
    return parse(f)
Esempio n. 8
0
def unmeta(url, res):
    """
    Finds any meta redirects a httplib.response object that has
    text/html as content-type.

    Args:

       url (str):  The url to follow one redirect

       res (httplib.response):  a http.response object

    Returns: (str).  The return resolved url

    """
    if res and (res.getheader('Content-type') or "").startswith('text/html'):
        size=65535
        if res.getheader('Content-Length'):
           try:
              tmp=int(res.getheader('Content-length'))
              if tmp<65535:
                 size=tmp
           except:
              print "wrong content-length:",res.getheader('Content-length')

        root=parse(StringIO(res.read(size)))
        for x in root.xpath('//meta[@http-equiv="refresh"]'):
            newurl=x.get('content').split(';')
            if len(newurl)>1:
                newurl=newurl[1].strip()[4:]
                parts=httplib.urlsplit(urllib.unquote_plus(newurl))
                if parts.scheme and parts.netloc:
                    url=newurl
    return weedparams(url)
Esempio n. 9
0
def fetch(url):
    # url to etree
    print >> sys.stderr, url
    f=urllib2.urlopen(url)
    raw=parse(f)
    f.close()
    return raw
Esempio n. 10
0
def interfaces():
    print "Interface Name\tProvisioned\tState\tSpeed\tMAC Address"
    root=parse(fetch("%s/status.asp" % host))
    for x in root.xpath(".//tr"):
        fields=[totext(t) for t in x.xpath('./td')]
        if "status/%s" % fields[0] in ifs:
            print fields[0], '\t', '\t'.join(fields[1:])
Esempio n. 11
0
    def __parseto_xtree(self, xhtml_s):

        if isinstance(xhtml_s, dict):
            base_url = xhtml_s.pop("base_url", None)
            # print "IN"
            print base_url
            resolve_base = xhtml_s.pop("resolve_base", True)
            clean_xhtml = xhtml_s.pop("clean_xhtml", False)
            xhtml_s = xhtml_s.pop("xhtml_s", None)
            assert xhtml_s,\
                "LinkExtractor.__parseto_xtree() Error: Dictionary with <None> xhtml source"

        elif isinstance(xhtml_s, str):
            clean_xhtml = False
            base_url = None

        else:
            raise Exception(
                "LinkExtractor.__parseto_xtree() Error: string or dictionary instance expected"
            )

        if clean_xhtml:
            xhtml_clr = html_clr(
                scripts=True, javascript=True, comments=True, style=True,
                links=True, meta=True, page_structure=False, processing_instructions=True,
                embedded=True, annoying_tags=True, remove_unknown_tags=True
            )
            # meta=False because we need MetaInfo

            xhtml_s = xhtml_clr.clean_html(xhtml_s)

        # The HTMLParser(s) should be defined in the thread (or process) when lxml.html.parser is dispatched into it
        htmlparser = lxml.html.HTMLParser(recover=True, no_network=False)
        # recover mode and download DTD enabled

        # Now parse the XHTML source
        try:
            etree = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser)
        except Exception as e:

            print("LinkExtractor Error: %s" % e)
            print("LinkExtractor: Now Trying with the SOUP parser")

            try:
                etree = soup.parse(xhtml_s)
            except Exception as e:
                raise Exception("LinkExtractor Error: %s" % e)

        if base_url:
            eroot = etree.getroot()
            try:
                eroot.make_links_absolute(base_url, resolve_base_href=resolve_base)
            except Exception as e:
                raise Exception(
                    "LinkExtractor.__parseto_xtree() while making links absolute Error: " % e
                )

        # Return the etree just created
        return etree
Esempio n. 12
0
def dump(pages=['system','signal','status','log','emta']):
    for page in pages:
        root=parse(fetch("%s/%s.asp" % (host, page)))
        for x in root.xpath(".//tr"):
            fields=[totext(t) for t in x.xpath('./td')]
            if filter(None,fields) and fields!=['']:
                print ':'.join(fields)
        print
Esempio n. 13
0
 def _initmap(self):
     pos=0
     i=0
     offset=0
     paths={}
     tree = parse(StringIO(self.doc.body.encode('utf8')))
     textnodes=tree.xpath('//div[@id="TexteOnly"]//text()')
     cut=5
     if not textnodes:
         textnodes=tree.xpath('//text()')
         cut=10
     texts=[unescape(x) for x in textnodes]
     #tmp = [token for frag in texts if frag for token in nltk.tokenize.wordpunct_tokenize(frag)]
     #for line in difflib.context_diff(tmp, self.doc.tokens):
     #    print repr(line)
     #print texts
     #print self.doc.tokens
     lastgood=(i,offset)
     while pos<len(self.doc.tokens):
         if i>=len(texts):
             print "guessing frag: %s, reset to %s, %s" % (self.doc.tokens[pos].encode('utf8'), lastgood[0], lastgood[1])
             (i, offset)=lastgood
             path=tree.getpath(textnodes[i].getparent())[cut:]
             paths[pos]=(path, offset)
             offset+=len(self.doc.tokens[pos])
             if offset>=len(texts[i]):
                 i+=1
                 offset=0
             pos+=1
             continue
         offset=texts[i].find(self.doc.tokens[pos],offset)
         if offset==-1:
             i+=1
             offset=0
             continue
         if textnodes[i].is_tail:
             path=tree.getpath(textnodes[i].getparent().getparent())[cut:]
             siblings=textnodes[i].getparent().getparent().xpath('.//text()')
             adjust=len(''.join(siblings[:siblings.index(textnodes[i])]))
             paths[pos]=(path, adjust+offset)
             #print 'asdf', self.doc.tokens[pos:pos+l], ''.join(siblings)[adjust+offset:adjust+offset+len(self.doc.tokens[pos])], adjust+offset, offset
         else:
             path=tree.getpath(textnodes[i].getparent())[cut:]
             paths[pos]=(path, offset)
             #print 'qwer', self.doc.tokens[pos], texts[i][offset:offset+len(self.doc.tokens[pos])], paths[pos], path, offset
         #print "frag: %s(%s) @%s" % (i,len(texts), paths[pos][1]),"token: %s(%s)" % (pos, len(self.doc.tokens)), self.doc.tokens[pos].encode('utf8')
         #print paths[pos]
         offset+=len(self.doc.tokens[pos])
         if offset>=len(texts[i]):
             i+=1
             offset=0
         lastgood=(i,offset)
         pos+=1
     #for pos, (path, offset) in sorted(paths.items()):
     #    print self.doc.tokens[pos], pos, path, offset
     #print len(paths), len(self.doc.tokens)
     #print
     return paths
Esempio n. 14
0
def fetch(url, retries=5, ignore=[], params=None):
    try:
        return parse(fetch_raw(url, retries, ignore, params))
    except:
        if retries > 0:
            time.sleep(4 * (6 - retries))
            return fetch(url, retries - 1, ignore=ignore)
        else:
            raise
Esempio n. 15
0
def main(basedir='', outdir='', infile='', outfile='', sexpr='.SearchResults'):
    """basic setup from directory"""
    os.chdir(basedir)  
    if not outfile:
        outfile = os.path.splitext(infile)[0] + '.csv'
    doc = soupparser.parse(infile)
    table = selecttable(doc, sexpr)
    rows = table2csv(table)
    write_csv(outfile,rows)
Esempio n. 16
0
def munin():
    muninfields=["system/Receive Power Level", "system/Transmit Power Level", "signal/Signal to Noise Ratio"]
    for page in ['system','signal','status']:
        root=parse(fetch("%s/%s.asp" % (host, page)))
        for x in root.xpath(".//tr"):
            fields=[totext(t) for t in x.xpath('./td')]
            key="%s/%s" % (page, fields[0])
            if key in muninfields:
                print "%s.value %s" % (fields[0].lower().replace(' ','_'), split(fields[1]))
Esempio n. 17
0
def fetch(url, retries=5, ignore=[], params=None):
    try:
        return parse(fetch_raw(url, retries, ignore, params))
    except:
        if retries>0:
            time.sleep(4*(6-retries))
            return fetch(url,retries-1, ignore=ignore)
        else:
            raise
Esempio n. 18
0
def main():
    (options, args) = parseOpts()

    print "Fetch EPG data from '%s'." % get_url(options)
    raw     = urllib2.urlopen(get_url(options), 'utf-8')
    content = parse(raw)

    exporter = XmltvExporter(options.output)
    parser   = OtrParser(exporter)

    parser.parse(content)
    exporter.write()
Esempio n. 19
0
def nextPage(req):
    response = opener.open(req)
    tree = parse(response)
    map(
        scrape, ["http://www.europarl.europa.eu/oeil/" + x.get("href") for x in tree.xpath('//a[@class="com_acronym"]')]
    )

    img = tree.xpath('//a/img[@src="img/cont/activities/navigation/navi_next_activities.gif"]')
    if len(img):
        next = "http://www.europarl.europa.eu/" + img[0].xpath("..")[0].get("href")
        print >>sys.stderr, ("retrieving next page")
        nextPage(next)
Esempio n. 20
0
def title(request):
    
    url = request.GET.get('url', None)
    if url is None:
        return HttpResponseBadRequest()
    else:
        soup = parse(urlopen(url))
        title = soup.find('.//title').text

        return HttpResponse(dumps({
            'url': url,
            'title': title,
        }))
Esempio n. 21
0
def fetch(url):
    # url to etree
    try:
        f=urllib2.urlopen(url)
    except urllib2.HTTPError:
        try:
            f=urllib2.urlopen(url)
        except urllib2.HTTPError:
            try:
                f=urllib2.urlopen(url)
            except urllib2.HTTPError:
                return ''
    return parse(f)
Esempio n. 22
0
def fetchVotes(d):
    url="%s%s%s" % ("http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+",
                    d,
                    "+RES-RCV+DOC+WORD+V0//EN&language=EN")
    print >>sys.stderr, url
    f=urllib2.urlopen(url)
    tmp=mkstemp()
    fd=os.fdopen(tmp[0],'w')
    fd.write(f.read())
    fd.close()
    f.close()
    res=subprocess.Popen(['/usr/bin/wvHtml', tmp[1], '-'],
                     stdout=subprocess.PIPE).communicate()[0]
    os.unlink(tmp[1])
    return parse(StringIO(res))
Esempio n. 23
0
File: crawler.py Progetto: h/eureka
    def fetch_broken_html(self, *args, **kwargs):
        '''
        like ``fetch_html`` with even more relaxed parsing by using
        ``BeautifulSoup`` as our parser

        '''

        from lxml.html import soupparser
        from eureka.xml import HTMLParser

        with self.fetch(*args, **kwargs) as fp:
            result = soupparser.parse(fp,
                     makeelement=HTMLParser().makeelement).getroot()
            result.make_links_absolute(fp.geturl(), handle_failures='ignore')
            return result
Esempio n. 24
0
def munin_speed():
    root=parse(fetch("%s/signal.asp" % host))
    modmap={'BPSK': 1, 'QPSK': 2, '8PSK': 3, '16QAM': 4, '32QAM': 5, '64QAM': 6, '256QAM': 8, }
    c=0
    for x in root.xpath(".//tr"):
        fields=[totext(t) for t in x.xpath('./td')]
        if fields[0]=="Modulation":
            c=modmap[fields[1]]
            continue
        if fields[0]=="Bit Rate":
            print "downstream_bitrate.value %.3f" % (int(split(fields[1]))/8000000.0)
            continue
        if fields[0]=="Symbol Rate":
            print "upstream_bitrate.value %.3f" % (int(split(fields[1]))*c/8000.0)
            continue
Esempio n. 25
0
def fetch(url):
    # url to etree
    print >> sys.stderr, url
    try:
        f=urllib2.urlopen(url)
    except (urllib2.HTTPError, urllib2.URLError):
        try:
            f=urllib2.urlopen(url)
        except (urllib2.HTTPError, urllib2.URLError):
            try:
                f=urllib2.urlopen(url)
            except (urllib2.HTTPError, urllib2.URLError):
                return ''
    raw=parse(f)
    f.close()
    return raw
Esempio n. 26
0
def fetch(url):
    # url to etree
    print >> sys.stderr, url
    try:
        f = urllib2.urlopen(url)
    except (urllib2.HTTPError, urllib2.URLError):
        try:
            f = urllib2.urlopen(url)
        except (urllib2.HTTPError, urllib2.URLError):
            try:
                f = urllib2.urlopen(url)
            except (urllib2.HTTPError, urllib2.URLError):
                return ''
    raw = parse(f)
    f.close()
    return raw
Esempio n. 27
0
 def parsetoXtree(self, xhtml_s, clean_xhtml=False):
     if clean_xhtml:
         cleaner = Cleaner( scripts=True, javascript=True, comments=True, style=True,\
                            links=True, meta=True, page_structure=False, processing_instructions=True,\
                            embedded=True, annoying_tags=True, remove_unknown_tags=True )#meta=False because we need MetaInfo
         try:
             xhtml_s = cleaner.clean_html(xhtml_s)
         except:
             pass
     #The HTML Parsers with and without recover mode but the capability to download the Proper DTD always ON
     #In case the lxml.html.parser will dispatched to sub-processes or threads then 
     #the HTMLParser(s) should be defined within these sub-processes or threads
     htmlparser = lxml.html.HTMLParser(recover=False, no_network=False) 
     htmlparser_rcv = lxml.html.HTMLParser(recover=True, no_network=False)    
     #Parse the XHTML Source 
     parsing_errors = list()    
     try:           
         xhtml_t = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser, base_url=self.due.base_url['url'])
     except:
     #except ValueError, error:
     #except lxml.etree.XMLSyntaxError, error:
         #print(xhtml_s)
         pass
         #print("PARSE ERROR (no recovery mode): %s" % error)
         #parsing_errors.append(error)
         try:
             xhtml_t = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser_rcv, base_url=self.due.base_url['url']) #StringIO(xhtml_s)
         except:
             print("PARSE ERROR (recivery mode): %s" % error)
             parsing_errors.append(error)
             try:
                 print('DA ZOUP')
                 xhtml_t = soup.parse(xhtml_s) #StringIO(xhtml_s)
             except:
                 print("F****D-UP PAGE")
                 parsing_errors.append("BeautifullSoup Failed")
                 return {'xtree' : None, 'parsing_errors' : parsing_errors}
             #Get the root Element and make the links absolute
             xhtml_troot = xhtml_t.getroot()
             try:
                 xhtml_troot.make_links_absolute(self.due.base_url['url'], resolve_base_href=True)
             except:
                 return {'xtree' : None, 'parsing_errors' : parsing_errors}
             for i in xhtml_t.iterlinks():
                 pass
     return {'xtree' : xhtml_t, 'parsing_errors' : parsing_errors}
Esempio n. 28
0
def parse_html(file):
    html = parser.parse(file)

    # extract the thread ID
    thread_id = html.xpath('//form/a[@name][1]/@name')
    thread_posts = html.xpath('count(//form/a[@name])')

    posts = []
    post_ids = html.xpath('//td[@class="reply"]/@id')

    # first post is special, unfortunately.
    post_id = post_ids.pop(0)
    author = html.xpath('//form/span[@class="postername"][1]/text()')[0]
    content = ElementTree.tostring(
        html.xpath('//form/blockquote')[0]).decode('UTF-8')
    date = html.xpath('//form/span[@class="posttime"][1]/text()')[0]
    attach = html.xpath('//form/span[@class="filesize"]/a[1]/@href')
    attach = attach[0] if len(attach) > 0 else None
    posts.append(Post(post_id, author, content, date, attach))

    # <a class="quotelink unkfunc" href="http://suptg.thisisnotatrueending.com/archive/17738107/#17745349" onclick="replyhl('17745349');">&gt;&gt;17745349</a>
    magic = re.compile(r'<a class=".*?" href=".*?" onclick=".*?">(.*?)</a>')

    # extract other postss
    for post in post_ids:
        author = html.xpath(
            '//td[@id={}]/span[@class="commentpostername"]/text()'.format(
                post))[0]
        content = ElementTree.tostring(
            html.xpath(
                '//td[@id={}]/blockquote'.format(post))[0]).decode('UTF-8')
        date = html.xpath(
            '//td[@id={}][span[@class="commentpostername"]]/text()[string-length()>1][1]'
            .format(post))[0]
        attach = html.xpath(
            '//td[@id={}][span[@class="filesize"]]/a/@href'.format(post))
        attach = attach[0] if len(attach) > 0 else None

        content = magic.sub(r'\\1', content)

        posts.append(Post(post, author, content, date, attach))

    return posts
Esempio n. 29
0
def parse_product(id, cid, name):
    root = parse(os.path.join(base_dir, 'htmls', '%s.html' % id))
    xml_product_content = root.xpath("//div[@class='productContent']")[0]
    product_img = norm_image_url(xml_product_content.xpath("//img[@class='productPic']")[0].attrib['src'])
    product_name = xml_product_content.xpath("//p[@class='d_title']/span")[0].text
    product_for = xml_product_content.xpath("//p[@class='d_title']/label")[0].text.rstrip(u'。').strip()
    product_spec_texts = re.compile(u'/|/').split(xml_product_content.xpath("//p[@class='d_title']/text()")[1].strip()[3:])
    product_specs = [norm_spec(i) for i in product_spec_texts]
    xml_product_detail = xml_product_content.xpath("//div[@class='pl_detail']")[0]
    product_detail = parse_detail(id, cid, name, xml_product_content)
    product = {}
    product['id'] = id
    product['cid'] = cid
    product['name'] = product_name
    product['image'] = product_img
    product['for'] = product_for
    product['specs'] = product_specs
    product['detail'] = product_detail
    return product
Esempio n. 30
0
 def _initmap(self):
     pos=0
     i=0
     offset=0
     paths={}
     tree = parse(StringIO(self.doc.body.encode('utf8')))
     textnodes=tree.xpath('//div[@id="TexteOnly"]//text()')
     cut=5
     if not textnodes:
         textnodes=tree.xpath('//text()')
         cut=10
     texts=[unescape(x) for x in textnodes]
     #print texts
     #print self.doc.tokens
     while i<len(texts) and pos<len(self.doc.tokens):
         #print i,len(texts),len(self.doc.tokens),pos, self.doc.tokens[pos].encode('utf8')
         offset=texts[i].find(self.doc.tokens[pos],offset)
         if offset==-1:
             i+=1
             offset=0
             continue
         if textnodes[i].is_tail:
             path=tree.getpath(textnodes[i].getparent().getparent())[cut:]
             siblings=textnodes[i].getparent().getparent().xpath('.//text()')
             adjust=len(''.join(siblings[:siblings.index(textnodes[i])]))
             paths[pos]=(path, adjust+offset)
             #print 'asdf', self.doc.tokens[pos], ''.join(siblings)[adjust+offset:adjust+offset+len(self.doc.tokens[pos])], adjust+offset, offset
         else:
             path=tree.getpath(textnodes[i].getparent())[cut:]
             paths[pos]=(path, offset)
             #print 'qwer', self.doc.tokens[pos], texts[i][offset:offset+len(self.doc.tokens[pos])], paths[pos], path, offset
         #print paths[pos]
         offset+=len(self.doc.tokens[pos])
         if offset>=len(texts[i]):
             i+=1
             offset=0
         pos+=1
     #for pos, (path, offset) in sorted(paths.items()):
     #    print self.doc.tokens[pos], pos, path, offset
     #print len(paths), len(self.doc.tokens)
     #print
     return paths
Esempio n. 31
0
def readSemcor3File(filename):
    '''
    Reads an XML semcore3.0 file and returns a corresponding MLN database.
    '''
    if not java.isJvmRunning():
            java.startJvm()
    tree = p.parse(filename)
    parser = StanfordParser(grammarPath)
    for e in tree.iter():
        if e.tag == 's':
            s, atoms = reconstruct(e)
            print('//', s)
            for a in atoms:
                print(a)
            deps = parser.get_dependencies(s)
            depstr = list(map(str, deps))
            # do some sanity check
            
            for d in depstr:
                print(d) 
            print('---')
Esempio n. 32
0
def fetchVotes(d):
    url = "%s%s%s" % (
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+",
        d, "+RES-RCV+DOC+WORD+V0//EN&language=EN")
    print >> sys.stderr, url
    try:
        f = urllib2.urlopen(url)
    except (urllib2.HTTPError, urllib2.URLError):
        try:
            f = urllib2.urlopen(url)
        except (urllib2.HTTPError, urllib2.URLError):
            try:
                f = urllib2.urlopen(url)
            except (urllib2.HTTPError, urllib2.URLError):
                return ''
    tmp = mkstemp()
    fd = os.fdopen(tmp[0], 'w')
    fd.write(f.read())
    fd.close()
    f.close()
    res = subprocess.Popen(['/usr/bin/wvHtml', tmp[1], '-'],
                           stdout=subprocess.PIPE).communicate()[0]
    os.unlink(tmp[1])
    return parse(StringIO(res))
Esempio n. 33
0
 def from_str(self, html_str):
     return HtmlNode(soupparser.parse(StringIO(html_str)))
Esempio n. 34
0
    v = etree.SubElement(pm, wp+'meta_value')
    v.text = etree.CDATA(val)
    return pm
    
#get wp ids from file
wpidsfl = open('wpids.mrs')
wpids = marshal.load(wpidsfl)
wpidsfl.close()

#base is a boilerplate from wordpress export
doc = etree.parse(bdir + '/boilerplate_import.xml')
root = doc.getroot()
channel = root.find('channel')

#todo: parsing in commandline values
bld = soupparser.parse("/home/rik/Dropbox/jos compendium/jos/out/biografieen2.html")
broot = bld.getroot()

fl = open("/home/rik/Dropbox/jos compendium/jos/nin/2beeldmateriaal.html")
txt = fl.read()
ch = copyhelper.choptext(txt, ['Algemeen', '2a', '2b', '2c', '2d'])

def convert_imgs(broot=broot,
                hfdst=[],
                categories=[],
                rectype='biografie',
                startnr=10,
                log=''):
    s = CSSSelector('img')
    imgs = s(broot)
    #imgs.reverse()
import lxml.html.soupparser as soupparser
from lxml.etree import tostring
import lxml.html
import io
import sys
import re
import nltk

from django.utils.encoding import smart_str

#file = open('./webpages/romeo_juliet.html')
file = open(sys.argv[1])
html = file.read()
file.close()

tree = soupparser.parse(io.BytesIO(html))

fulltext = ""

for a in tree.xpath('//*[name()="a"]'):
    if a.text is not None:
        if 'name' in a.attrib:
            fulltext += " " + a.text
            #print a.attrib['name'] + a.text

oSentences = nltk.sent_tokenize(fulltext)
for s in oSentences:
    s = smart_str(re.sub(r'\s+', ' ', s))
    s = re.sub(r'^\s', '', s)
    print s
Esempio n. 36
0
def getIpexData():
    page = parse(fetch('http://www.ipex.eu/IPEXL-WEB/epdoc.do'))
    title = None
    for url in page.xpath('//div[@id="widgetContent_LU_WID"]//a'):
        title = u''.join(url.xpath('text()'))
        if title == u'a. Legislative procedures (currently ongoing or ended during the 7th Parliamentary term)':
            a = url
            break
    assert title == u'a. Legislative procedures (currently ongoing or ended during the 7th Parliamentary term)', "title changed on ipex: %s" % title
    url = "http://www.ipex.eu%s" % a.get('href')
    items = list(csv.DictReader(fetch(url), dialect="hash"))
    ipexmap = {}
    for item in items:
        date = None
        for k in cdates[::-1]:
            if item[k]:
                date = item[k]
                break
        item['Rapporteur'] = [[
            x['_id'], getMEPGroup(x, date), x['Name']['full']
        ] for x in filter(None, [
            getMEPRef(mep) for mep in item['Rapporteur'].decode(
                'raw_unicode_escape').split(', ')
        ])]
        item['Shadows'] = [[
            x['_id'], getMEPGroup(x, date), x['Name']['full']
        ] for x in filter(None, [
            getMEPRef(mep)
            for mep in item['Shadows'].decode('raw_unicode_escape').split(', ')
        ])]
        item['Dates'] = []
        for k in dates.keys():
            tmp = item[k].split(' ')
            body = dates[k]['body']
            if len(tmp) == 1:
                try:
                    tmp1 = toDate(tmp[0])
                    if tmp1:
                        item['Dates'].append({
                            'type': 'Event',
                            'body': body,
                            'date': tmp1,
                            'type': k
                        })
                except:
                    print k, tmp[0]
                    raise
            elif len(tmp) > 1:
                tmp1 = toDate(tmp[-1])
                if tmp1:
                    item['Dates'].append({
                        'type': 'Event',
                        'body': body,
                        'date': tmp1,
                        'type': k
                    })
            else:
                print >> sys.stderr, "[!]", k, item[k]
            del item[k]
        item['Dates'] = sorted(item['Dates'])
        tmp = basre.match(item['Bas Doc'])
        if tmp:
            item['Base Doc'] = u"%s/%s/%s" % tmp.groups()
            del item['Bas Doc']
        item['Com Opinion'] = filter(None, item['Com Avis'].split(';'))
        item['title'] = item['Titre EN'].decode('raw_unicode_escape')
        item['subject'] = item['Theme'].decode('raw_unicode_escape')
        item['Com Responible'] = item['ComFond'].decode('raw_unicode_escape')
        for k in ['ComFond', 'Theme', ' ', 'Titre EN', 'Com Avis']:
            del item[k]
        for k in item.keys():
            if not item[k]:
                del item[k]
        ipexmap[item['ProcRef']] = item

        # other fields
        # 'ComFond': 'BUDG',
        # 'Phase': '8.10 Ended',
        # 'Pol Group': 'PPE',
        # 'Type': 'DBA',
        # 'url OEIL': 'http://www.europarl.europa.eu/oeil/FindByProcnum.do?lang=en&procnum=BUD/2009/2048'
        # 'Scrutiny': 'http://www.ipex.eu/ipex/cms/home/Documents/dossier_CNS20110817'
    return ipexmap
Esempio n. 37
0
opener.addheaders = [('User-agent', 'weurstchen/0.5')]


def fetch(url, retries=5):
    # url to etree
    try:
        f = urllib2.urlopen(url)
    except (urllib2.HTTPError, urllib2.URLError), e:
        if hasattr(e, 'code') and e.code >= 400 and e.code not in [504]:
            print >> sys.stderr, "[!] %d %s" % (e.code, url)
            raise
        if retries > 0:
            f = fetch(url, retries - 1)
        else:
            raise
    return parse(f)


def getNewItems(root):
    for d in root.xpath('//td[@class="listlevelthree"]/../td/a'):
        dossier = fetch((URL + d.attrib['href']).encode('utf8'))
        for e in dossier.xpath('//a[@class="com_acronym"]'):
            d_url = e.attrib['href']
            if not db.dossiers.find_one({'meta.source': URL + d_url}):
                oeil_scrape(URL + d_url)
                # print '[!] NEW ITEM: %s%s scraped!!' % (URL, d_url)


def scrape(url):
    root = fetch(url)
    # TODO optimize this!! (reduce steps)
Esempio n. 38
0
# is actually the html of a google maps page. We use the lxml library to load it as a python object,
# and the html soup parser to fix any issues understanding the html and still retrieve acceptable,
# well structured xml

def urlToET(url, tryAgain = True):
    # This just attempt to load the URL twice to account for the possibility of network trouble.
    # This is not a good example of error-proof code, but it still greatly reduces the probability
    # of errors in retrieval effecting your application.
    try:
        fURL = urllib2.urlopen(url)
    except Exception, e:
        if tryAgain:
            return urlToET(url, tryAgain = False)
        else:
            return None
    return soup.parse(fURL)

def parseMap(tree, multiple=False):
    # This function parses the inputted xml data for either one or more routes, finding the relevant
    # information about each. It uses xpath, a way of searching for a particular node within a
    # tree of xml data.
    routes = []
    # These next two lines find all nodes that look like <div class='dir-altroute-inner'>....</div> (inside
    # certain parent nodes), which correpond to the route information we're looking for. You can look at
    # ./map.html if you want to see the raw info, although it's pretty ugly.
    route_div = '//ol[@id="dir_altroutes_body"]/li[@class="dir-altroute"]/div[@class="dir-altroute-inner"]'
    div_list = tree.xpath(route_div)
    x = 0
    for div in div_list:
        # This bit of arguably ugly xml parsing is just pulling names and project times from the route
        # info, and remembering if they include projected traffic or not.
Esempio n. 39
0
 def _initmap(self):
     pos = 0
     i = 0
     offset = 0
     paths = {}
     tree = parse(StringIO(self.doc.body.encode('utf8')))
     textnodes = tree.xpath('//div[@id="TexteOnly"]//text()')
     cut = 5
     if not textnodes:
         textnodes = tree.xpath('//text()')
         cut = 10
     texts = [unescape(x) for x in textnodes]
     #tmp = [token for frag in texts if frag for token in nltk.tokenize.wordpunct_tokenize(frag)]
     #for line in difflib.context_diff(tmp, self.doc.tokens):
     #    print repr(line)
     #print texts
     #print self.doc.tokens
     lastgood = (i, offset)
     while pos < len(self.doc.tokens):
         if i >= len(texts):
             print "guessing frag: %s, reset to %s, %s" % (
                 self.doc.tokens[pos].encode('utf8'), lastgood[0],
                 lastgood[1])
             (i, offset) = lastgood
             path = tree.getpath(textnodes[i].getparent())[cut:]
             paths[pos] = (path, offset)
             offset += len(self.doc.tokens[pos])
             if offset >= len(texts[i]):
                 i += 1
                 offset = 0
             pos += 1
             continue
         offset = texts[i].find(self.doc.tokens[pos], offset)
         if offset == -1:
             i += 1
             offset = 0
             continue
         if textnodes[i].is_tail:
             path = tree.getpath(textnodes[i].getparent().getparent())[cut:]
             siblings = textnodes[i].getparent().getparent().xpath(
                 './/text()')
             adjust = len(''.join(siblings[:siblings.index(textnodes[i])]))
             paths[pos] = (path, adjust + offset)
             #print 'asdf', self.doc.tokens[pos:pos+l], ''.join(siblings)[adjust+offset:adjust+offset+len(self.doc.tokens[pos])], adjust+offset, offset
         else:
             path = tree.getpath(textnodes[i].getparent())[cut:]
             paths[pos] = (path, offset)
             #print 'qwer', self.doc.tokens[pos], texts[i][offset:offset+len(self.doc.tokens[pos])], paths[pos], path, offset
         #print "frag: %s(%s) @%s" % (i,len(texts), paths[pos][1]),"token: %s(%s)" % (pos, len(self.doc.tokens)), self.doc.tokens[pos].encode('utf8')
         #print paths[pos]
         offset += len(self.doc.tokens[pos])
         if offset >= len(texts[i]):
             i += 1
             offset = 0
         lastgood = (i, offset)
         pos += 1
     #for pos, (path, offset) in sorted(paths.items()):
     #    print self.doc.tokens[pos], pos, path, offset
     #print len(paths), len(self.doc.tokens)
     #print
     return paths
Esempio n. 40
0
# and the html soup parser to fix any issues understanding the html and still retrieve acceptable,
# well structured xml


def urlToET(url, tryAgain=True):
    # This just attempt to load the URL twice to account for the possibility of network trouble.
    # This is not a good example of error-proof code, but it still greatly reduces the probability
    # of errors in retrieval effecting your application.
    try:
        fURL = urllib2.urlopen(url)
    except Exception, e:
        if tryAgain:
            return urlToET(url, tryAgain=False)
        else:
            return None
    return soup.parse(fURL)


def parseMap(tree, multiple=False):
    # This function parses the inputted xml data for either one or more routes, finding the relevant
    # information about each. It uses xpath, a way of searching for a particular node within a
    # tree of xml data.
    routes = []
    # These next two lines find all nodes that look like <div class='dir-altroute-inner'>....</div> (inside
    # certain parent nodes), which correpond to the route information we're looking for. You can look at
    # ./map.html if you want to see the raw info, although it's pretty ugly.
    route_div = '//ol[@id="dir_altroutes_body"]/li[@class="dir-altroute"]/div[@class="dir-altroute-inner"]'
    div_list = tree.xpath(route_div)
    x = 0
    for div in div_list:
        # This bit of arguably ugly xml parsing is just pulling names and project times from the route
Esempio n. 41
0
def listings():
    for page in rawpages():
        tree = soupparser.parse(page)
        _listings = tree.xpath('//*[@id="search-results"]/li[*]/div')
        for listing in _listings:
            yield listing
    #return False
    #if a.tail != b.tail:
    #return False
    #if a.values()!=b.values(): #redundant to the attrib matching
    #return False
    # if sorted(a.keys()) != sorted(b.keys()): #may also be redundant to the attrib matching, #See if any attributes were added/removed
    #    str1 = ''.join(sorted(a.keys()))
    #    str2 = ''.join(sorted(b.keys()))
    #    reportStringChange(str1, str2, a.tag, "ATTRIBUTE CHANGE")
    return True


path1 = sys.argv[1]
path2 = sys.argv[2]

tree1 = parse(path1).getroot()
tree2 = parse(path2).getroot()

elementsA_hash = {}
elementsB_hash = {}
isLeafNodeA = {}
isLeafNodeB = {}

hashNodes(tree1, elementsA_hash, isLeafNodeA)
hashNodes(tree2, elementsB_hash, isLeafNodeB)

noofchanges = 0

for key, value in elementsA_hash.iteritems():
    try:
        isSameNode(elementsA_hash[key], elementsB_hash[key], isLeafNodeA[key])
Esempio n. 43
0
 def get_revisions(self):
     root = parse(self.page).getroot()
     list_rev = root.xpath("//ul[@id='pagehistory']/li")
     for rev in list_rev:
         yield Revision(rev)
Esempio n. 44
0
def transformHTML(i, o, root_dir='.', prefix=None, exclude=None):
    """
    @param root_dir: Path to look for resources from.
    @param prefix: If provided, don't inline stuff.  Instead, prepend
        the prefix to relative paths.
    """
    exclude = exclude or []
    root = soupparser.parse(i)
    html = root.getroot()

    # links (css)
    if 'link' not in exclude:
        for link in html.xpath('//link'):
            href = link.attrib.get('href', '')
            if prefix:
                # prefix
                link.attrib['href'] = prefix + href
            else:
                # inline
                loaded = loadThing(href, root_dir)
                style_tag = etree.Element('style')
                style_tag.text = loaded['content']
                link.getparent().replace(link, style_tag)

    # css
    if 'css' not in exclude:
        r_import = re.compile(r'(@import\s+url\((.*?)\)\s*;)')
        r_url = re.compile(r'(url\((.*?)\))', re.S | re.M)
        for style in html.xpath('//style'):
            # imports
            while True:
                imports = r_import.findall(style.text)
                if not imports:
                    break
                for rule, url in imports:
                    # inline
                    loaded = loadThing(url, root_dir)
                    style.text = style.text.replace(rule, loaded['content'])

            # other urls
            urls = r_url.findall(style.text)
            for match, url in urls:
                if prefix:
                    # prefix
                    pass
                else:
                    # inline
                    loaded = loadThing(url, root_dir)
                    style.text = style.text.replace(
                        match, 'url(' + toDataURL(**loaded) + ')')

    # images
    if 'img' not in exclude:
        for image in html.xpath('//img'):
            src = image.attrib.get('src', '')
            if src.startswith('data:'):
                # already a data url
                continue
            if prefix:
                # prefix
                if src.startswith('//') or src.startswith(
                        'http:') or src.startswith('https:'):
                    pass
                else:
                    image.attrib['src'] = prefix + src
            else:
                # inline
                loaded = loadThing(src, root_dir)
                image.attrib['src'] = toDataURL(**loaded)
    o.write(etree.tostring(html, method='html'))
Esempio n. 45
0
import lxml.html.soupparser as soupparser
import lxml.html
import io
import sys
import re
import nltk

from django.utils.encoding import smart_str

file = open(sys.argv[1])
html = file.read()
file.close()

tree = soupparser.parse(io.BytesIO(html))

original = ""
modern = ""
for t in tree.xpath('//*[name()="div"]'):    
    if t.text is not None:
        if 'class' in t.attrib :
            if t.attrib['class'] == 'original-line' :
                oline = t.text.replace('\n', ' ')
                oline = smart_str(re.sub(r'\s+', ' ', oline))
                original += " " + oline

                
            elif t.attrib['class'] == 'modern-line' :
                mline = t.text.replace('\n', ' ')
                mline = smart_str(re.sub(r'\s+', ' ', mline))
                modern += " " + mline
                
Esempio n. 46
0
def munin_freq():
    root=parse(fetch("%s/signal.asp" % host))
    for x in root.xpath(".//tr"):
        fields=[totext(t) for t in x.xpath('./td')]
        if fields[0].endswith("stream Frequency"):
            print "%s.value %s" % (fields[0].lower().replace(' ','_'), split(fields[1]))
Esempio n. 47
0
#                              urllib2.ProxyHandler({'http': 'http://*****:*****@class="listlevelthree"]/../td/a'):
        dossier = fetch((URL+d.attrib['href']).encode('utf8'))
        for e in  dossier.xpath('//a[@class="com_acronym"]'):
            d_url = e.attrib['href']
            if not db.dossiers.find_one({'meta.source': URL+d_url}):
                oeil_scrape(URL+d_url)
                # print '[!] NEW ITEM: %s%s scraped!!' % (URL, d_url)

def scrape(url):
    root = fetch(url)
    # TODO optimize this!! (reduce steps)
    if not exists(LAST_UPDATED_CACHE) or open(LAST_UPDATED_CACHE).read() != strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]):
        print >>sys.stderr, '[!] Site modification found, scraping unfinished dossiers....'
    #return False
    #if a.tail != b.tail:
    #return False
    #if a.values()!=b.values(): #redundant to the attrib matching
    #return False
    # if sorted(a.keys()) != sorted(b.keys()): #may also be redundant to the attrib matching, #See if any attributes were added/removed
    #    str1 = ''.join(sorted(a.keys()))
    #    str2 = ''.join(sorted(b.keys()))
    #    reportStringChange(str1, str2, a.tag, "ATTRIBUTE CHANGE")
    return True


path1 = sys.argv[1]
path2 = sys.argv[2]

tree1 = parse(path1).getroot()
tree2 = parse(path2).getroot()

elementsA_hash = {}
elementsB_hash = {}
isLeafNodeA = {}
isLeafNodeB = {}

hashNodes(tree1, elementsA_hash, isLeafNodeA)
hashNodes(tree2, elementsB_hash, isLeafNodeB)

noofchanges = 0

for key, value in elementsA_hash.iteritems():
    try:
        isSameNode(elementsA_hash[key], elementsB_hash[key], isLeafNodeA[key])