def parseAndImport(self, import_from_source=False): from exe.engine.beautifulsoup import BeautifulSoup fp = open(self.filename) bs = BeautifulSoup(fp.read().replace(CDATA_BEGIN, "").replace(CDATA_END, "")) fp.close() for transunit in bs.findAll("trans-unit"): item_id = transunit.get("id", None) if item_id is None: log.info("Item id not found: %s" % item_id) continue field = self.getFieldFromPackage(self.package, item_id) if field is None: log.info("Field not found: %s" % item_id) continue if import_from_source: tar = transunit.find("source") else: tar = transunit.find("target") if item_id.endswith("title"): # It's a idevice, set the title field.set_title(u" ".join([unicode(u) for u in tar.contents])) log.debug("Title set for: %s" % item_id) elif item_id.endswith("nodename"): # It's a node, set the title field.setTitle(u" ".join([unicode(u) for u in tar.contents])) log.debug("Title set for: %s" % item_id) else: # It's a field field.content_w_resourcePaths = u" ".join([unicode(u) for u in tar.contents]) field.TwistedRePersist() log.debug("Content set for: %s" % item_id) self.package.isChanged = True
def parseAndImport(self, import_from_source=False): from exe.engine.beautifulsoup import BeautifulSoup fp = open(self.filename) bs = BeautifulSoup(fp.read().replace(CDATA_BEGIN, "").replace(CDATA_END, "")) fp.close() for transunit in bs.findAll('trans-unit'): item_id = transunit.get('id', None) if item_id is None: log.info('Item id not found: %s' % item_id) continue field = self.getFieldFromPackage(self.package, item_id) if field is None: log.info('Field not found: %s' % item_id) continue if import_from_source: tar = transunit.find('source') else: tar = transunit.find('target') if item_id.endswith('title'): # It's a idevice, set the title field.set_title(u' '.join([unicode(u) for u in tar.contents])) log.debug('Title set for: %s' % item_id) elif item_id.endswith('nodename'): # It's a node, set the title field.setTitle(u' '.join([unicode(u) for u in tar.contents])) log.debug('Title set for: %s' % item_id) else: # It's a field field.content_w_resourcePaths = u' '.join([unicode(u) for u in tar.contents]) field.TwistedRePersist() log.debug('Content set for: %s' % item_id) self.package.isChanged = True
def parseAndImport(self, import_from_source=False): from exe.engine.beautifulsoup import BeautifulSoup fp = open(self.filename) bs = BeautifulSoup(fp.read()) fp.close() for transunit in bs.findAll('trans-unit'): item_id = transunit.get('id', None) if item_id is None: log.info('Item id not found: %s' % item_id) continue field = self.getFieldFromPackage(self.package, item_id) if field is None: log.info('Field not found: %s' % item_id) continue if import_from_source: tar = transunit.find('source') else: tar = transunit.find('target') if item_id.endswith('title'): # It's a idevice, set the title field.set_title(u' '.join([unicode(u) for u in tar.contents])) log.debug('Title set for: %s' % item_id) elif item_id.endswith('nodename'): # It's a node, set the title field.setTitle(u' '.join([unicode(u) for u in tar.contents])) log.debug('Title set for: %s' % item_id) else: # It's a field field.content_w_resourcePaths = u' '.join([unicode(u) for u in tar.contents])\ .replace(CDATA_BEGIN, "").replace(CDATA_END, "") field.TwistedRePersist() log.debug('Content set for: %s' % item_id)
def getAppletcodeDescartes(self, filename): """ xhtml string for DescartesApplet """ global SCENE_NUM html = "" if not filename.endswith(".jar"): if filename.endswith(".html") or filename.endswith(".htm"): from exe.engine.beautifulsoup import BeautifulSoup, BeautifulStoneSoup import urllib2 if filename.find(",") == -1: # firstly verify the URL is reachable, or come back: if self.verifyConn(filename) == False: assert self.parentNode.package, _('Sorry, this URL is unreachable') return # filename is reachable, go on: htmlbytes = urllib2.urlopen(filename) else: if self.verifyConn(filename[2:]) == False: return html == '' htmlbytes = urllib2.urlopen(filename[2:]) content = htmlbytes.read() # content = content.replace('""','"') Galo swears it won't be necessary soup = BeautifulSoup(content) i = 0 appletslist = [] for ap_old in soup.findAll("applet",{"code":"Descartes.class"}): for resource in reversed(self.userResources): if resource._storageName != ap_old["archive"]: resource.delete() global DESC_PLUGIN DESC_PLUGIN = 0 ap_old["codebase"] = "./" appletslist.append(ap_old) for ap_new in soup.findAll("applet",{"code":"descinst.Descartes.class"}): DESC_PLUGIN = 1 for resource in reversed(self.userResources): if resource._storageName != 'descinst.jar': resource.delete() ap_new["codebase"] = "./" appletslist.append(ap_new) for ap_supernew in soup.findAll("applet",{"code":"descinst.DescartesWeb2_0.class"}): DESC_PLUGIN = 1 for resource in reversed(self.userResources): if resource._storageName != 'descinst.jar': resource.delete() ap_supernew["codebase"] = "./" appletslist.append(ap_supernew) # TO_DO sometimes applets are included in frame labels (no applets found in the url): # it could begin...: # if appletslist == []: # because none <applet> was founded # for ap_frame in soup.findAll("frame src"): # could be problems with that whitespace # DESC_PLUGIN = 1 # for resource in reversed(self.userResources): # if resource._storageName != 'descinst.jar': # resource.delete() # if ap_frame["codebase"]: # ap_frame["codebase"] = "./" # appletslist.append(ap_frame) # if none applet was found: if appletslist == []: html == '' return html # finally: for x in appletslist: u = '' if i == SCENE_NUM -1: u = unicode(x) umod = self.downloadFiles(u) break i = i+1 htmlbytes.close() html = umod # now html has the code of the applet for eXe: return html
def downloadFiles(self, stringapplet): """ only for DescartesApplet initially; three jobs: 1 look for image and macros files in the URL indicated by the user, 2 modify applet code for a correct exe detection of them after this, 3 download and store them into the exe project (absolute urls are required). Return the code modified. """ from exe.engine.beautifulsoup import BeautifulSoup, BeautifulStoneSoup import re import urllib import urllib2 import string import os # import urllib.request stringappletmod = stringapplet soup = BeautifulSoup(stringapplet) # ONE: image files: key_image = ['archivo=', 'imagem_de_fundo=', 'imagem=', 'imagen=', 'file=', 'fitxer=', 'artxibo=', 'image=', 'bg_image=', 'imatge=', 'immagine=', 'irudia=', 'irundia=', 'fichier=', 'imaxe=', 'arquivo=', 'immagine_fondo='] # paths to the images indicated in the applet code: imageslist = [] for x in key_image: if string.find(stringapplet, x) != -1: expression = r"%s'([\w\./]+)'" % x patron = re.compile(expression) for tag in soup.findAll('param'): result = patron.search(tag['value']) if result: if result.group(1) not in imageslist: imageslist.append(result.group(1)) # modify applet code: urlimageslist = [] for im in imageslist: # put as locals the images' path inside exe editor... stringappletmod = stringappletmod.replace(im,im[im.rfind("/")+1:]) # from imageslist, it's neccesary to create the list of absolute paths to the image # files because we want to download this images and load them in the project: # first quit scene number urlnoesc = url[url.find(",")+1:] # cut the right side of the last /: urlcut = urlnoesc[: urlnoesc.rfind("/")] # and extend with the image from the applet code: urlimageslist.append(urlcut+"/"+im) # repeated no thanks: urlimageslist = list(set(urlimageslist)) # do not forget that it could be image_down and image_over versions # of the file in the same place, so... a new extended list: urlimgslistextended = [] for pathimg in urlimageslist: # we trick to urlimageslist adding files that haven't been detected really if pathimg not in urlimgslistextended: urlimgslistextended.append(pathimg) if string.find(pathimg, '.png') != -1: urlimgslistextended.append(pathimg.replace('.png', '_down.png')) urlimgslistextended.append(pathimg.replace('.png', '_over.png')) if string.find(pathimg, '.jpg') != -1: urlimgslistextended.append(pathimg.replace('.jpg', '_down.jpg')) urlimgslistextended.append(pathimg.replace('.jpg', '_over.jpg')) if string.find(pathimg, '.gif') != -1: urlimgslistextended.append(pathimg.replace('.gif', '_down.gif')) urlimgslistextended.append(pathimg.replace('.gif', '_over.gif')) urlimgslistextended = list(set(urlimgslistextended)) # now we can: download all you can find: for pathimgext in urlimgslistextended: # the clean name of the image file img = pathimgext[pathimgext.rfind("/")+1:] # firstly to test the existence of the file: try: resp = urllib2.urlopen(pathimgext) except urllib2.URLError, e: if not hasattr(e, "code"): raise resp = e try: # download whith its original name: img_down = urllib.urlretrieve(pathimgext, img) except: print 'Unable to download file' # be sure the file was found: if img_down[1].maintype == 'image': self.uploadFile(img_down[0]) os.remove(img_down[0])
def getAppletcodeDescartes(self, filename): """ xhtml string for DescartesApplet """ global SCENE_NUM html = "" if not filename.endswith(".jar"): if filename.endswith(".html") or filename.endswith(".htm"): from exe.engine.beautifulsoup import BeautifulSoup, BeautifulStoneSoup import urllib2 if filename.find(",") == -1: # firstly verify the URL is reachable, or come back: if self.verifyConn(filename) == False: assert self.parentNode.package, _('Sorry, this URL is unreachable') return # filename is reachable, go on: htmlbytes = urllib2.urlopen(filename) else: if self.verifyConn(filename[2:]) == False: return html == '' htmlbytes = urllib2.urlopen(filename[2:]) content = htmlbytes.read() # content = content.replace('""','"') Galo swears it won't be necessary soup = BeautifulSoup(content) i = 0 appletslist = [] for ap_old in soup.findAll("applet",{"code":"Descartes.class"}): for resource in reversed(self.userResources): if resource._storageName != ap_old["archive"]: resource.delete() global DESC_PLUGIN DESC_PLUGIN = 0 ap_old["codebase"] = "./" appletslist.append(ap_old) for ap_new in soup.findAll("applet",{"code":"descinst.Descartes.class"}): DESC_PLUGIN = 1 for resource in reversed(self.userResources): if resource._storageName != 'descinst.jar': resource.delete() ap_new["codebase"] = "./" appletslist.append(ap_new) for ap_supernew in soup.findAll("applet",{"code":"descinst.DescartesWeb2_0.class"}): DESC_PLUGIN = 1 for resource in reversed(self.userResources): if resource._storageName != 'descinst.jar': resource.delete() ap_supernew["codebase"] = "./" appletslist.append(ap_supernew) # TO_DO sometimes applets are included in frame labels (no applets found in the url): # it could begin...: # if appletslist == []: # because none <applet> was founded # for ap_frame in soup.findAll("frame src"): # could be problems with that whitespace # DESC_PLUGIN = 1 # for resource in reversed(self.userResources): # if resource._storageName != 'descinst.jar': # resource.delete() # if ap_frame["codebase"]: # ap_frame["codebase"] = "./" # appletslist.append(ap_frame) # if none applet was found: if appletslist == []: html == '' return html # finally: for x in appletslist: u = '' if i == SCENE_NUM -1: u = unicode(x) umod = self.downloadFiles(u) break i = i+1 htmlbytes.close() html = umod # now html has the code of the applet for eXe: return html
def downloadFiles(self, stringapplet): """ only for DescartesApplet initially; three jobs: 1 look for image and macros files in the URL indicated by the user, 2 modify applet code for a correct exe detection of them after this, 3 download and store them into the exe project (absolutes urls are required). Return the code modified. """ from exe.engine.beautifulsoup import BeautifulSoup, BeautifulStoneSoup import re import urllib import urllib2 import string import os # import urllib.request stringappletmod = stringapplet soup = BeautifulSoup(stringapplet) # ONE: image files: key_image = ['archivo=', 'imagem_de_fundo=', 'imagem=', 'imagen=', 'file=', 'fitxer=', 'artxibo=', 'image=', 'bg_image=', 'imatge=', 'immagine=', 'irudia=', 'irundia=', 'fichier=', 'imaxe=', 'arquivo=', 'immagine_fondo='] # paths to the images indicated in the applet code: imageslist = [] for x in key_image: if string.find(stringapplet, x) != -1: expression = r"%s'([\w\./]+)'" % x patron = re.compile(expression) for tag in soup.findAll('param'): result = patron.search(tag['value']) if result: if result.group(1) not in imageslist: imageslist.append(result.group(1)) # modify applet code: urlimageslist = [] for im in imageslist: # put as locals the images' path inside exe editor... stringappletmod = stringappletmod.replace(im,im[im.rfind("/")+1:]) # from imageslist, it's neccesary to create the list of absolute paths to the image # files because we want to download this images and load them in the project: # first quit scene number urlnoesc = url[url.find(",")+1:] # cut the right side of the last /: urlcut = urlnoesc[: urlnoesc.rfind("/")] # and extend with the image from the applet code: urlimageslist.append(urlcut+"/"+im) # repeated no thanks: urlimageslist = list(set(urlimageslist)) # do not forget that it could be image_down and image_over versions # of the file in the same place, so... a new extended list: urlimgslistextended = [] for pathimg in urlimageslist: # we trick to urlimageslist adding files that haven't been detected really if pathimg not in urlimgslistextended: urlimgslistextended.append(pathimg) if string.find(pathimg, '.png') != -1: urlimgslistextended.append(pathimg.replace('.png', '_down.png')) urlimgslistextended.append(pathimg.replace('.png', '_over.png')) if string.find(pathimg, '.jpg') != -1: urlimgslistextended.append(pathimg.replace('.jpg', '_down.jpg')) urlimgslistextended.append(pathimg.replace('.jpg', '_over.jpg')) if string.find(pathimg, '.gif') != -1: urlimgslistextended.append(pathimg.replace('.gif', '_down.gif')) urlimgslistextended.append(pathimg.replace('.gif', '_over.gif')) urlimgslistextended = list(set(urlimgslistextended)) # now we can: download all you can find: for pathimgext in urlimgslistextended: # the clean name of the image file img = pathimgext[pathimgext.rfind("/")+1:] # firstly to test the existence of the file: try: resp = urllib2.urlopen(pathimgext) except urllib2.URLError, e: if not hasattr(e, "code"): raise resp = e try: # download whith its original name: img_down = urllib.urlretrieve(pathimgext, img) except: print 'Unable to download file' # be sure the file was found: if img_down[1].maintype == 'image': self.uploadFile(img_down[0]) os.remove(img_down[0])
def _computeLinks(self): self._computeRelpaths() htmls = self.resources['mimes']['text/html'] total = len(htmls) i = 1 for url in htmls: if self.cancel: return if self.client: self.client.call('eXe.app.getController("Toolbar").updateImportProgressWindow',_(u'Analyzing HTML file labels %d of %d: %s') % (i, total, str(url))) content = open(url.path).read() encoding = detect(content)['encoding'] ucontent = unicode(content,encoding) soup = BeautifulSoup(ucontent,fromEncoding=encoding) declaredHTMLEncoding = getattr(soup, 'declaredHTMLEncoding') if declaredHTMLEncoding: ucontent = UnicodeDammit(content,[declaredHTMLEncoding]).unicode encoding = declaredHTMLEncoding else: pass url.setContent(ucontent,encoding) url.setSoup(soup) for tag in soup.findAll(): if self.cancel: return if not tag.attrs: continue matches = [] for key, value in tag.attrs: if value == "": continue unq_value = unquote(value) unq_low_value = unquote(value.lower()) for l, rl in self.resources['urls'][url.parentpath].relpaths: low_rl = rl.lower() if rl in unq_value: L = Link(self.resources['urls'][l],rl,url,tag,key,rl) matches.append(L) elif low_rl in unq_value: L = Link(self.resources['urls'][l],rl,url,tag,key,low_rl) matches.append(L) elif l in unq_value: L = Link(self.resources['urls'][l],rl,url,tag,key,l) matches.append(L) matches_final = [] for l1 in matches: matches_ = [ m for m in matches if m != l1 ] found = False for l2 in matches_: if re.search(re.escape(l1.relative),l2.relative): found = True if not found: matches_final.append(l1) if matches_final: for match in matches_final: url.addLink( match ) url.addRLink( str(match.url) ) i += 1 csss = self.resources['mimes']['text/css'] if 'text/css' in self.resources['mimes'].keys() else None csss_and_htmls = csss + htmls if csss else htmls total = len(csss_and_htmls) i = 1 for url in csss_and_htmls: if self.cancel: return if url.mime == 'text/css': tipo = 'CSS' else: tipo = 'HTML' content = url.getContent() if not content: content = open(url.path).read() encoding = detect(content)['encoding'] content = unicode(content,encoding) url.setContent(content,encoding) if self.client: self.client.call('eXe.app.getController("Toolbar").updateImportProgressWindow',_(u'Exhaustively analyzed file %s %d of %d: %s') % (tipo, i, total, str(url))) matches = [] for l, rl in self.resources['urls'][url.parentpath].relpaths: low_rl = rl.lower() if rl in content: L = Link(self.resources['urls'][l],rl,url,match=rl) matches.append(L) elif low_rl in content: L = Link(self.resources['urls'][l],rl,url,match=low_rl) matches.append(L) matches_final = [] for l1 in matches: matches_ = [ m for m in matches if m != l1 ] found = False for l2 in matches_: if re.search(re.escape(l1.relative),l2.relative): found = True if not found: matches_final.append(l1) if matches_final: for match in matches_final: if not [ link for link in url.links if link.relative == match.relative ]: url.addLink( match ) url.addRLink( str(match.url) ) i += 1