def get_story_url_from_zip_html(inputio,_is_good_url=None): # print("get_story_url_from_zip_html called") zipf = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob # calibre's convert tends to put FFF's title_page towards the end, # shift it to the front to avoid internal links. filelist = zipf.namelist() tpl = [ x for x in filelist if 'title_page' in x ] for x in tpl: filelist.remove(x) filelist.insert(0,x) for item in filelist: # print(item) # only .htm, .html and .xhtml (or .xhtm for that matter) if re.match(r".*\.x?html?$", item): # print("matched") try: soup = make_soup(zipf.read(item).decode("utf-8")) except UnicodeDecodeError: # calibre converted to html zip fails with decode. soup = make_soup(zipf.read(item)) for link in soup.findAll('a',href=re.compile(r'^http.*')): ahref=link['href'] # print("href:(%s)"%ahref) if _is_good_url == None or _is_good_url(ahref): return ahref return None
def _extract_content(self): pdxf_file = ZipFile(self.path, 'r') try: fl = pdxf_file.namelist() except: errtype, value, traceback = sys.exc_info() msg = _('It seems the PDXF file is corrupted') + '\n' + value events.emit(events.MESSAGES, msgconst.ERROR, msg) raise IOError(errtype, msg , traceback) if not 'mimetype' in fl or not pdxf_file.read('mimetype') == const.DOC_MIME: msg = _('The file is corrupted or not PDXF file') events.emit(events.MESSAGES, msgconst.ERROR, msg) raise IOError(2, msg) filelist = [] for item in fl: if item == 'mimetype' or item[-1] == '/': continue filelist.append(item) for item in filelist: source = pdxf_file.read(item) dest = open(os.path.join(self.presenter.doc_dir, item), 'wb') dest.write(source) dest.close() msg = _('The file content is extracted successfully') events.emit(events.MESSAGES, msgconst.OK, msg)
def readZip(f, **kwargs): from mien.parsers.nmpml import elements as dialect f=ZipFile(f, 'r') xml=f.read('xml') xml=StringIO(xml) doc=xm.readTree(xml) xml.close() doc=xm.assignClasses(doc, dialect) try: dat=f.read('data') except: print "No data archive in zip file" return doc from mien.parsers.datahash import readMD dat=StringIO(dat) dat=readMD(dat, return_raw_hash=True) des=doc.getElements('Data') for de in des: try: d, h=dat[de.upath()] except: print "can't find data for element %s" % (de.upath(),) d, h=(zeros(0), {}) de.datinit(d, h) f.close() return doc
def get_package_metadata(package): """Get the metadata of a plugin in a package. Pass it a filepointer or filename. Raises a `ValueError` if the package is not valid. """ from zipfile import ZipFile, error as BadZipFile try: f = ZipFile(package) except (IOError, BadZipFile): raise ValueError('not a valid package') # get the package version and name try: package_version = int(f.read('ZINE_PACKAGE')) plugin_name = f.read('ZINE_PLUGIN') except (KeyError, ValueError): raise ValueError('not a valid package') if package_version > PACKAGE_VERSION: raise ValueError('incompatible package version') try: metadata = parse_metadata(f.read('pdata/metadata.txt')) except KeyError: metadata = {} metadata['uid'] = plugin_name return metadata
def get_story_url_from_epub_html(inputio,_is_good_url=None): # print("get_story_url_from_epub_html called") epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagName("rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") contentdom = parseString(epub.read(rootfilename)) #firstmetadom = contentdom.getElementsByTagName("metadata")[0] ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) # spin through the manifest--only place there are item tags. for item in contentdom.getElementsByTagName("item"): if( item.getAttribute("media-type") == "application/xhtml+xml" ): filehref=relpath+item.getAttribute("href") soup = make_soup(epub.read(filehref).decode("utf-8")) for link in soup.findAll('a',href=re.compile(r'^http.*')): ahref=link['href'] # print("href:(%s)"%ahref) # hack for bad ficsaver ffnet URLs. m = re.match(r"^http://www.fanfiction.net/s(?P<id>\d+)//$",ahref) if m != None: ahref="http://www.fanfiction.net/s/%s/1/"%m.group('id') if _is_good_url == None or _is_good_url(ahref): return ahref return None
def _get_carparks_xml_from_zip(url=CARPARKS_ZIP_URL, index_xml=INDEX_XML_FILE_NAME): res = request.urlopen(url=url) # Validate a successful HTTP call with status 200. if not res.status == 200: raise Exception('Call to \'{0!s}\' failed with status code {1!s}.'.format(url, res.status)) # Convert the downloaded byte stream to a file-like in-memory object. zip_file = BytesIO(res.read()) # Validate the file-like object contains a valid zip file. if not is_zipfile(zip_file): raise Exception('The URL \'{0!s}\' did not return a valid zip file.'.format(url)) # Convert to an actual ZipFile object. zip = ZipFile(zip_file, 'r') # Fail if the returned zip file is corrupt. if zip.testzip(): raise Exception('Zip file from \'{0!s}\' was corrupt.'.format(url)) # Create a list of filenames to process. xml_filenames = _get_filenames_from_index_xml(index=zip.read(index_xml)) # Validate there is at least 1 file to process. if len(xml_filenames) < 1: raise Exception('No XML files listed in {0!s}!'.format(index_xml)) # Create and populate a dictionary with filenames and contents. return_str_xmls = dict() for filename in xml_filenames: return_str_xmls[filename] = zip.read(filename) # Return the dictionary. return return_str_xmls
def find_plugin_yaml(dataobj): """ """ yml = False try: # The first thing we are going to try to do is create a ZipFile # object with the StringIO data that we have. zfile = ZipFile(dataobj) except: print "[DEBUG] ZipFile Library Failed to Parse DataObject" else: # Before we start recursively jumping through hoops, lets first # check to see if the plugin.yml exists at this level. If so, then # just set the yaml variable. Otherwise we are gonna look for more # zip and jar files and dig into them. if "plugin.yml" in zfile.namelist(): try: yml = yaml.load(zfile.read("plugin.yml")) except: return False else: for filename in zfile.namelist(): if not yml and filename[-3:].lower() in ["zip", "jar"]: print "[DEBUG] Found Zip/Jar file " + filename data = StringIO() data.write(zfile.read(filename)) yml = find_plugin_yaml(data) data.close() zfile.close() return yml
def test_multiple_files_same_comment_same_name(self): assignmentgroup1 = mommy.make('core.AssignmentGroup', parentnode__parentnode__parentnode__short_name="test2100", parentnode__parentnode__short_name="spring2015", parentnode__short_name="oblig1") mommy.make('core.Candidate', assignment_group=assignmentgroup1, relatedstudent__user__shortname="testuser1") tomorrow = datetime.datetime.now() + datetime.timedelta(days=1) feedbackset1 = mommy.make('devilry_group.FeedbackSet', group=assignmentgroup1, is_last_in_group=True, feedbackset_type=groupmodels.FeedbackSet.FEEDBACKSET_TYPE_FIRST_ATTEMPT, deadline_datetime=tomorrow) comment_fbs1_2 = mommy.make('devilry_group.GroupComment', feedback_set=feedbackset1, user_role=Comment.USER_ROLE_STUDENT) commentfile_fbs1_2 = mommy.make('devilry_comment.CommentFile', comment=comment_fbs1_2, filename='testfile1.txt') commentfile_fbs1_2.file.save('testfile1.txt', ContentFile('test2')) commentfile_fbs1_2_2 = mommy.make('devilry_comment.CommentFile', comment=comment_fbs1_2, filename='testfile1.txt') commentfile_fbs1_2_2.file.save('testfile1.txt', ContentFile('test3')) testclass = BulkDownloadTestClass() response = testclass.get(None) zipfileobject = ZipFile(StringIO(response.content)) filecontents = zipfileobject.read('test2100.spring2015.oblig1.testuser1/attempt1/testfile1.txt') self.assertEquals(filecontents, "test2") filecontents = zipfileobject.read('test2100.spring2015.oblig1.testuser1/attempt1/testfile1-1.txt') self.assertEquals(filecontents, "test3")
def test_export_mixed_encodings(self): self.test_folder.zip_import.do_import(data=mac_zip) addNyDocument(self.test_folder, id='html_document') self.test_folder['html_document'].body = u'<p>Html document</p>' self.test_folder['html_document'].approved = 1 export_value = self.test_folder.zip_export.do_export() self.assertFalse(isinstance(export_value, list), ('Errors are raised: ', export_value)) zip = ZipFile(export_value, 'r') expected_namelist = ['index.txt', 'zip_export_folder/picture-1.png', 'zip_export_folder/picture-2.png', 'zip_export_folder/html_document.html'] self.assertEqual(sorted(zip.namelist()), sorted(expected_namelist)) self.assertTrue('<p>Html document</p>' in \ zip.read('zip_export_folder/html_document.html')) picture1_data = IZipExportObject(self.test_folder['picture-1'])()[0] picture2_data = IZipExportObject(self.test_folder['picture-2'])()[0] self.assertEqual(zip.read('zip_export_folder/picture-1.png'), picture1_data) self.assertEqual(zip.read('zip_export_folder/picture-2.png'), picture2_data)
def test_three_groups_after_deadline(self): with self.settings(DEVILRY_COMPRESSED_ARCHIVES_DIRECTORY=self.backend_path): testassignment = mommy.make_recipe('devilry.apps.core.assignment_activeperiod_start', short_name='learn-python-basics', first_deadline=timezone.now() - timezone.timedelta(hours=1)) testgroup1 = mommy.make('core.AssignmentGroup', parentnode=testassignment) testgroup2 = mommy.make('core.AssignmentGroup', parentnode=testassignment) testgroup3 = mommy.make('core.AssignmentGroup', parentnode=testassignment) # Create user as examiner on all groups. testuser = mommy.make(settings.AUTH_USER_MODEL, shortname='thor', fullname='Thor') related_examiner = mommy.make('core.RelatedExaminer', user=testuser, period=testassignment.parentnode) mommy.make('core.Examiner', relatedexaminer=related_examiner, assignmentgroup=testgroup1) mommy.make('core.Examiner', relatedexaminer=related_examiner, assignmentgroup=testgroup2) mommy.make('core.Examiner', relatedexaminer=related_examiner, assignmentgroup=testgroup3) # Create feedbackset for testgroup1 with commentfiles testfeedbackset_group1 = group_mommy.feedbackset_first_attempt_unpublished(group=testgroup1) self.__make_comment_file(feedback_set=testfeedbackset_group1, file_name='testfile.txt', file_content='testcontent group 1') mommy.make('core.Candidate', assignment_group=testgroup1, relatedstudent__user__shortname='april') # Create feedbackset for testgroup2 with commentfiles testfeedbackset_group2 = group_mommy.feedbackset_first_attempt_unpublished(group=testgroup2) self.__make_comment_file(feedback_set=testfeedbackset_group2, file_name='testfile.txt', file_content='testcontent group 2') mommy.make('core.Candidate', assignment_group=testgroup2, relatedstudent__user__shortname='dewey') # Create feedbackset for testgroup3 with commentfiles testfeedbackset_group3 = group_mommy.feedbackset_first_attempt_unpublished(group=testgroup3) self.__make_comment_file(feedback_set=testfeedbackset_group3, file_name='testfile.txt', file_content='testcontent group 3') mommy.make('core.Candidate', assignment_group=testgroup3, relatedstudent__user__shortname='huey') # run actiongroup self._run_actiongroup(name='batchframework_assignment', task=tasks.AssignmentCompressAction, context_object=testassignment, started_by=testuser) archive_meta = archivemodels.CompressedArchiveMeta.objects.get(content_object_id=testassignment.id) zipfileobject = ZipFile(archive_meta.archive_path) path_to_file_group1 = os.path.join('april', 'deadline-{}'.format(defaultfilters.date( testfeedbackset_group1.deadline_datetime, 'b.j.Y-H:i')), 'after_deadline_not_part_of_delivery', 'testfile.txt') path_to_file_group2 = os.path.join('dewey', 'deadline-{}'.format(defaultfilters.date( testfeedbackset_group2.deadline_datetime, 'b.j.Y-H:i')), 'after_deadline_not_part_of_delivery', 'testfile.txt') path_to_file_group3 = os.path.join('huey', 'deadline-{}'.format(defaultfilters.date( testfeedbackset_group3.deadline_datetime, 'b.j.Y-H:i')), 'after_deadline_not_part_of_delivery', 'testfile.txt') self.assertEqual(b'testcontent group 1', zipfileobject.read(path_to_file_group1)) self.assertEqual(b'testcontent group 2', zipfileobject.read(path_to_file_group2)) self.assertEqual(b'testcontent group 3', zipfileobject.read(path_to_file_group3))
def verifyZipSignature(outerZipFilePath): result = MODULE_ZIP_STATUS.Invalid try: dataToSign = None signature = None outerZipFile = ZipFile(outerZipFilePath) # look for a zip file in the name list. # There should only be 2 files in this zip: # The inner zip file and the sig file if len(outerZipFile.namelist()) == 3: dataToSign = sha256(sha256(outerZipFile.read(INNER_ZIP_FILENAME)) + sha256(outerZipFile.read(PROPERTIES_FILENAME))) signature = outerZipFile.read(SIGNATURE_FILENAME) if dataToSign and signature: """ Signature file contains multiple lines, of the form "key=value\n" The last line is the hex-encoded signature, which is over the source code + everything in the sig file up to the last line. The key-value lines may contain properties such as signature validity times/expiration, contact info of author, etc. """ dataToSignSBD = SecureBinaryData(dataToSign) sigSBD = SecureBinaryData(hex_to_binary(signature.strip())) publicKeySBD = SecureBinaryData(hex_to_binary(ARMORY_INFO_SIGN_PUBLICKEY)) result = MODULE_ZIP_STATUS.Valid if CryptoECDSA().VerifyData(dataToSignSBD, sigSBD, publicKeySBD) else \ MODULE_ZIP_STATUS.Unsigned except: # if anything goes wrong an invalid zip file indicator will get returned pass return result
def open(zipname): zf = ZipFile(zipname, 'r') m = zf.read('META-INF/manifest.xml') manifest = Manifest.parse(m) def warn(resource): print(u"Warning: bundle {} does not contain resource {}, which is referred in its manifest.".format(zipname, resource).encode('utf-8')) result = Bundle() result.presets_data = [] for preset in manifest.get_resources('paintoppresets'): if preset in zf.namelist(): result.presets.append(preset) data = zf.read(preset) kpp = KPP(preset, data) result.presets_data.append(kpp) else: warn(preset) result.meta_string = zf.read("meta.xml") result.preview_data = zf.read("preview.png") for brush in manifest.get_resources('brushes'): if brush in zf.namelist(): result.brushes.append(brush) else: warn(brush) for pattern in manifest.get_resources('patterns'): if pattern in zf.namelist(): result.patterns.append(pattern) else: warn(pattern) zf.close() return result
def _get_score(filename): """Given a MusicXML file, return the score as an xml.etree.ElementTree. Given a MusicXML file, return the score as an xml.etree.ElementTree If the file is compress (ends in .mxl), uncompress it first Args: filename: The path of a MusicXML file Returns: The score as an xml.etree.ElementTree. """ score = None if filename.endswith('.mxl'): # Compressed MXL file. Uncompress in memory. filename = ZipFile(filename) # A compressed MXL file may contain multiple files, but only one # MusicXML file. Read the META-INF/container.xml file inside of the # MXL file to locate the MusicXML file within the MXL file # http://www.musicxml.com/tutorial/compressed-mxl-files/zip-archive-structure/ # Raise a MusicXMLParseException if multiple MusicXML files found namelist = filename.namelist() container_file = [x for x in namelist if x == 'META-INF/container.xml'] compressed_file_name = '' try: container = ET.fromstring(filename.read(container_file[0])) for rootfile_tag in container.findall('./rootfiles/rootfile'): if 'media-type' in rootfile_tag.attrib: if rootfile_tag.attrib['media-type'] == MUSICXML_MIME_TYPE: if not compressed_file_name: compressed_file_name = rootfile_tag.attrib['full-path'] else: raise MusicXMLParseException( 'Multiple MusicXML files found in compressed archive') else: # No media-type attribute, so assume this is the MusicXML file if not compressed_file_name: compressed_file_name = rootfile_tag.attrib['full-path'] else: raise MusicXMLParseException( 'Multiple MusicXML files found in compressed archive') except ET.ParseError as exception: raise MusicXMLParseException(exception) try: score = ET.fromstring(filename.read(compressed_file_name)) except ET.ParseError as exception: raise MusicXMLParseException(exception) else: # Uncompressed XML file. try: tree = ET.parse(filename) score = tree.getroot() except ET.ParseError as exception: raise MusicXMLParseException(exception) return score
def unpack_zipdata(self, zipdata): stream = BytesIO(zipdata) fzip = ZipFile(stream) pdfs = [x.filename for x in fzip.filelist if x.filename.endswith('.pdf')] if not pdfs: raise ServerError( 'Conversion returned zip containing no pdf files') thumbnails = sorted( [x.filename for x in fzip.filelist if x.filename.startswith('small/') and x.filename != 'small/'], key=lambda x: int(x.split('.')[0].split('_')[-1])) previews = sorted( [x.filename for x in fzip.filelist if x.filename.startswith('large/') and x.filename != 'large/'], key=lambda x: int(x.split('.')[0].split('_')[-1])) converted = { 'pdfs': [fzip.read(pdfs[0])], 'thumbnails': [fzip.read(filename) for filename in thumbnails[:20]], 'previews': [fzip.read(filename) for filename in previews[:20]], } fzip.close() stream.close() return converted
class ZipfileReader: """ Reads files from an imported zip file. """ def __init__(self, files): self.files = ZipFile(files) self.fullpath = '' def readManifest(self): """ Get the maifest file if it exists. """ for x in self.files.namelist(): index = x.find('imsmanifest.xml') if index != -1: self.fullpath = x[:index] return self.files.read(x) return None def readFile(self, path): """ Get file data from the zip file. """ fn = '%s%s' %(self.fullpath, str(path)) if fn not in self.files.namelist(): fn = fn.replace('/', '\\') if fn not in self.files.namelist(): return None return self.files.read(fn) def listFiles(self): """ List files in the package. """ return self.files.namelist()
def test_graph_export_csv(self): create_graph(self) create_schema(self) create_type(self) create_data(self) self.browser.find_by_id('toolsMenu').first.click() cookies = {self.browser.cookies.all()[0]["name"]: self.browser.cookies.all()[0]["value"], self.browser.cookies.all()[1]["name"]: self.browser.cookies.all()[1]["value"]} result = requests.get(self.live_server_url + '/tools/bobs-graph/export/csv/', cookies=cookies) spin_assert(lambda: self.assertEqual( result.headers['content-type'], 'application/zip')) spin_assert(lambda: self.assertEqual( self.browser.status_code.is_success(), True)) test_file = StringIO(result.content) csv_zip = ZipFile(test_file) for name in csv_zip.namelist(): fw = open('sylva/sylva/tests/files/' + name, 'w') fw.write(csv_zip.read(name)) fw.close() for name in csv_zip.namelist(): f = open('sylva/sylva/tests/files/' + name) csvFile = "" for line in f: csvFile += line f.close() spin_assert(lambda: self.assertEqual(csv_zip.read(name), csvFile)) Graph.objects.get(name="Bob's graph").destroy()
def testTopZip(self): top_zip = ZipFile(StringIO(self._directory_zipper.Zip('top').Get())) self.assertEqual(['top/one.txt', 'top/two/four.txt', 'top/two/three.txt'], sorted(top_zip.namelist())) self.assertEqual('one.txt contents', top_zip.read('top/one.txt')) self.assertEqual('three.txt contents', top_zip.read('top/two/three.txt')) self.assertEqual('four.txt contents', top_zip.read('top/two/four.txt'))
def _load(self, stream): zf = ZipFile(stream) self._load_content_types(zf.read("[Content_Types].xml")) rels_path = posixpath.join("_rels", ".rels") self._load_rels(zf.read(rels_path)) def ropen(item): "read item and recursively open its children" if isinstance(item, Relationships): return if isinstance(item, Part): base, rname = posixpath.split(to_zip_name(item.name)) relname = posixpath.join(base, "_rels", "%s.rels" % rname) if relname in zf.namelist(): item._load_rels(zf.read(relname)) for rel in item.relationships: pname = posixpath.join(item.base, rel.target) if pname in self: # This item is already in self. continue target_path = to_zip_name(pname) data = b"".join(self._get_matching_segments(zf, target_path)) new_part = self._load_part(rel.type, pname, data) if new_part: ropen(new_part) ropen(self) zf.close()
def test_csxconvert(self): # This zip contains an entry on frogs in txt (latin1) # html (utf-8) and odt (utf-8) ff = (sc.config.test_samples_dir / 'ff.zip').open('rb') csxp = webtools.CSXProcessor() result = csxp.process_zip(ff, ff.name) # Affirm result is a valid ZipFile z = ZipFile(result.result.fileobj) # Affirm content is now utf8 text = z.read('ff-latin1.txt').decode(encoding='UTF-8') # 'Maṇḍūka', 'Nīlamaṇḍūka', 'Uddhumāyikā' # And it has been properly transcoded. self.assertIn('Maṇḍūka', text) html = z.read('ff-utf8.html').decode(encoding='UTF-8') self.assertIn('Nīlamaṇḍūka', text) # This doesn't completely test odt but confirms that # it basically worked. odt = z.open('ff.odt') odt = io.BytesIO(odt.read()) # Needs to be seekable odtz = ZipFile(odt) content = odtz.read('content.xml').decode(encoding='UTF-8') self.assertIn('Uddhumāyikā', content)
def _unbundle(path, target): zf = ZipFile(path, 'r') contents = zf.namelist() for item in contents: sp = item.split("/") if not sp[-1]: continue print item, ">", target + item cpath = target + "/".join(sp[:-1]) if not os.path.exists(cpath): os.makedirs(cpath) if item.endswith((".jar", ".xpi", ".zip")): now = target + item path_item = item.split("/") path_item[-1] = "_" + path_item[-1] path = target + "/".join(path_item) buff = StringIO(zf.read(item)) _unbundle(buff, path + "/") else: f = open(target + item, 'w') f.write(zf.read(item)) f.close() zf.close()
def test_groupcomment_files_download_two_users(self): with self.settings(DEVILRY_COMPRESSED_ARCHIVES_DIRECTORY=self.backend_path): testgroup = mommy.make('core.AssignmentGroup') testuser1 = mommy.make(settings.AUTH_USER_MODEL, shortname='*****@*****.**', fullname='Dewey Duck') testuser2 = mommy.make(settings.AUTH_USER_MODEL, shortname='*****@*****.**', fullname='April Duck') candidate1 = mommy.make('core.Candidate', assignment_group=testgroup, relatedstudent__user=testuser1) candidate2 = mommy.make('core.Candidate', assignment_group=testgroup, relatedstudent__user=testuser2) testcomment = mommy.make('devilry_group.GroupComment', feedback_set__group=testgroup, user=testuser1, user_role='student') commentfile = mommy.make('devilry_comment.CommentFile', comment=testcomment, filename='testfile.txt') commentfile.file.save('testfile.txt', ContentFile('testcontent')) testdownloader = feedbackfeed_download_files.CompressedGroupCommentFileDownload() # First user download mockrequest = mock.MagicMock() mockrequest.cradmin_role = candidate1.assignment_group mockrequest.user = testuser1 response = testdownloader.get(mockrequest, testcomment.id) zipfileobject = ZipFile(StringIO(response.content)) filecontents = zipfileobject.read('testfile.txt') self.assertEquals(filecontents, 'testcontent') # Second user download mockrequest.cradmin_role = candidate2.assignment_group mockrequest.user = testuser2 response = testdownloader.get(mockrequest, testcomment.id) zipfileobject = ZipFile(StringIO(response.content)) filecontents = zipfileobject.read('testfile.txt') self.assertEquals(filecontents, 'testcontent')
def _epub_parser(epub): """ Handle EPUB specific parsing Return dict of ebook metadata An EPUB must contain META-INF/container.xml, which contains the path to the EPUB metadata file. """ sha256 = file_hash(epub) zf = ZipFile(epub) xml = xmltodict.parse(zf.read('META-INF/container.xml')) metadata_path = xml['container']['rootfiles']['rootfile']['@full-path'] # TODO: validate this is true for all EPUBs raw_metadata = xmltodict.parse(zf.read(metadata_path)) metadata = {'format': 'epub'} for k, v in raw_metadata['package']['metadata'].items(): if 'dc:' in k: if 'creator' in k: # Required element, needs additional parsing k = 'author' v = v['#text'] if 'identifier' in k: # Required element, needs additional parsing k = 'identifiers' if not isinstance(v, list): v = [v] # Just in case we get a single element identifiers = [] for i in v: identifiers.append({'identifier': i['@opf:scheme'], 'value': i['#text']}) # Support multiple identifiers v = identifiers metadata[k.split('dc:')[-1]] = v metadata['identifiers'].append({'identifier': 'sha256', 'value': sha256}) return metadata
class ApkParser: def __init__(self, file): self._file = ZipFile(file) def getManifest(self): return AXML(self._file.read('AndroidManifest.xml')).get_xml_obj() def getPackageName(self): return self.getManifest().documentElement.getAttribute('package') def getVersionCode(self): return int(self.getManifest().documentElement.getAttribute('android:versionCode')) def getVersionName(self): return self.getManifest().documentElement.getAttribute('android:versionName') def getMinSdkVersion(self): return int(self.getManifest().documentElement.getElementsByTagName('uses-sdk')[0].getAttribute('android:minSdkVersion')) def _getCerts(self): for info in self._file.infolist(): if info.filename.startswith('META-INF/') and info.filename.endswith('.RSA'): for cert in ContentInfo.load(self._file.read(info))['content']['certificates']: yield cert.dump() def getCert(self): certs = list(self._getCerts()) if len(certs) != 1: raise Exception('Cannot read certificate') return certs[0]
class Stick(object): def __init__(self,stickfaceFileLoc,iniName,pos=(0,0)): self.iniName = iniName self.stickfaceFileLoc = stickfaceFileLoc self.loadStickfaceFile(self.stickfaceFileLoc) self.buttons = [0] * len(self.stickfaceIni['buttons']) self.hat = (0,0) self.pos = pos self.cutoff = 1 def loadStickfaceFile(self,stickfaceFileLoc): load = pygame.image.load self.stickfaceZip = ZipFile(stickfaceFileLoc,'r') self.stickfaceIni = yamlLoad(self.stickfaceZip.read(self.iniName)) self.controllerSize = tuple(self.stickfaceIni['controllerSize']) self.buttonImages = [load(StringIO(self.stickfaceZip.read(i))) for i in self.stickfaceIni['buttonImages']] self.controllerImg = load(StringIO(self.stickfaceZip.read(self.stickfaceIni['controllerImage']))) self.buttonLoc = [tuple(i) for i in self.stickfaceIni['buttonLocs']] self.buttonSize = [tuple(i) for i in self.stickfaceIni['buttonSizes']] def drawController(self): finalImg = pygame.Surface(self.controllerSize).convert_alpha() finalImg.blit(self.controllerImg,(0,0)) for b in range(12): if self.buttons[b] >= self.cutoff: finalImg.blit(self.buttonImages[b],self.buttonLoc[b]) return finalImg
class PebbleSystemResources(object): def __init__(self, firmware_path): self._firmware_path = firmware_path self._zipfile = ZipFile(firmware_path) self._manifest = json.loads(self._zipfile.read("manifest.json")) self._resource_data = self._zipfile.read("system_resources.pbpack") self.resources = PebbleResources(self._resource_data) self.resource_id_mapping = self.get_resource_id_mapping() def get_resource_id_mapping(self): resource_id_mapping = {} media = self._manifest["debug"]["resourceMap"]["media"] file_id = 0 for media_entry in media: file_id += 1 resource_name = "RESOURCE_ID_" + media_entry["defName"] if media_entry["type"] == "png-trans": resource_id_mapping[resource_name + "_WHITE"] = file_id file_id += 1 resource_id_mapping[resource_name + "_BLACK"] = file_id else: resource_id_mapping[resource_name] = file_id return resource_id_mapping def verify_data(self): return self.resources.verify_data() def get_file_id(self, def_name): return self.resource_id_mapping[def_name] def get_chunk(self, file_id): return self.resources.get_chunk(file_id)
def _find_plugin_yaml(self, dataobj): ''' ''' yml = False try: # The first thing we are going to try to do is create a ZipFile # object with the StringIO data that we have. zfile = ZipFile(dataobj) except: pass else: # Before we start recursively jumping through hoops, lets first # check to see if the plugin.yml exists at this level. If so, then # just set the yaml variable. Otherwise we are gonna look for more # zip and jar files and dig into them. if 'plugin.yml' in zfile.namelist(): try: yml = yaml.load(zfile.read('plugin.yml')) except: return False else: for filename in zfile.namelist(): if not yml and filename[-3:].lower() in ['zip', 'jar']: data = StringIO() data.write(zfile.read(filename)) yml = self._find_plugin_yaml(data) data.close() zfile.close() return yml
def parseZip( fn ): date_time = '' members = dict() removemembers = False zipfile = ZipFile( fn ) cache.invalidate(recordlist.output, 'list_output', ) files_of_interest = ['infolog.txt','ext.txt','platform.txt','script.txt','settings.txt','unitsync.log','client.txt','information.txt','demo.sdf'] for info in zipfile.infolist(): if info.filename in files_of_interest and info.file_size < 5 * 1024 * 1024: members[info.filename] = zipfile.read( info.filename ) if info.filename == 'infolog.txt': date_time = info.date_time else: removemembers = True if removemembers: newzipfile = ZipFile (fn + '.new', 'w') tmpfilename = '/tmp/' + os.path.basename (fn) + '.tmp' for file in members.keys (): tmpfile = open (tmpfilename, 'w') tmpfile.write (zipfile.read (file)) tmpfile.close () newzipfile.write (tmpfilename, file) os.remove (tmpfilename) newzipfile.close () zipfile.close () os.rename (fn, fn + '.orig') os.rename (fn + '.new', fn) else: zipfile.close () return db.parseZipMembers( fn, members, date_time )
def handle(self, *args, **options): if not settings.DEBUG: print "this should never be run on production" return zipfile = ZipFile(os.path.join("data", "intervention.zip"), "r") # Load Intervention objects json = loads(zipfile.read("interventions.json")) print "clearing intervention prod database content..." Intervention.objects.all().delete() print "importing prod database content..." for i in json["interventions"]: intervention = Intervention.objects.create(name="tmp") intervention.from_dict(i) # Load Problem Solving objects json = loads(zipfile.read("issues.json")) print "clearing problemsolving database content..." Issue.objects.all().delete() print "importing problemsolving prod database content..." for i in json["issues"]: issue = Issue.objects.create(name="tmp", ordinality=0) issue.from_dict(i)
class OdfDocument(object): """Manipulates odf documents in memory""" implements(IOdfDocument) def __init__(self, data, source_format): """Open the the file in memory. Keyword arguments: data -- Content of the document source_format -- Document Extension """ self._zipfile = ZipFile(StringIO(data)) self.source_format = source_format # XXX - Maybe parsed_content should not be here, but on OOGranulate self.parsed_content = etree.fromstring(self.getContentXml()) def getContentXml(self): """Returns the content.xml file as string""" return self._zipfile.read('content.xml') def getFile(self, path): """If exists, returns file as string, else return an empty string""" try: return self._zipfile.read(path) except KeyError: return '' def trash(self): """Remove the file in memory.""" self._zipfile.close()
def load_and_save_scopes(self): scopes = set() for x in os.walk(sublime.packages_path() + '/..'): for f in glob.glob(os.path.join(x[0], '*.tmLanguage')): for s in self.get_scopes_from(plistlib.readPlist(f)): scopes.add(s.strip()) for x in os.walk(os.path.dirname(sublime.executable_path())): for f in glob.glob(os.path.join(x[0], '*.sublime-package')): input_zip = ZipFile(f) for name in input_zip.namelist(): if name.endswith('.tmLanguage'): for s in self.get_scopes_from(plistlib.readPlistFromBytes(input_zip.read(name))): scopes.add(s.strip()) for x in os.walk(sublime.packages_path() + '/..'): for f in glob.glob(os.path.join(x[0], '*.sublime-package')): input_zip = ZipFile(f) for name in input_zip.namelist(): if name.endswith('.tmLanguage'): for s in self.get_scopes_from(plistlib.readPlistFromBytes(input_zip.read(name))): scopes.add(s.strip()) names = list(scopes) scopes = dict() for name in names: value = name if value.startswith('source.'): value = value[7:] elif value.startswith('text.'): value = value[5:] scopes[name] = value self.settings.set('scopes', scopes) sublime.save_settings('smart-pieces.sublime-settings')
def get_update_data(inputio, getfilecount=True, getsoups=True): epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagName("rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") contentdom = parseString(epub.read(rootfilename)) firstmetadom = contentdom.getElementsByTagName("metadata")[0] try: source = ensure_text( firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data) except: source = None ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) oldcover = None calibrebookmark = None logfile = None # Looking for pre-existing cover. for item in contentdom.getElementsByTagName("reference"): if item.getAttribute("type") == "cover": # there is a cover (x)html file, save the soup for it. href = relpath + item.getAttribute("href") src = None try: oldcoverhtmlhref = href oldcoverhtmldata = epub.read(href) oldcoverhtmltype = "application/xhtml+xml" for item in contentdom.getElementsByTagName("item"): if (relpath + item.getAttribute("href") == oldcoverhtmlhref): oldcoverhtmltype = item.getAttribute("media-type") break soup = make_soup(oldcoverhtmldata.decode("utf-8")) # first img or image tag. imgs = soup.findAll('img') if imgs: src = get_path_part(href) + imgs[0]['src'] else: imgs = soup.findAll('image') if imgs: src = get_path_part(href) + imgs[0]['xlink:href'] if not src: continue except Exception as e: ## Calibre's Polish Book corrupts sub-book covers. logger.warning("Cover (x)html file %s not found" % href) logger.warning("Exception: %s" % (unicode(e))) try: # remove all .. and the path part above it, if present. # Mostly for epubs edited by Sigil. src = re.sub(r"([^/]+/\.\./)", "", src) #print("epubutils: found pre-existing cover image:%s"%src) oldcoverimghref = src oldcoverimgdata = epub.read(src) for item in contentdom.getElementsByTagName("item"): if (relpath + item.getAttribute("href") == oldcoverimghref): oldcoverimgtype = item.getAttribute("media-type") break oldcover = (oldcoverhtmlhref, oldcoverhtmltype, oldcoverhtmldata, oldcoverimghref, oldcoverimgtype, oldcoverimgdata) except Exception as e: logger.warning("Cover Image %s not found" % src) logger.warning("Exception: %s" % (unicode(e))) filecount = 0 soups = [] # list of xhmtl blocks urlsoups = {} # map of xhtml blocks by url images = {} # dict() longdesc->data datamaps = defaultdict(dict) # map of data maps by url if getfilecount: # spin through the manifest--only place there are item tags. for item in contentdom.getElementsByTagName("item"): # First, count the 'chapter' files. FFF uses file0000.xhtml, # but can also update epubs downloaded from Twisting the # Hellmouth, which uses chapter0.html. if (item.getAttribute("media-type") == "application/xhtml+xml"): href = relpath + item.getAttribute("href") #print("---- item href:%s path part: %s"%(href,get_path_part(href))) if re.match(r'.*/log_page(_u\d+)?\.x?html', href): try: logfile = epub.read(href).decode("utf-8") except: pass # corner case I bumped into while testing. if re.match(r'.*/(file|chapter)\d+(_u\d+)?\.x?html', href): # (_u\d+)? is from calibre convert naming files # 3/OEBPS/file0005_u3.xhtml etc. if getsoups: soup = make_soup(epub.read(href).decode("utf-8")) for img in soup.findAll('img'): newsrc = '' longdesc = '' ## skip <img src="data:image..." if not img['src'].startswith('data:image'): try: newsrc = get_path_part(href) + img['src'] # remove all .. and the path part above it, if present. # Mostly for epubs edited by Sigil. newsrc = re.sub(r"([^/]+/\.\./)", "", newsrc) longdesc = img['longdesc'] data = epub.read(newsrc) images[longdesc] = data img['src'] = img['longdesc'] except Exception as e: # don't report u'OEBPS/failedtoload', # it indicates a failed download # originally. if newsrc != u'OEBPS/failedtoload': logger.warning( "Image %s not found!\n(originally:%s)" % (newsrc, longdesc)) logger.warning("Exception: %s" % (unicode(e)), exc_info=True) bodysoup = soup.find('body') # ffdl epubs have chapter title h3 h3 = bodysoup.find('h3') if h3: h3.extract() # TtH epubs have chapter title h2 h2 = bodysoup.find('h2') if h2: h2.extract() for skip in bodysoup.findAll( attrs={'class': 'skip_on_ffdl_update'}): skip.extract() ## <meta name="chapterurl" content="${url}"></meta> #print("look for meta chapurl") currenturl = None chapurl = soup.find('meta', {'name': 'chapterurl'}) if chapurl: if chapurl[ 'content'] not in urlsoups: # keep first found if more than one. # print("Found chapurl['content']:%s"%chapurl['content']) currenturl = chapurl['content'] urlsoups[chapurl['content']] = bodysoup else: # for older pre-meta. Only temp. chapa = bodysoup.find('a', {'class': 'chapterurl'}) if chapa and chapa[ 'href'] not in urlsoups: # keep first found if more than one. urlsoups[chapa['href']] = bodysoup currenturl = chapa['href'] chapa.extract() chapterorigtitle = soup.find( 'meta', {'name': 'chapterorigtitle'}) if chapterorigtitle: datamaps[currenturl][ 'chapterorigtitle'] = chapterorigtitle[ 'content'] chaptertitle = soup.find('meta', {'name': 'chaptertitle'}) if chaptertitle: datamaps[currenturl][ 'chaptertitle'] = chaptertitle['content'] soups.append(bodysoup) filecount += 1 try: calibrebookmark = epub.read("META-INF/calibre_bookmarks.txt") except: pass #for k in images.keys(): #print("\tlongdesc:%s\n\tData len:%s\n"%(k,len(images[k]))) #print("datamaps:%s"%datamaps) return (source, filecount, soups, images, oldcover, calibrebookmark, logfile, urlsoups, datamaps)
class WsFile(attrib.Container): """Workspace File. Workspace files are Zip-Archives, that contain a INI-formatted configuration file 'workspace.ini' in the archives root, and arbitrary resource files within subfolders. Args: filepath: String or :term:`path-like object`, that points to a valid workspace file or None. If the filepath points to a valid workspace file, then the class instance is initialized with a memory copy of the file. If the given file, however, does not exist, isn't a valid ZipFile, or does not contain a workspace configuration, respectively one of the errors FileNotFoundError, BadZipFile or BadWsFile is raised. The default behaviour, if the filepath is None, is to create an empty workspace in the memory, that uses the default folders layout. In this case the attribute maintainer is initialized with the current username. pwd: Bytes representing password of workspace file. """ # # Protected Class Variables # _config_file: ClassVar[Path] = Path('workspace.ini') _default_config: ClassVar[ConfigDict] = { 'dc': { 'creator': env.get_username(), 'date': datetime.datetime.now()}} _default_dir_layout: ClassVar[StrList] = [ 'dataset', 'network', 'system', 'model', 'script'] _default_encoding = env.get_encoding() # # Public Attributes and Attribute Groups # dc: attrib.Group = attrib.create_group(attrib.DCGroup) startup: property = attrib.MetaData(classinfo=Path, category='hooks') startup.__doc__ = """ The startup script is a path, that points to a python script inside the workspace, which is executed after loading the workspace. """ path: property = attrib.Virtual(fget='_get_path') path.__doc__ = """Filepath of the workspace.""" name: property = attrib.Virtual(fget='_get_name') name.__doc__ = """Filename of the workspace without file extension.""" files: property = attrib.Virtual(fget='search') files.__doc__ = """List of all files within the workspace.""" folders: property = attrib.Virtual(fget='_get_folders') folders.__doc__ = """List of all folders within the workspace.""" changed: property = attrib.Virtual(fget='_get_changed') changed.__doc__ = """Tells whether the workspace file has been changed.""" # # Protected Attributes # _file: property = attrib.Content(classinfo=ZipFile) _buffer: property = attrib.Content(classinfo=BytesIOBaseClass) _path: property = attrib.Temporary(classinfo=Path) _pwd: property = attrib.Temporary(classinfo=bytes) _changed: property = attrib.Temporary(classinfo=bool, default=False) # # Events # def __init__( self, filepath: OptPathLike = None, pwd: OptBytes = None, parent: Optional[attrib.Container] = None) -> None: """Load Workspace from file.""" super().__init__() if filepath: self.load(filepath, pwd=pwd) else: self._create_new() def __enter__(self) -> 'WsFile': """Enter with statement.""" return self def __exit__(self, cls: ExcType, obj: Exc, tb: Traceback) -> None: """Close workspace file and buffer.""" self.close() # # Public Methods # def load(self, filepath: PathLike, pwd: OptBytes = None) -> None: """Load Workspace from file. Args: filepath: String or :term:`path-like object`, that points to a valid workspace file. If the filepath points to a valid workspace file, then the class instance is initialized with a memory copy of the file. If the given file, however, does not exist, isn't a valid ZipFile, or does not contain a workspace configuration, respectively one of the errors FileNotFoundError, BadZipFile or BadWsFile is raised. pwd: Bytes representing password of workspace file. """ # Initialize instance Variables, Buffer and buffered ZipFile self._changed = False self._path = env.expand(filepath) self._pwd = pwd self._buffer = BytesIO() self._file = ZipFile(self._buffer, mode='w') # Copy contents from ZipFile to buffered ZipFile with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) try: with ZipFile(self.path, mode='r') as fh: for zinfo in fh.infolist(): data = fh.read(zinfo, pwd=pwd) # TODO ([email protected]): The zipfile standard # module currently does not support encryption in write # mode of new ZipFiles. See: # https://docs.python.org/3/library/zipfile.html # When support is provided, the below line for writing # files shall be replaced by: # self._file.writestr(zinfo, data, pwd=pwd) self._file.writestr(zinfo, data) except FileNotFoundError as err: raise FileNotFoundError( f"file '{self.path}' does not exist") from err except BadZipFile as err: raise BadZipFile( f"file '{self.path}' is not a valid ZIP file") from err # Try to open and load workspace configuration from buffer structure = { 'dc': self._get_attr_types(group='dc'), 'hooks': self._get_attr_types(category='hooks')} try: with self.open(self._config_file) as file: cfg = inifile.load(file, structure=structure) except KeyError as err: raise BadWsFile( f"workspace '{self.path}' is not valid: " f"file '{self._config_file}' could not be loaded") from err # Link configuration self._set_attr_values(cfg.get('dc', {}), group='dc') # type: ignore def save(self) -> None: """Save the workspace to it's filepath.""" if isinstance(self.path, Path): self.saveas(self.path) else: raise FileNotGivenError( "use saveas() to save the workspace to a file") def saveas(self, filepath: PathLike) -> None: """Save the workspace to a file. Args: filepath: String or :term:`path-like object`, that represents the name of a workspace file. """ path = env.expand(filepath) # Update datetime self.date = datetime.datetime.now() # Update 'workspace.ini' with self.open(self._config_file, mode='w') as file: inifile.save({ 'dc': self._get_attr_values(group='dc'), 'hooks': self._get_attr_values(category='hooks')}, file) # Remove duplicates from workspace self._remove_duplicates() # Mark plattform, which created the files as Windows # to avoid inference of wrong Unix permissions for zinfo in self._file.infolist(): zinfo.create_system = 0 # Close ZipArchive (to allow to read the buffer) self._file.close() # Read buffer and write workspace file if not isinstance(self._buffer, BytesIO): raise TypeError("buffer has not been initialized") with open(path, 'wb') as file: file.write(self._buffer.getvalue()) # Close buffer self._buffer.close() # Reload saved workpace from file self.load(path, pwd=self._pwd) def get_file_accessor(self, path: PathLike) -> FileAccessorBase: """Get file accessor to workspace member. Args: path: String or :term:`path-like object`, that represents a workspace member. In reading mode the path has to point to a valid workspace file, or a FileNotFoundError is raised. In writing mode the path by default is treated as a file path. New directories can be written by setting the argument is_dir to True. Returns: :class:`File accessor <nemoa.types.FileAccessorBase>` to workspace member. """ def wrap_open(path: PathLike) -> AnyFunc: def wrapped_open( obj: FileAccessorBase, *args: Any, **kwds: Any) -> FileLike: return self.open(path, *args, **kwds) return wrapped_open return type( # pylint: disable=E0110 'FileAccessor', (FileAccessorBase,), { 'name': str(path), 'open': wrap_open(path)})() def open( self, path: PathLike, mode: str = 'r', encoding: OptStr = None, is_dir: bool = False) -> FileLike: """Open file within the workspace. Args: path: String or :term:`path-like object`, that represents a workspace member. In reading mode the path has to point to a valid workspace file, or a FileNotFoundError is raised. In writing mode the path by default is treated as a file path. New directories can be written by setting the argument is_dir to True. mode: String, which characters specify the mode in which the file is to be opened. The default mode is reading in text mode. Suported characters are: 'r': Reading mode (default) 'w': Writing mode 'b': Binary mode 't': Text mode (default) encoding: In binary mode encoding has not effect. In text mode encoding specifies the name of the encoding, which in reading and writing mode respectively is used to decode the stream’s bytes into strings, and to encode strings into bytes. By default the preferred encoding of the operating system is used. is_dir: Boolean value which determines, if the path is to be treated as a directory or not. This information is required for writing directories to the workspace. The default behaviour is not to treat paths as directories. Returns: :term:`File object` in reading or writing mode. Examples: >>> with self.open('workspace.ini') as file: >>> print(file.read()) """ # Open file handler to workspace member if 'w' in mode: if 'r' in mode: raise ValueError( "'mode' is not allowed to contain the " "characters 'r' AND 'w'") file = self._open_write(path, is_dir=is_dir) else: file = self._open_read(path) # Wrap binary files to text files if required if 'b' in mode: if 't' in mode: raise ValueError( "'mode' is not allowed to contain the " "characters 'b' AND 't'") return file return TextIOWrapper( file, encoding=encoding or self._default_encoding, write_through=True) def close(self) -> None: """Close current workspace and buffer.""" if hasattr(self._file, 'close'): self._file.close() if hasattr(self._buffer, 'close'): self._buffer.close() def copy(self, source: PathLike, target: PathLike) -> bool: """Copy file within workspace. Args: source: String or :term:`path-like object`, that points to a file in the directory structure of the workspace. If the file does not exist, a FileNotFoundError is raised. If the filepath points to a directory, an IsADirectoryError is raised. target: String or :term:`path-like object`, that points to a new filename or an existing directory in the directory structure of the workspace. If the target is a directory the target file consists of the directory and the basename of the source file. If the target file already exists a FileExistsError is raised. Returns: Boolean value which is True if the file was copied. """ # Check if source file exists and is not a directory src_file = PurePath(source).as_posix() src_infos = self._locate(source) if not src_infos: raise FileNotFoundError( f"workspace file '{src_file}' does not exist") src_info = src_infos[-1] if getattr(src_info, 'is_dir')(): raise IsADirectoryError( f"'{src_file}/' is a directory not a file") # If target is a directory get name of target file from # source filename tgt_file = PurePath(target).as_posix() if tgt_file == '.': tgt_file = Path(src_file).name else: tgt_infos = self._locate(target) if tgt_infos: if getattr(tgt_infos[-1], 'is_dir')(): tgt_path = PurePath(tgt_file, Path(src_file).name) tgt_file = tgt_path.as_posix() # Check if target file already exists if self._locate(tgt_file): raise FileExistsError( f"workspace file '{tgt_file}' already exist.") # Read binary data from source file data = self._file.read(src_info, pwd=self._pwd) # Create ZipInfo for target file from source file info tgt_time = getattr(src_info, 'date_time') tgt_info = ZipInfo(filename=tgt_file, date_time=tgt_time) # type: ignore # Write binary data to target file # TODO ([email protected]): The zipfile standard module currently # does not support encryption in write mode. See: # https://docs.python.org/3/library/zipfile.html # When support is provided, the below line shall be replaced by: # self._file.writestr(tgt_info, data, pwd=self._pwd) self._file.writestr(tgt_info, data) self._changed = True # Check if new file exists return bool(self._locate(tgt_file)) def move(self, source: PathLike, target: PathLike) -> bool: """Move file within workspace. Args: source: String or :term:`path-like object`, that points to a file in the directory structure of the workspace. If the file does not exist, a FileNotFoundError is raised. If the filepath points to a directory, an IsADirectoryError is raised. target: String or :term:`path-like object`, that points to a new filename or an existing directory in the directory structure of the workspace. If the target is a directory the target file consists of the directory and the basename of the source file. If the target file already exists a FileExistsError is raised. Returns: Boolean value which is True if the file has been moved. """ # Copy source file to target file or directory # and on success remove source file return self.copy(source, target) and self.unlink(source) def append(self, source: PathLike, target: OptPathLike = None) -> bool: """Append file to the workspace. Args: source: String or :term:`path-like object`, that points to a valid file in the directory structure if the system. If the file does not exist, a FileNotFoundError is raised. If the filepath points to a directory, a IsADirectoryError is raised. target: String or :term:`path-like object`, that points to a valid directory in the directory structure of the workspace. By default the root directory is used. If the directory does not exist, a FileNotFoundError is raised. If the target directory already contains a file, which name equals the filename of the source, a FileExistsError is raised. Returns: Boolean value which is True if the file has been appended. """ # Check source file src_file = env.expand(source) if not src_file.exists(): raise FileNotFoundError(f"file '{src_file}' does not exist") if src_file.is_dir(): raise IsADirectoryError(f"'{src_file}' is a directory not a file") # Check target directory if target: tgt_dir = PurePath(target).as_posix() + '/' if not self._locate(tgt_dir): raise FileNotFoundError( f"workspace directory '{tgt_dir}' does not exist") else: tgt_dir = '.' tgt_file = Path(tgt_dir, src_file.name) if self._locate(tgt_file): raise FileExistsError( f"workspace directory '{tgt_dir}' already contains a file " f"with name '{src_file.name}'") # Create ZipInfo entry from source file filename = PurePath(tgt_file).as_posix() date_time = time.localtime(src_file.stat().st_mtime)[:6] zinfo = ZipInfo(filename=filename, date_time=date_time) # type: ignore # Copy file to archive with src_file.open('rb') as src: data = src.read() # TODO ([email protected]): The zipfile standard module currently # does not support encryption in write mode. See: # https://docs.python.org/3/library/zipfile.html # When support is provided, the below line shall be replaced by: # self._file.writestr(zinfo, data, pwd=pwd) self._file.writestr(zinfo, data) return True def read_text(self, filepath: PathLike, encoding: OptStr = None) -> str: """Read text from file. Args: filepath: String or :term:`path-like object`, that points to a valid file in the directory structure of the workspace. If the file does not exist a FileNotFoundError is raised. encoding: Specifies the name of the encoding, which is used to decode the stream’s bytes into strings. By default the preferred encoding of the operating system is used. Returns: Contents of the given filepath encoded as string. """ with self.open(filepath, mode='r', encoding=encoding) as file: text = file.read() if not isinstance(text, str): return '' return text def read_bytes(self, filepath: PathLike) -> bytes: """Read bytes from file. Args: filepath: String or :term:`path-like object`, that points to a valid file in the dirctory structure of the workspace. If the file does not exist a FileNotFoundError is raised. Returns: Contents of the given filepath as bytes. """ with self.open(filepath, mode='rb') as file: blob = file.read() if not isinstance(blob, bytes): return b'' return blob def write_text( self, text: str, filepath: PathLike, encoding: OptStr = None) -> int: """Write text to file. Args: text: String, which has to be written to the given file. filepath: String or :term:`path-like object`, that represents a valid filename in the dirctory structure of the workspace. encoding: Specifies the name of the encoding, which is used to encode strings into bytes. By default the preferred encoding of the operating system is used. Returns: Number of characters, that are written to the file. """ with self.open(filepath, mode='w', encoding=encoding) as file: if isinstance(file, TextIOBaseClass): return file.write(text) return 0 def write_bytes(self, blob: BytesLike, filepath: PathLike) -> int: """Write bytes to file. Args: blob: Bytes, which are to be written to the given file. filepath: String or :term:`path-like object`, that represents a valid filename in the dirctory structure of the workspace. Returns: Number of bytes, that are written to the file. """ with self.open(filepath, mode='wb') as file: if isinstance(file, BytesIOBaseClass): return file.write(blob) return 0 def unlink(self, filepath: PathLike, ignore_missing: bool = True) -> bool: """Remove file from workspace. Args: filepath: String or :term:`path-like object`, that points to a file in the directory structure of the workspace. If the filepath points to a directory, an IsADirectoryError is raised. For the case, that the file does not exist, the argument ignore_missing determines, if a FileNotFoundError is raised. ignore_missing: Boolean value which determines, if FileNotFoundError is raised, if the target file does not exist. The default behaviour, is to ignore missing files. Returns: Boolean value, which is True if the given file was removed. """ matches = self._locate(filepath) if not matches: if ignore_missing: return True filename = PurePath(filepath).as_posix() raise FileNotFoundError(f"file '{filename}' does not exist") if getattr(matches[-1], 'is_dir')(): dirname = PurePath(filepath).as_posix() + '/' raise IsADirectoryError(f"'{dirname}' is a directory not a file") return self._remove_members(matches) def mkdir(self, dirpath: PathLike, ignore_exists: bool = False) -> bool: """Create a new directory at the given path. Args: dirpath: String or :term:`path-like object`, that represents a valid directory name in the directory structure of the workspace. If the directory already exists, the argument ignore_exists determines, if a FileExistsError is raised. ignore_exists: Boolean value which determines, if FileExistsError is raised, if the target directory already exists. The default behaviour is to raise an error, if the file already exists. Returns: Boolean value, which is True if the given directory was created. """ matches = self._locate(dirpath) if not matches: with self.open(dirpath, mode='w', is_dir=True): pass elif not ignore_exists: dirname = PurePath(dirpath).as_posix() + '/' raise FileExistsError(f"directory '{dirname}' already exists") return True def rmdir( self, dirpath: PathLike, recursive: bool = False, ignore_missing: bool = False) -> bool: """Remove directory from workspace. Args: dirpath: String or :term:`path-like object`, that points to a directory in the directory structure of the workspace. If the directory does not exist, the argument ignore_missing determines, if a FileNotFoundError is raised. ignore_missing: Boolean value which determines, if FileNotFoundError is raised, if the target directory does not exist. The default behaviour, is to raise an error if the directory is missing. recursive: Boolean value which determines, if directories are removed recursively. If recursive is False, then only empty directories can be removed. If recursive, however, is True, then all files and subdirectories are alse removed. By default recursive is False. Returns: Boolean value, which is True if the given directory was removed. """ matches = self._locate(dirpath) dirname = PurePath(dirpath).as_posix() + '/' if not matches: if ignore_missing: return True raise FileNotFoundError(f"directory '{dirname}' does not exist") files = self.search(dirname + '*') if not files: return self._remove_members(matches) if not recursive: raise DirNotEmptyError(f"directory '{dirname}' is not empty") allmatches = matches for file in files: allmatches += self._locate(file) return self._remove_members(allmatches) def search(self, pattern: OptStr = None) -> StrList: """Search for files in the workspace. Args: pattern: Search pattern that contains Unix shell-style wildcards: '*': Matches arbitrary strings '?': Matches single characters [seq]: Matches any character in seq [!seq]: Matches any character not in seq By default a list of all files and directories is returned. Returns: List of files and directories in the directory structure of the workspace, that match the search pattern. """ # Get list of normalized unique paths of workspace members paths: PathLikeList = [] for zinfo in self._file.infolist(): path = PurePath(zinfo.filename).as_posix() if getattr(zinfo, 'is_dir')(): path += '/' if path not in paths: paths.append(path) # Match path list with given pattern if pattern: paths = env.match_paths(paths, pattern) # Sort paths return sorted([str(path) for path in paths]) # # Protected Methods # def _create_new(self) -> None: # Initialize instance Variables, Buffer and buffered ZipFile self._set_attr_values(self._default_config['dc'], group='dc') self._path = None self._changed = False self._pwd = None self._buffer = BytesIO() self._file = ZipFile(self._buffer, mode='w') # Create folders for folder in self._default_dir_layout: self.mkdir(folder) def _open_read(self, path: PathLike) -> BytesIOLike: # Locate workspace member by it's path # and open file handler for reading the file matches = self._locate(path) if not matches: fname = PurePath(path).as_posix() raise FileNotFoundError( f"workspace member with filename '{fname}' does not exist") # Select latest version of file zinfo = matches[-1] return self._file.open(zinfo, pwd=self._pwd, mode='r') def _open_write(self, path: PathLike, is_dir: bool = False) -> BytesIOLike: # Determine workspace member name from path # and get ZipInfo with local time as date_time filename = PurePath(path).as_posix() if is_dir: filename += '/' zinfo = ZipInfo( # type: ignore filename=filename, date_time=time.localtime()[:6]) # Catch Warning for duplicate files with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) # TODO ([email protected]): The zipfile standard # module currently does not support encryption in write # mode of new ZipFiles. See: # https://docs.python.org/3/library/zipfile.html # When support is provided, the below line for writing # files shall be replaced by: # file = self._file.open(zinfo, mode='w', pwd=self._pwd) file = self._file.open(zinfo, mode='w') self._changed = True return file def _locate(self, path: PathLike, sort: bool = True) -> ZipInfoList: # Get list of member zipinfos zinfos = self._file.infolist() # Match members by path-like filenames matches = [i for i in zinfos if Path(i.filename) == Path(path)] if sort: # Sort matches by datetime matches = sorted(matches, key=lambda i: i.date_time) # Return sorted matches return matches def _get_name(self) -> OptStr: return getattr(self._path, 'stem', None) def _get_path(self) -> OptPath: return self._path def _get_changed(self) -> bool: return self._changed def _get_folders(self) -> StrList: names: StrList = [] for zinfo in self._file.infolist(): if getattr(zinfo, 'is_dir')(): name = PurePath(zinfo.filename).as_posix() + '/' names.append(name) return sorted(names) def _remove_members(self, zinfos: ZipInfoList) -> bool: # Return True if list of members is empty if not zinfos: return True # Remove entries in the list of members from workspace new_zinfos = [] zids = [(zinfo.filename, zinfo.date_time) for zinfo in zinfos] for zinfo in self._file.infolist(): zid = (zinfo.filename, zinfo.date_time) if zid in zids: zids.remove(zid) else: new_zinfos.append(zinfo) # If any entry on the list could not be found raise an error if zids: names = [zid[0] for zid in zids] raise FileNotFoundError( f"could not locate workspace members: {names}") # Create new ZipArchive in Memory new_buffer = BytesIO() new_file = ZipFile(new_buffer, mode='w') # Copy all workspace members on the new list from current # to new workspace for zinfo in new_zinfos: data = self._file.read(zinfo, pwd=self._pwd) new_file.writestr(zinfo, data) # Close current workspace and buffer and link new workspace and buffer self._file.close() self._buffer.close() self._buffer = new_buffer self._file = new_file self._changed = True return True def _remove_duplicates(self) -> bool: # Get list of duplicates zinfos: ZipInfoList = [] for filename in self.files: zinfos += self._locate(filename, sort=True)[:-1] # Remove duplicates return self._remove_members(zinfos)
import requests import numpy as np import tensorflow as tf from zipfile import ZipFile data_dir = 'data/' data_file = 'spam.txt' if not os.path.exists(data_dir): os.makedirs(data_dir) if not os.path.isfile(os.path.join(data_dir, data_file)): zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip' r = requests.get(zip_url) z = ZipFile(io.BytesIO(r.content)) file = z.read('SMSSpamCollection') text_data = file.decode() text_data = text_data.encode('ascii', errors='ignore') text_data = text_data.decode().split('\n') with open(os.path.join(data_dir, data_file), 'w') as file_conn: for text in text_data: file_conn.write("{}\n".format(text)) else: text_data = [] with open(os.path.join(data_dir, data_file), 'r') as file_conn: for row in file_conn: text_data.append(row) text_data = text_data[:-1]
def get_lines_from_zip(zip_str): zip_file = ZipFile(BytesIO(zip_str)) for name in zip_file.namelist(): for line in zip_file.read(name).decode('utf-8').splitlines(): yield line
def import_from_json(cls, campaign, batch_user, batch_data, max_count): """ Creates new DataAssessmentTask instances based on JSON input. """ batch_meta = batch_data.metadata batch_name = batch_data.dataFile.name batch_file = batch_data.dataFile batch_json = None if batch_name.endswith('.zip'): if not is_zipfile(batch_file): _msg = 'Batch {0} not a valid ZIP archive'.format(batch_name) LOGGER.warn(_msg) return batch_zip = ZipFile(batch_file) batch_json_files = [ x for x in batch_zip.namelist() if x.endswith('.json') ] # TODO: implement proper support for multiple json files in archive. for batch_json_file in batch_json_files: batch_content = batch_zip.read(batch_json_file).decode('utf-8') batch_json = loads(batch_content, encoding='utf-8') else: batch_json = loads(str(batch_file.read(), encoding="utf-8")) from datetime import datetime t1 = datetime.now() current_count = 0 max_length_id = 0 max_length_text = 0 for batch_task in batch_json: if max_count > 0 and current_count >= max_count: _msg = 'Stopping after max_count={0} iterations'.format( max_count) LOGGER.info(_msg) print(_msg) t2 = datetime.now() print(t2 - t1) return print('Batch name/no:', batch_name, batch_task['task']['batchNo']) new_items = [] for item in batch_task['items']: current_length_id = len(item['targetID']) current_length_text = len(item['targetText']) if current_length_id > max_length_id: print('Longest target ID', current_length_id, item['targetID']) max_length_id = current_length_id if current_length_text > max_length_text: print('Longest targetText', current_length_text, item['targetText'].encode('utf-8')) max_length_text = current_length_text new_item = TextPairWithDomain( sourceID=item['sourceID'], sourceText=item['sourceText'], targetID=item['targetID'], targetText=item['targetText'], createdBy=batch_user, itemID=item['itemID'], itemType=item['itemType'], documentDomain=item['documentDomain'], sourceURL=item['sourceURL'], targetURL=item['targetURL']) new_items.append(new_item) if not len(new_items) == 100: _msg = 'Expected 100 items for task but found {0}'.format( len(new_items)) LOGGER.warn(_msg) print(_msg) continue current_count += 1 #for new_item in new_items: # new_item.metadata = batch_meta # new_item.save() batch_meta.textpair_set.add(*new_items, bulk=False) batch_meta.save() new_task = DataAssessmentTask( campaign=campaign, requiredAnnotations=batch_task['task']['requiredAnnotations'], batchNo=batch_task['task']['batchNo'], batchData=batch_data, createdBy=batch_user, ) new_task.save() #for new_item in new_items: # new_task.items.add(new_item) new_task.items.add(*new_items) new_task.save() _msg = 'Success processing batch {0}, task {1}'.format( str(batch_data), batch_task['task']['batchNo']) LOGGER.info(_msg) print(_msg) _msg = 'Max length ID={0}, text={1}'.format(max_length_id, max_length_text) LOGGER.info(_msg) print(_msg) t2 = datetime.now() print(t2 - t1)
xlsm_file = sys.argv[1] else: print("\nUtility to extract a vbaProject.bin binary from an Excel 2007+ " "xlsm macro file for insertion into an XlsxWriter file." "\n" "See: https://xlsxwriter.readthedocs.io/working_with_macros.html\n" "\n" "Usage: vba_extract file.xlsm\n") exit() try: # Open the Excel xlsm file as a zip file. xlsm_zip = ZipFile(xlsm_file, 'r') # Read the xl/vbaProject.bin file. vba_data = xlsm_zip.read('xl/' + vba_filename) # Write the vba data to a local file. vba_file = open(vba_filename, "wb") vba_file.write(vba_data) vba_file.close() except IOError: # Use exc_info() for Python 2.5+ compatibility. e = sys.exc_info()[1] print("File error: %s" % str(e)) exit() except KeyError: # Usually when there isn't a xl/vbaProject.bin member in the file. e = sys.exc_info()[1]
def perform( self): # The function that will be executed must have this name # Accessing system location settings #lat = self.settings.location.latitude log.info("Hello KMZ") # Other location settings #self.zip #self.name #self.state #self.latitude #self.longitude #self.address #self.elevation #self.gmtOffset #self.dstOffset #self.stationID #self.stationName #self.et0Average station = self.params.get("station", None) if station is None or station == "": station = "K4086" log.debug("No station set, using (%s)" % station) url = "https://opendata.dwd.de/weather/local_forecasts/mos/MOSMIX_L/single_stations/" + str( station) + "/kml/MOSMIX_L_LATEST_" + str(station) + ".kmz" URLParams = [("User-Agent", "RainMachine v2")] try: req = urllib2.Request(url) response = urllib2.urlopen(req) raw = response.read() zipFile = ZipFile(StringIO.StringIO(raw)) kml = zipFile.read(zipFile.filelist[0]) rootNode = ET.fromstring(kml) nameSpaces = { 'dwd': "https://opendata.dwd.de/weather/lib/pointforecast_dwd_extension_V1_0.xsd", 'gx': "http://www.google.com/kml/ext/2.2", 'xal': "urn:oasis:names:tc:ciq:xsdschema:xAL:2.0", 'kml': "http://www.opengis.net/kml/2.2", 'atom': "http://www.w3.org/2005/Atom" } timeStampsNode = rootNode.findall( "./kml:Document/kml:ExtendedData/dwd:ProductDefinition/dwd:ForecastTimeSteps/", nameSpaces) extendedDataNode = rootNode.findall( "./kml:Document/kml:Placemark/kml:ExtendedData/", nameSpaces) nowTimeStamp = rmCurrentTimestamp() skipColumens = 0 # Parse Timestamps timeStampList = [] for ts in timeStampsNode: compatibleString = re.sub(r"\.\d+Z$", '', ts.text) unix = rmTimestampFromDateAsString(compatibleString, "%Y-%m-%dT%H:%M:%S") #ts = datetime.datetime.strptime(compatibleString, "%Y-%m-%dT%H:%M:%S") if (unix < nowTimeStamp): skipColumens += 1 continue timeStampList.append(unix) dwdData = [] parsedData = DWDData() for data in extendedDataNode: currentCol = 0 for k, v in data.attrib.items(): if k.endswith("elementName"): valueNode = data.find("./dwd:value", nameSpaces) if valueNode == None: continue allValues = valueNode.text.split() if skipColumens > 0: rawValues = allValues[skipColumens:] else: rawValues = allValues if len(rawValues) != len(timeStampList): continue # Temperature if v.lower() == "TTT".lower(): parsedData.Temperature = parseFloats( rawValues, timeStampList, temperatureTransformation) continue # Min Temperature if v.lower() == "TN".lower(): parsedData.MinTemp = parseFloats( rawValues, timeStampList, temperatureTransformation) continue # Max Temperature if v.lower() == "TX".lower(): parsedData.MaxTemp = parseFloats( rawValues, timeStampList, temperatureTransformation) continue # Probability of precipitation > 0.0mm during the last hour if v.lower() == "wwP".lower(): parsedData.POP = parseFloats( rawValues, timeStampList) continue # Wind if v.lower() == "FF".lower(): parsedData.Wind = parseFloats( rawValues, timeStampList) continue # Solar Radiation if v.lower() == "Rad1h".lower(): parsedData.SolarRadiation = parseFloats( rawValues, timeStampList, pressureTransformation) continue # Cloud if v.lower() == "Neff".lower(): parsedData.SkyCover = parseFloats( rawValues, timeStampList, skyCoverTransform) continue # QPF if v.lower() == "RRdc".lower(): parsedData.QPF = parseFloats( rawValues, timeStampList, None, yesterday) continue # evapotranspiration if v.lower() == "PEvap".lower(): parsedData.ET0 = parseFloats( rawValues, timeStampList, None, yesterday) continue # Pressure if v.lower() == "PPPP".lower(): parsedData.Pressure = parseFloats( rawValues, timeStampList, pressureTransformation) continue # Dewpoint if v.lower() == "Td".lower(): parsedData.DewPoint = parseFloats( rawValues, timeStampList, temperatureTransformation) continue # Condition if v.lower() == "WPcd1".lower(): parsedData.Condition = parseFloats( rawValues, timeStampList, conditionParser, yesterday) continue log.info("Adding parsed values to database") if parsedData.Temperature != None: log.debug("Adding Temparatures values") self.addValues(RMParser.dataType.TEMPERATURE, parsedData.Temperature) if parsedData.MinTemp != None: log.debug("Adding Min-Temparatures values") self.addValues(RMParser.dataType.MINTEMP, parsedData.MinTemp) if parsedData.MaxTemp != None: log.debug("Adding Max-Temparatures values") self.addValues(RMParser.dataType.MAXTEMP, parsedData.MaxTemp) if parsedData.RH != None: log.debug("Adding RH values") self.addValues(RMParser.dataType.RH, parsedData.RH) if parsedData.Wind != None: log.debug("Adding Wind values") self.addValues(RMParser.dataType.WIND, parsedData.Wind) if parsedData.SolarRadiation != None: log.debug("Adding Solar Radiation values") self.addValues(RMParser.dataType.SOLARRADIATION, parsedData.SolarRadiation) if parsedData.SkyCover != None: log.debug("Adding SkyCover values") self.addValues(RMParser.dataType.SKYCOVER, parsedData.SkyCover) if parsedData.QPF != None: log.debug("Adding QPF values") self.addValues(RMParser.dataType.QPF, parsedData.QPF) if parsedData.ET0 != None: log.debug("Adding ET0 values") #self.addValues(RMParser.dataType.ET0, parsedData.ET0) if parsedData.POP != None: log.debug("Adding POP values") self.addValues(RMParser.dataType.POP, parsedData.POP) if parsedData.Pressure != None: log.debug("Adding Pressure values") self.addValues(RMParser.dataType.PRESSURE, parsedData.Pressure) if parsedData.DewPoint != None: log.debug("Adding DewPoint values") self.addValues(RMParser.dataType.DEWPOINT, parsedData.DewPoint) if parsedData.Condition != None: self.addValues(RMParser.dataType.CONDITION, parsedData.Condition) except Exception, e: log.error("*** Error running DWD parser") log.exception(e)
class Epub(object): def __init__(self, filename=None): self.filename = filename def zip_get_name(self, name_searched): for name in self._zipfile.namelist(): if name == name_searched: return name return None @property def filename(self): return self._filename @filename.setter def filename(self, filename): self._filename = filename self.ncx = self.content = self._zipfile = None if self._filename: self._zipfile = ZipFile(self._filename) self.container = Container(self._zipfile.read(CONTAINER_NAME)) self.content = Content(self._zipfile.read(self.content_filename), file_url=self.content_filename) self.ncx = Ncx(self._zipfile.read(self.content.ncx_item.url)) # Harvest URLs from parsable files self.urls_used_into_id = {} for name in self._zipfile.infolist(): data = self._zipfile.read(name) url = str(name.filename) if url in self.content.urls_by_id: item = self.content.manifest[self.content.urls_by_id[url]] if item: urls = set() if item.parsable(): parser = URLLister(parent_path=url) try: parser.feed(data) except UnicodeDecodeError: continue for url_found in parser.urls: urls.add(url_found) self.urls_used_into_id[item.id] = urls @property def zipfile(self): return self._zipfile @property def container_filename(self): name = self.zip_get_name(CONTAINER_NAME) if name: return name raise epexc.ContainerFileNotFound(name) @property def content_filename(self): name = self.zip_get_name(self.container.rootfile) if name: return name raise epexc.ContentFileNotFound(name) def create_preview(self, preview_filename, spine_preview, missing_page=None, overwrite=False): """ Create a preview, writing filename, with only spine elements, with an optional missing_page for missing links and return an ePub """ for item in spine_preview: if not item in self.content.spine: raise epexc.ElementNotInSpine(item) # Check preview_filename if os.path.exists(preview_filename) and not overwrite: raise epexc.PreviewAlreadyExists(preview_filename) # Spine IDs and URLs removed from preview spine_ids_removed = set(id for id in self.content.spine if id not in spine_preview) urls_to_be_removed = set(self.content.manifest[id].url for id in spine_ids_removed) # Recursively check URLs used (content urls are always included) used_urls = set(self.content.metadata_content_urls) exploring_ids = deque(spine_preview) explored_ids = set() while exploring_ids: id = exploring_ids.popleft() # Explored id only if is internal, not explored and not removed if id in self.content.manifest and not id in explored_ids and not id in spine_ids_removed: used_urls.add(self.content.manifest[id].url) for url in self.urls_used_into_id[id]: if url in self.content.urls_by_id and not url in urls_to_be_removed: used_urls.add(url) exploring_ids.append(self.content.urls_by_id[url]) explored_ids.add(id) # Check every url in manifest if not used for id, item in self.content.manifest.items(): if not item.url in used_urls: urls_to_be_removed.add(item.url) # Write preview epub zip_out = ZipFile(preview_filename, mode='w') for name in self._zipfile.infolist(): url = name.filename parent_path_parts = url.split("/")[:-1] insert_file = False if (not url in urls_to_be_removed): if url in (self.content_filename, CONTAINER_NAME, MIMETYPE_NAME): insert_file = True elif url in self.content.urls_by_id: insert_file = True if insert_file: data = self._zipfile.read(name) if url == self.content_filename: dom = xml.dom.minidom.parseString(data) # Process content manifest for item in dom.getElementsByTagNameNS("*", "manifest")[0].getElementsByTagNameNS("*", "item"): url = absolutize_url(item.getAttribute("href"), parent_path_parts) if url in urls_to_be_removed: try: item.parentNode.removeChild(item) except xml.dom.NotFoundErr: pass # Process content spine for itemref in dom.getElementsByTagNameNS("*", "spine")[0].getElementsByTagNameNS("*", "itemref"): url = self.content.manifest[itemref.getAttribute("idref")].url if url in urls_to_be_removed: try: itemref.parentNode.removeChild(itemref) except xml.dom.NotFoundErr: pass data = dom.toxml(dom.encoding) elif url in self.content.urls_by_id: item = self.content.manifest[self.content.urls_by_id[url]] if item == self.content.ncx_item: # Process toc dom = xml.dom.minidom.parseString(data) for navPoint in dom.getElementsByTagNameNS("*", "navPoint"): for node in navPoint.childNodes: if node.nodeType == node.ELEMENT_NODE and node.nodeName == "content": url = absolutize_url(node.getAttribute("src"), parent_path_parts) if url in urls_to_be_removed: try: navPoint.parentNode.removeChild(navPoint) except xml.dom.NotFoundErr: pass for cont, navPoint in enumerate(dom.getElementsByTagNameNS("*", "navPoint")): playOrder = navPoint.getAttribute("playOrder") if playOrder != str(cont + 1): playOrder = navPoint.setAttribute("playOrder", str(cont + 1)) data = dom.toxml(dom.encoding) elif item.parsable(): # Process generic html/xml dom = xml.dom.minidom.parseString(data) parent_path_parts = url.split("/")[:-1] for tagname, attr in (("content", "src"), ("img", "src"), ("link", "href"), ("a", "href")): for node in dom.getElementsByTagNameNS("*", tagname): if node.nodeType == node.ELEMENT_NODE: url = absolutize_url(node.getAttribute(attr), parent_path_parts) if url in urls_to_be_removed: try: node.parentNode.removeChild(node) except xml.dom.NotFoundErr: pass data = dom.toxml(dom.encoding) zip_out.writestr(copy.deepcopy(name), data) zip_out.close()
def load_csv_files(zip_file: ZipFile, files: list) -> pd.DataFrame: df = pd.DataFrame() for file in files: csv = io.BytesIO(zip_file.read(file)) df = df.append(pd.read_csv(csv)) return df
import io from zipfile import ZipFile import pandas as pd import mysql.connector from datetime import date,datetime,timedelta #Download the zip file dataset_url = "http://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/datos_abiertos_covid19.zip" response = urllib.request.urlopen(dataset_url) data_zip = response.read() data_zip = io.BytesIO(data_zip) data_zip = ZipFile(file = data_zip) #Extract the csv file csv_filename = data_zip.namelist()[0] data_csv = data_zip.read(csv_filename) data_csv = io.BytesIO(data_csv) #Read the csv (in chunks because the csv is large) chunks = pd.read_csv(data_csv, encoding='ANSI', chunksize=100000, low_memory=False) #Clases for data class StatsPerAges: cases = 0 deaths = 0 recovered = 0 mortality = 0 c_diabetes = 0 d_diabetes = 0 m_diabetes = 0
class DarFile: """ Provides access to the contents of a .dar file. """ def __init__(self, dar_path: 'Union[str, Path, BinaryIO]'): if isinstance(dar_path, (str, Path)): self.dar_path = pathify(dar_path) self.dar_contents = ZipFile(str(self.dar_path)) else: self.dar_path = None self.dar_contents = ZipFile(dar_path) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): self.dar_contents.close() def read_metadata(self) -> 'PackageStore': from ..model.types_store import PackageStore store = PackageStore.empty() dalf_names = self.get_dalf_names() for dalf_name in dalf_names: contents = self.dar_contents.read(dalf_name) store.register_all(parse_dalf(contents)) return store def get_archives(self) -> 'Mapping[str, bytes]': """ Return a mapping from package ID to byte contents. """ from ..protocols.v1.pb_parse_metadata import parse_archive_payload archives = {} # type: Dict[str, bytes] for dalf_name in self.get_dalf_names(): contents = self.dar_contents.read(dalf_name) payload = parse_archive_payload(contents) archives[payload.hash] = contents return archives def get_dalf_names(self) -> 'Sequence[str]': dalf_names = [] for name in self.dar_contents.namelist(): _, ext = path.splitext(name) if ext == '.dalf': dalf_names.append(name) return dalf_names def get_manifest(self) -> 'Optional[Mapping[str, str]]': """ Return the contents of the manifest of this DAR. :return: """ names = self.dar_contents.namelist() if 'META-INF/MANIFEST.MF' in names: manifest_bytes = self.dar_contents.read('META-INF/MANIFEST.MF') manifest = {} for line in manifest_bytes.decode('utf-8').splitlines(): print(line) name, _, value = line.partition(':') manifest[name] = value.strip() return manifest else: return None def get_sdk_version(self) -> 'Optional[str]': """ Return the SDK version used to compile this dar (if this information is available). """ manifest = self.get_manifest() return manifest.get('Sdk-Version') if manifest is not None else None def get_package_provider(self) -> 'PackageProvider': from typing import Dict from ..model.types_store import MemoryPackageProvider from .._gen.com.digitalasset.daml_lf_dev.daml_lf_pb2 import Archive packages = {} # type: Dict[str, bytes] dalf_names = self.get_dalf_names() for dalf_name in dalf_names: contents = self.dar_contents.read(dalf_name) a = Archive() a.ParseFromString(contents) packages[a.hash] = a.payload return MemoryPackageProvider(packages)
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) print(len(mnist.train.images)) print(len(mnist.test.images)) print(len(mnist.validation.images)) print(mnist.train.labels[1, :]) # Ham/Spam Text Data import requests import io from zipfile import ZipFile # Get/read zip file zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip' r = requests.get(zip_url) z = ZipFile(io.BytesIO(r.content)) file = z.read('SMSSpamCollection') # Format Data text_data = file.decode() text_data = text_data.encode('ascii', errors='ignore') text_data = text_data.decode().split('\n') text_data = [x.split('\t') for x in text_data if len(x) >= 1] [text_data_target, text_data_train] = [list(x) for x in zip(*text_data)] print(len(text_data_train)) print(set(text_data_target)) print(text_data_train[1]) # Movie Review Data import requests import io import tarfile
#!/usr/bin/env python from zipfile import ZipFile, ZIP_DEFLATED import os.path # reading & extracting rzip = ZipFile("DATA/textfiles.zip") # <1> print(rzip.namelist()) # <2> ty = rzip.read('tyger.txt').decode() # <3> print(ty[:50]) rzip.extract('parrot.txt') # <4> # creating a zip file wzip = ZipFile("example.zip", mode="w", compression=ZIP_DEFLATED) # <5> for base in "parrot tyger knights alice poe_sonnet spam".split(): filename = os.path.join("DATA", base + '.txt') print("adding {} as {}".format(filename, base + '.txt')) wzip.write(filename, base + '.txt') # <6>
from zipfile import ZipFile from urllib.request import urlopen from io import BytesIO from bs4 import BeautifulSoup wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read() wordFile = BytesIO(wordFile) document = ZipFile(wordFile) xml_content = document.read('word/document.xml') wordObj = BeautifulSoup(xml_content.decode('utf-8')) textStrings = wordObj.findAll("w:t") for textElem in textStrings: print(textElem.text)
def absoluteFilePaths(directory): for dirpath,_,filenames in os.walk(directory): for f in filenames: yield os.path.abspath(os.path.join(dirpath, f)) async def zipper(filelist,name): with ZipFile(name, 'w') as zipMe: for file in filelist: zipMe.write(file, compress_type=ZIP_DEFLATED) async def unzipper(my_dir,my_zip) zip_file = ZipFile(my_zip, 'r') for files in zip_file.namelist(): data = zip_file.read(files, my_dir) myfile_path = os.path.join(my_dir, files.split("/")[-1]) myfile = open(myfile_path, "wb") myfile.write(data) myfile.close() zip_file.close() async def hey(event): reply_message = await event.get_reply_message() await event.reply("Starting to part the files please wait...") name = await bot.download_media(reply_message,"./") dir = str(uuid4()) os.mkdir(dir) await unzipper(dir,name) files = list(absoluteFilePaths(dir))
def get_update_data(inputio, getfilecount=True, getsoups=True): epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagName("rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") contentdom = parseString(epub.read(rootfilename)) firstmetadom = contentdom.getElementsByTagName("metadata")[0] try: source = firstmetadom.getElementsByTagName( "dc:source")[0].firstChild.data.encode("utf-8") except: source = None ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) oldcover = None calibrebookmark = None logfile = None # Looking for pre-existing cover. for item in contentdom.getElementsByTagName("reference"): if item.getAttribute("type") == "cover": # there is a cover (x)html file, save the soup for it. href = relpath + item.getAttribute("href") oldcoverhtmlhref = href oldcoverhtmldata = epub.read(href) oldcoverhtmltype = "application/xhtml+xml" for item in contentdom.getElementsByTagName("item"): if (relpath + item.getAttribute("href") == oldcoverhtmlhref): oldcoverhtmltype = item.getAttribute("media-type") break soup = bs.BeautifulSoup(oldcoverhtmldata.decode("utf-8")) src = None # first img or image tag. imgs = soup.findAll('img') if imgs: src = get_path_part(href) + imgs[0]['src'] else: imgs = soup.findAll('image') if imgs: src = get_path_part(href) + imgs[0]['xlink:href'] if not src: continue try: # remove all .. and the path part above it, if present. # Mostly for epubs edited by Sigil. src = re.sub(r"([^/]+/\.\./)", "", src) #print("epubutils: found pre-existing cover image:%s"%src) oldcoverimghref = src oldcoverimgdata = epub.read(src) for item in contentdom.getElementsByTagName("item"): if (relpath + item.getAttribute("href") == oldcoverimghref): oldcoverimgtype = item.getAttribute("media-type") break oldcover = (oldcoverhtmlhref, oldcoverhtmltype, oldcoverhtmldata, oldcoverimghref, oldcoverimgtype, oldcoverimgdata) except Exception as e: logger.warn("Cover Image %s not found" % src) logger.warn("Exception: %s" % (unicode(e))) traceback.print_exc() filecount = 0 soups = [] # list of xhmtl blocks images = {} # dict() longdesc->data if getfilecount: # spin through the manifest--only place there are item tags. for item in contentdom.getElementsByTagName("item"): # First, count the 'chapter' files. FFF uses file0000.xhtml, # but can also update epubs downloaded from Twisting the # Hellmouth, which uses chapter0.html. if (item.getAttribute("media-type") == "application/xhtml+xml"): href = relpath + item.getAttribute("href") #print("---- item href:%s path part: %s"%(href,get_path_part(href))) if re.match(r'.*/log_page\.x?html', href): try: logfile = epub.read(href).decode("utf-8") except: pass # corner case I bumped into while testing. if re.match(r'.*/(file|chapter)\d+\.x?html', href): if getsoups: soup = bs.BeautifulSoup( epub.read(href).decode("utf-8"), "html5lib") for img in soup.findAll('img'): newsrc = '' longdesc = '' try: newsrc = get_path_part(href) + img['src'] # remove all .. and the path part above it, if present. # Mostly for epubs edited by Sigil. newsrc = re.sub(r"([^/]+/\.\./)", "", newsrc) longdesc = img['longdesc'] data = epub.read(newsrc) images[longdesc] = data img['src'] = img['longdesc'] except Exception as e: logger.warn( "Image %s not found!\n(originally:%s)" % (newsrc, longdesc)) logger.warn("Exception: %s" % (unicode(e))) traceback.print_exc() soup = soup.find('body') # ffdl epubs have chapter title h3 h3 = soup.find('h3') if h3: h3.extract() # TtH epubs have chapter title h2 h2 = soup.find('h2') if h2: h2.extract() for skip in soup.findAll( attrs={'class': 'skip_on_ffdl_update'}): skip.extract() soups.append(soup) filecount += 1 try: calibrebookmark = epub.read("META-INF/calibre_bookmarks.txt") except: pass #for k in images.keys(): #print("\tlongdesc:%s\n\tData len:%s\n"%(k,len(images[k]))) return (source, filecount, soups, images, oldcover, calibrebookmark, logfile)
def test_exported_meeting_json_has_correct_file_names(self, browser): self.login(self.committee_responsible, browser) self.schedule_paragraph(self.meeting, u'A Gesch\xfcfte') with freeze(localized_datetime(2017, 12, 13)): self.schedule_ad_hoc(self.meeting, u'Ad-hoc Traktand\xfem').decide() self.schedule_proposal(self.meeting, self.submitted_word_proposal).decide() with freeze(localized_datetime(2017, 12, 14)): self.meeting.model.close() browser.open(self.meeting, view='export-meeting-zip') self.assertEquals('application/zip', browser.contenttype) zip_file = ZipFile(StringIO(browser.contents), 'r') meeting_json = json.loads(zip_file.read('meeting.json')) # the protocol is generated during the tests and its checksum cannot # be predicted meeting_json['meetings'][0]['protocol']['checksum'] = 'unpredictable' self.assert_json_structure_equal( { 'meetings': [{ 'agenda_items': [{ 'title': u'A Gesch\xfcfte' }, { 'number': '1.', 'proposal': { 'checksum': 'e00d6c8fb32c30d3ca3a3f8e5d873565482567561023016d9ca18243ff1cfa14', 'file': '1. Ad-hoc Traktandthm/Ad hoc agenda item Ad-hoc Traktandthm.docx', 'modified': '2017-12-12T23:00:00+01:00' }, 'title': u'Ad-hoc Traktand\xfem' }, { 'attachments': [{ 'checksum': '51d6317494eccc4a73154625a6820cb6b50dc1455eb4cf26399299d4f9ce77b2', 'file': '2. Anderungen am Personalreglement/Vertragsentwurf.docx', 'modified': '2016-08-31T15:21:46+02:00', 'title': u'Vertr\xe4gsentwurf' }], 'number': '2.', 'proposal': { 'checksum': 'e00d6c8fb32c30d3ca3a3f8e5d873565482567561023016d9ca18243ff1cfa14', 'file': '2. Anderungen am Personalreglement/Anderungen am Personalreglement.docx', 'modified': '2016-08-31T15:21:44+02:00' }, 'title': u'\xc4nderungen am Personalreglement' }], 'committee': { 'oguid': 'plone:1009233300', 'title': u'Rechnungspr\xfcfungskommission' }, 'end': '2016-09-12T17:00:00+00:00', 'location': u'B\xfcren an der Aare', 'protocol': { 'checksum': 'unpredictable', 'file': 'Protocol-9. Sitzung der Rechnungsprufungskommission.docx', 'modified': '2017-12-13T23:00:00+01:00' }, 'start': '2016-09-12T15:30:00+00:00', 'title': u'9. Sitzung der Rechnungspr\xfcfungskommission' }], 'version': '1.0.0' }, meeting_json) file_names = zip_file.namelist() for file_name in [ '1. Ad-hoc Traktandthm/Ad hoc agenda item Ad-hoc Traktandthm.docx', '2. Anderungen am Personalreglement/Vertragsentwurf.docx', '2. Anderungen am Personalreglement/Anderungen am Personalreglement.docx', 'Protocol-9. Sitzung der Rechnungsprufungskommission.docx' ]: self.assertIn(file_name, file_names)
def verify(certificate, jar_file, sf_name=None): """ Verifies signature of a JAR file. Limitations: - diagnostic is less verbose than of jarsigner :return None if verification succeeds. :exception SignatureBlockFileVerificationError, ManifestChecksumError, JarChecksumError, JarSignatureMissingError Reference: http://docs.oracle.com/javase/7/docs/technotes/guides/jar/jar.html#Signature_Validation Note that the validation is done in three steps. Failure at any step is a failure of the whole validation. """ # noqua # Step 0: get the "key alias", used also for naming of sig-related files. zip_file = ZipFile(jar_file) sf_files = [f for f in zip_file.namelist() if file_matches_sigfile(f)] if len(sf_files) == 0: raise JarSignatureMissingError("No .SF file in %s" % jar_file) elif len(sf_files) > 1: if sf_name is None: msg = "Multiple .SF files in %s, but SF_NAME.SF not specified" \ % jar_file raise VerificationError(msg) elif ('META-INF/' + sf_name) in sf_files: sf_filename = 'META-INF/' + sf_name else: msg = "No .SF file in %s named META-INF/%s (found %d .SF files)" \ % (jar_file, sf_name, len(sf_files)) raise VerificationError(msg) elif len(sf_files) == 1: if sf_name is None: sf_filename = sf_files[0] elif sf_files[0] == 'META-INF/' + sf_name: sf_filename = sf_files[0] else: msg = "No .SF file in %s named META-INF/%s" % (jar_file, sf_name) raise VerificationError(msg) key_alias = sf_filename[9:-3] # "META-INF/%s.SF" sf_data = zip_file.read(sf_filename) # Step 1: check the crypto part. file_list = zip_file.namelist() sig_block_filename = None # JAR specification mentions only RSA and DSA; jarsigner also has EC # TODO: what about "SIG-*"? signature_extensions = ("RSA", "DSA", "EC") for extension in signature_extensions: candidate_filename = "META-INF/%s.%s" % (key_alias, extension) if candidate_filename in file_list: sig_block_filename = candidate_filename break if sig_block_filename is None: msg = "None of %s found in JAR" % \ ", ".join(key_alias + "." + x for x in signature_extensions) raise JarSignatureMissingError(msg) sig_block_data = zip_file.read(sig_block_filename) try: verify_signature_block(certificate, sf_data, sig_block_data) except SignatureBlockVerificationError as message: message = "Signature block verification failed: %s" % message raise SignatureBlockFileVerificationError(message) # KEYALIAS.SF is correctly signed. # Step 2: Check that it contains correct checksum of the manifest. signature_manifest = SignatureManifest() signature_manifest.parse(sf_data) jar_manifest = Manifest() jar_manifest.load_from_jar(jar_file) errors = signature_manifest.verify_manifest(jar_manifest) if len(errors) > 0: msg = "%s: in .SF file, section checksum(s) failed for: %s" \ % (jar_file, ",".join(errors)) raise ManifestChecksumError(msg) # Checksums of MANIFEST.MF itself are correct. # Step 3: Check that it contains valid checksums for each file # from the JAR. NOTE: the check is done for JAR entries. If some # JAR entries are deleted after signing, the verification still # succeeds. This seems to not follow the reference specification, # but that's what jarsigner does. errors = jar_manifest.verify_jar_checksums(jar_file) if len(errors) > 0: msg = "Checksum(s) for jar entries of jar file %s failed for: %s" \ % (jar_file, ",".join(errors)) raise JarChecksumError(msg) return None
def get_recipe_from_file(self, file): ingredient_mode = False direction_mode = False description_mode = False ingredients = [] directions = [] descriptions = [] for fl in file.readlines(): line = fl.decode("utf-8") if 'title:' in line: title = line.replace('title:', '').replace('"', '').strip() if 'image:' in line: image = line.replace('image:', '').strip() if 'tags:' in line: tags = line.replace('tags:', '').strip() if ingredient_mode: if len(line) > 2 and 'directions:' not in line: ingredients.append(line[2:]) if '---' in line and direction_mode: direction_mode = False description_mode = True if direction_mode: if len(line) > 2: directions.append(line[2:]) if 'ingredients:' in line: ingredient_mode = True if 'directions:' in line: ingredient_mode = False direction_mode = True if description_mode and len(line) > 3 and '---' not in line: descriptions.append(line) recipe = Recipe.objects.create(name=title, created_by=self.request.user, internal=True, space=self.request.space) for k in tags.split(','): print(f'adding keyword {k.strip()}') keyword, created = Keyword.objects.get_or_create( name=k.strip(), space=self.request.space) recipe.keywords.add(keyword) step = Step.objects.create( instruction='\n'.join(directions) + '\n\n' + '\n'.join(descriptions), space=self.request.space, ) ingredient_parser = IngredientParser(self.request, True) for ingredient in ingredients: if len(ingredient.strip()) > 0: amount, unit, food, note = ingredient_parser.parse(ingredient) f = ingredient_parser.get_food(food) u = ingredient_parser.get_unit(unit) step.ingredients.add( Ingredient.objects.create( food=f, unit=u, amount=amount, note=note, original_text=ingredient, space=self.request.space, )) recipe.steps.add(step) for f in self.files: if '.zip' in f['name']: import_zip = ZipFile(f['file']) for z in import_zip.filelist: if re.match(f'^images/{image}$', z.filename): self.import_recipe_image( recipe, BytesIO(import_zip.read(z.filename)), filetype=get_filetype(z.filename)) return recipe
def generate_hocr_xar( hocr_dir, output_dir, metadata_file, imagexarfile, dictionary_file, ocr_engine, classifier, datetime, verbose, clobber, ): import glob, os, shutil, tempfile import lxml.etree as etree import datetime as dt import lxml.etree as ET from halo import Halo from os.path import basename from pathlib import Path import traceback from zipfile import ZipFile print("generating hocr xar") # set the datetime if it isn't given if datetime == None: now = dt.datetime.now() datetime = now.strftime("%Y-%m-%d-%H-%M-%S") if verbose: print("supplying datetime for this OCR run:", datetime) identifier = "" repo_file_string = "" # change classifier variable to be the name of the classifier, not path to the actual file classifier = Path(classifier).stem if not (metadata_file == None): identifier = get_identifier_from_metadata_file(metadata_file.name) xsl_file = Path(__file__).parent / "XSLT/make_repo_texts.xsl" xsl_file_handle = open(xsl_file, "r") xslt = ET.parse(xsl_file_handle) dom = ET.parse(open(metadata_file.name, "r")) transform = ET.XSLT(xslt) newdom = transform( dom, identifier=etree.XSLT.strparam(identifier), classifier=etree.XSLT.strparam(classifier), rundate=etree.XSLT.strparam(datetime), ) repo_file_string = ET.tostring(newdom) if not (imagexarfile == None): try: if verbose: print("archive is:", imagexarfile) archive = ZipFile(imagexarfile.name, "r") metadata = archive.read("meta.xml") root = ET.fromstring(metadata) identifier = get_dc_element_from_metadata("identifier", root) if verbose: print("Using identifier from image xar file:", identifier) repo_file_string = make_text_repo_string(root, datetime) except Exception as e: print("Failed to open image archive at", metadata_file, "Exiting ...") print(e) exit(0) # get the final file name and check if we're clobbering output_file_name = identifier + "-" + datetime + "-" + classifier + "-texts.xar" output_file_path = os.path.join(output_dir, output_file_name) if os.path.exists(output_file_path) and not (clobber): print( "the output file", output_file_path, "already exists, and you've set '--clobber' to false, so I'm exiting without doing anything.", ) exit(0) # collect and sort all the hocr files in the inputdir types = ["*.hocr", "*.html", "*.xhtml", "*.htm"] all_hocr_files = [] for a_type in types: this_type_files = glob.glob(os.path.join(hocr_dir, a_type)) all_hocr_files += this_type_files all_hocr_files.sort() if verbose: print("Input hocr files:") print(all_hocr_files) xhtml_temp_dir = tempfile.mkdtemp() output_counter = 1 for hocr_file in all_hocr_files: fileout_name = identifier + "_" + str(output_counter).zfill( 4) + ".html" # if verbose: # print("fileout name: ", fileout_name) fileout_path = os.path.join(xhtml_temp_dir, fileout_name) shutil.copyfile(hocr_file, fileout_path) output_counter = output_counter + 1 xslt_to_xhtml = etree.XML("""\ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xmlns:html='http://www.w3.org/1999/xhtml'> <xsl:template match="*"> <xsl:element name="html:{local-name(.)}"> <xsl:apply-templates select="@*|*|text()"/> </xsl:element> </xsl:template> <xsl:template match="@*"> <xsl:attribute name="{name(.)}"><xsl:value-of select="."/></xsl:attribute> </xsl:template> </xsl:stylesheet>""") transform_to_xhtml = etree.XSLT(xslt_to_xhtml) xhtml_dehyph_temp_dir = tempfile.mkdtemp() if verbose: print("dehyphenation temp dir:", xhtml_dehyph_temp_dir) spinner = Halo(text="dehyphenating", spinner="dots") spinner.start() all_renamed_hocr_files = os.listdir(xhtml_temp_dir) for file_name in all_renamed_hocr_files: file_path = os.path.join(xhtml_temp_dir, file_name) with open(file_path) as file_path: try: tree = etree.parse(file_path) xhtml = transform_to_xhtml(tree) if ocr_engine == "kraken": fix_kraken_hocr.get_word_span_area(xhtml, verbose) fix_kraken_hocr.clean_ocr_page_title(xhtml, file_name) try: fix_kraken_hocr.share_space_spans(xhtml, verbose) except Exception: print(traceback.format_exc()) exit() fix_kraken_hocr.confidence_summary(xhtml) dehyphenate.convert_ocrx_to_ocr(xhtml) dehyphenate.remove_meta_tags(xhtml) dehyphenate.identify(xhtml) dehyphenate.dehyphenate(xhtml, file_name, verbose) dehyphenate.add_dublin_core_tags(xhtml) out_path = os.path.join(xhtml_dehyph_temp_dir, file_name) xhtml.write(out_path, pretty_print=True, xml_declaration=True, encoding="utf-8") except Exception as e: print("This exception was thrown on file {}".format(file_name)) print(e) spinner.stop() # now generate a spellcheck file spinner = Halo(text="spellchecking", spinner="dots") spinner.start() no_accent_dict_file_path = (Path(__file__).parent / "Dictionaries/unique_no_accent_list.csv") # TODO: Parameterize this, so we can set the dictionary on the command line dictionary_file_path = (Path(__file__).parent / "Dictionaries/english_greek_latin.txt") spellcheck_file_path = tempfile.mktemp() if verbose: print("spellcheck file is:", spellcheck_file_path) generate_spellcheck_file.make_spellcheck_file( xhtml_dehyph_temp_dir, dictionary_file_path, no_accent_dict_file_path, spellcheck_file_path, verbose, ) spellchecked_xhtml_temp_dir = tempfile.mkdtemp() if verbose: print( "temp dir for collecting xar context, including spellchecked hocr: ", spellchecked_xhtml_temp_dir, ) spellcheck_hocr.spellcheck( spellcheck_file_path, xhtml_dehyph_temp_dir, spellchecked_xhtml_temp_dir, verbose, ) spinner.stop() # todo delete temp files # make meta file for texts xsl_file = Path(__file__).parent / "XSLT/make_meta_texts.xsl" xsl_file_handle = open(xsl_file, "r") dom = ET.parse(xsl_file_handle) # ET.parse(open(metadata_file, 'r')) xslt = ET.parse(open(xsl_file, "r")) plain_string_value = etree.XSLT.strparam(identifier) transform = ET.XSLT(xslt) newdom = transform( dom, identifier=etree.XSLT.strparam(identifier), classifier=etree.XSLT.strparam(classifier), rundate=etree.XSLT.strparam(datetime), engine=etree.XSLT.strparam(ocr_engine), ) newdom.write(os.path.join(spellchecked_xhtml_temp_dir, "meta.xml"), pretty_print=True) # make repo.xml for texts # xsl_file = Path(__file__).parent / "XSLT/make_repo_texts.xsl" # xsl_file_handle = open(xsl_file, "r") # xslt = ET.parse(xsl_file_handle) # dom = ET.parse(open(xsl_file, "r")) # get accuracy value assessment = str(assess_hocr_dir.assess(spellchecked_xhtml_temp_dir)) # transform = ET.XSLT(xslt) # newdom = transform(dom, identifier=etree.XSLT.strparam(identifier), accuracy=assessment, rundate=etree.XSLT.strparam(datetime)) # newdom.write(os.path.join(spellchecked_xhtml_temp_dir,'repo.xml') , pretty_print=True) # different approach with open(os.path.join(spellchecked_xhtml_temp_dir, "repo.xml"), "w") as repo_file: repo_file.write(repo_file_string) if not (imagexarfile.name == None): accuracySvgAndTotals.makeAccuracySVG(spellchecked_xhtml_temp_dir, imagexarfile.name) # make expath-pkg.xml for texts xsl_file = Path(__file__).parent / "XSLT/make_expath_texts.xsl" xsl_file_handle = open(xsl_file, "r") dom = ET.parse(open(xsl_file, "r")) xslt = ET.parse(xsl_file_handle) transform = ET.XSLT(xslt) newdom = transform( dom, identifier=etree.XSLT.strparam(identifier), rundate=etree.XSLT.strparam(datetime), ) newdom.write(os.path.join(spellchecked_xhtml_temp_dir, "expath-pkg.xml"), pretty_print=True) # save static metadata files to the temp dir static_files_dir = Path(__file__).parent / "static_for_text_xar" static_files = os.listdir(static_files_dir) for file_name in static_files: shutil.copy(os.path.join(static_files_dir, file_name), spellchecked_xhtml_temp_dir) # make accuracy report? accuracySvgAndTotals.makeTotalsFile(spellchecked_xhtml_temp_dir) # this requires the xar file, or at least images. # We could re-do all this by passing in the image xar file and using its metadata for this one, which would # mean we don't have to keep our metadata files sitting around. # generate the zip file and save to outputdir # Make xar file output by compressing everything in 'spellchecked_xhtml_temp_dir' output_zip_file_path = os.path.join( output_dir, identifier + "-" + datetime + "-" + classifier + "-texts.xar") with ZipFile(output_zip_file_path, "w") as zipObj: for filename in os.listdir(spellchecked_xhtml_temp_dir): filePath = os.path.join(spellchecked_xhtml_temp_dir, filename) zipObj.write(filePath, basename(filePath)) print("text archive from date", datetime, "saved to", output_zip_file_path) # Clean up if not (verbose): for temp_directory in [ spellchecked_xhtml_temp_dir, xhtml_dehyph_temp_dir ]: shutil.rmtree(temp_directory) # delete unused spellcheck file os.remove(spellcheck_file_path)
class MailMerge(object): def __init__(self, file, remove_empty_tables=False): self.zip = ZipFile(file) self.parts = {} self.settings = None self._settings_info = None self.media = {} # new images to add indexed by embed id self.rels = None # etree for relationships self._rels_info = None # zi info block for rels self.RELS_NAMESPACES = {'ns': None, 'od': None} self.remove_empty_tables = remove_empty_tables try: content_types = etree.parse(self.zip.open('[Content_Types].xml')) for file in content_types.findall('{%(ct)s}Override' % NAMESPACES): type = file.attrib['ContentType' % NAMESPACES] if type in CONTENT_TYPES_PARTS: zi, self.parts[zi] = self.__get_tree_of_file(file) elif type == CONTENT_TYPE_SETTINGS: self._settings_info, self.settings = self.__get_tree_of_file(file) # get the rels for image mappings try: self._rels_info, self.rels = self.__get_tree_of_file('word/_rels/document.xml.rels') self.RELS_NAMESPACES['ns'] = self.rels.getroot().nsmap.get(None) self.RELS_NAMESPACES['od'] = self.rels.getroot().nsmap.get(None).replace('package', 'officeDocument') except: pass to_delete = [] r = re.compile(r' MERGEFIELD +"?([^ ]+?)"? +(|\\\* MERGEFORMAT )', re.I) for part in self.parts.values(): for parent in part.findall('.//{%(w)s}fldSimple/..' % NAMESPACES): for idx, child in enumerate(parent): if child.tag != '{%(w)s}fldSimple' % NAMESPACES: continue instr = child.attrib['{%(w)s}instr' % NAMESPACES] m = r.match(instr) if m is None: continue parent[idx] = Element('MergeField', name=m.group(1)) for parent in part.findall('.//{%(w)s}instrText/../..' % NAMESPACES): children = list(parent) fields = zip( [children.index(e) for e in parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="begin"]/..' % NAMESPACES)], [children.index(e) for e in parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="end"]/..' % NAMESPACES)] ) for idx_begin, idx_end in fields: # consolidate all instrText nodes between'begin' and 'end' into a single node begin = children[idx_begin] instr_elements = [e for e in begin.getparent().findall('{%(w)s}r/{%(w)s}instrText' % NAMESPACES) if idx_begin < children.index(e.getparent()) < idx_end] if len(instr_elements) == 0: continue # set the text of the first instrText element to the concatenation # of all the instrText element texts instr_text = ''.join([e.text for e in instr_elements]) instr_elements[0].text = instr_text # delete all instrText elements except the first for instr in instr_elements[1:]: instr.getparent().remove(instr) m = r.match(instr_text) if m is None: continue parent[idx_begin] = Element('MergeField', name=m.group(1)) # use this so we know *where* to put the replacement instr_elements[0].tag = 'MergeText' block = instr_elements[0].getparent() # append the other tags in the w:r block too parent[idx_begin].extend(list(block)) to_delete += [(parent, parent[i + 1]) for i in range(idx_begin, idx_end)] for parent, child in to_delete: parent.remove(child) # Remove mail merge settings to avoid error messages when opening document in Winword if self.settings: settings_root = self.settings.getroot() mail_merge = settings_root.find('{%(w)s}mailMerge' % NAMESPACES) if mail_merge is not None: settings_root.remove(mail_merge) except: self.zip.close() raise def __get_tree_of_file(self, file): if isinstance(file, etree._Element): fn = file.get('PartName').split('/', 1)[1] else: fn = file zi = self.zip.getinfo(fn) return zi, etree.parse(self.zip.open(zi)) def write(self, file): # Replace all remaining merge fields with empty values for field in self.get_merge_fields(): self.merge(**{field: ''}) with ZipFile(file, 'w', ZIP_DEFLATED) as output: for zi in self.zip.filelist: if zi in self.parts: xml = etree.tostring(self.parts[zi].getroot()) output.writestr(zi.filename, xml) elif zi == self._settings_info: xml = etree.tostring(self.settings.getroot()) output.writestr(zi.filename, xml) elif zi == self._rels_info: xml = etree.tostring(self.rels.getroot()) output.writestr(zi.filename, xml) else: output.writestr(zi.filename, self.zip.read(zi)) # add new images to media folder is we have images merged for img_id, img_data in self.media.items(): output.writestr('media/{}.png'.format(img_id), img_data) def get_merge_fields(self, parts=None): if not parts: parts = self.parts.values() fields = set() for part in parts: for mf in part.findall('.//MergeField'): fields.add(mf.attrib['name']) return fields def merge_templates(self, replacements, separator): """ Duplicate template. Creates a copy of the template, does a merge, and separates them by a new paragraph, a new break or a new section break. separator must be : - page_break : Page Break. - column_break : Column Break. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS - textWrapping_break : Line Break. - continuous_section : Continuous section break. Begins the section on the next paragraph. - evenPage_section : evenPage section break. section begins on the next even-numbered page, leaving the next odd page blank if necessary. - nextColumn_section : nextColumn section break. section begins on the following column on the page. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS - nextPage_section : nextPage section break. section begins on the following page. - oddPage_section : oddPage section break. section begins on the next odd-numbered page, leaving the next even page blank if necessary. """ # TYPE PARAM CONTROL AND SPLIT valid_separators = {'page_break', 'column_break', 'textWrapping_break', 'continuous_section', 'evenPage_section', 'nextColumn_section', 'nextPage_section', 'oddPage_section'} if not separator in valid_separators: raise ValueError("Invalid separator argument") type, sepClass = separator.split("_") # GET ROOT - WORK WITH DOCUMENT for part in self.parts.values(): root = part.getroot() tag = root.tag if tag == '{%(w)s}ftr' % NAMESPACES or tag == '{%(w)s}hdr' % NAMESPACES: continue if sepClass == 'section': # FINDING FIRST SECTION OF THE DOCUMENT firstSection = root.find("w:body/w:p/w:pPr/w:sectPr", namespaces=NAMESPACES) if firstSection == None: firstSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES) # MODIFY TYPE ATTRIBUTE OF FIRST SECTION FOR MERGING nextPageSec = deepcopy(firstSection) for child in nextPageSec: # Delete old type if exist if child.tag == '{%(w)s}type' % NAMESPACES: nextPageSec.remove(child) # Create new type (def parameter) newType = etree.SubElement(nextPageSec, '{%(w)s}type' % NAMESPACES) newType.set('{%(w)s}val' % NAMESPACES, type) # REPLACING FIRST SECTION secRoot = firstSection.getparent() secRoot.replace(firstSection, nextPageSec) # FINDING LAST SECTION OF THE DOCUMENT lastSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES) # SAVING LAST SECTION mainSection = deepcopy(lastSection) lsecRoot = lastSection.getparent() lsecRoot.remove(lastSection) # COPY CHILDREN ELEMENTS OF BODY IN A LIST childrenList = root.findall('w:body/*', namespaces=NAMESPACES) # DELETE ALL CHILDREN OF BODY for child in root: if child.tag == '{%(w)s}body' % NAMESPACES: child.clear() # REFILL BODY AND MERGE DOCS - ADD LAST SECTION ENCAPSULATED OR NOT lr = len(replacements) lc = len(childrenList) parts = [] for i, repl in enumerate(replacements): for (j, n) in enumerate(childrenList): element = deepcopy(n) for child in root: if child.tag == '{%(w)s}body' % NAMESPACES: child.append(element) parts.append(element) if (j + 1) == lc: if (i + 1) == lr: child.append(mainSection) parts.append(mainSection) else: if sepClass == 'section': intSection = deepcopy(mainSection) p = etree.SubElement(child, '{%(w)s}p' % NAMESPACES) pPr = etree.SubElement(p, '{%(w)s}pPr' % NAMESPACES) pPr.append(intSection) parts.append(p) elif sepClass == 'break': pb = etree.SubElement(child, '{%(w)s}p' % NAMESPACES) r = etree.SubElement(pb, '{%(w)s}r' % NAMESPACES) nbreak = Element('{%(w)s}br' % NAMESPACES) nbreak.attrib['{%(w)s}type' % NAMESPACES] = type r.append(nbreak) self.merge(parts, **repl) def merge_pages(self, replacements): """ Deprecated method. """ warnings.warn("merge_pages has been deprecated in favour of merge_templates", category=DeprecationWarning, stacklevel=2) self.merge_templates(replacements, "page_break") def merge(self, parts=None, **replacements): if not parts: parts = self.parts.values() for field, replacement in replacements.items(): if isinstance(replacement, list): self.merge_rows(field, replacement) else: for part in parts: self.__merge_field(part, field, replacement) def __merge_field(self, part, field, text): if field.startswith('IMAGE:'): _, img_name = field.split(':') inline_img_el = part.find('.//wp:docPr[@title="{}"]/..'.format(img_name), namespaces=NAMESPACES) if inline_img_el: embed_node = inline_img_el.find('.//a:blip', namespaces=NAMESPACES) if embed_node: # generate a random id and add tp media list for later export to media folder in zip file img_id = 'MMR{}'.format(randint(10000000, 999999999)) self.media[img_id] = text # add a relationship last_img_relationship = \ self.rels.findall('{%(ns)s}Relationship[@Type="%(od)s/image"]' % self.RELS_NAMESPACES)[-1] new_img_relationship = deepcopy(last_img_relationship) new_img_relationship.set('Id', img_id) new_img_relationship.set('Target', '/media/{}.png'.format(img_id)) self.rels.getroot().append(new_img_relationship) # replace the embed attrib with the new image_id embed_node = inline_img_el.find('.//a:blip', namespaces=NAMESPACES) embed_attr = embed_node.attrib.keys()[0] embed_node.attrib[embed_attr] = img_id # mark as done inline_img_el.find('wp:docPr', namespaces=NAMESPACES).attrib['title'] = 'replaced_image_{}'.format( img_id) return for mf in part.findall('.//MergeField[@name="%s"]' % field): children = list(mf) mf.clear() # clear away the attributes mf.tag = '{%(w)s}r' % NAMESPACES mf.extend(children) nodes = [] # preserve new lines in replacement text text = text or '' # text might be None text_parts = text.replace('\r', '').split('\n') for i, text_part in enumerate(text_parts): text_node = Element('{%(w)s}t' % NAMESPACES) text_node.text = text_part nodes.append(text_node) # if not last node add new line node if i < (len(text_parts) - 1): nodes.append(Element('{%(w)s}br' % NAMESPACES)) ph = mf.find('MergeText') if ph is not None: # add text nodes at the exact position where # MergeText was found index = mf.index(ph) for node in reversed(nodes): mf.insert(index, node) mf.remove(ph) else: mf.extend(nodes) def merge_rows(self, anchor, rows): table, idx, template = self.__find_row_anchor(anchor) if table is not None: if len(rows) > 0: del table[idx] for i, row_data in enumerate(rows): row = deepcopy(template) self.merge([row], **row_data) table.insert(idx + i, row) else: # if there is no data for a given table # we check whether table needs to be removed if self.remove_empty_tables: parent = table.getparent() parent.remove(table) def __find_row_anchor(self, field, parts=None): if not parts: parts = self.parts.values() for part in parts: for table in part.findall('.//{%(w)s}tbl' % NAMESPACES): for idx, row in enumerate(table): if row.find('.//MergeField[@name="%s"]' % field) is not None: return table, idx, row return None, None, None def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): if self.zip is not None: try: self.zip.close() finally: self.zip = None
def _make_super_runtime(capsule, output, licfile=None, platforms=None, restrict=True, suffix=''): logging.info('Generating super runtime library to %s', relpath(output)) if not os.path.exists(output): os.makedirs(output) supermode = True if not platforms: platid = _format_platid() filelist = _build_platforms([platid], restrict, supermode)[:1] elif len(platforms) == 1: filelist = _build_platforms(platforms, restrict, supermode)[:1] else: filelist = _build_platforms(platforms, restrict, supermode) myzip = ZipFile(capsule, 'r') if 'pytransform.key' not in myzip.namelist(): raise RuntimeError('No pytransform.key found in capsule') logging.info('Extract pytransform.key') keydata = myzip.read('pytransform.key') myzip.close() lickey = _build_license_file(capsule, licfile) if sys.version_info.major == 2: size1 = ord(keydata[0]) + ord(keydata[1]) * 256 size2 = ord(keydata[2]) + ord(keydata[3]) * 256 else: size1 = keydata[0] + keydata[1] * 256 size2 = keydata[2] + keydata[3] * 256 k1 = 16 k2 = k1 + size1 keylist = keydata[k1:k2], keydata[k2:k2 + size2], lickey namelist = [] checklist = [] for filename in filelist: logging.info('Copying %s', filename) name = os.path.basename(filename) if suffix: k = name.rfind('pytransform') + len('pytransform') name = name[:k] + suffix + name[k:] logging.info('Rename extension to %s', name) if name in namelist: raise RuntimeError('Multiple platforms confilt with ' 'same extension name "%s"' % name) namelist.append(name) target = os.path.join(output, name) shutil.copy2(filename, target) logging.info('Patch extension %s', target) data = _patch_extension(target, keylist, suffix) with open(target, 'wb') as f: f.write(data) checklist.append(sum(data)) logging.info('Generate runtime files OK') return checklist
class BundleProjectStore(ProjectStore): """Represents a translate project bundle (zip archive).""" # INITIALIZERS # def __init__(self, fname): super(BundleProjectStore, self).__init__() self._tempfiles = {} if fname and os.path.isfile(fname): self.load(fname) else: self.zip = ZipFile(fname, 'w') self.save() self.zip.close() self.zip = ZipFile(fname, 'a') # CLASS METHODS # @classmethod def from_project(cls, proj, fname=None): if fname is None: fname = 'bundle.zip' bundle = BundleProjectStore(fname) for fn in proj.sourcefiles: bundle.append_sourcefile(proj.get_file(fn)) for fn in proj.transfiles: bundle.append_transfile(proj.get_file(fn)) for fn in proj.targetfiles: bundle.append_targetfile(proj.get_file(fn)) bundle.settings = proj.settings.copy() bundle.save() return bundle # METHODS # def append_file(self, afile, fname, ftype='trans', delete_orig=False): """Append the given file to the project with the given filename, marked to be of type ``ftype`` ('src', 'trans', 'tgt'). :param delete_orig: If ``True``, as set by :meth:`~translate.storage.Project.convert_forward`, ``afile`` is deleted after appending, if possible. .. note:: For this implementation, the appended file will be deleted from disk if ``delete_orig`` is ``True``. """ if fname and fname in self.zip.namelist(): raise ValueError("File already in bundle archive: %s" % (fname)) if not fname and isinstance(afile, basestring) and afile in self.zip.namelist(): raise ValueError("File already in bundle archive: %s" % (afile)) afile, fname = super(BundleProjectStore, self).append_file(afile, fname, ftype) self._zip_add(fname, afile) if delete_orig and hasattr(afile, 'name') and afile.name not in self._tempfiles: try: os.unlink(afile.name) except Exception: pass return self.get_file(fname), fname def remove_file(self, fname, ftype=None): """Remove the file with the given project name from the project.""" super(BundleProjectStore, self).remove_file(fname, ftype) self._zip_delete([fname]) tempfiles = [tmpf for tmpf, prjf in self._tempfiles.iteritems() if prjf == fname] if tempfiles: for tmpf in tempfiles: try: os.unlink(tmpf) except Exception: pass del self._tempfiles[tmpf] def close(self): super(BundleProjectStore, self).close() self.cleanup() self.zip.close() def cleanup(self): """Clean up our mess: remove temporary files.""" for tempfname in self._tempfiles: if os.path.isfile(tempfname): os.unlink(tempfname) self._tempfiles = {} def get_file(self, fname): """Retrieve a project file (source, translation or target file) from the project archive.""" retfile = None if fname in self._files or fname in self.zip.namelist(): # Check if the file has not already been extracted to a temp file tempfname = [tfn for tfn in self._tempfiles if self._tempfiles[tfn] == fname] if tempfname and os.path.isfile(tempfname[0]): tempfname = tempfname[0] else: tempfname = '' if not tempfname: # Extract the file to a temporary file zfile = self.zip.open(fname) tempfname = os.path.split(fname)[-1] tempfd, tempfname = tempfile.mkstemp(suffix='_' + tempfname) os.close(tempfd) open(tempfname, 'w').write(zfile.read()) retfile = open(tempfname) self._tempfiles[tempfname] = fname if not retfile: raise FileNotInProjectError(fname) return retfile def get_proj_filename(self, realfname): """Try and find a project file name for the given real file name.""" try: fname = super(BundleProjectStore, self).get_proj_filename(realfname) except ValueError as ve: fname = None if fname: return fname if realfname in self._tempfiles: return self._tempfiles[realfname] raise ValueError('Real file not in project store: %s' % (realfname)) def load(self, zipname): """Load the bundle project from the zip file of the given name.""" self.zip = ZipFile(zipname, mode='a') self._load_settings() append_section = { 'sources': self._sourcefiles.append, 'targets': self._targetfiles.append, 'transfiles': self._transfiles.append, } for section in ('sources', 'targets', 'transfiles'): if section in self.settings: for fname in self.settings[section]: append_section[section](fname) self._files[fname] = None def save(self, filename=None): """Save all project files to the bundle zip file.""" self._update_from_tempfiles() if filename: newzip = ZipFile(filename, 'w') else: newzip = self._create_temp_zipfile() # Write project file for the new zip bundle newzip.writestr('project.xtp', self._generate_settings()) # Copy project files from project to the new zip file project_files = self._sourcefiles + self._transfiles + self._targetfiles for fname in project_files: newzip.writestr(fname, self.get_file(fname).read()) # Copy any extra (non-project) files from the current zip for fname in self.zip.namelist(): if fname in project_files or fname == 'project.xtp': continue newzip.writestr(fname, self.zip.read(fname)) self._replace_project_zip(newzip) def update_file(self, pfname, infile): """Updates the file with the given project file name with the contents of ``infile``. :returns: the results from :meth:`BundleProjStore.append_file`.""" if pfname not in self._files: raise FileNotInProjectError(pfname) if pfname not in self.zip.namelist(): return super(BundleProjectStore, self).update_file(pfname, infile) self._zip_delete([pfname]) self._zip_add(pfname, infile) def _load_settings(self): """Grab the project.xtp file from the zip file and load it.""" if 'project.xtp' not in self.zip.namelist(): raise InvalidBundleError('Not a translate project bundle') super(BundleProjectStore, self)._load_settings(self.zip.open('project.xtp').read()) def _create_temp_zipfile(self): """Create a new zip file with a temporary file name (with mode 'w').""" newzipfd, newzipfname = tempfile.mkstemp(prefix='translate_bundle', suffix='.zip') os.close(newzipfd) return ZipFile(newzipfname, 'w') def _replace_project_zip(self, zfile): """Replace the currently used zip file (``self.zip``) with the given zip file. Basically, ``os.rename(zfile.filename, self.zip.filename)``.""" if not zfile.fp.closed: zfile.close() if not self.zip.fp.closed: self.zip.close() shutil.move(zfile.filename, self.zip.filename) self.zip = ZipFile(self.zip.filename, mode='a') def _update_from_tempfiles(self): """Update project files from temporary files.""" for tempfname in self._tempfiles: tmp = open(tempfname) self.update_file(self._tempfiles[tempfname], tmp) if not tmp.closed: tmp.close() def _zip_add(self, pfname, infile): """Add the contents of ``infile`` to the zip with file name ``pfname``.""" if hasattr(infile, 'seek'): infile.seek(0) self.zip.writestr(pfname, infile.read()) # Clear the cached file object to force the file to be read from the # zip file. self._files[pfname] = None def _zip_delete(self, fnames): """Delete the files with the given names from the zip file (``self.zip``).""" # Sanity checking if not isinstance(fnames, (list, tuple)): raise ValueError("fnames must be list or tuple: %s" % (fnames)) if not self.zip: raise ValueError("No zip file to work on") zippedfiles = self.zip.namelist() for fn in fnames: if fn not in zippedfiles: raise KeyError("File not in zip archive: %s" % (fn)) newzip = self._create_temp_zipfile() newzip.writestr('project.xtp', self._generate_settings()) for fname in zippedfiles: # Copy all files from self.zip that are not project.xtp (already # in the new zip file) or in fnames (they are to be removed, after # all. if fname in fnames or fname == 'project.xtp': continue newzip.writestr(fname, self.zip.read(fname)) self._replace_project_zip(newzip)
class XPCShellRemote(xpcshell.XPCShellTests, object): def __init__(self, devmgr, options, log): xpcshell.XPCShellTests.__init__(self, log) # Add Android version (SDK level) to mozinfo so that manifest entries # can be conditional on android_version. androidVersion = devmgr.shellCheckOutput( ['getprop', 'ro.build.version.sdk']) mozinfo.info['android_version'] = androidVersion self.localLib = options.localLib self.localBin = options.localBin self.options = options self.device = devmgr self.pathMapping = [] self.remoteTestRoot = "%s/xpc" % self.device.deviceRoot # remoteBinDir contains xpcshell and its wrapper script, both of which must # be executable. Since +x permissions cannot usually be set on /mnt/sdcard, # and the test root may be on /mnt/sdcard, remoteBinDir is set to be on # /data/local, always. self.remoteBinDir = "/data/local/xpcb" # Terse directory names are used here ("c" for the components directory) # to minimize the length of the command line used to execute # xpcshell on the remote device. adb has a limit to the number # of characters used in a shell command, and the xpcshell command # line can be quite complex. self.remoteTmpDir = remoteJoin(self.remoteTestRoot, "tmp") self.remoteScriptsDir = self.remoteTestRoot self.remoteComponentsDir = remoteJoin(self.remoteTestRoot, "c") self.remoteModulesDir = remoteJoin(self.remoteTestRoot, "m") self.remoteMinidumpDir = remoteJoin(self.remoteTestRoot, "minidumps") self.remoteClearDirScript = remoteJoin(self.remoteBinDir, "cleardir") self.profileDir = remoteJoin(self.remoteTestRoot, "p") self.remoteDebugger = options.debugger self.remoteDebuggerArgs = options.debuggerArgs self.testingModulesDir = options.testingModulesDir self.env = {} if self.options.objdir: self.xpcDir = os.path.join(self.options.objdir, "_tests/xpcshell") elif os.path.isdir(os.path.join(here, 'tests')): self.xpcDir = os.path.join(here, 'tests') else: print >> sys.stderr, "Couldn't find local xpcshell test directory" sys.exit(1) if options.localAPK: self.localAPKContents = ZipFile(options.localAPK) if options.setup: self.setupUtilities() self.setupModules() self.setupTestDir() self.setupMinidumpDir() self.remoteAPK = None if options.localAPK: self.remoteAPK = remoteJoin(self.remoteBinDir, os.path.basename(options.localAPK)) self.setAppRoot() # data that needs to be passed to the RemoteXPCShellTestThread self.mobileArgs = { 'device': self.device, 'remoteBinDir': self.remoteBinDir, 'remoteScriptsDir': self.remoteScriptsDir, 'remoteComponentsDir': self.remoteComponentsDir, 'remoteModulesDir': self.remoteModulesDir, 'options': self.options, 'remoteDebugger': self.remoteDebugger, 'pathMapping': self.pathMapping, 'profileDir': self.profileDir, 'remoteTmpDir': self.remoteTmpDir, 'remoteMinidumpDir': self.remoteMinidumpDir, 'remoteClearDirScript': self.remoteClearDirScript, } if self.remoteAPK: self.mobileArgs['remoteAPK'] = self.remoteAPK def setLD_LIBRARY_PATH(self): self.env["LD_LIBRARY_PATH"] = self.remoteBinDir def pushWrapper(self): # Rather than executing xpcshell directly, this wrapper script is # used. By setting environment variables and the cwd in the script, # the length of the per-test command line is shortened. This is # often important when using ADB, as there is a limit to the length # of the ADB command line. localWrapper = tempfile.mktemp() f = open(localWrapper, "w") f.write("#!/system/bin/sh\n") for envkey, envval in self.env.iteritems(): f.write("export %s=%s\n" % (envkey, envval)) f.writelines([ "cd $1\n", "echo xpcw: cd $1\n", "shift\n", "echo xpcw: xpcshell \"$@\"\n", "%s/xpcshell \"$@\"\n" % self.remoteBinDir ]) f.close() remoteWrapper = remoteJoin(self.remoteBinDir, "xpcw") self.device.pushFile(localWrapper, remoteWrapper) os.remove(localWrapper) # Removing and re-creating a directory is a common operation which # can be implemented more efficiently with a shell script. localWrapper = tempfile.mktemp() f = open(localWrapper, "w") # The directory may not exist initially, so rm may fail. 'rm -f' is not # supported on some Androids. Similarly, 'test' and 'if [ -d ]' are not # universally available, so we just ignore errors from rm. f.writelines( ["#!/system/bin/sh\n", "rm -r \"$1\"\n", "mkdir \"$1\"\n"]) f.close() self.device.pushFile(localWrapper, self.remoteClearDirScript) os.remove(localWrapper) self.device.chmodDir(self.remoteBinDir) def buildEnvironment(self): self.buildCoreEnvironment() self.setLD_LIBRARY_PATH() self.env["MOZ_LINKER_CACHE"] = self.remoteBinDir if self.options.localAPK and self.appRoot: self.env["GRE_HOME"] = self.appRoot self.env["XPCSHELL_TEST_PROFILE_DIR"] = self.profileDir self.env["TMPDIR"] = self.remoteTmpDir self.env["HOME"] = self.profileDir self.env["XPCSHELL_TEST_TEMP_DIR"] = self.remoteTmpDir self.env["XPCSHELL_MINIDUMP_DIR"] = self.remoteMinidumpDir if self.options.setup: self.pushWrapper() def setAppRoot(self): # Determine the application root directory associated with the package # name used by the Fennec APK. self.appRoot = None packageName = None if self.options.localAPK: try: packageName = self.localAPKContents.read("package-name.txt") if packageName: self.appRoot = self.device.getAppRoot(packageName.strip()) except Exception as detail: print "unable to determine app root: " + str(detail) pass return None def setupUtilities(self): if (not self.device.dirExists(self.remoteBinDir)): # device.mkDir may fail here where shellCheckOutput may succeed -- see bug 817235 try: self.device.shellCheckOutput(["mkdir", self.remoteBinDir]) except mozdevice.DMError: # Might get a permission error; try again as root, if available self.device.shellCheckOutput(["mkdir", self.remoteBinDir], root=True) self.device.shellCheckOutput( ["chmod", "777", self.remoteBinDir], root=True) remotePrefDir = remoteJoin(self.remoteBinDir, "defaults/pref") if (self.device.dirExists(self.remoteTmpDir)): self.device.removeDir(self.remoteTmpDir) self.device.mkDir(self.remoteTmpDir) if (not self.device.dirExists(remotePrefDir)): self.device.mkDirs(remoteJoin(remotePrefDir, "extra")) if (not self.device.dirExists(self.remoteScriptsDir)): self.device.mkDir(self.remoteScriptsDir) if (not self.device.dirExists(self.remoteComponentsDir)): self.device.mkDir(self.remoteComponentsDir) local = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'head.js') remoteFile = remoteJoin(self.remoteScriptsDir, "head.js") self.device.pushFile(local, remoteFile) # The xpcshell binary is required for all tests. Additional binaries # are required for some tests. This list should be similar to # TEST_HARNESS_BINS in testing/mochitest/Makefile.in. binaries = [ "xpcshell", "ssltunnel", "certutil", "pk12util", "BadCertServer", "OCSPStaplingServer", "GenerateOCSPResponse" ] for fname in binaries: local = os.path.join(self.localBin, fname) if os.path.isfile(local): print >> sys.stderr, "Pushing %s.." % fname remoteFile = remoteJoin(self.remoteBinDir, fname) self.device.pushFile(local, remoteFile) else: print >> sys.stderr, "*** Expected binary %s not found in %s!" % ( fname, self.localBin) local = os.path.join(self.localBin, "components/httpd.js") remoteFile = remoteJoin(self.remoteComponentsDir, "httpd.js") self.device.pushFile(local, remoteFile) local = os.path.join(self.localBin, "components/httpd.manifest") remoteFile = remoteJoin(self.remoteComponentsDir, "httpd.manifest") self.device.pushFile(local, remoteFile) local = os.path.join(self.localBin, "components/test_necko.xpt") remoteFile = remoteJoin(self.remoteComponentsDir, "test_necko.xpt") self.device.pushFile(local, remoteFile) if self.options.localAPK: remoteFile = remoteJoin(self.remoteBinDir, os.path.basename(self.options.localAPK)) self.device.pushFile(self.options.localAPK, remoteFile) self.pushLibs() def pushLibs(self): if self.localBin is not None: szip = os.path.join(self.localBin, '..', 'host', 'bin', 'szip') if not os.path.exists(szip): # Tinderbox builds must run szip from the test package szip = os.path.join(self.localBin, 'host', 'szip') if not os.path.exists(szip): # If the test package doesn't contain szip, it means files # are not szipped in the test package. szip = None else: szip = None pushed_libs_count = 0 if self.options.localAPK: try: dir = tempfile.mkdtemp() for info in self.localAPKContents.infolist(): if info.filename.endswith(".so"): print >> sys.stderr, "Pushing %s.." % info.filename remoteFile = remoteJoin( self.remoteBinDir, os.path.basename(info.filename)) self.localAPKContents.extract(info, dir) localFile = os.path.join(dir, info.filename) if szip: try: out = subprocess.check_output( [szip, '-d', localFile], stderr=subprocess.STDOUT) except CalledProcessError: print >> sys.stderr, "Error calling %s on %s.." % ( szip, localFile) if out: print >> sys.stderr, out self.device.pushFile(localFile, remoteFile) pushed_libs_count += 1 finally: shutil.rmtree(dir) return pushed_libs_count for file in os.listdir(self.localLib): if (file.endswith(".so")): print >> sys.stderr, "Pushing %s.." % file if 'libxul' in file: print >> sys.stderr, "This is a big file, it could take a while." localFile = os.path.join(self.localLib, file) remoteFile = remoteJoin(self.remoteBinDir, file) if szip: try: out = subprocess.check_output([szip, '-d', localFile], stderr=subprocess.STDOUT) except CalledProcessError: print >> sys.stderr, "Error calling %s on %s.." % ( szip, localFile) if out: print >> sys.stderr, out self.device.pushFile(localFile, remoteFile) pushed_libs_count += 1 # Additional libraries may be found in a sub-directory such as "lib/armeabi-v7a" localArmLib = os.path.join(self.localLib, "lib") if os.path.exists(localArmLib): for root, dirs, files in os.walk(localArmLib): for file in files: if (file.endswith(".so")): print >> sys.stderr, "Pushing %s.." % file localFile = os.path.join(root, file) remoteFile = remoteJoin(self.remoteBinDir, file) if szip: try: out = subprocess.check_output( [szip, '-d', localFile], stderr=subprocess.STDOUT) except CalledProcessError: print >> sys.stderr, "Error calling %s on %s.." % ( szip, localFile) if out: print >> sys.stderr, out self.device.pushFile(localFile, remoteFile) pushed_libs_count += 1 return pushed_libs_count def setupModules(self): if self.testingModulesDir: self.device.pushDir(self.testingModulesDir, self.remoteModulesDir) def setupTestDir(self): print 'pushing %s' % self.xpcDir try: # The tests directory can be quite large: 5000 files and growing! # Sometimes - like on a low-end aws instance running an emulator - the push # may exceed the default 5 minute timeout, so we increase it here to 10 minutes. self.device.pushDir(self.xpcDir, self.remoteScriptsDir, timeout=600, retryLimit=10) except TypeError: # Foopies have an older mozdevice ver without retryLimit self.device.pushDir(self.xpcDir, self.remoteScriptsDir) def setupMinidumpDir(self): if self.device.dirExists(self.remoteMinidumpDir): self.device.removeDir(self.remoteMinidumpDir) self.device.mkDir(self.remoteMinidumpDir) def buildTestList(self, test_tags=None, test_paths=None): xpcshell.XPCShellTests.buildTestList(self, test_tags=test_tags, test_paths=test_paths) uniqueTestPaths = set([]) for test in self.alltests: uniqueTestPaths.add(test['here']) for testdir in uniqueTestPaths: abbrevTestDir = os.path.relpath(testdir, self.xpcDir) remoteScriptDir = remoteJoin(self.remoteScriptsDir, abbrevTestDir) self.pathMapping.append(PathMapping(testdir, remoteScriptDir))
def run(file_name): config_dict = False jar = ZipFile(file_name, 'r') # Version A if 'a.txt' and 'b.txt' in jar.namelist(): pre_key = jar.read('a.txt') enckey = ['{0}{1}{0}{1}a'.format('plowkmsssssPosq34r', pre_key), '{0}{1}{0}{1}a'.format('kevthehermitisaGAYXD', pre_key) ] coded_jar = jar.read('b.txt') config_dict = version_a(enckey, coded_jar) # Version B if 'ID' and 'MANIFEST.MF' in jar.namelist(): pre_key = jar.read('ID') enckey = ['{0}H3SUW7E82IKQK2J2J2IISIS'.format(pre_key)] coded_jar = jar.read('MANIFEST.MF') config_dict = version_b(enckey, coded_jar) # Version C if 'resource/password.txt' and 'resource/server.dll' in jar.namelist(): pre_key = jar.read('resource/password.txt') enckey = ['CJDKSIWKSJDKEIUSYEIDWE{0}'.format(pre_key)] coded_jar = jar.read('resource/server.dll') config_dict = version_c(enckey, coded_jar) # Version D if 'java/stubcito.opp' and 'java/textito.isn' in jar.namelist(): pre_key = jar.read('java/textito.isn') enckey = ['TVDKSIWKSJDKEIUSYEIDWE{0}'.format(pre_key)] coded_jar = jar.read('java/stubcito.opp') config_dict = version_c(enckey, coded_jar) # Version E if 'java/textito.text' and 'java/resource.xsx' in jar.namelist(): pre_key = jar.read('java/textito.text') enckey = ['kevthehermitGAYGAYXDXD{0}'.format(pre_key)] coded_jar = jar.read('java/resource.xsx') config_dict = version_c(enckey, coded_jar) if 'amarillo/asdasd.asd' and 'amarillo/adqwdqwd.asdwf' in jar.namelist(): pre_key = jar.read('amarillo/asdasd.asd') enckey = ['kevthehermitGAYGAYXDXD{0}'.format(pre_key)] coded_jar = jar.read('amarillo/adqwdqwd.asdwf') config_dict = version_c(enckey, coded_jar) # Version F if 'config/config.perl' in jar.namelist(): temp_config = xor_config(jar.read('config/config.perl')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['kevthehermitGAYGAYXDXD{0}'.format(temp_config["PASSWORD"])] config_dict = version_c(enckey, coded_jar) # Version G if 'config/config.pl' in jar.namelist(): temp_config = xor_config(jar.read('config/config.pl')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['kevthehermitGAYGAYGAYD{0}'.format(temp_config["PASSWORD"])] config_dict = version_c(enckey, coded_jar) # Version H if 'config/config.ini' in jar.namelist(): temp_config = xor_config(jar.read('config/config.ini')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['kevthehermitGAYGAYGAYD{0}'.format(temp_config["PASSWORD"]), 'kevthehermitGADGAYGAYD{}'.format(temp_config["PASSWORD"])] config_dict = version_c(enckey, coded_jar) # Version I if 'windows/windows.ini' in jar.namelist(): temp_config = xor_config(jar.read('windows/windows.ini')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['kevthehermitGADGAYGAYD{0}'.format(temp_config["PASSWORD"])] config_dict = version_c(enckey, coded_jar) # Version J if 'components/linux.plsk' in jar.namelist(): temp_config = xor_config(jar.read('components/linux.plsk')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['kevthehermitGADGAYGAYD{0}'.format(temp_config["PASSWORD"]), 'LDLDKFJVUI39OWIS9WOQ92{}'.format(temp_config["PASSWORD"])] config_dict = version_c(enckey, coded_jar) if config_dict is None: config_dict = version_d(enckey, coded_jar) # Version K if 'components/manifest.ini' in jar.namelist(): temp_config = xor_config(jar.read('components/manifest.ini')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['LDLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])] config_dict = version_d(enckey, coded_jar) # Version L if 'components/mac.hwid' in jar.namelist(): temp_config = xor_config(jar.read('components/mac.hwid')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['LDLDKFJVUI39OWIS9WOQ92{0}'.format(temp_config["PASSWORD"])] config_dict = version_d(enckey, coded_jar) # Version M if 'components/logo.png' in jar.namelist(): temp_config = xor_config(jar.read('components/logo.png')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['LDLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])] config_dict = version_d(enckey, coded_jar) # Version N if 'components/picture.gif' in jar.namelist(): temp_config = xor_config(jar.read('components/picture.gif')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['TDLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])] config_dict = version_d(enckey, coded_jar) # Version O if 'klip/clip.mp4' in jar.namelist(): temp_config = xor_config(jar.read('klip/clip.mp4')) coded_jar = jar.read(temp_config['SERVER'][1:]) enckey = ['TKLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])] config_dict = version_d(enckey, coded_jar) return config_dict
def preInstallCheck(self, eggInstall=True): """Check that prerequisite zenpacks are installed. Return True if no prereqs specified or if they are present. False otherwise. """ if eggInstall: installedPacks = dict((pack.id, pack.version) \ for pack in self.dataroot.ZenPackManager.packs()) if self.options.installPackName.lower().endswith('.egg'): # standard prebuilt egg if not os.path.exists(self.options.installPackName): raise ZenPackNotFoundException("Unable to find ZenPack named '%s'" % \ self.options.installPackName) zf = ZipFile(self.options.installPackName) if 'EGG-INFO/requires.txt' in zf.namelist(): reqZenpacks = zf.read('EGG-INFO/requires.txt').split('\n') else: return True else: # source egg, no prebuilt egg-info with get_temp_dir() as tempEggDir: cmd = '%s setup.py egg_info -e %s' % \ (binPath('python'), tempEggDir) subprocess.call(cmd, shell=True, stdout=open('/dev/null', 'w'), cwd=self.options.installPackName) eggRequires = os.path.join( tempEggDir, self.options.installPackName + '.egg-info', 'requires.txt') if os.path.isfile(eggRequires): reqZenpacks = open(eggRequires, 'r').read().split('\n') else: return True prereqsMet = True for req in reqZenpacks: if not req.startswith('ZenPacks'): continue for parsed_req in parse_requirements([req]): installed_version = installedPacks.get( parsed_req.project_name, None) if installed_version is None: self.log.error( 'Zenpack %s requires %s' % (self.options.installPackName, parsed_req)) prereqsMet = False else: if not installed_version in parsed_req: self.log.error( 'Zenpack %s requires %s, found: %s' % (self.options.installPackName, parsed_req, installed_version)) prereqsMet = False return prereqsMet if os.path.isfile(self.options.installPackName): zf = ZipFile(self.options.installPackName) for name in zf.namelist(): if name.endswith == '/%s' % CONFIG_FILE: sio = StringIO(zf.read(name)) else: return True else: name = os.path.join(self.options.installPackName, CONFIG_FILE) if os.path.isfile(name): fp = open(name) sio = StringIO(fp.read()) fp.close() else: return True parser = ConfigParser.SafeConfigParser() parser.readfp(sio, name) if parser.has_section(CONFIG_SECTION_ABOUT) \ and parser.has_option(CONFIG_SECTION_ABOUT, 'requires'): requires = eval(parser.get(CONFIG_SECTION_ABOUT, 'requires')) if not isinstance(requires, list): requires = [zp.strip() for zp in requires.split(',')] missing = [ zp for zp in requires if zp not in self.dataroot.ZenPackManager.packs.objectIds() ] if missing: self.log.error('ZenPack %s was not installed because' % self.options.installPackName + ' it requires the following ZenPack(s): %s' % ', '.join(missing)) return False return True
def reset_orig_chapters_epub(inputio, outfile): inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob ## build zip in memory in case updating in place(CLI). zipio = BytesIO() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 changed = False unmerge_tocncxdoms = {} ## spin through file contents, saving any unmerge toc.ncx files. for zf in inputepub.namelist(): ## logger.debug("zf:%s"%zf) if zf.endswith('/toc.ncx'): ## logger.debug("toc.ncx zf:%s"%zf) unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf)) unmerge_navxhtmldoms = {} ## spin through file contents, saving any unmerge toc.ncx files. for zf in inputepub.namelist(): ## logger.debug("zf:%s"%zf) if zf.endswith('/nav.xhtml'): ## logger.debug("toc.ncx zf:%s"%zf) unmerge_navxhtmldoms[zf] = parseString(inputepub.read(zf)) tocncxdom = parseString(inputepub.read('toc.ncx')) if 'nav.xhtml' in inputepub.namelist(): navxhtmldom = parseString(inputepub.read('nav.xhtml')) else: navxhtmldom = None ## spin through file contents. for zf in inputepub.namelist(): if zf not in [ 'mimetype', 'toc.ncx', 'nav.xhtml' ] and not zf.endswith('/toc.ncx') and not zf.endswith('/nav.xhtml'): entrychanged = False data = inputepub.read(zf) # if isinstance(data,unicode): # logger.debug("\n\n\ndata is unicode\n\n\n") if re.match(r'.*/file\d+\.xhtml', zf): #logger.debug("zf:%s"%zf) data = data.decode('utf-8') soup = make_soup(data) chapterorigtitle = None tag = soup.find('meta', {'name': 'chapterorigtitle'}) if tag: chapterorigtitle = tag['content'] # toctitle is separate for add_chapter_numbers:toconly users. chaptertoctitle = None tag = soup.find('meta', {'name': 'chaptertoctitle'}) if tag: chaptertoctitle = tag['content'] chaptertoctitle = chapterorigtitle chaptertitle = None tag = soup.find('meta', {'name': 'chaptertitle'}) if tag: chaptertitle = tag['content'] chaptertitle_tag = tag #logger.debug("chaptertitle:(%s) chapterorigtitle:(%s)"%(chaptertitle, chapterorigtitle)) if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle: origdata = data # data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>', # u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>') # data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>') # data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>') chaptertitle_tag['content'] = chapterorigtitle title_tag = soup.find('title') if title_tag and title_tag.string == chaptertitle: title_tag.string.replace_with(chapterorigtitle) h3_tag = soup.find('h3') if h3_tag and h3_tag.string == chaptertitle: h3_tag.string.replace_with(chapterorigtitle) data = unicode(soup) entrychanged = (origdata != data) changed = changed or entrychanged if entrychanged: logger.debug("\nentrychanged:%s\n" % zf) _replace_tocncx(tocncxdom, zf, chaptertoctitle) if navxhtmldom: _replace_navxhtml(navxhtmldom, zf, chaptertoctitle) ## Also look for and update individual ## book toc.ncx files for anthology in case ## it's unmerged. zf_toc = zf[:zf.rfind('/OEBPS/')] + '/toc.ncx' mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')]) + 1 if zf_toc in unmerge_tocncxdoms: _replace_tocncx(unmerge_tocncxdoms[zf_toc], zf[mergedprefix_len:], chaptertoctitle) if zf_toc in unmerge_navxhtmldoms: _replace_navxhtml(unmerge_navxhtmldoms[zf_toc], zf[mergedprefix_len:], chaptertoctitle) outputepub.writestr(zf, data.encode('utf-8')) else: # possibly binary data, thus no .encode(). outputepub.writestr(zf, data) for tocnm, tocdom in unmerge_tocncxdoms.items(): outputepub.writestr(tocnm, tocdom.toxml(encoding='utf-8')) for navnm, navdom in unmerge_navxhtmldoms.items(): outputepub.writestr(navnm, navdom.toxml(encoding='utf-8')) outputepub.writestr('toc.ncx', tocncxdom.toxml(encoding='utf-8')) if navxhtmldom: outputepub.writestr('nav.xhtml', navxhtmldom.toxml(encoding='utf-8')) outputepub.close() # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 # only *actually* write if changed. if changed: if isinstance(outfile, basestring): with open(outfile, "wb") as outputio: outputio.write(zipio.getvalue()) else: outfile.write(zipio.getvalue()) inputepub.close() zipio.close() return changed
class MovieLens100kDataManager: DOWNLOAD_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' DEFAULT_PATH = os.path.expanduser('/.ml-100k.zip') @classmethod def _read_interaction(cls, byte_stream): with BytesIO(byte_stream) as ifs: data = pd.read_csv( ifs, sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp']) data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s') return data def __init__(self, zippath=None): if zippath is None: zippath = self.DEFAULT_PATH if not os.path.exists(zippath): download = input( 'Could not find {}.\nCan I download and save it there?[y/N]' .format(zippath)) urllib.request.urlretrieve(self.DOWNLOAD_URL, zippath) if zippath is not None: self.zf = ZipFile(zippath) else: self.zf = None def load_rating(self, random_state=114514, fold=None): if fold is None: df_all = self._read_interaction(self.zf.read('ml-100k/u.data')) df_train, df_test = train_test_split(df_all, random_state=random_state) else: assert fold >= 1 and fold <= 5 train_path = 'ml-100k/u{}.base'.format(fold) test_path = 'ml-100k/u{}.test'.format(fold) df_train = self._read_interaction(self.zf.read(train_path)) df_test = self._read_interaction(self.zf.read(test_path)) return df_train, df_test def load_userinfo(self): user_info_bytes = self.zf.read('ml-100k/u.user') with BytesIO(user_info_bytes) as ifs: return pd.read_csv( ifs, sep='|', header=None, names=['user_id', 'age', 'gender', 'occupation', 'zipcode']) def load_movieinfo(self): MOVIE_COLUMNS = ['movie_id', 'title', 'release_date', 'unk', 'url'] with BytesIO(self.zf.read('ml-100k/u.genre')) as ifs: genres = pd.read_csv(ifs, sep='|', header=None)[0] with BytesIO(self.zf.read('ml-100k/u.item')) as ifs: df_mov = pd.read_csv( ifs, sep='|', encoding='latin-1', header=None, ) df_mov.columns = (MOVIE_COLUMNS + list(genres)) df_mov['release_date'] = pd.to_datetime(df_mov.release_date) return df_mov, list(genres)
def extract_zip(input_zip): input_zip = ZipFile(input_zip) return {name: input_zip.read(name) for name in input_zip.namelist()}
import numpy as np import pandas as pd from scipy.stats import norm import statsmodels.api as sm import matplotlib.pyplot as plt import requests from io import BytesIO from zipfile import ZipFile # Download the dataset dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content f = BytesIO(dk) zipped = ZipFile(f) df = pd.read_table(BytesIO(zipped.read('internet.dat')), skiprows=1, header=None, sep='\s+', engine='python', names=['internet', 'dinternet']) # ### Model Selection # # As in Durbin and Koopman, we force a number of the values to be missing. # Get the basic series dta_full = df.dinternet[1:].values dta_miss = dta_full.copy() # Remove datapoints