Beispiel #1
0
def get_story_url_from_zip_html(inputio,_is_good_url=None):
    # print("get_story_url_from_zip_html called")
    zipf = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob

    # calibre's convert tends to put FFF's title_page towards the end,
    # shift it to the front to avoid internal links.
    filelist = zipf.namelist()
    tpl = [ x for x in filelist if 'title_page' in x ]
    for x in tpl:
        filelist.remove(x)
        filelist.insert(0,x)

    for item in filelist:
        # print(item)
        # only .htm, .html and .xhtml (or .xhtm for that matter)
        if re.match(r".*\.x?html?$", item):
            # print("matched")
            try:
                soup = make_soup(zipf.read(item).decode("utf-8"))
            except UnicodeDecodeError:
                # calibre converted to html zip fails with decode.
                soup = make_soup(zipf.read(item))
            for link in soup.findAll('a',href=re.compile(r'^http.*')):
                ahref=link['href']
                # print("href:(%s)"%ahref)
                if _is_good_url == None or _is_good_url(ahref):
                    return ahref
    return None
Beispiel #2
0
	def _extract_content(self):
		pdxf_file = ZipFile(self.path, 'r')
		try:
			fl = pdxf_file.namelist()
		except:
			errtype, value, traceback = sys.exc_info()
			msg = _('It seems the PDXF file is corrupted') + '\n' + value
			events.emit(events.MESSAGES, msgconst.ERROR, msg)
			raise IOError(errtype, msg , traceback)
		if not 'mimetype' in fl or not pdxf_file.read('mimetype') == const.DOC_MIME:
			msg = _('The file is corrupted or not PDXF file')
			events.emit(events.MESSAGES, msgconst.ERROR, msg)
			raise IOError(2, msg)

		filelist = []
		for item in fl:
			if item == 'mimetype' or item[-1] == '/':
				continue
			filelist.append(item)

		for item in filelist:
			source = pdxf_file.read(item)
			dest = open(os.path.join(self.presenter.doc_dir, item), 'wb')
			dest.write(source)
			dest.close()
		msg = _('The file content is extracted successfully')
		events.emit(events.MESSAGES, msgconst.OK, msg)
Beispiel #3
0
def readZip(f, **kwargs):
	from mien.parsers.nmpml import elements as dialect
	f=ZipFile(f, 'r')
	xml=f.read('xml')
	xml=StringIO(xml)
	doc=xm.readTree(xml)
	xml.close()
	doc=xm.assignClasses(doc, dialect)	
	try:
		dat=f.read('data')
	except:
		print "No data archive in zip file"
		return doc
	from mien.parsers.datahash import readMD	
	dat=StringIO(dat)
	dat=readMD(dat, return_raw_hash=True)
	des=doc.getElements('Data')
	for de in des:
		try:
			d, h=dat[de.upath()]
		except:
			print "can't find data for element %s" % (de.upath(),)
			d, h=(zeros(0), {})
		de.datinit(d, h)
	f.close()	
	return doc		
Beispiel #4
0
def get_package_metadata(package):
    """Get the metadata of a plugin in a package. Pass it a filepointer or
    filename. Raises a `ValueError` if the package is not valid.
    """
    from zipfile import ZipFile, error as BadZipFile
    try:
        f = ZipFile(package)
    except (IOError, BadZipFile):
        raise ValueError('not a valid package')

    # get the package version and name
    try:
        package_version = int(f.read('ZINE_PACKAGE'))
        plugin_name = f.read('ZINE_PLUGIN')
    except (KeyError, ValueError):
        raise ValueError('not a valid package')
    if package_version > PACKAGE_VERSION:
        raise ValueError('incompatible package version')

    try:
        metadata = parse_metadata(f.read('pdata/metadata.txt'))
    except KeyError:
        metadata = {}
    metadata['uid'] = plugin_name
    return metadata
Beispiel #5
0
def get_story_url_from_epub_html(inputio,_is_good_url=None):
    # print("get_story_url_from_epub_html called")
    epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob

    ## Find the .opf file.
    container = epub.read("META-INF/container.xml")
    containerdom = parseString(container)
    rootfilenodelist = containerdom.getElementsByTagName("rootfile")
    rootfilename = rootfilenodelist[0].getAttribute("full-path")

    contentdom = parseString(epub.read(rootfilename))
    #firstmetadom = contentdom.getElementsByTagName("metadata")[0]

    ## Save the path to the .opf file--hrefs inside it are relative to it.
    relpath = get_path_part(rootfilename)

    # spin through the manifest--only place there are item tags.
    for item in contentdom.getElementsByTagName("item"):
        if( item.getAttribute("media-type") == "application/xhtml+xml" ):
            filehref=relpath+item.getAttribute("href")
            soup = make_soup(epub.read(filehref).decode("utf-8"))
            for link in soup.findAll('a',href=re.compile(r'^http.*')):
                ahref=link['href']
                # print("href:(%s)"%ahref)
                # hack for bad ficsaver ffnet URLs.
                m = re.match(r"^http://www.fanfiction.net/s(?P<id>\d+)//$",ahref)
                if m != None:
                    ahref="http://www.fanfiction.net/s/%s/1/"%m.group('id')
                if _is_good_url == None or _is_good_url(ahref):
                    return ahref
    return None
Beispiel #6
0
def _get_carparks_xml_from_zip(url=CARPARKS_ZIP_URL, index_xml=INDEX_XML_FILE_NAME):
    res = request.urlopen(url=url)

    # Validate a successful HTTP call with status 200.
    if not res.status == 200:
        raise Exception('Call to \'{0!s}\' failed with status code {1!s}.'.format(url, res.status))

    # Convert the downloaded byte stream to a file-like in-memory object.
    zip_file = BytesIO(res.read())

    # Validate the file-like object contains a valid zip file.
    if not is_zipfile(zip_file):
        raise Exception('The URL \'{0!s}\' did not return a valid zip file.'.format(url))

    # Convert to an actual ZipFile object.
    zip = ZipFile(zip_file, 'r')

    # Fail if the returned zip file is corrupt.
    if zip.testzip():
        raise Exception('Zip file from \'{0!s}\' was corrupt.'.format(url))

    # Create a list of filenames to process.
    xml_filenames = _get_filenames_from_index_xml(index=zip.read(index_xml))

    # Validate there is at least 1 file to process.
    if len(xml_filenames) < 1:
        raise Exception('No XML files listed in {0!s}!'.format(index_xml))

    # Create and populate a dictionary with filenames and contents.
    return_str_xmls = dict()
    for filename in xml_filenames:
        return_str_xmls[filename] = zip.read(filename)

    # Return the dictionary.
    return return_str_xmls
Beispiel #7
0
def find_plugin_yaml(dataobj):
    """
        """
    yml = False
    try:
        # The first thing we are going to try to do is create a ZipFile
        # object with the StringIO data that we have.
        zfile = ZipFile(dataobj)
    except:
        print "[DEBUG] ZipFile Library Failed to Parse DataObject"
    else:
        # Before we start recursively jumping through hoops, lets first
        # check to see if the plugin.yml exists at this level.  If so, then
        # just set the yaml variable.  Otherwise we are gonna look for more
        # zip and jar files and dig into them.
        if "plugin.yml" in zfile.namelist():
            try:
                yml = yaml.load(zfile.read("plugin.yml"))
            except:
                return False
        else:
            for filename in zfile.namelist():
                if not yml and filename[-3:].lower() in ["zip", "jar"]:
                    print "[DEBUG] Found Zip/Jar file " + filename
                    data = StringIO()
                    data.write(zfile.read(filename))
                    yml = find_plugin_yaml(data)
                    data.close()
            zfile.close()
    return yml
Beispiel #8
0
    def test_multiple_files_same_comment_same_name(self):
        assignmentgroup1 = mommy.make('core.AssignmentGroup',
                                      parentnode__parentnode__parentnode__short_name="test2100",
                                      parentnode__parentnode__short_name="spring2015",
                                      parentnode__short_name="oblig1")
        mommy.make('core.Candidate',
                   assignment_group=assignmentgroup1,
                   relatedstudent__user__shortname="testuser1")
        tomorrow = datetime.datetime.now() + datetime.timedelta(days=1)
        feedbackset1 = mommy.make('devilry_group.FeedbackSet',
                                  group=assignmentgroup1,
                                  is_last_in_group=True,
                                  feedbackset_type=groupmodels.FeedbackSet.FEEDBACKSET_TYPE_FIRST_ATTEMPT,
                                  deadline_datetime=tomorrow)

        comment_fbs1_2 = mommy.make('devilry_group.GroupComment',
                                    feedback_set=feedbackset1,
                                    user_role=Comment.USER_ROLE_STUDENT)
        commentfile_fbs1_2 = mommy.make('devilry_comment.CommentFile',
                                        comment=comment_fbs1_2, filename='testfile1.txt')
        commentfile_fbs1_2.file.save('testfile1.txt', ContentFile('test2'))
        commentfile_fbs1_2_2 = mommy.make('devilry_comment.CommentFile',
                                          comment=comment_fbs1_2, filename='testfile1.txt')
        commentfile_fbs1_2_2.file.save('testfile1.txt', ContentFile('test3'))

        testclass = BulkDownloadTestClass()
        response = testclass.get(None)
        zipfileobject = ZipFile(StringIO(response.content))
        filecontents = zipfileobject.read('test2100.spring2015.oblig1.testuser1/attempt1/testfile1.txt')
        self.assertEquals(filecontents, "test2")
        filecontents = zipfileobject.read('test2100.spring2015.oblig1.testuser1/attempt1/testfile1-1.txt')
        self.assertEquals(filecontents, "test3")
Beispiel #9
0
    def test_export_mixed_encodings(self):
        self.test_folder.zip_import.do_import(data=mac_zip)
        addNyDocument(self.test_folder, id='html_document')
        self.test_folder['html_document'].body = u'<p>Html document</p>'
        self.test_folder['html_document'].approved = 1
        export_value = self.test_folder.zip_export.do_export()
        self.assertFalse(isinstance(export_value, list),
                         ('Errors are raised: ', export_value))

        zip = ZipFile(export_value, 'r')

        expected_namelist = ['index.txt',
                             'zip_export_folder/picture-1.png',
                             'zip_export_folder/picture-2.png',
                             'zip_export_folder/html_document.html']

        self.assertEqual(sorted(zip.namelist()), sorted(expected_namelist))
        self.assertTrue('<p>Html document</p>' in \
                         zip.read('zip_export_folder/html_document.html'))

        picture1_data = IZipExportObject(self.test_folder['picture-1'])()[0]
        picture2_data = IZipExportObject(self.test_folder['picture-2'])()[0]

        self.assertEqual(zip.read('zip_export_folder/picture-1.png'),
                         picture1_data)
        self.assertEqual(zip.read('zip_export_folder/picture-2.png'),
                         picture2_data)
    def test_three_groups_after_deadline(self):
        with self.settings(DEVILRY_COMPRESSED_ARCHIVES_DIRECTORY=self.backend_path):
            testassignment = mommy.make_recipe('devilry.apps.core.assignment_activeperiod_start',
                                               short_name='learn-python-basics',
                                               first_deadline=timezone.now() - timezone.timedelta(hours=1))
            testgroup1 = mommy.make('core.AssignmentGroup', parentnode=testassignment)
            testgroup2 = mommy.make('core.AssignmentGroup', parentnode=testassignment)
            testgroup3 = mommy.make('core.AssignmentGroup', parentnode=testassignment)

            # Create user as examiner on all groups.
            testuser = mommy.make(settings.AUTH_USER_MODEL, shortname='thor', fullname='Thor')
            related_examiner = mommy.make('core.RelatedExaminer', user=testuser, period=testassignment.parentnode)
            mommy.make('core.Examiner', relatedexaminer=related_examiner, assignmentgroup=testgroup1)
            mommy.make('core.Examiner', relatedexaminer=related_examiner, assignmentgroup=testgroup2)
            mommy.make('core.Examiner', relatedexaminer=related_examiner, assignmentgroup=testgroup3)

            # Create feedbackset for testgroup1 with commentfiles
            testfeedbackset_group1 = group_mommy.feedbackset_first_attempt_unpublished(group=testgroup1)
            self.__make_comment_file(feedback_set=testfeedbackset_group1, file_name='testfile.txt',
                                     file_content='testcontent group 1')
            mommy.make('core.Candidate', assignment_group=testgroup1, relatedstudent__user__shortname='april')

            # Create feedbackset for testgroup2 with commentfiles
            testfeedbackset_group2 = group_mommy.feedbackset_first_attempt_unpublished(group=testgroup2)
            self.__make_comment_file(feedback_set=testfeedbackset_group2, file_name='testfile.txt',
                                     file_content='testcontent group 2')
            mommy.make('core.Candidate', assignment_group=testgroup2, relatedstudent__user__shortname='dewey')

            # Create feedbackset for testgroup3 with commentfiles
            testfeedbackset_group3 = group_mommy.feedbackset_first_attempt_unpublished(group=testgroup3)
            self.__make_comment_file(feedback_set=testfeedbackset_group3, file_name='testfile.txt',
                                     file_content='testcontent group 3')
            mommy.make('core.Candidate', assignment_group=testgroup3, relatedstudent__user__shortname='huey')

            # run actiongroup
            self._run_actiongroup(name='batchframework_assignment',
                                  task=tasks.AssignmentCompressAction,
                                  context_object=testassignment,
                                  started_by=testuser)

            archive_meta = archivemodels.CompressedArchiveMeta.objects.get(content_object_id=testassignment.id)
            zipfileobject = ZipFile(archive_meta.archive_path)
            path_to_file_group1 = os.path.join('april',
                                               'deadline-{}'.format(defaultfilters.date(
                                                   testfeedbackset_group1.deadline_datetime, 'b.j.Y-H:i')),
                                               'after_deadline_not_part_of_delivery',
                                               'testfile.txt')
            path_to_file_group2 = os.path.join('dewey',
                                               'deadline-{}'.format(defaultfilters.date(
                                                   testfeedbackset_group2.deadline_datetime, 'b.j.Y-H:i')),
                                               'after_deadline_not_part_of_delivery',
                                               'testfile.txt')
            path_to_file_group3 = os.path.join('huey',
                                               'deadline-{}'.format(defaultfilters.date(
                                                   testfeedbackset_group3.deadline_datetime, 'b.j.Y-H:i')),
                                               'after_deadline_not_part_of_delivery',
                                               'testfile.txt')
            self.assertEqual(b'testcontent group 1', zipfileobject.read(path_to_file_group1))
            self.assertEqual(b'testcontent group 2', zipfileobject.read(path_to_file_group2))
            self.assertEqual(b'testcontent group 3', zipfileobject.read(path_to_file_group3))
def verifyZipSignature(outerZipFilePath):
   result = MODULE_ZIP_STATUS.Invalid
   try:
      dataToSign = None
      signature = None
      outerZipFile = ZipFile(outerZipFilePath)
      # look for a zip file in the name list.
      # There should only be 2 files in this zip:
      #    The inner zip file and the sig file
      if len(outerZipFile.namelist()) == 3:
         dataToSign = sha256(sha256(outerZipFile.read(INNER_ZIP_FILENAME)) +
                      sha256(outerZipFile.read(PROPERTIES_FILENAME)))
         signature = outerZipFile.read(SIGNATURE_FILENAME)
               
      if dataToSign and signature:
         """
         Signature file contains multiple lines, of the form "key=value\n"
         The last line is the hex-encoded signature, which is over the 
         source code + everything in the sig file up to the last line.
         The key-value lines may contain properties such as signature 
         validity times/expiration, contact info of author, etc.
         """
         dataToSignSBD = SecureBinaryData(dataToSign)
         sigSBD = SecureBinaryData(hex_to_binary(signature.strip()))
         publicKeySBD = SecureBinaryData(hex_to_binary(ARMORY_INFO_SIGN_PUBLICKEY))
         result = MODULE_ZIP_STATUS.Valid if CryptoECDSA().VerifyData(dataToSignSBD, sigSBD, publicKeySBD) else \
                  MODULE_ZIP_STATUS.Unsigned
   except:
      # if anything goes wrong an invalid zip file indicator will get returned 
      pass
   return result
Beispiel #12
0
    def open(zipname):
        zf = ZipFile(zipname, 'r')
        m = zf.read('META-INF/manifest.xml')
        manifest = Manifest.parse(m)

        def warn(resource):
            print(u"Warning: bundle {} does not contain resource {}, which is referred in its manifest.".format(zipname, resource).encode('utf-8'))

        result = Bundle()
        result.presets_data = []
        for preset in manifest.get_resources('paintoppresets'):
            if preset in zf.namelist():
                result.presets.append(preset)
                data = zf.read(preset)
                kpp = KPP(preset, data)
                result.presets_data.append(kpp)
            else:
                warn(preset)

        result.meta_string = zf.read("meta.xml")
        result.preview_data = zf.read("preview.png")

        for brush in manifest.get_resources('brushes'):
            if brush in zf.namelist():
                result.brushes.append(brush)
            else:
                warn(brush)
        for pattern in manifest.get_resources('patterns'):
            if pattern in zf.namelist():
                result.patterns.append(pattern)
            else:
                warn(pattern)
            
        zf.close()
        return result
  def _get_score(filename):
    """Given a MusicXML file, return the score as an xml.etree.ElementTree.

    Given a MusicXML file, return the score as an xml.etree.ElementTree
    If the file is compress (ends in .mxl), uncompress it first

    Args:
        filename: The path of a MusicXML file

    Returns:
      The score as an xml.etree.ElementTree.
    """
    score = None
    if filename.endswith('.mxl'):
      # Compressed MXL file. Uncompress in memory.
      filename = ZipFile(filename)

      # A compressed MXL file may contain multiple files, but only one
      # MusicXML file. Read the META-INF/container.xml file inside of the
      # MXL file to locate the MusicXML file within the MXL file
      # http://www.musicxml.com/tutorial/compressed-mxl-files/zip-archive-structure/

      # Raise a MusicXMLParseException if multiple MusicXML files found
      namelist = filename.namelist()
      container_file = [x for x in namelist if x == 'META-INF/container.xml']
      compressed_file_name = ''

      try:
        container = ET.fromstring(filename.read(container_file[0]))
        for rootfile_tag in container.findall('./rootfiles/rootfile'):
          if 'media-type' in rootfile_tag.attrib:
            if rootfile_tag.attrib['media-type'] == MUSICXML_MIME_TYPE:
              if not compressed_file_name:
                compressed_file_name = rootfile_tag.attrib['full-path']
              else:
                raise MusicXMLParseException(
                    'Multiple MusicXML files found in compressed archive')
          else:
            # No media-type attribute, so assume this is the MusicXML file
            if not compressed_file_name:
              compressed_file_name = rootfile_tag.attrib['full-path']
            else:
              raise MusicXMLParseException(
                  'Multiple MusicXML files found in compressed archive')
      except ET.ParseError as exception:
        raise MusicXMLParseException(exception)

      try:
        score = ET.fromstring(filename.read(compressed_file_name))
      except ET.ParseError as exception:
        raise MusicXMLParseException(exception)
    else:
      # Uncompressed XML file.
      try:
        tree = ET.parse(filename)
        score = tree.getroot()
      except ET.ParseError as exception:
        raise MusicXMLParseException(exception)

    return score
Beispiel #14
0
    def unpack_zipdata(self, zipdata):
        stream = BytesIO(zipdata)
        fzip = ZipFile(stream)
        pdfs = [x.filename for x in fzip.filelist
                if x.filename.endswith('.pdf')]
        if not pdfs:
            raise ServerError(
                'Conversion returned zip containing no pdf files')

        thumbnails = sorted(
            [x.filename for x in fzip.filelist
                if x.filename.startswith('small/') and x.filename != 'small/'],
            key=lambda x: int(x.split('.')[0].split('_')[-1]))
        previews = sorted(
            [x.filename for x in fzip.filelist
                if x.filename.startswith('large/') and x.filename != 'large/'],
            key=lambda x: int(x.split('.')[0].split('_')[-1]))
        converted = {
            'pdfs': [fzip.read(pdfs[0])],
            'thumbnails': [fzip.read(filename)
                           for filename in thumbnails[:20]],
            'previews': [fzip.read(filename) for filename in previews[:20]],
        }
        fzip.close()
        stream.close()
        return converted
Beispiel #15
0
class ZipfileReader:
    """ Reads files from an imported zip file. """

    def __init__(self, files):
        self.files = ZipFile(files)
        self.fullpath = ''


    def readManifest(self):
        """ Get the maifest file if it exists. """
        for x in self.files.namelist():
            index = x.find('imsmanifest.xml')
            if index != -1:
                self.fullpath = x[:index]
                return self.files.read(x)
        return None
    

    def readFile(self, path):
        """ Get file data from the zip file. """
        fn = '%s%s' %(self.fullpath, str(path))
        if fn not in self.files.namelist():
            fn = fn.replace('/', '\\')
            if fn not in self.files.namelist():
                return None
        return self.files.read(fn)

    def listFiles(self):
        """ List files in the package. """
        return self.files.namelist()
Beispiel #16
0
 def test_graph_export_csv(self):
     create_graph(self)
     create_schema(self)
     create_type(self)
     create_data(self)
     self.browser.find_by_id('toolsMenu').first.click()
     cookies = {self.browser.cookies.all()[0]["name"]: self.browser.cookies.all()[0]["value"], self.browser.cookies.all()[1]["name"]: self.browser.cookies.all()[1]["value"]}
     result = requests.get(self.live_server_url + '/tools/bobs-graph/export/csv/', cookies=cookies)
     spin_assert(lambda: self.assertEqual(
         result.headers['content-type'], 'application/zip'))
     spin_assert(lambda: self.assertEqual(
         self.browser.status_code.is_success(), True))
     test_file = StringIO(result.content)
     csv_zip = ZipFile(test_file)
     for name in csv_zip.namelist():
         fw = open('sylva/sylva/tests/files/' + name, 'w')
         fw.write(csv_zip.read(name))
         fw.close()
     for name in csv_zip.namelist():
         f = open('sylva/sylva/tests/files/' + name)
         csvFile = ""
         for line in f:
             csvFile += line
         f.close()
         spin_assert(lambda: self.assertEqual(csv_zip.read(name), csvFile))
     Graph.objects.get(name="Bob's graph").destroy()
 def testTopZip(self):
   top_zip = ZipFile(StringIO(self._directory_zipper.Zip('top').Get()))
   self.assertEqual(['top/one.txt', 'top/two/four.txt', 'top/two/three.txt'],
                    sorted(top_zip.namelist()))
   self.assertEqual('one.txt contents', top_zip.read('top/one.txt'))
   self.assertEqual('three.txt contents', top_zip.read('top/two/three.txt'))
   self.assertEqual('four.txt contents', top_zip.read('top/two/four.txt'))
Beispiel #18
0
    def _load(self, stream):
        zf = ZipFile(stream)
        self._load_content_types(zf.read("[Content_Types].xml"))
        rels_path = posixpath.join("_rels", ".rels")
        self._load_rels(zf.read(rels_path))

        def ropen(item):
            "read item and recursively open its children"
            if isinstance(item, Relationships):
                return
            if isinstance(item, Part):
                base, rname = posixpath.split(to_zip_name(item.name))
                relname = posixpath.join(base, "_rels", "%s.rels" % rname)
                if relname in zf.namelist():
                    item._load_rels(zf.read(relname))
            for rel in item.relationships:
                pname = posixpath.join(item.base, rel.target)
                if pname in self:
                    # This item is already in self.
                    continue
                target_path = to_zip_name(pname)
                data = b"".join(self._get_matching_segments(zf, target_path))
                new_part = self._load_part(rel.type, pname, data)
                if new_part:
                    ropen(new_part)

        ropen(self)
        zf.close()
Beispiel #19
0
 def test_csxconvert(self):
     # This zip contains an entry on frogs in txt (latin1)
     # html (utf-8) and odt (utf-8)
     
     ff = (sc.config.test_samples_dir / 'ff.zip').open('rb')
     csxp = webtools.CSXProcessor()
     result = csxp.process_zip(ff, ff.name)
     
     # Affirm result is a valid ZipFile
     z = ZipFile(result.result.fileobj)
     
     # Affirm content is now utf8
     text = z.read('ff-latin1.txt').decode(encoding='UTF-8')
     
     # 'Maṇḍūka', 'Nīlamaṇḍūka', 'Uddhumāyikā'
     
     # And it has been properly transcoded.
     self.assertIn('Maṇḍūka', text)
     
     html = z.read('ff-utf8.html').decode(encoding='UTF-8')
     self.assertIn('Nīlamaṇḍūka', text)
     
     # This doesn't completely test odt but confirms that
     # it basically worked.
     odt = z.open('ff.odt')
     odt = io.BytesIO(odt.read()) # Needs to be seekable
     
     odtz = ZipFile(odt)
     content = odtz.read('content.xml').decode(encoding='UTF-8')
     self.assertIn('Uddhumāyikā', content)
Beispiel #20
0
def _unbundle(path, target):
    zf = ZipFile(path, 'r')
    contents = zf.namelist()
    for item in contents:
        sp = item.split("/")
        if not sp[-1]:
            continue

        print item, ">", target + item

        cpath = target + "/".join(sp[:-1])
        if not os.path.exists(cpath):
            os.makedirs(cpath)
        if item.endswith((".jar", ".xpi", ".zip")):
            now = target + item
            path_item = item.split("/")
            path_item[-1] = "_" + path_item[-1]
            path = target + "/".join(path_item)

            buff = StringIO(zf.read(item))
            _unbundle(buff, path + "/")
        else:
            f = open(target + item, 'w')
            f.write(zf.read(item))
            f.close()
    zf.close()
Beispiel #21
0
    def test_groupcomment_files_download_two_users(self):
        with self.settings(DEVILRY_COMPRESSED_ARCHIVES_DIRECTORY=self.backend_path):
            testgroup = mommy.make('core.AssignmentGroup')
            testuser1 = mommy.make(settings.AUTH_USER_MODEL, shortname='*****@*****.**', fullname='Dewey Duck')
            testuser2 = mommy.make(settings.AUTH_USER_MODEL, shortname='*****@*****.**', fullname='April Duck')
            candidate1 = mommy.make('core.Candidate', assignment_group=testgroup, relatedstudent__user=testuser1)
            candidate2 = mommy.make('core.Candidate', assignment_group=testgroup, relatedstudent__user=testuser2)
            testcomment = mommy.make('devilry_group.GroupComment',
                                     feedback_set__group=testgroup,
                                     user=testuser1,
                                     user_role='student')
            commentfile = mommy.make('devilry_comment.CommentFile', comment=testcomment, filename='testfile.txt')
            commentfile.file.save('testfile.txt', ContentFile('testcontent'))

            testdownloader = feedbackfeed_download_files.CompressedGroupCommentFileDownload()

            # First user download
            mockrequest = mock.MagicMock()
            mockrequest.cradmin_role = candidate1.assignment_group
            mockrequest.user = testuser1
            response = testdownloader.get(mockrequest, testcomment.id)
            zipfileobject = ZipFile(StringIO(response.content))
            filecontents = zipfileobject.read('testfile.txt')
            self.assertEquals(filecontents, 'testcontent')

            # Second user download
            mockrequest.cradmin_role = candidate2.assignment_group
            mockrequest.user = testuser2
            response = testdownloader.get(mockrequest, testcomment.id)
            zipfileobject = ZipFile(StringIO(response.content))
            filecontents = zipfileobject.read('testfile.txt')
            self.assertEquals(filecontents, 'testcontent')
Beispiel #22
0
def _epub_parser(epub):
    """
    Handle EPUB specific parsing
    Return dict of ebook metadata

    An EPUB must contain META-INF/container.xml, which contains the path to
    the EPUB metadata file.
    """
    sha256 = file_hash(epub)
    zf = ZipFile(epub)
    xml = xmltodict.parse(zf.read('META-INF/container.xml'))
    metadata_path = xml['container']['rootfiles']['rootfile']['@full-path']  # TODO: validate this is true for all EPUBs
    raw_metadata = xmltodict.parse(zf.read(metadata_path))
    metadata = {'format': 'epub'}
    for k, v in raw_metadata['package']['metadata'].items():
        if 'dc:' in k:
            if 'creator' in k:  # Required element, needs additional parsing
                k = 'author'
                v = v['#text']
            if 'identifier' in k:  # Required element, needs additional parsing
                k = 'identifiers'
                if not isinstance(v, list):
                    v = [v]  # Just in case we get a single element
                identifiers = []
                for i in v:
                    identifiers.append({'identifier': i['@opf:scheme'], 'value': i['#text']})  # Support multiple identifiers
                v = identifiers
            metadata[k.split('dc:')[-1]] = v
    metadata['identifiers'].append({'identifier': 'sha256', 'value': sha256})
    return metadata
Beispiel #23
0
class ApkParser:
 def __init__(self, file):
  self._file = ZipFile(file)

 def getManifest(self):
  return AXML(self._file.read('AndroidManifest.xml')).get_xml_obj()

 def getPackageName(self):
  return self.getManifest().documentElement.getAttribute('package')

 def getVersionCode(self):
  return int(self.getManifest().documentElement.getAttribute('android:versionCode'))

 def getVersionName(self):
  return self.getManifest().documentElement.getAttribute('android:versionName')

 def getMinSdkVersion(self):
  return int(self.getManifest().documentElement.getElementsByTagName('uses-sdk')[0].getAttribute('android:minSdkVersion'))

 def _getCerts(self):
  for info in self._file.infolist():
   if info.filename.startswith('META-INF/') and info.filename.endswith('.RSA'):
    for cert in ContentInfo.load(self._file.read(info))['content']['certificates']:
     yield cert.dump()

 def getCert(self):
  certs = list(self._getCerts())
  if len(certs) != 1:
   raise Exception('Cannot read certificate')
  return certs[0]
Beispiel #24
0
class Stick(object):

    def __init__(self,stickfaceFileLoc,iniName,pos=(0,0)):
        self.iniName = iniName
        self.stickfaceFileLoc = stickfaceFileLoc
        self.loadStickfaceFile(self.stickfaceFileLoc)
        self.buttons = [0] * len(self.stickfaceIni['buttons'])
        self.hat = (0,0)
        self.pos = pos
        self.cutoff = 1

    def loadStickfaceFile(self,stickfaceFileLoc):
        load = pygame.image.load
        self.stickfaceZip = ZipFile(stickfaceFileLoc,'r')
        self.stickfaceIni = yamlLoad(self.stickfaceZip.read(self.iniName))
        self.controllerSize = tuple(self.stickfaceIni['controllerSize'])
        self.buttonImages = [load(StringIO(self.stickfaceZip.read(i))) for i in self.stickfaceIni['buttonImages']]
        self.controllerImg = load(StringIO(self.stickfaceZip.read(self.stickfaceIni['controllerImage'])))
        self.buttonLoc = [tuple(i) for i in self.stickfaceIni['buttonLocs']]
        self.buttonSize = [tuple(i) for i in self.stickfaceIni['buttonSizes']]

    def drawController(self):
        finalImg = pygame.Surface(self.controllerSize).convert_alpha()
        finalImg.blit(self.controllerImg,(0,0))
        for b in range(12):
            if self.buttons[b] >= self.cutoff:
                finalImg.blit(self.buttonImages[b],self.buttonLoc[b])
        return finalImg
Beispiel #25
0
class PebbleSystemResources(object):
    def __init__(self, firmware_path):
        self._firmware_path = firmware_path
        self._zipfile = ZipFile(firmware_path)
        self._manifest = json.loads(self._zipfile.read("manifest.json"))
        self._resource_data = self._zipfile.read("system_resources.pbpack")
        self.resources = PebbleResources(self._resource_data)
        self.resource_id_mapping = self.get_resource_id_mapping()

    def get_resource_id_mapping(self):
        resource_id_mapping = {}
        media = self._manifest["debug"]["resourceMap"]["media"]
        file_id = 0
        for media_entry in media:
            file_id += 1
            resource_name = "RESOURCE_ID_" + media_entry["defName"]
            if media_entry["type"] == "png-trans":
                resource_id_mapping[resource_name + "_WHITE"] = file_id
                file_id += 1
                resource_id_mapping[resource_name + "_BLACK"] = file_id
            else:
                resource_id_mapping[resource_name] = file_id
        return resource_id_mapping

    def verify_data(self):
        return self.resources.verify_data()

    def get_file_id(self, def_name):
        return self.resource_id_mapping[def_name]

    def get_chunk(self, file_id):
        return self.resources.get_chunk(file_id)
Beispiel #26
0
 def _find_plugin_yaml(self, dataobj):
     '''
     '''
     yml = False
     try:
         # The first thing we are going to try to do is create a ZipFile
         # object with the StringIO data that we have.
         zfile = ZipFile(dataobj)
     except:
         pass
     else:
         # Before we start recursively jumping through hoops, lets first
         # check to see if the plugin.yml exists at this level.  If so, then
         # just set the yaml variable.  Otherwise we are gonna look for more
         # zip and jar files and dig into them.
         if 'plugin.yml' in zfile.namelist():
             try:
                 yml = yaml.load(zfile.read('plugin.yml'))
             except:
                 return False
         else:
             for filename in zfile.namelist():
                 if not yml and filename[-3:].lower() in ['zip', 'jar']:
                     data = StringIO()
                     data.write(zfile.read(filename))
                     yml = self._find_plugin_yaml(data)
                     data.close()
             zfile.close()
     return yml
Beispiel #27
0
def parseZip( fn ):
	date_time = ''
	members = dict()
	removemembers = False
	zipfile = ZipFile( fn )
	cache.invalidate(recordlist.output, 'list_output', )
	files_of_interest = ['infolog.txt','ext.txt','platform.txt','script.txt','settings.txt','unitsync.log','client.txt','information.txt','demo.sdf']
	
	for info in zipfile.infolist():
		if info.filename in files_of_interest and info.file_size < 5 * 1024 * 1024:
			members[info.filename] = zipfile.read( info.filename )
			if info.filename == 'infolog.txt':
				date_time = info.date_time
		else:
			removemembers = True
	
	if removemembers:
		newzipfile = ZipFile (fn + '.new', 'w')
		tmpfilename = '/tmp/' + os.path.basename (fn) + '.tmp'
		for file in members.keys ():
			tmpfile = open (tmpfilename, 'w')
			tmpfile.write (zipfile.read (file))
			tmpfile.close ()
			newzipfile.write (tmpfilename, file)
			os.remove (tmpfilename)
		newzipfile.close ()
		zipfile.close ()
		os.rename (fn, fn + '.orig')
		os.rename (fn + '.new', fn)
	else:
		zipfile.close ()
	
	return db.parseZipMembers( fn, members, date_time )
    def handle(self, *args, **options):
        if not settings.DEBUG:
            print "this should never be run on production"
            return

        zipfile = ZipFile(os.path.join("data", "intervention.zip"), "r")

        # Load Intervention objects
        json = loads(zipfile.read("interventions.json"))

        print "clearing intervention prod database content..."
        Intervention.objects.all().delete()

        print "importing prod database content..."
        for i in json["interventions"]:
            intervention = Intervention.objects.create(name="tmp")
            intervention.from_dict(i)

        # Load Problem Solving objects
        json = loads(zipfile.read("issues.json"))

        print "clearing problemsolving database content..."
        Issue.objects.all().delete()

        print "importing problemsolving prod database content..."
        for i in json["issues"]:
            issue = Issue.objects.create(name="tmp", ordinality=0)
            issue.from_dict(i)
Beispiel #29
0
class OdfDocument(object):
  """Manipulates odf documents in memory"""

  implements(IOdfDocument)

  def __init__(self, data, source_format):
    """Open the the file in memory.

    Keyword arguments:
    data -- Content of the document
    source_format -- Document Extension
    """
    self._zipfile = ZipFile(StringIO(data))

    self.source_format = source_format
    # XXX - Maybe parsed_content should not be here, but on OOGranulate
    self.parsed_content = etree.fromstring(self.getContentXml())

  def getContentXml(self):
    """Returns the content.xml file as string"""
    return self._zipfile.read('content.xml')

  def getFile(self, path):
    """If exists, returns file as string, else return an empty string"""
    try:
      return self._zipfile.read(path)
    except KeyError:
      return ''

  def trash(self):
    """Remove the file in memory."""
    self._zipfile.close()
	def load_and_save_scopes(self):
		scopes = set()
		for x in os.walk(sublime.packages_path() + '/..'):
			for f in glob.glob(os.path.join(x[0], '*.tmLanguage')):
				for s in self.get_scopes_from(plistlib.readPlist(f)):
					scopes.add(s.strip())

		for x in os.walk(os.path.dirname(sublime.executable_path())):
			for f in glob.glob(os.path.join(x[0], '*.sublime-package')):
				input_zip = ZipFile(f)
				for name in input_zip.namelist():
					if name.endswith('.tmLanguage'):
						for s in self.get_scopes_from(plistlib.readPlistFromBytes(input_zip.read(name))):
							scopes.add(s.strip())

		for x in os.walk(sublime.packages_path() + '/..'):
			for f in glob.glob(os.path.join(x[0], '*.sublime-package')):
				input_zip = ZipFile(f)
				for name in input_zip.namelist():
					if name.endswith('.tmLanguage'):
						for s in self.get_scopes_from(plistlib.readPlistFromBytes(input_zip.read(name))):
							scopes.add(s.strip())
		names = list(scopes)
		scopes = dict()
		for name in names:
			value = name
			if value.startswith('source.'):
				value = value[7:]
			elif value.startswith('text.'):
				value = value[5:]
			scopes[name] = value
		self.settings.set('scopes', scopes)
		sublime.save_settings('smart-pieces.sublime-settings')
Beispiel #31
0
def get_update_data(inputio, getfilecount=True, getsoups=True):
    epub = ZipFile(inputio,
                   'r')  # works equally well with inputio as a path or a blob

    ## Find the .opf file.
    container = epub.read("META-INF/container.xml")
    containerdom = parseString(container)
    rootfilenodelist = containerdom.getElementsByTagName("rootfile")
    rootfilename = rootfilenodelist[0].getAttribute("full-path")

    contentdom = parseString(epub.read(rootfilename))
    firstmetadom = contentdom.getElementsByTagName("metadata")[0]
    try:
        source = ensure_text(
            firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data)
    except:
        source = None

    ## Save the path to the .opf file--hrefs inside it are relative to it.
    relpath = get_path_part(rootfilename)

    oldcover = None
    calibrebookmark = None
    logfile = None
    # Looking for pre-existing cover.
    for item in contentdom.getElementsByTagName("reference"):
        if item.getAttribute("type") == "cover":
            # there is a cover (x)html file, save the soup for it.
            href = relpath + item.getAttribute("href")
            src = None
            try:
                oldcoverhtmlhref = href
                oldcoverhtmldata = epub.read(href)
                oldcoverhtmltype = "application/xhtml+xml"
                for item in contentdom.getElementsByTagName("item"):
                    if (relpath +
                            item.getAttribute("href") == oldcoverhtmlhref):
                        oldcoverhtmltype = item.getAttribute("media-type")
                        break
                soup = make_soup(oldcoverhtmldata.decode("utf-8"))
                # first img or image tag.
                imgs = soup.findAll('img')
                if imgs:
                    src = get_path_part(href) + imgs[0]['src']
                else:
                    imgs = soup.findAll('image')
                    if imgs:
                        src = get_path_part(href) + imgs[0]['xlink:href']

                if not src:
                    continue
            except Exception as e:
                ## Calibre's Polish Book corrupts sub-book covers.
                logger.warning("Cover (x)html file %s not found" % href)
                logger.warning("Exception: %s" % (unicode(e)))

            try:
                # remove all .. and the path part above it, if present.
                # Mostly for epubs edited by Sigil.
                src = re.sub(r"([^/]+/\.\./)", "", src)
                #print("epubutils: found pre-existing cover image:%s"%src)
                oldcoverimghref = src
                oldcoverimgdata = epub.read(src)
                for item in contentdom.getElementsByTagName("item"):
                    if (relpath +
                            item.getAttribute("href") == oldcoverimghref):
                        oldcoverimgtype = item.getAttribute("media-type")
                        break
                oldcover = (oldcoverhtmlhref, oldcoverhtmltype,
                            oldcoverhtmldata, oldcoverimghref, oldcoverimgtype,
                            oldcoverimgdata)
            except Exception as e:
                logger.warning("Cover Image %s not found" % src)
                logger.warning("Exception: %s" % (unicode(e)))

    filecount = 0
    soups = []  # list of xhmtl blocks
    urlsoups = {}  # map of xhtml blocks by url
    images = {}  # dict() longdesc->data
    datamaps = defaultdict(dict)  # map of data maps by url
    if getfilecount:
        # spin through the manifest--only place there are item tags.
        for item in contentdom.getElementsByTagName("item"):
            # First, count the 'chapter' files.  FFF uses file0000.xhtml,
            # but can also update epubs downloaded from Twisting the
            # Hellmouth, which uses chapter0.html.
            if (item.getAttribute("media-type") == "application/xhtml+xml"):
                href = relpath + item.getAttribute("href")
                #print("---- item href:%s path part: %s"%(href,get_path_part(href)))
                if re.match(r'.*/log_page(_u\d+)?\.x?html', href):
                    try:
                        logfile = epub.read(href).decode("utf-8")
                    except:
                        pass  # corner case I bumped into while testing.
                if re.match(r'.*/(file|chapter)\d+(_u\d+)?\.x?html', href):
                    # (_u\d+)? is from calibre convert naming files
                    # 3/OEBPS/file0005_u3.xhtml etc.
                    if getsoups:
                        soup = make_soup(epub.read(href).decode("utf-8"))
                        for img in soup.findAll('img'):
                            newsrc = ''
                            longdesc = ''
                            ## skip <img src="data:image..."
                            if not img['src'].startswith('data:image'):
                                try:
                                    newsrc = get_path_part(href) + img['src']
                                    # remove all .. and the path part above it, if present.
                                    # Mostly for epubs edited by Sigil.
                                    newsrc = re.sub(r"([^/]+/\.\./)", "",
                                                    newsrc)
                                    longdesc = img['longdesc']
                                    data = epub.read(newsrc)
                                    images[longdesc] = data
                                    img['src'] = img['longdesc']
                                except Exception as e:
                                    # don't report u'OEBPS/failedtoload',
                                    # it indicates a failed download
                                    # originally.
                                    if newsrc != u'OEBPS/failedtoload':
                                        logger.warning(
                                            "Image %s not found!\n(originally:%s)"
                                            % (newsrc, longdesc))
                                        logger.warning("Exception: %s" %
                                                       (unicode(e)),
                                                       exc_info=True)
                        bodysoup = soup.find('body')
                        # ffdl epubs have chapter title h3
                        h3 = bodysoup.find('h3')
                        if h3:
                            h3.extract()
                        # TtH epubs have chapter title h2
                        h2 = bodysoup.find('h2')
                        if h2:
                            h2.extract()

                        for skip in bodysoup.findAll(
                                attrs={'class': 'skip_on_ffdl_update'}):
                            skip.extract()

                        ## <meta name="chapterurl" content="${url}"></meta>
                        #print("look for meta chapurl")
                        currenturl = None
                        chapurl = soup.find('meta', {'name': 'chapterurl'})
                        if chapurl:
                            if chapurl[
                                    'content'] not in urlsoups:  # keep first found if more than one.
                                # print("Found chapurl['content']:%s"%chapurl['content'])
                                currenturl = chapurl['content']
                                urlsoups[chapurl['content']] = bodysoup
                        else:
                            # for older pre-meta.  Only temp.
                            chapa = bodysoup.find('a', {'class': 'chapterurl'})
                            if chapa and chapa[
                                    'href'] not in urlsoups:  # keep first found if more than one.
                                urlsoups[chapa['href']] = bodysoup
                                currenturl = chapa['href']
                                chapa.extract()

                        chapterorigtitle = soup.find(
                            'meta', {'name': 'chapterorigtitle'})
                        if chapterorigtitle:
                            datamaps[currenturl][
                                'chapterorigtitle'] = chapterorigtitle[
                                    'content']

                        chaptertitle = soup.find('meta',
                                                 {'name': 'chaptertitle'})
                        if chaptertitle:
                            datamaps[currenturl][
                                'chaptertitle'] = chaptertitle['content']

                        soups.append(bodysoup)

                    filecount += 1

    try:
        calibrebookmark = epub.read("META-INF/calibre_bookmarks.txt")
    except:
        pass

    #for k in images.keys():
    #print("\tlongdesc:%s\n\tData len:%s\n"%(k,len(images[k])))
    #print("datamaps:%s"%datamaps)
    return (source, filecount, soups, images, oldcover, calibrebookmark,
            logfile, urlsoups, datamaps)
Beispiel #32
0
class WsFile(attrib.Container):
    """Workspace File.

    Workspace files are Zip-Archives, that contain a INI-formatted
    configuration file 'workspace.ini' in the archives root, and arbitrary
    resource files within subfolders.

    Args:
        filepath: String or :term:`path-like object`, that points to a valid
            workspace file or None. If the filepath points to a valid workspace
            file, then the class instance is initialized with a memory copy of
            the file. If the given file, however, does not exist, isn't a valid
            ZipFile, or does not contain a workspace configuration, respectively
            one of the errors FileNotFoundError, BadZipFile or BadWsFile is
            raised. The default behaviour, if the filepath is None, is to create
            an empty workspace in the memory, that uses the default folders
            layout. In this case the attribute maintainer is initialized with
            the current username.
        pwd: Bytes representing password of workspace file.

    """

    #
    # Protected Class Variables
    #

    _config_file: ClassVar[Path] = Path('workspace.ini')
    _default_config: ClassVar[ConfigDict] = {
        'dc': {
            'creator': env.get_username(),
            'date': datetime.datetime.now()}}
    _default_dir_layout: ClassVar[StrList] = [
        'dataset', 'network', 'system', 'model', 'script']
    _default_encoding = env.get_encoding()

    #
    # Public Attributes and Attribute Groups
    #

    dc: attrib.Group = attrib.create_group(attrib.DCGroup)

    startup: property = attrib.MetaData(classinfo=Path, category='hooks')
    startup.__doc__ = """
    The startup script is a path, that points to a python script inside the
    workspace, which is executed after loading the workspace.
    """

    path: property = attrib.Virtual(fget='_get_path')
    path.__doc__ = """Filepath of the workspace."""

    name: property = attrib.Virtual(fget='_get_name')
    name.__doc__ = """Filename of the workspace without file extension."""

    files: property = attrib.Virtual(fget='search')
    files.__doc__ = """List of all files within the workspace."""

    folders: property = attrib.Virtual(fget='_get_folders')
    folders.__doc__ = """List of all folders within the workspace."""

    changed: property = attrib.Virtual(fget='_get_changed')
    changed.__doc__ = """Tells whether the workspace file has been changed."""

    #
    # Protected Attributes
    #

    _file: property = attrib.Content(classinfo=ZipFile)
    _buffer: property = attrib.Content(classinfo=BytesIOBaseClass)
    _path: property = attrib.Temporary(classinfo=Path)
    _pwd: property = attrib.Temporary(classinfo=bytes)
    _changed: property = attrib.Temporary(classinfo=bool, default=False)

    #
    # Events
    #

    def __init__(
            self, filepath: OptPathLike = None, pwd: OptBytes = None,
            parent: Optional[attrib.Container] = None) -> None:
        """Load Workspace from file."""
        super().__init__()
        if filepath:
            self.load(filepath, pwd=pwd)
        else:
            self._create_new()

    def __enter__(self) -> 'WsFile':
        """Enter with statement."""
        return self

    def __exit__(self, cls: ExcType, obj: Exc, tb: Traceback) -> None:
        """Close workspace file and buffer."""
        self.close()

    #
    # Public Methods
    #

    def load(self, filepath: PathLike, pwd: OptBytes = None) -> None:
        """Load Workspace from file.

        Args:
            filepath: String or :term:`path-like object`, that points to a valid
                workspace file. If the filepath points to a valid workspace
                file, then the class instance is initialized with a memory copy
                of the file. If the given file, however, does not exist, isn't a
                valid ZipFile, or does not contain a workspace configuration,
                respectively one of the errors FileNotFoundError, BadZipFile or
                BadWsFile is raised.
            pwd: Bytes representing password of workspace file.

        """
        # Initialize instance Variables, Buffer and buffered ZipFile
        self._changed = False
        self._path = env.expand(filepath)
        self._pwd = pwd
        self._buffer = BytesIO()
        self._file = ZipFile(self._buffer, mode='w')

        # Copy contents from ZipFile to buffered ZipFile
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            try:
                with ZipFile(self.path, mode='r') as fh:
                    for zinfo in fh.infolist():
                        data = fh.read(zinfo, pwd=pwd)
                        # TODO ([email protected]): The zipfile standard
                        # module currently does not support encryption in write
                        # mode of new ZipFiles. See:
                        # https://docs.python.org/3/library/zipfile.html
                        # When support is provided, the below line for writing
                        # files shall be replaced by:
                        # self._file.writestr(zinfo, data, pwd=pwd)
                        self._file.writestr(zinfo, data)
            except FileNotFoundError as err:
                raise FileNotFoundError(
                    f"file '{self.path}' does not exist") from err
            except BadZipFile as err:
                raise BadZipFile(
                    f"file '{self.path}' is not a valid ZIP file") from err

        # Try to open and load workspace configuration from buffer
        structure = {
            'dc': self._get_attr_types(group='dc'),
            'hooks': self._get_attr_types(category='hooks')}
        try:
            with self.open(self._config_file) as file:
                cfg = inifile.load(file, structure=structure)
        except KeyError as err:
            raise BadWsFile(
                f"workspace '{self.path}' is not valid: "
                f"file '{self._config_file}' could not be loaded") from err

        # Link configuration
        self._set_attr_values(cfg.get('dc', {}), group='dc') # type: ignore

    def save(self) -> None:
        """Save the workspace to it's filepath."""
        if isinstance(self.path, Path):
            self.saveas(self.path)
        else:
            raise FileNotGivenError(
                "use saveas() to save the workspace to a file")

    def saveas(self, filepath: PathLike) -> None:
        """Save the workspace to a file.

        Args:
            filepath: String or :term:`path-like object`, that represents the
                name of a workspace file.

        """
        path = env.expand(filepath)

        # Update datetime
        self.date = datetime.datetime.now()

        # Update 'workspace.ini'
        with self.open(self._config_file, mode='w') as file:
            inifile.save({
                'dc': self._get_attr_values(group='dc'),
                'hooks': self._get_attr_values(category='hooks')}, file)

        # Remove duplicates from workspace
        self._remove_duplicates()

        # Mark plattform, which created the files as Windows
        # to avoid inference of wrong Unix permissions
        for zinfo in self._file.infolist():
            zinfo.create_system = 0

        # Close ZipArchive (to allow to read the buffer)
        self._file.close()

        # Read buffer and write workspace file
        if not isinstance(self._buffer, BytesIO):
            raise TypeError("buffer has not been initialized")
        with open(path, 'wb') as file:
            file.write(self._buffer.getvalue())

        # Close buffer
        self._buffer.close()

        # Reload saved workpace from file
        self.load(path, pwd=self._pwd)

    def get_file_accessor(self, path: PathLike) -> FileAccessorBase:
        """Get file accessor to workspace member.

        Args:
            path: String or :term:`path-like object`, that represents a
                workspace member. In reading mode the path has to point to a
                valid workspace file, or a FileNotFoundError is raised. In
                writing mode the path by default is treated as a file path. New
                directories can be written by setting the argument is_dir to
                True.

        Returns:
            :class:`File accessor <nemoa.types.FileAccessorBase>` to workspace
            member.

        """
        def wrap_open(path: PathLike) -> AnyFunc:
            def wrapped_open(
                    obj: FileAccessorBase, *args: Any, **kwds: Any) -> FileLike:
                return self.open(path, *args, **kwds)
            return wrapped_open

        return type( # pylint: disable=E0110
            'FileAccessor', (FileAccessorBase,), {
            'name': str(path),
            'open': wrap_open(path)})()

    def open(
            self, path: PathLike, mode: str = 'r', encoding: OptStr = None,
            is_dir: bool = False) -> FileLike:
        """Open file within the workspace.

        Args:
            path: String or :term:`path-like object`, that represents a
                workspace member. In reading mode the path has to point to a
                valid workspace file, or a FileNotFoundError is raised. In
                writing mode the path by default is treated as a file path. New
                directories can be written by setting the argument is_dir to
                True.
            mode: String, which characters specify the mode in which the file is
                to be opened. The default mode is reading in text mode. Suported
                characters are:
                'r': Reading mode (default)
                'w': Writing mode
                'b': Binary mode
                't': Text mode (default)
            encoding: In binary mode encoding has not effect. In text mode
                encoding specifies the name of the encoding, which in reading
                and writing mode respectively is used to decode the stream’s
                bytes into strings, and to encode strings into bytes. By default
                the preferred encoding of the operating system is used.
            is_dir: Boolean value which determines, if the path is to be treated
                as a directory or not. This information is required for writing
                directories to the workspace. The default behaviour is not to
                treat paths as directories.

        Returns:
            :term:`File object` in reading or writing mode.

        Examples:
            >>> with self.open('workspace.ini') as file:
            >>>     print(file.read())

        """
        # Open file handler to workspace member
        if 'w' in mode:
            if 'r' in mode:
                raise ValueError(
                    "'mode' is not allowed to contain the "
                    "characters 'r' AND 'w'")
            file = self._open_write(path, is_dir=is_dir)
        else:
            file = self._open_read(path)

        # Wrap binary files to text files if required
        if 'b' in mode:
            if 't' in mode:
                raise ValueError(
                    "'mode' is not allowed to contain the "
                    "characters 'b' AND 't'")
            return file
        return TextIOWrapper(
            file, encoding=encoding or self._default_encoding,
            write_through=True)

    def close(self) -> None:
        """Close current workspace and buffer."""
        if hasattr(self._file, 'close'):
            self._file.close()
        if hasattr(self._buffer, 'close'):
            self._buffer.close()

    def copy(self, source: PathLike, target: PathLike) -> bool:
        """Copy file within workspace.

        Args:
            source: String or :term:`path-like object`, that points to a file in
                the directory structure of the workspace. If the file does not
                exist, a FileNotFoundError is raised. If the filepath points to
                a directory, an IsADirectoryError is raised.
            target: String or :term:`path-like object`, that points to a new
                filename or an existing directory in the directory structure of
                the workspace. If the target is a directory the target file
                consists of the directory and the basename of the source file.
                If the target file already exists a FileExistsError is raised.

        Returns:
            Boolean value which is True if the file was copied.

        """
        # Check if source file exists and is not a directory
        src_file = PurePath(source).as_posix()
        src_infos = self._locate(source)
        if not src_infos:
            raise FileNotFoundError(
                f"workspace file '{src_file}' does not exist")
        src_info = src_infos[-1]
        if getattr(src_info, 'is_dir')():
            raise IsADirectoryError(
                f"'{src_file}/' is a directory not a file")

        # If target is a directory get name of target file from
        # source filename
        tgt_file = PurePath(target).as_posix()
        if tgt_file == '.':
            tgt_file = Path(src_file).name
        else:
            tgt_infos = self._locate(target)
            if tgt_infos:
                if getattr(tgt_infos[-1], 'is_dir')():
                    tgt_path = PurePath(tgt_file, Path(src_file).name)
                    tgt_file = tgt_path.as_posix()

        # Check if target file already exists
        if self._locate(tgt_file):
            raise FileExistsError(
                f"workspace file '{tgt_file}' already exist.")

        # Read binary data from source file
        data = self._file.read(src_info, pwd=self._pwd)

        # Create ZipInfo for target file from source file info
        tgt_time = getattr(src_info, 'date_time')
        tgt_info = ZipInfo(filename=tgt_file, date_time=tgt_time) # type: ignore

        # Write binary data to target file
        # TODO ([email protected]): The zipfile standard module currently
        # does not support encryption in write mode. See:
        # https://docs.python.org/3/library/zipfile.html
        # When support is provided, the below line shall be replaced by:
        # self._file.writestr(tgt_info, data, pwd=self._pwd)
        self._file.writestr(tgt_info, data)
        self._changed = True

        # Check if new file exists
        return bool(self._locate(tgt_file))

    def move(self, source: PathLike, target: PathLike) -> bool:
        """Move file within workspace.

        Args:
            source: String or :term:`path-like object`, that points to a file in
                the directory structure of the workspace. If the file does not
                exist, a FileNotFoundError is raised. If the filepath points to
                a directory, an IsADirectoryError is raised.
            target: String or :term:`path-like object`, that points to a new
                filename or an existing directory in the directory structure of
                the workspace. If the target is a directory the target file
                consists of the directory and the basename of the source file.
                If the target file already exists a FileExistsError is raised.

        Returns:
            Boolean value which is True if the file has been moved.

        """
        # Copy source file to target file or directory
        # and on success remove source file
        return self.copy(source, target) and self.unlink(source)

    def append(self, source: PathLike, target: OptPathLike = None) -> bool:
        """Append file to the workspace.

        Args:
            source: String or :term:`path-like object`, that points to a valid
                file in the directory structure if the system. If the file does
                not exist, a FileNotFoundError is raised. If the filepath points
                to a directory, a IsADirectoryError is raised.
            target: String or :term:`path-like object`, that points to a valid
                directory in the directory structure of the workspace. By
                default the root directory is used. If the directory does not
                exist, a FileNotFoundError is raised. If the target directory
                already contains a file, which name equals the filename of the
                source, a FileExistsError is raised.

        Returns:
            Boolean value which is True if the file has been appended.

        """
        # Check source file
        src_file = env.expand(source)
        if not src_file.exists():
            raise FileNotFoundError(f"file '{src_file}' does not exist")
        if src_file.is_dir():
            raise IsADirectoryError(f"'{src_file}' is a directory not a file")

        # Check target directory
        if target:
            tgt_dir = PurePath(target).as_posix() + '/'
            if not self._locate(tgt_dir):
                raise FileNotFoundError(
                    f"workspace directory '{tgt_dir}' does not exist")
        else:
            tgt_dir = '.'
        tgt_file = Path(tgt_dir, src_file.name)
        if self._locate(tgt_file):
            raise FileExistsError(
                f"workspace directory '{tgt_dir}' already contains a file "
                f"with name '{src_file.name}'")

        # Create ZipInfo entry from source file
        filename = PurePath(tgt_file).as_posix()
        date_time = time.localtime(src_file.stat().st_mtime)[:6]
        zinfo = ZipInfo(filename=filename, date_time=date_time) # type: ignore

        # Copy file to archive
        with src_file.open('rb') as src:
            data = src.read()
        # TODO ([email protected]): The zipfile standard module currently
        # does not support encryption in write mode. See:
        # https://docs.python.org/3/library/zipfile.html
        # When support is provided, the below line shall be replaced by:
        # self._file.writestr(zinfo, data, pwd=pwd)
        self._file.writestr(zinfo, data)

        return True

    def read_text(self, filepath: PathLike, encoding: OptStr = None) -> str:
        """Read text from file.

        Args:
            filepath: String or :term:`path-like object`, that points to a valid
                file in the directory structure of the workspace. If the file
                does not exist a FileNotFoundError is raised.
            encoding: Specifies the name of the encoding, which is used to
                decode the stream’s bytes into strings. By default the preferred
                encoding of the operating system is used.

        Returns:
            Contents of the given filepath encoded as string.

        """
        with self.open(filepath, mode='r', encoding=encoding) as file:
            text = file.read()
        if not isinstance(text, str):
            return ''
        return text

    def read_bytes(self, filepath: PathLike) -> bytes:
        """Read bytes from file.

        Args:
            filepath: String or :term:`path-like object`, that points to a valid
                file in the dirctory structure of the workspace. If the file
                does not exist a FileNotFoundError is raised.

        Returns:
            Contents of the given filepath as bytes.

        """
        with self.open(filepath, mode='rb') as file:
            blob = file.read()
        if not isinstance(blob, bytes):
            return b''
        return blob

    def write_text(
            self, text: str, filepath: PathLike,
            encoding: OptStr = None) -> int:
        """Write text to file.

        Args:
            text: String, which has to be written to the given file.
            filepath: String or :term:`path-like object`, that represents a
                valid filename in the dirctory structure of the workspace.
            encoding: Specifies the name of the encoding, which is used to
                encode strings into bytes. By default the preferred encoding of
                the operating system is used.

        Returns:
            Number of characters, that are written to the file.

        """
        with self.open(filepath, mode='w', encoding=encoding) as file:
            if isinstance(file, TextIOBaseClass):
                return file.write(text)
        return 0

    def write_bytes(self, blob: BytesLike, filepath: PathLike) -> int:
        """Write bytes to file.

        Args:
            blob: Bytes, which are to be written to the given file.
            filepath: String or :term:`path-like object`, that represents a
                valid filename in the dirctory structure of the workspace.

        Returns:
            Number of bytes, that are written to the file.

        """
        with self.open(filepath, mode='wb') as file:
            if isinstance(file, BytesIOBaseClass):
                return file.write(blob)
        return 0

    def unlink(self, filepath: PathLike, ignore_missing: bool = True) -> bool:
        """Remove file from workspace.

        Args:
            filepath: String or :term:`path-like object`, that points to a file
                in the directory structure of the workspace. If the filepath
                points to a directory, an IsADirectoryError is raised. For the
                case, that the file does not exist, the argument ignore_missing
                determines, if a FileNotFoundError is raised.
            ignore_missing: Boolean value which determines, if FileNotFoundError
                is raised, if the target file does not exist. The default
                behaviour, is to ignore missing files.

        Returns:
            Boolean value, which is True if the given file was removed.

        """
        matches = self._locate(filepath)
        if not matches:
            if ignore_missing:
                return True
            filename = PurePath(filepath).as_posix()
            raise FileNotFoundError(f"file '{filename}' does not exist")
        if getattr(matches[-1], 'is_dir')():
            dirname = PurePath(filepath).as_posix() + '/'
            raise IsADirectoryError(f"'{dirname}' is a directory not a file")
        return self._remove_members(matches)

    def mkdir(self, dirpath: PathLike, ignore_exists: bool = False) -> bool:
        """Create a new directory at the given path.

        Args:
            dirpath: String or :term:`path-like object`, that represents a valid
                directory name in the directory structure of the workspace. If
                the directory already exists, the argument ignore_exists
                determines, if a FileExistsError is raised.
            ignore_exists: Boolean value which determines, if FileExistsError is
                raised, if the target directory already exists. The default
                behaviour is to raise an error, if the file already exists.

        Returns:
            Boolean value, which is True if the given directory was created.

        """
        matches = self._locate(dirpath)
        if not matches:
            with self.open(dirpath, mode='w', is_dir=True):
                pass
        elif not ignore_exists:
            dirname = PurePath(dirpath).as_posix() + '/'
            raise FileExistsError(f"directory '{dirname}' already exists")
        return True

    def rmdir(
            self, dirpath: PathLike, recursive: bool = False,
            ignore_missing: bool = False) -> bool:
        """Remove directory from workspace.

        Args:
            dirpath: String or :term:`path-like object`, that points to a
                directory in the directory structure of the workspace. If the
                directory does not exist, the argument ignore_missing
                determines, if a FileNotFoundError is raised.
            ignore_missing: Boolean value which determines, if FileNotFoundError
                is raised, if the target directory does not exist. The default
                behaviour, is to raise an error if the directory is missing.
            recursive: Boolean value which determines, if directories are
                removed recursively. If recursive is False, then only empty
                directories can be removed. If recursive, however, is True, then
                all files and subdirectories are alse removed. By default
                recursive is False.

        Returns:
            Boolean value, which is True if the given directory was removed.

        """
        matches = self._locate(dirpath)
        dirname = PurePath(dirpath).as_posix() + '/'
        if not matches:
            if ignore_missing:
                return True
            raise FileNotFoundError(f"directory '{dirname}' does not exist")
        files = self.search(dirname + '*')
        if not files:
            return self._remove_members(matches)
        if not recursive:
            raise DirNotEmptyError(f"directory '{dirname}' is not empty")
        allmatches = matches
        for file in files:
            allmatches += self._locate(file)
        return self._remove_members(allmatches)

    def search(self, pattern: OptStr = None) -> StrList:
        """Search for files in the workspace.

        Args:
            pattern: Search pattern that contains Unix shell-style wildcards:
                '*': Matches arbitrary strings
                '?': Matches single characters
                [seq]: Matches any character in seq
                [!seq]: Matches any character not in seq
                By default a list of all files and directories is returned.

        Returns:
            List of files and directories in the directory structure of the
            workspace, that match the search pattern.

        """
        # Get list of normalized unique paths of workspace members
        paths: PathLikeList = []
        for zinfo in self._file.infolist():
            path = PurePath(zinfo.filename).as_posix()
            if getattr(zinfo, 'is_dir')():
                path += '/'
            if path not in paths:
                paths.append(path)

        # Match path list with given pattern
        if pattern:
            paths = env.match_paths(paths, pattern)

        # Sort paths
        return sorted([str(path) for path in paths])

    #
    # Protected Methods
    #

    def _create_new(self) -> None:
        # Initialize instance Variables, Buffer and buffered ZipFile
        self._set_attr_values(self._default_config['dc'], group='dc')
        self._path = None
        self._changed = False
        self._pwd = None
        self._buffer = BytesIO()
        self._file = ZipFile(self._buffer, mode='w')

        # Create folders
        for folder in self._default_dir_layout:
            self.mkdir(folder)

    def _open_read(self, path: PathLike) -> BytesIOLike:
        # Locate workspace member by it's path
        # and open file handler for reading the file
        matches = self._locate(path)
        if not matches:
            fname = PurePath(path).as_posix()
            raise FileNotFoundError(
                f"workspace member with filename '{fname}' does not exist")
        # Select latest version of file
        zinfo = matches[-1]
        return self._file.open(zinfo, pwd=self._pwd, mode='r')

    def _open_write(self, path: PathLike, is_dir: bool = False) -> BytesIOLike:
        # Determine workspace member name from path
        # and get ZipInfo with local time as date_time
        filename = PurePath(path).as_posix()
        if is_dir:
            filename += '/'
        zinfo = ZipInfo( # type: ignore
            filename=filename,
            date_time=time.localtime()[:6])
        # Catch Warning for duplicate files
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            # TODO ([email protected]): The zipfile standard
            # module currently does not support encryption in write
            # mode of new ZipFiles. See:
            # https://docs.python.org/3/library/zipfile.html
            # When support is provided, the below line for writing
            # files shall be replaced by:
            # file = self._file.open(zinfo, mode='w', pwd=self._pwd)
            file = self._file.open(zinfo, mode='w')
        self._changed = True
        return file

    def _locate(self, path: PathLike, sort: bool = True) -> ZipInfoList:
        # Get list of member zipinfos
        zinfos = self._file.infolist()
        # Match members by path-like filenames
        matches = [i for i in zinfos if Path(i.filename) == Path(path)]
        if sort:
            # Sort matches by datetime
            matches = sorted(matches, key=lambda i: i.date_time)
        # Return sorted matches
        return matches

    def _get_name(self) -> OptStr:
        return getattr(self._path, 'stem', None)

    def _get_path(self) -> OptPath:
        return self._path

    def _get_changed(self) -> bool:
        return self._changed

    def _get_folders(self) -> StrList:
        names: StrList = []
        for zinfo in self._file.infolist():
            if getattr(zinfo, 'is_dir')():
                name = PurePath(zinfo.filename).as_posix() + '/'
                names.append(name)
        return sorted(names)

    def _remove_members(self, zinfos: ZipInfoList) -> bool:
        # Return True if list of members is empty
        if not zinfos:
            return True

        # Remove entries in the list of members from workspace
        new_zinfos = []
        zids = [(zinfo.filename, zinfo.date_time) for zinfo in zinfos]
        for zinfo in self._file.infolist():
            zid = (zinfo.filename, zinfo.date_time)
            if zid in zids:
                zids.remove(zid)
            else:
                new_zinfos.append(zinfo)

        # If any entry on the list could not be found raise an error
        if zids:
            names = [zid[0] for zid in zids]
            raise FileNotFoundError(
                f"could not locate workspace members: {names}")

        # Create new ZipArchive in Memory
        new_buffer = BytesIO()
        new_file = ZipFile(new_buffer, mode='w')

        # Copy all workspace members on the new list from current
        # to new workspace
        for zinfo in new_zinfos:
            data = self._file.read(zinfo, pwd=self._pwd)
            new_file.writestr(zinfo, data)

        # Close current workspace and buffer and link new workspace and buffer
        self._file.close()
        self._buffer.close()
        self._buffer = new_buffer
        self._file = new_file
        self._changed = True

        return True

    def _remove_duplicates(self) -> bool:
        # Get list of duplicates
        zinfos: ZipInfoList = []
        for filename in self.files:
            zinfos += self._locate(filename, sort=True)[:-1]

        # Remove duplicates
        return self._remove_members(zinfos)
Beispiel #33
0
import requests
import numpy as np
import tensorflow as tf
from zipfile import ZipFile

data_dir = 'data/'
data_file = 'spam.txt'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.isfile(os.path.join(data_dir, data_file)):
    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    r = requests.get(zip_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('SMSSpamCollection')
    text_data = file.decode()
    text_data = text_data.encode('ascii', errors='ignore')
    text_data = text_data.decode().split('\n')

    with open(os.path.join(data_dir, data_file), 'w') as file_conn:
        for text in text_data:
            file_conn.write("{}\n".format(text))

else:
    text_data = []
    with open(os.path.join(data_dir, data_file), 'r') as file_conn:
        for row in file_conn:
            text_data.append(row)
    text_data = text_data[:-1]
Beispiel #34
0
def get_lines_from_zip(zip_str):
    zip_file = ZipFile(BytesIO(zip_str))
    for name in zip_file.namelist():
        for line in zip_file.read(name).decode('utf-8').splitlines():
            yield line
    def import_from_json(cls, campaign, batch_user, batch_data, max_count):
        """
        Creates new DataAssessmentTask instances based on JSON input.
        """
        batch_meta = batch_data.metadata
        batch_name = batch_data.dataFile.name
        batch_file = batch_data.dataFile
        batch_json = None

        if batch_name.endswith('.zip'):
            if not is_zipfile(batch_file):
                _msg = 'Batch {0} not a valid ZIP archive'.format(batch_name)
                LOGGER.warn(_msg)
                return

            batch_zip = ZipFile(batch_file)
            batch_json_files = [
                x for x in batch_zip.namelist() if x.endswith('.json')
            ]
            # TODO: implement proper support for multiple json files in archive.
            for batch_json_file in batch_json_files:
                batch_content = batch_zip.read(batch_json_file).decode('utf-8')
                batch_json = loads(batch_content, encoding='utf-8')

        else:
            batch_json = loads(str(batch_file.read(), encoding="utf-8"))

        from datetime import datetime
        t1 = datetime.now()

        current_count = 0
        max_length_id = 0
        max_length_text = 0
        for batch_task in batch_json:
            if max_count > 0 and current_count >= max_count:
                _msg = 'Stopping after max_count={0} iterations'.format(
                    max_count)
                LOGGER.info(_msg)
                print(_msg)

                t2 = datetime.now()
                print(t2 - t1)
                return

            print('Batch name/no:', batch_name, batch_task['task']['batchNo'])

            new_items = []
            for item in batch_task['items']:
                current_length_id = len(item['targetID'])
                current_length_text = len(item['targetText'])

                if current_length_id > max_length_id:
                    print('Longest target ID', current_length_id,
                          item['targetID'])
                    max_length_id = current_length_id

                if current_length_text > max_length_text:
                    print('Longest targetText', current_length_text,
                          item['targetText'].encode('utf-8'))
                    max_length_text = current_length_text

                new_item = TextPairWithDomain(
                    sourceID=item['sourceID'],
                    sourceText=item['sourceText'],
                    targetID=item['targetID'],
                    targetText=item['targetText'],
                    createdBy=batch_user,
                    itemID=item['itemID'],
                    itemType=item['itemType'],
                    documentDomain=item['documentDomain'],
                    sourceURL=item['sourceURL'],
                    targetURL=item['targetURL'])
                new_items.append(new_item)

            if not len(new_items) == 100:
                _msg = 'Expected 100 items for task but found {0}'.format(
                    len(new_items))
                LOGGER.warn(_msg)
                print(_msg)
                continue

            current_count += 1

            #for new_item in new_items:
            #    new_item.metadata = batch_meta
            #    new_item.save()
            batch_meta.textpair_set.add(*new_items, bulk=False)
            batch_meta.save()

            new_task = DataAssessmentTask(
                campaign=campaign,
                requiredAnnotations=batch_task['task']['requiredAnnotations'],
                batchNo=batch_task['task']['batchNo'],
                batchData=batch_data,
                createdBy=batch_user,
            )
            new_task.save()

            #for new_item in new_items:
            #    new_task.items.add(new_item)
            new_task.items.add(*new_items)
            new_task.save()

            _msg = 'Success processing batch {0}, task {1}'.format(
                str(batch_data), batch_task['task']['batchNo'])
            LOGGER.info(_msg)
            print(_msg)

        _msg = 'Max length ID={0}, text={1}'.format(max_length_id,
                                                    max_length_text)
        LOGGER.info(_msg)
        print(_msg)

        t2 = datetime.now()
        print(t2 - t1)
Beispiel #36
0
    xlsm_file = sys.argv[1]
else:
    print("\nUtility to extract a vbaProject.bin binary from an Excel 2007+ "
          "xlsm macro file for insertion into an XlsxWriter file."
          "\n"
          "See: https://xlsxwriter.readthedocs.io/working_with_macros.html\n"
          "\n"
          "Usage: vba_extract file.xlsm\n")
    exit()

try:
    # Open the Excel xlsm file as a zip file.
    xlsm_zip = ZipFile(xlsm_file, 'r')

    # Read the xl/vbaProject.bin file.
    vba_data = xlsm_zip.read('xl/' + vba_filename)

    # Write the vba data to a local file.
    vba_file = open(vba_filename, "wb")
    vba_file.write(vba_data)
    vba_file.close()

except IOError:
    # Use exc_info() for Python 2.5+ compatibility.
    e = sys.exc_info()[1]
    print("File error: %s" % str(e))
    exit()

except KeyError:
    # Usually when there isn't a xl/vbaProject.bin member in the file.
    e = sys.exc_info()[1]
Beispiel #37
0
    def perform(
            self):  # The function that will be executed must have this name

        # Accessing system location settings
        #lat = self.settings.location.latitude
        log.info("Hello KMZ")
        # Other location settings
        #self.zip
        #self.name
        #self.state
        #self.latitude
        #self.longitude
        #self.address
        #self.elevation
        #self.gmtOffset
        #self.dstOffset
        #self.stationID
        #self.stationName
        #self.et0Average

        station = self.params.get("station", None)
        if station is None or station == "":
            station = "K4086"
            log.debug("No station set, using (%s)" % station)

        url = "https://opendata.dwd.de/weather/local_forecasts/mos/MOSMIX_L/single_stations/" + str(
            station) + "/kml/MOSMIX_L_LATEST_" + str(station) + ".kmz"

        URLParams = [("User-Agent", "RainMachine v2")]

        try:
            req = urllib2.Request(url)
            response = urllib2.urlopen(req)
            raw = response.read()

            zipFile = ZipFile(StringIO.StringIO(raw))
            kml = zipFile.read(zipFile.filelist[0])

            rootNode = ET.fromstring(kml)

            nameSpaces = {
                'dwd':
                "https://opendata.dwd.de/weather/lib/pointforecast_dwd_extension_V1_0.xsd",
                'gx': "http://www.google.com/kml/ext/2.2",
                'xal': "urn:oasis:names:tc:ciq:xsdschema:xAL:2.0",
                'kml': "http://www.opengis.net/kml/2.2",
                'atom': "http://www.w3.org/2005/Atom"
            }

            timeStampsNode = rootNode.findall(
                "./kml:Document/kml:ExtendedData/dwd:ProductDefinition/dwd:ForecastTimeSteps/",
                nameSpaces)
            extendedDataNode = rootNode.findall(
                "./kml:Document/kml:Placemark/kml:ExtendedData/", nameSpaces)

            nowTimeStamp = rmCurrentTimestamp()
            skipColumens = 0

            # Parse Timestamps
            timeStampList = []
            for ts in timeStampsNode:
                compatibleString = re.sub(r"\.\d+Z$", '', ts.text)
                unix = rmTimestampFromDateAsString(compatibleString,
                                                   "%Y-%m-%dT%H:%M:%S")
                #ts = datetime.datetime.strptime(compatibleString, "%Y-%m-%dT%H:%M:%S")
                if (unix < nowTimeStamp):
                    skipColumens += 1
                    continue
                timeStampList.append(unix)

            dwdData = []
            parsedData = DWDData()
            for data in extendedDataNode:
                currentCol = 0
                for k, v in data.attrib.items():
                    if k.endswith("elementName"):
                        valueNode = data.find("./dwd:value", nameSpaces)
                        if valueNode == None:
                            continue
                        allValues = valueNode.text.split()
                        if skipColumens > 0:
                            rawValues = allValues[skipColumens:]
                        else:
                            rawValues = allValues
                        if len(rawValues) != len(timeStampList):
                            continue
                        # Temperature
                        if v.lower() == "TTT".lower():
                            parsedData.Temperature = parseFloats(
                                rawValues, timeStampList,
                                temperatureTransformation)
                            continue
                        # Min Temperature
                        if v.lower() == "TN".lower():
                            parsedData.MinTemp = parseFloats(
                                rawValues, timeStampList,
                                temperatureTransformation)
                            continue
                        # Max Temperature
                        if v.lower() == "TX".lower():
                            parsedData.MaxTemp = parseFloats(
                                rawValues, timeStampList,
                                temperatureTransformation)
                            continue
                        # Probability of precipitation > 0.0mm during the last hour
                        if v.lower() == "wwP".lower():
                            parsedData.POP = parseFloats(
                                rawValues, timeStampList)
                            continue
                        # Wind
                        if v.lower() == "FF".lower():
                            parsedData.Wind = parseFloats(
                                rawValues, timeStampList)
                            continue
                        # Solar Radiation
                        if v.lower() == "Rad1h".lower():
                            parsedData.SolarRadiation = parseFloats(
                                rawValues, timeStampList,
                                pressureTransformation)
                            continue
                        # Cloud
                        if v.lower() == "Neff".lower():
                            parsedData.SkyCover = parseFloats(
                                rawValues, timeStampList, skyCoverTransform)
                            continue
                        # QPF
                        if v.lower() == "RRdc".lower():
                            parsedData.QPF = parseFloats(
                                rawValues, timeStampList, None, yesterday)
                            continue
                        # evapotranspiration
                        if v.lower() == "PEvap".lower():
                            parsedData.ET0 = parseFloats(
                                rawValues, timeStampList, None, yesterday)
                            continue
                        # Pressure
                        if v.lower() == "PPPP".lower():
                            parsedData.Pressure = parseFloats(
                                rawValues, timeStampList,
                                pressureTransformation)
                            continue
                        # Dewpoint
                        if v.lower() == "Td".lower():
                            parsedData.DewPoint = parseFloats(
                                rawValues, timeStampList,
                                temperatureTransformation)
                            continue
                        # Condition
                        if v.lower() == "WPcd1".lower():
                            parsedData.Condition = parseFloats(
                                rawValues, timeStampList, conditionParser,
                                yesterday)
                            continue

            log.info("Adding parsed values to database")
            if parsedData.Temperature != None:
                log.debug("Adding Temparatures values")
                self.addValues(RMParser.dataType.TEMPERATURE,
                               parsedData.Temperature)
            if parsedData.MinTemp != None:
                log.debug("Adding Min-Temparatures values")
                self.addValues(RMParser.dataType.MINTEMP, parsedData.MinTemp)
            if parsedData.MaxTemp != None:
                log.debug("Adding Max-Temparatures values")
                self.addValues(RMParser.dataType.MAXTEMP, parsedData.MaxTemp)
            if parsedData.RH != None:
                log.debug("Adding RH values")
                self.addValues(RMParser.dataType.RH, parsedData.RH)
            if parsedData.Wind != None:
                log.debug("Adding Wind values")
                self.addValues(RMParser.dataType.WIND, parsedData.Wind)
            if parsedData.SolarRadiation != None:
                log.debug("Adding Solar Radiation values")
                self.addValues(RMParser.dataType.SOLARRADIATION,
                               parsedData.SolarRadiation)
            if parsedData.SkyCover != None:
                log.debug("Adding SkyCover values")
                self.addValues(RMParser.dataType.SKYCOVER, parsedData.SkyCover)
            if parsedData.QPF != None:
                log.debug("Adding QPF values")
                self.addValues(RMParser.dataType.QPF, parsedData.QPF)
            if parsedData.ET0 != None:
                log.debug("Adding ET0 values")
                #self.addValues(RMParser.dataType.ET0, parsedData.ET0)
            if parsedData.POP != None:
                log.debug("Adding POP values")
                self.addValues(RMParser.dataType.POP, parsedData.POP)
            if parsedData.Pressure != None:
                log.debug("Adding Pressure values")
                self.addValues(RMParser.dataType.PRESSURE, parsedData.Pressure)
            if parsedData.DewPoint != None:
                log.debug("Adding DewPoint values")
                self.addValues(RMParser.dataType.DEWPOINT, parsedData.DewPoint)
            if parsedData.Condition != None:
                self.addValues(RMParser.dataType.CONDITION,
                               parsedData.Condition)

        except Exception, e:
            log.error("*** Error running DWD parser")
            log.exception(e)
Beispiel #38
0
class Epub(object):
    def __init__(self, filename=None):
        self.filename = filename

    def zip_get_name(self, name_searched):
        for name in self._zipfile.namelist():
            if name == name_searched:
                return name
        return None

    @property
    def filename(self):
        return self._filename

    @filename.setter
    def filename(self, filename):
        self._filename = filename
        self.ncx = self.content = self._zipfile = None
        if self._filename:
            self._zipfile = ZipFile(self._filename)
            self.container = Container(self._zipfile.read(CONTAINER_NAME))
            self.content = Content(self._zipfile.read(self.content_filename), file_url=self.content_filename)
            self.ncx = Ncx(self._zipfile.read(self.content.ncx_item.url))
            # Harvest URLs from parsable files
            self.urls_used_into_id = {}
            for name in self._zipfile.infolist():
                data = self._zipfile.read(name)
                url = str(name.filename)
                if url in self.content.urls_by_id:
                    item = self.content.manifest[self.content.urls_by_id[url]]
                    if item:
                        urls = set()
                        if item.parsable():
                            parser = URLLister(parent_path=url)
                            try:
                                parser.feed(data)
                            except UnicodeDecodeError:
                                continue
                            for url_found in parser.urls:
                                urls.add(url_found)
                        self.urls_used_into_id[item.id] = urls

    @property
    def zipfile(self):
        return self._zipfile

    @property
    def container_filename(self):
        name = self.zip_get_name(CONTAINER_NAME)
        if name:
            return name
        raise epexc.ContainerFileNotFound(name)

    @property
    def content_filename(self):
        name = self.zip_get_name(self.container.rootfile)
        if name:
            return name
        raise epexc.ContentFileNotFound(name)

    def create_preview(self, preview_filename, spine_preview, missing_page=None, overwrite=False):
        """ Create a preview, writing filename, with only spine elements,
        with an optional missing_page for missing links and return an ePub """
        for item in spine_preview:
            if not item in self.content.spine:
                raise epexc.ElementNotInSpine(item)
        # Check preview_filename
        if os.path.exists(preview_filename) and not overwrite:
            raise epexc.PreviewAlreadyExists(preview_filename)

        # Spine IDs and URLs removed from preview
        spine_ids_removed = set(id for id in self.content.spine if id not in spine_preview)
        urls_to_be_removed = set(self.content.manifest[id].url for id in spine_ids_removed)

        # Recursively check URLs used (content urls are always included)
        used_urls = set(self.content.metadata_content_urls)
        exploring_ids = deque(spine_preview)
        explored_ids = set()
        while exploring_ids:
            id = exploring_ids.popleft()
            # Explored id only if is internal, not explored and not removed
            if id in self.content.manifest and not id in explored_ids and not id in spine_ids_removed:
                used_urls.add(self.content.manifest[id].url)
                for url in self.urls_used_into_id[id]:
                    if url in self.content.urls_by_id and not url in urls_to_be_removed:
                        used_urls.add(url)
                        exploring_ids.append(self.content.urls_by_id[url])
            explored_ids.add(id)

        # Check every url in manifest if not used
        for id, item in self.content.manifest.items():
            if not item.url in used_urls:
                urls_to_be_removed.add(item.url)

        # Write preview epub
        zip_out = ZipFile(preview_filename, mode='w')
        for name in self._zipfile.infolist():
            url = name.filename
            parent_path_parts = url.split("/")[:-1]
            insert_file = False
            if (not url in urls_to_be_removed):
                if url in (self.content_filename, CONTAINER_NAME, MIMETYPE_NAME):
                    insert_file = True
                elif url in self.content.urls_by_id:
                    insert_file = True
            if insert_file:
                data = self._zipfile.read(name)
                if url == self.content_filename:
                    dom = xml.dom.minidom.parseString(data)
                    # Process content manifest
                    for item in dom.getElementsByTagNameNS("*", "manifest")[0].getElementsByTagNameNS("*", "item"):
                        url = absolutize_url(item.getAttribute("href"), parent_path_parts)
                        if url in urls_to_be_removed:
                            try:
                                item.parentNode.removeChild(item)
                            except xml.dom.NotFoundErr:
                                pass
                    # Process content spine
                    for itemref in dom.getElementsByTagNameNS("*", "spine")[0].getElementsByTagNameNS("*", "itemref"):
                        url = self.content.manifest[itemref.getAttribute("idref")].url
                        if url in urls_to_be_removed:
                            try:
                                itemref.parentNode.removeChild(itemref)
                            except xml.dom.NotFoundErr:
                                pass
                    data = dom.toxml(dom.encoding)
                elif url in self.content.urls_by_id:
                    item = self.content.manifest[self.content.urls_by_id[url]]
                    if item == self.content.ncx_item:
                        # Process toc
                        dom = xml.dom.minidom.parseString(data)
                        for navPoint in dom.getElementsByTagNameNS("*", "navPoint"):
                            for node in navPoint.childNodes:
                                if node.nodeType == node.ELEMENT_NODE and node.nodeName == "content":
                                    url = absolutize_url(node.getAttribute("src"), parent_path_parts)
                                    if url in urls_to_be_removed:
                                        try:
                                            navPoint.parentNode.removeChild(navPoint)
                                        except xml.dom.NotFoundErr:
                                            pass
                        for cont, navPoint in enumerate(dom.getElementsByTagNameNS("*", "navPoint")):
                            playOrder = navPoint.getAttribute("playOrder")
                            if playOrder != str(cont + 1):
                                playOrder = navPoint.setAttribute("playOrder", str(cont + 1))
                        data = dom.toxml(dom.encoding)
                    elif item.parsable():
                        # Process generic html/xml
                        dom = xml.dom.minidom.parseString(data)
                        parent_path_parts = url.split("/")[:-1]
                        for tagname, attr in (("content", "src"), ("img", "src"), ("link", "href"), ("a", "href")):
                            for node in dom.getElementsByTagNameNS("*", tagname):
                                if node.nodeType == node.ELEMENT_NODE:
                                    url = absolutize_url(node.getAttribute(attr), parent_path_parts)
                                    if url in urls_to_be_removed:
                                        try:
                                            node.parentNode.removeChild(node)
                                        except xml.dom.NotFoundErr:
                                            pass
                        data = dom.toxml(dom.encoding)
                zip_out.writestr(copy.deepcopy(name), data)
        zip_out.close()
def load_csv_files(zip_file: ZipFile, files: list) -> pd.DataFrame:
    df = pd.DataFrame()
    for file in files:
        csv = io.BytesIO(zip_file.read(file))
        df = df.append(pd.read_csv(csv))
    return df
Beispiel #40
0
import io
from zipfile import ZipFile
import pandas as pd
import mysql.connector
from datetime import date,datetime,timedelta

    #Download the zip file
dataset_url = "http://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/datos_abiertos_covid19.zip"
response = urllib.request.urlopen(dataset_url)
data_zip = response.read()
data_zip = io.BytesIO(data_zip)
data_zip = ZipFile(file = data_zip)

    #Extract the csv file
csv_filename = data_zip.namelist()[0]
data_csv = data_zip.read(csv_filename)
data_csv = io.BytesIO(data_csv)

    #Read the csv (in chunks because the csv is large)
chunks = pd.read_csv(data_csv, encoding='ANSI', chunksize=100000, low_memory=False)

    #Clases for data

class StatsPerAges:
    cases = 0
    deaths = 0
    recovered = 0
    mortality = 0
    c_diabetes = 0
    d_diabetes = 0
    m_diabetes = 0
Beispiel #41
0
class DarFile:
    """
    Provides access to the contents of a .dar file.
    """
    def __init__(self, dar_path: 'Union[str, Path, BinaryIO]'):
        if isinstance(dar_path, (str, Path)):
            self.dar_path = pathify(dar_path)
            self.dar_contents = ZipFile(str(self.dar_path))
        else:
            self.dar_path = None
            self.dar_contents = ZipFile(dar_path)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        self.dar_contents.close()

    def read_metadata(self) -> 'PackageStore':
        from ..model.types_store import PackageStore
        store = PackageStore.empty()
        dalf_names = self.get_dalf_names()
        for dalf_name in dalf_names:
            contents = self.dar_contents.read(dalf_name)
            store.register_all(parse_dalf(contents))
        return store

    def get_archives(self) -> 'Mapping[str, bytes]':
        """
        Return a mapping from package ID to byte contents.
        """
        from ..protocols.v1.pb_parse_metadata import parse_archive_payload
        archives = {}  # type: Dict[str, bytes]
        for dalf_name in self.get_dalf_names():
            contents = self.dar_contents.read(dalf_name)
            payload = parse_archive_payload(contents)
            archives[payload.hash] = contents
        return archives

    def get_dalf_names(self) -> 'Sequence[str]':
        dalf_names = []
        for name in self.dar_contents.namelist():
            _, ext = path.splitext(name)
            if ext == '.dalf':
                dalf_names.append(name)
        return dalf_names

    def get_manifest(self) -> 'Optional[Mapping[str, str]]':
        """
        Return the contents of the manifest of this DAR.
        :return:
        """
        names = self.dar_contents.namelist()
        if 'META-INF/MANIFEST.MF' in names:
            manifest_bytes = self.dar_contents.read('META-INF/MANIFEST.MF')
            manifest = {}
            for line in manifest_bytes.decode('utf-8').splitlines():
                print(line)
                name, _, value = line.partition(':')
                manifest[name] = value.strip()
            return manifest
        else:
            return None

    def get_sdk_version(self) -> 'Optional[str]':
        """
        Return the SDK version used to compile this dar (if this information is available).
        """
        manifest = self.get_manifest()
        return manifest.get('Sdk-Version') if manifest is not None else None

    def get_package_provider(self) -> 'PackageProvider':
        from typing import Dict
        from ..model.types_store import MemoryPackageProvider
        from .._gen.com.digitalasset.daml_lf_dev.daml_lf_pb2 import Archive

        packages = {}  # type: Dict[str, bytes]
        dalf_names = self.get_dalf_names()
        for dalf_name in dalf_names:
            contents = self.dar_contents.read(dalf_name)

            a = Archive()
            a.ParseFromString(contents)

            packages[a.hash] = a.payload

        return MemoryPackageProvider(packages)
Beispiel #42
0
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
print(len(mnist.train.images))
print(len(mnist.test.images))
print(len(mnist.validation.images))
print(mnist.train.labels[1, :])

# Ham/Spam Text Data
import requests
import io
from zipfile import ZipFile

# Get/read zip file
zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
r = requests.get(zip_url)
z = ZipFile(io.BytesIO(r.content))
file = z.read('SMSSpamCollection')
# Format Data
text_data = file.decode()
text_data = text_data.encode('ascii', errors='ignore')
text_data = text_data.decode().split('\n')
text_data = [x.split('\t') for x in text_data if len(x) >= 1]
[text_data_target, text_data_train] = [list(x) for x in zip(*text_data)]
print(len(text_data_train))
print(set(text_data_target))
print(text_data_train[1])

# Movie Review Data
import requests
import io
import tarfile
Beispiel #43
0
#!/usr/bin/env python

from zipfile import ZipFile, ZIP_DEFLATED
import os.path

# reading & extracting
rzip = ZipFile("DATA/textfiles.zip")  # <1>
print(rzip.namelist())  # <2>
ty = rzip.read('tyger.txt').decode()  # <3>
print(ty[:50])
rzip.extract('parrot.txt')  # <4>

# creating a zip file
wzip = ZipFile("example.zip", mode="w", compression=ZIP_DEFLATED)  # <5>
for base in "parrot tyger knights alice poe_sonnet spam".split():
    filename = os.path.join("DATA", base + '.txt')
    print("adding {} as {}".format(filename, base + '.txt'))
    wzip.write(filename, base + '.txt')  # <6>
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup

wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')

wordObj = BeautifulSoup(xml_content.decode('utf-8'))
textStrings = wordObj.findAll("w:t")
for textElem in textStrings:
    print(textElem.text)
Beispiel #45
0
def absoluteFilePaths(directory):
   for dirpath,_,filenames in os.walk(directory):
       for f in filenames:
           yield os.path.abspath(os.path.join(dirpath, f))

async def zipper(filelist,name):
    with ZipFile(name, 'w') as zipMe:        
        for file in filelist:
            zipMe.write(file, compress_type=ZIP_DEFLATED)
    
   
async def unzipper(my_dir,my_zip)
    zip_file = ZipFile(my_zip, 'r')
    for files in zip_file.namelist():
        data = zip_file.read(files, my_dir)
        myfile_path = os.path.join(my_dir, files.split("/")[-1])
        myfile = open(myfile_path, "wb")
        myfile.write(data)
        myfile.close()
    zip_file.close()
   
   
async def hey(event):
    reply_message = await event.get_reply_message()
    await event.reply("Starting to part the files please wait...")
    name = await bot.download_media(reply_message,"./")
    dir = str(uuid4())
    os.mkdir(dir)
    await unzipper(dir,name)
    files = list(absoluteFilePaths(dir))
Beispiel #46
0
def get_update_data(inputio, getfilecount=True, getsoups=True):
    epub = ZipFile(inputio,
                   'r')  # works equally well with inputio as a path or a blob

    ## Find the .opf file.
    container = epub.read("META-INF/container.xml")
    containerdom = parseString(container)
    rootfilenodelist = containerdom.getElementsByTagName("rootfile")
    rootfilename = rootfilenodelist[0].getAttribute("full-path")

    contentdom = parseString(epub.read(rootfilename))
    firstmetadom = contentdom.getElementsByTagName("metadata")[0]
    try:
        source = firstmetadom.getElementsByTagName(
            "dc:source")[0].firstChild.data.encode("utf-8")
    except:
        source = None

    ## Save the path to the .opf file--hrefs inside it are relative to it.
    relpath = get_path_part(rootfilename)

    oldcover = None
    calibrebookmark = None
    logfile = None
    # Looking for pre-existing cover.
    for item in contentdom.getElementsByTagName("reference"):
        if item.getAttribute("type") == "cover":
            # there is a cover (x)html file, save the soup for it.
            href = relpath + item.getAttribute("href")
            oldcoverhtmlhref = href
            oldcoverhtmldata = epub.read(href)
            oldcoverhtmltype = "application/xhtml+xml"
            for item in contentdom.getElementsByTagName("item"):
                if (relpath + item.getAttribute("href") == oldcoverhtmlhref):
                    oldcoverhtmltype = item.getAttribute("media-type")
                    break
            soup = bs.BeautifulSoup(oldcoverhtmldata.decode("utf-8"))
            src = None
            # first img or image tag.
            imgs = soup.findAll('img')
            if imgs:
                src = get_path_part(href) + imgs[0]['src']
            else:
                imgs = soup.findAll('image')
                if imgs:
                    src = get_path_part(href) + imgs[0]['xlink:href']

            if not src:
                continue
            try:
                # remove all .. and the path part above it, if present.
                # Mostly for epubs edited by Sigil.
                src = re.sub(r"([^/]+/\.\./)", "", src)
                #print("epubutils: found pre-existing cover image:%s"%src)
                oldcoverimghref = src
                oldcoverimgdata = epub.read(src)
                for item in contentdom.getElementsByTagName("item"):
                    if (relpath +
                            item.getAttribute("href") == oldcoverimghref):
                        oldcoverimgtype = item.getAttribute("media-type")
                        break
                oldcover = (oldcoverhtmlhref, oldcoverhtmltype,
                            oldcoverhtmldata, oldcoverimghref, oldcoverimgtype,
                            oldcoverimgdata)
            except Exception as e:
                logger.warn("Cover Image %s not found" % src)
                logger.warn("Exception: %s" % (unicode(e)))
                traceback.print_exc()

    filecount = 0
    soups = []  # list of xhmtl blocks
    images = {}  # dict() longdesc->data
    if getfilecount:
        # spin through the manifest--only place there are item tags.
        for item in contentdom.getElementsByTagName("item"):
            # First, count the 'chapter' files.  FFF uses file0000.xhtml,
            # but can also update epubs downloaded from Twisting the
            # Hellmouth, which uses chapter0.html.
            if (item.getAttribute("media-type") == "application/xhtml+xml"):
                href = relpath + item.getAttribute("href")
                #print("---- item href:%s path part: %s"%(href,get_path_part(href)))
                if re.match(r'.*/log_page\.x?html', href):
                    try:
                        logfile = epub.read(href).decode("utf-8")
                    except:
                        pass  # corner case I bumped into while testing.
                if re.match(r'.*/(file|chapter)\d+\.x?html', href):
                    if getsoups:
                        soup = bs.BeautifulSoup(
                            epub.read(href).decode("utf-8"), "html5lib")
                        for img in soup.findAll('img'):
                            newsrc = ''
                            longdesc = ''
                            try:
                                newsrc = get_path_part(href) + img['src']
                                # remove all .. and the path part above it, if present.
                                # Mostly for epubs edited by Sigil.
                                newsrc = re.sub(r"([^/]+/\.\./)", "", newsrc)
                                longdesc = img['longdesc']
                                data = epub.read(newsrc)
                                images[longdesc] = data
                                img['src'] = img['longdesc']
                            except Exception as e:
                                logger.warn(
                                    "Image %s not found!\n(originally:%s)" %
                                    (newsrc, longdesc))
                                logger.warn("Exception: %s" % (unicode(e)))
                                traceback.print_exc()
                        soup = soup.find('body')
                        # ffdl epubs have chapter title h3
                        h3 = soup.find('h3')
                        if h3:
                            h3.extract()
                        # TtH epubs have chapter title h2
                        h2 = soup.find('h2')
                        if h2:
                            h2.extract()

                        for skip in soup.findAll(
                                attrs={'class': 'skip_on_ffdl_update'}):
                            skip.extract()

                        soups.append(soup)

                    filecount += 1

    try:
        calibrebookmark = epub.read("META-INF/calibre_bookmarks.txt")
    except:
        pass

    #for k in images.keys():
    #print("\tlongdesc:%s\n\tData len:%s\n"%(k,len(images[k])))
    return (source, filecount, soups, images, oldcover, calibrebookmark,
            logfile)
Beispiel #47
0
    def test_exported_meeting_json_has_correct_file_names(self, browser):
        self.login(self.committee_responsible, browser)
        self.schedule_paragraph(self.meeting, u'A Gesch\xfcfte')
        with freeze(localized_datetime(2017, 12, 13)):
            self.schedule_ad_hoc(self.meeting,
                                 u'Ad-hoc Traktand\xfem').decide()
        self.schedule_proposal(self.meeting,
                               self.submitted_word_proposal).decide()
        with freeze(localized_datetime(2017, 12, 14)):
            self.meeting.model.close()

        browser.open(self.meeting, view='export-meeting-zip')
        self.assertEquals('application/zip', browser.contenttype)

        zip_file = ZipFile(StringIO(browser.contents), 'r')

        meeting_json = json.loads(zip_file.read('meeting.json'))

        # the protocol is generated during the tests and its checksum cannot
        # be predicted
        meeting_json['meetings'][0]['protocol']['checksum'] = 'unpredictable'

        self.assert_json_structure_equal(
            {
                'meetings': [{
                    'agenda_items': [{
                        'title': u'A Gesch\xfcfte'
                    }, {
                        'number': '1.',
                        'proposal': {
                            'checksum':
                            'e00d6c8fb32c30d3ca3a3f8e5d873565482567561023016d9ca18243ff1cfa14',
                            'file':
                            '1. Ad-hoc Traktandthm/Ad hoc agenda item Ad-hoc Traktandthm.docx',
                            'modified': '2017-12-12T23:00:00+01:00'
                        },
                        'title': u'Ad-hoc Traktand\xfem'
                    }, {
                        'attachments': [{
                            'checksum':
                            '51d6317494eccc4a73154625a6820cb6b50dc1455eb4cf26399299d4f9ce77b2',
                            'file':
                            '2. Anderungen am Personalreglement/Vertragsentwurf.docx',
                            'modified': '2016-08-31T15:21:46+02:00',
                            'title': u'Vertr\xe4gsentwurf'
                        }],
                        'number':
                        '2.',
                        'proposal': {
                            'checksum':
                            'e00d6c8fb32c30d3ca3a3f8e5d873565482567561023016d9ca18243ff1cfa14',
                            'file':
                            '2. Anderungen am Personalreglement/Anderungen am Personalreglement.docx',
                            'modified': '2016-08-31T15:21:44+02:00'
                        },
                        'title':
                        u'\xc4nderungen am Personalreglement'
                    }],
                    'committee': {
                        'oguid': 'plone:1009233300',
                        'title': u'Rechnungspr\xfcfungskommission'
                    },
                    'end':
                    '2016-09-12T17:00:00+00:00',
                    'location':
                    u'B\xfcren an der Aare',
                    'protocol': {
                        'checksum': 'unpredictable',
                        'file':
                        'Protocol-9. Sitzung der Rechnungsprufungskommission.docx',
                        'modified': '2017-12-13T23:00:00+01:00'
                    },
                    'start':
                    '2016-09-12T15:30:00+00:00',
                    'title':
                    u'9. Sitzung der Rechnungspr\xfcfungskommission'
                }],
                'version':
                '1.0.0'
            }, meeting_json)

        file_names = zip_file.namelist()
        for file_name in [
                '1. Ad-hoc Traktandthm/Ad hoc agenda item Ad-hoc Traktandthm.docx',
                '2. Anderungen am Personalreglement/Vertragsentwurf.docx',
                '2. Anderungen am Personalreglement/Anderungen am Personalreglement.docx',
                'Protocol-9. Sitzung der Rechnungsprufungskommission.docx'
        ]:
            self.assertIn(file_name, file_names)
Beispiel #48
0
def verify(certificate, jar_file, sf_name=None):
    """
    Verifies signature of a JAR file.

    Limitations:
    - diagnostic is less verbose than of jarsigner
    :return None if verification succeeds.
    :exception SignatureBlockFileVerificationError, ManifestChecksumError,
        JarChecksumError, JarSignatureMissingError

    Reference:
    http://docs.oracle.com/javase/7/docs/technotes/guides/jar/jar.html#Signature_Validation
    Note that the validation is done in three steps. Failure at any step is a
    failure of the whole validation.
    """  # noqua

    # Step 0: get the "key alias", used also for naming of sig-related files.
    zip_file = ZipFile(jar_file)
    sf_files = [f for f in zip_file.namelist() if file_matches_sigfile(f)]

    if len(sf_files) == 0:
        raise JarSignatureMissingError("No .SF file in %s" % jar_file)

    elif len(sf_files) > 1:
        if sf_name is None:
            msg = "Multiple .SF files in %s, but SF_NAME.SF not specified" \
                % jar_file
            raise VerificationError(msg)

        elif ('META-INF/' + sf_name) in sf_files:
            sf_filename = 'META-INF/' + sf_name

        else:
            msg = "No .SF file in %s named META-INF/%s (found %d .SF files)" \
                % (jar_file, sf_name, len(sf_files))
            raise VerificationError(msg)

    elif len(sf_files) == 1:
        if sf_name is None:
            sf_filename = sf_files[0]

        elif sf_files[0] == 'META-INF/' + sf_name:
            sf_filename = sf_files[0]

        else:
            msg = "No .SF file in %s named META-INF/%s" % (jar_file, sf_name)
            raise VerificationError(msg)

    key_alias = sf_filename[9:-3]  # "META-INF/%s.SF"
    sf_data = zip_file.read(sf_filename)

    # Step 1: check the crypto part.
    file_list = zip_file.namelist()
    sig_block_filename = None

    # JAR specification mentions only RSA and DSA; jarsigner also has EC
    # TODO: what about "SIG-*"?
    signature_extensions = ("RSA", "DSA", "EC")
    for extension in signature_extensions:
        candidate_filename = "META-INF/%s.%s" % (key_alias, extension)
        if candidate_filename in file_list:
            sig_block_filename = candidate_filename
            break

    if sig_block_filename is None:
        msg = "None of %s found in JAR" % \
              ", ".join(key_alias + "." + x for x in signature_extensions)
        raise JarSignatureMissingError(msg)

    sig_block_data = zip_file.read(sig_block_filename)
    try:
        verify_signature_block(certificate, sf_data, sig_block_data)

    except SignatureBlockVerificationError as message:
        message = "Signature block verification failed: %s" % message
        raise SignatureBlockFileVerificationError(message)

    # KEYALIAS.SF is correctly signed.
    # Step 2: Check that it contains correct checksum of the manifest.
    signature_manifest = SignatureManifest()
    signature_manifest.parse(sf_data)

    jar_manifest = Manifest()
    jar_manifest.load_from_jar(jar_file)

    errors = signature_manifest.verify_manifest(jar_manifest)
    if len(errors) > 0:
        msg = "%s: in .SF file, section checksum(s) failed for: %s" \
              % (jar_file, ",".join(errors))
        raise ManifestChecksumError(msg)

    # Checksums of MANIFEST.MF itself are correct.

    # Step 3: Check that it contains valid checksums for each file
    # from the JAR.  NOTE: the check is done for JAR entries. If some
    # JAR entries are deleted after signing, the verification still
    # succeeds.  This seems to not follow the reference specification,
    # but that's what jarsigner does.
    errors = jar_manifest.verify_jar_checksums(jar_file)
    if len(errors) > 0:
        msg = "Checksum(s) for jar entries of jar file %s failed for: %s" \
              % (jar_file, ",".join(errors))
        raise JarChecksumError(msg)

    return None
Beispiel #49
0
    def get_recipe_from_file(self, file):
        ingredient_mode = False
        direction_mode = False
        description_mode = False

        ingredients = []
        directions = []
        descriptions = []
        for fl in file.readlines():
            line = fl.decode("utf-8")
            if 'title:' in line:
                title = line.replace('title:', '').replace('"', '').strip()
            if 'image:' in line:
                image = line.replace('image:', '').strip()
            if 'tags:' in line:
                tags = line.replace('tags:', '').strip()
            if ingredient_mode:
                if len(line) > 2 and 'directions:' not in line:
                    ingredients.append(line[2:])
            if '---' in line and direction_mode:
                direction_mode = False
                description_mode = True
            if direction_mode:
                if len(line) > 2:
                    directions.append(line[2:])
            if 'ingredients:' in line:
                ingredient_mode = True
            if 'directions:' in line:
                ingredient_mode = False
                direction_mode = True
            if description_mode and len(line) > 3 and '---' not in line:
                descriptions.append(line)

        recipe = Recipe.objects.create(name=title,
                                       created_by=self.request.user,
                                       internal=True,
                                       space=self.request.space)

        for k in tags.split(','):
            print(f'adding keyword {k.strip()}')
            keyword, created = Keyword.objects.get_or_create(
                name=k.strip(), space=self.request.space)
            recipe.keywords.add(keyword)

        step = Step.objects.create(
            instruction='\n'.join(directions) + '\n\n' +
            '\n'.join(descriptions),
            space=self.request.space,
        )

        ingredient_parser = IngredientParser(self.request, True)
        for ingredient in ingredients:
            if len(ingredient.strip()) > 0:
                amount, unit, food, note = ingredient_parser.parse(ingredient)
                f = ingredient_parser.get_food(food)
                u = ingredient_parser.get_unit(unit)
                step.ingredients.add(
                    Ingredient.objects.create(
                        food=f,
                        unit=u,
                        amount=amount,
                        note=note,
                        original_text=ingredient,
                        space=self.request.space,
                    ))
        recipe.steps.add(step)

        for f in self.files:
            if '.zip' in f['name']:
                import_zip = ZipFile(f['file'])
                for z in import_zip.filelist:
                    if re.match(f'^images/{image}$', z.filename):
                        self.import_recipe_image(
                            recipe,
                            BytesIO(import_zip.read(z.filename)),
                            filetype=get_filetype(z.filename))

        return recipe
Beispiel #50
0
def generate_hocr_xar(
    hocr_dir,
    output_dir,
    metadata_file,
    imagexarfile,
    dictionary_file,
    ocr_engine,
    classifier,
    datetime,
    verbose,
    clobber,
):
    import glob, os, shutil, tempfile
    import lxml.etree as etree
    import datetime as dt
    import lxml.etree as ET
    from halo import Halo
    from os.path import basename
    from pathlib import Path
    import traceback
    from zipfile import ZipFile

    print("generating hocr xar")
    # set the datetime if it isn't given
    if datetime == None:
        now = dt.datetime.now()
        datetime = now.strftime("%Y-%m-%d-%H-%M-%S")
        if verbose:
            print("supplying datetime for this OCR run:", datetime)
    identifier = ""
    repo_file_string = ""

    # change classifier variable to be the name of the classifier, not path to the actual file
    classifier = Path(classifier).stem

    if not (metadata_file == None):
        identifier = get_identifier_from_metadata_file(metadata_file.name)
        xsl_file = Path(__file__).parent / "XSLT/make_repo_texts.xsl"
        xsl_file_handle = open(xsl_file, "r")
        xslt = ET.parse(xsl_file_handle)
        dom = ET.parse(open(metadata_file.name, "r"))
        transform = ET.XSLT(xslt)
        newdom = transform(
            dom,
            identifier=etree.XSLT.strparam(identifier),
            classifier=etree.XSLT.strparam(classifier),
            rundate=etree.XSLT.strparam(datetime),
        )
        repo_file_string = ET.tostring(newdom)

    if not (imagexarfile == None):
        try:
            if verbose:
                print("archive is:", imagexarfile)
            archive = ZipFile(imagexarfile.name, "r")
            metadata = archive.read("meta.xml")
            root = ET.fromstring(metadata)
            identifier = get_dc_element_from_metadata("identifier", root)
            if verbose:
                print("Using identifier from image xar file:", identifier)
            repo_file_string = make_text_repo_string(root, datetime)

        except Exception as e:
            print("Failed to open image archive at", metadata_file,
                  "Exiting ...")
            print(e)
            exit(0)

    # get the final file name and check if we're clobbering
    output_file_name = identifier + "-" + datetime + "-" + classifier + "-texts.xar"
    output_file_path = os.path.join(output_dir, output_file_name)
    if os.path.exists(output_file_path) and not (clobber):
        print(
            "the output file",
            output_file_path,
            "already exists, and you've set '--clobber' to false, so I'm exiting without doing anything.",
        )
        exit(0)
    # collect and sort all the hocr files in the inputdir
    types = ["*.hocr", "*.html", "*.xhtml", "*.htm"]
    all_hocr_files = []
    for a_type in types:
        this_type_files = glob.glob(os.path.join(hocr_dir, a_type))
        all_hocr_files += this_type_files
    all_hocr_files.sort()
    if verbose:
        print("Input hocr files:")
        print(all_hocr_files)
    xhtml_temp_dir = tempfile.mkdtemp()
    output_counter = 1
    for hocr_file in all_hocr_files:
        fileout_name = identifier + "_" + str(output_counter).zfill(
            4) + ".html"
        # if verbose:
        #    print("fileout name: ", fileout_name)
        fileout_path = os.path.join(xhtml_temp_dir, fileout_name)
        shutil.copyfile(hocr_file, fileout_path)
        output_counter = output_counter + 1
    xslt_to_xhtml = etree.XML("""\
    <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
       xmlns:html='http://www.w3.org/1999/xhtml'>

       <xsl:template match="*">
        <xsl:element name="html:{local-name(.)}">
          <xsl:apply-templates select="@*|*|text()"/>
           </xsl:element>
           </xsl:template>

           <xsl:template match="@*">
             <xsl:attribute name="{name(.)}"><xsl:value-of
             select="."/></xsl:attribute>
             </xsl:template>

             </xsl:stylesheet>""")
    transform_to_xhtml = etree.XSLT(xslt_to_xhtml)
    xhtml_dehyph_temp_dir = tempfile.mkdtemp()
    if verbose:
        print("dehyphenation temp dir:", xhtml_dehyph_temp_dir)
    spinner = Halo(text="dehyphenating", spinner="dots")
    spinner.start()
    all_renamed_hocr_files = os.listdir(xhtml_temp_dir)
    for file_name in all_renamed_hocr_files:
        file_path = os.path.join(xhtml_temp_dir, file_name)
        with open(file_path) as file_path:
            try:
                tree = etree.parse(file_path)
                xhtml = transform_to_xhtml(tree)
                if ocr_engine == "kraken":
                    fix_kraken_hocr.get_word_span_area(xhtml, verbose)
                    fix_kraken_hocr.clean_ocr_page_title(xhtml, file_name)
                    try:
                        fix_kraken_hocr.share_space_spans(xhtml, verbose)
                    except Exception:
                        print(traceback.format_exc())
                        exit()
                    fix_kraken_hocr.confidence_summary(xhtml)
                dehyphenate.convert_ocrx_to_ocr(xhtml)
                dehyphenate.remove_meta_tags(xhtml)
                dehyphenate.identify(xhtml)
                dehyphenate.dehyphenate(xhtml, file_name, verbose)
                dehyphenate.add_dublin_core_tags(xhtml)
                out_path = os.path.join(xhtml_dehyph_temp_dir, file_name)
                xhtml.write(out_path,
                            pretty_print=True,
                            xml_declaration=True,
                            encoding="utf-8")
            except Exception as e:
                print("This exception was thrown on file {}".format(file_name))
                print(e)
    spinner.stop()
    # now generate a spellcheck file
    spinner = Halo(text="spellchecking", spinner="dots")
    spinner.start()
    no_accent_dict_file_path = (Path(__file__).parent /
                                "Dictionaries/unique_no_accent_list.csv")
    # TODO: Parameterize this, so we can set the dictionary on the command line
    dictionary_file_path = (Path(__file__).parent /
                            "Dictionaries/english_greek_latin.txt")
    spellcheck_file_path = tempfile.mktemp()
    if verbose:
        print("spellcheck file is:", spellcheck_file_path)
    generate_spellcheck_file.make_spellcheck_file(
        xhtml_dehyph_temp_dir,
        dictionary_file_path,
        no_accent_dict_file_path,
        spellcheck_file_path,
        verbose,
    )
    spellchecked_xhtml_temp_dir = tempfile.mkdtemp()
    if verbose:
        print(
            "temp dir for collecting xar context, including spellchecked hocr: ",
            spellchecked_xhtml_temp_dir,
        )
    spellcheck_hocr.spellcheck(
        spellcheck_file_path,
        xhtml_dehyph_temp_dir,
        spellchecked_xhtml_temp_dir,
        verbose,
    )
    spinner.stop()
    # todo delete temp files
    # make meta file for texts
    xsl_file = Path(__file__).parent / "XSLT/make_meta_texts.xsl"
    xsl_file_handle = open(xsl_file, "r")
    dom = ET.parse(xsl_file_handle)  # ET.parse(open(metadata_file, 'r'))
    xslt = ET.parse(open(xsl_file, "r"))
    plain_string_value = etree.XSLT.strparam(identifier)
    transform = ET.XSLT(xslt)
    newdom = transform(
        dom,
        identifier=etree.XSLT.strparam(identifier),
        classifier=etree.XSLT.strparam(classifier),
        rundate=etree.XSLT.strparam(datetime),
        engine=etree.XSLT.strparam(ocr_engine),
    )
    newdom.write(os.path.join(spellchecked_xhtml_temp_dir, "meta.xml"),
                 pretty_print=True)

    # make repo.xml for texts
    # xsl_file = Path(__file__).parent / "XSLT/make_repo_texts.xsl"
    # xsl_file_handle = open(xsl_file, "r")
    # xslt = ET.parse(xsl_file_handle)
    # dom = ET.parse(open(xsl_file, "r"))
    # get accuracy value
    assessment = str(assess_hocr_dir.assess(spellchecked_xhtml_temp_dir))
    # transform = ET.XSLT(xslt)
    # newdom = transform(dom, identifier=etree.XSLT.strparam(identifier), accuracy=assessment, rundate=etree.XSLT.strparam(datetime))
    # newdom.write(os.path.join(spellchecked_xhtml_temp_dir,'repo.xml') , pretty_print=True)
    # different approach
    with open(os.path.join(spellchecked_xhtml_temp_dir, "repo.xml"),
              "w") as repo_file:
        repo_file.write(repo_file_string)

    if not (imagexarfile.name == None):
        accuracySvgAndTotals.makeAccuracySVG(spellchecked_xhtml_temp_dir,
                                             imagexarfile.name)

    # make expath-pkg.xml for texts
    xsl_file = Path(__file__).parent / "XSLT/make_expath_texts.xsl"
    xsl_file_handle = open(xsl_file, "r")
    dom = ET.parse(open(xsl_file, "r"))
    xslt = ET.parse(xsl_file_handle)
    transform = ET.XSLT(xslt)
    newdom = transform(
        dom,
        identifier=etree.XSLT.strparam(identifier),
        rundate=etree.XSLT.strparam(datetime),
    )
    newdom.write(os.path.join(spellchecked_xhtml_temp_dir, "expath-pkg.xml"),
                 pretty_print=True)

    # save static metadata files to the temp dir
    static_files_dir = Path(__file__).parent / "static_for_text_xar"
    static_files = os.listdir(static_files_dir)
    for file_name in static_files:
        shutil.copy(os.path.join(static_files_dir, file_name),
                    spellchecked_xhtml_temp_dir)

    # make accuracy report?
    accuracySvgAndTotals.makeTotalsFile(spellchecked_xhtml_temp_dir)
    # this requires the xar file, or at least images.
    # We could re-do all this by passing in the image xar file and using its metadata for this one, which would
    # mean we don't have to keep our metadata files sitting around.
    # generate the zip file and save to outputdir

    # Make xar file output by compressing everything in 'spellchecked_xhtml_temp_dir'
    output_zip_file_path = os.path.join(
        output_dir,
        identifier + "-" + datetime + "-" + classifier + "-texts.xar")
    with ZipFile(output_zip_file_path, "w") as zipObj:
        for filename in os.listdir(spellchecked_xhtml_temp_dir):
            filePath = os.path.join(spellchecked_xhtml_temp_dir, filename)
            zipObj.write(filePath, basename(filePath))
    print("text archive from date", datetime, "saved to", output_zip_file_path)

    # Clean up
    if not (verbose):
        for temp_directory in [
                spellchecked_xhtml_temp_dir, xhtml_dehyph_temp_dir
        ]:
            shutil.rmtree(temp_directory)
        # delete unused spellcheck file
        os.remove(spellcheck_file_path)
class MailMerge(object):
    def __init__(self, file, remove_empty_tables=False):
        self.zip = ZipFile(file)
        self.parts = {}
        self.settings = None
        self._settings_info = None

        self.media = {}         # new images to add indexed by embed id
        self.rels = None        # etree for relationships
        self._rels_info = None  # zi info block for rels
        self.RELS_NAMESPACES = {'ns': None, 'od': None}

        self.remove_empty_tables = remove_empty_tables

        try:
            content_types = etree.parse(self.zip.open('[Content_Types].xml'))
            for file in content_types.findall('{%(ct)s}Override' % NAMESPACES):
                type = file.attrib['ContentType' % NAMESPACES]
                if type in CONTENT_TYPES_PARTS:
                    zi, self.parts[zi] = self.__get_tree_of_file(file)
                elif type == CONTENT_TYPE_SETTINGS:
                    self._settings_info, self.settings = self.__get_tree_of_file(file)

            # get the rels for image mappings
            try:
                self._rels_info, self.rels = self.__get_tree_of_file('word/_rels/document.xml.rels')
                self.RELS_NAMESPACES['ns'] = self.rels.getroot().nsmap.get(None)
                self.RELS_NAMESPACES['od'] = self.rels.getroot().nsmap.get(None).replace('package', 'officeDocument')
            except:
                pass
            to_delete = []

            r = re.compile(r' MERGEFIELD +"?([^ ]+?)"? +(|\\\* MERGEFORMAT )', re.I)
            for part in self.parts.values():

                for parent in part.findall('.//{%(w)s}fldSimple/..' % NAMESPACES):
                    for idx, child in enumerate(parent):
                        if child.tag != '{%(w)s}fldSimple' % NAMESPACES:
                            continue
                        instr = child.attrib['{%(w)s}instr' % NAMESPACES]

                        m = r.match(instr)
                        if m is None:
                            continue
                        parent[idx] = Element('MergeField', name=m.group(1))

                for parent in part.findall('.//{%(w)s}instrText/../..' % NAMESPACES):
                    children = list(parent)
                    fields = zip(
                        [children.index(e) for e in
                         parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="begin"]/..' % NAMESPACES)],
                        [children.index(e) for e in
                         parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="end"]/..' % NAMESPACES)]
                    )

                    for idx_begin, idx_end in fields:
                        # consolidate all instrText nodes between'begin' and 'end' into a single node
                        begin = children[idx_begin]
                        instr_elements = [e for e in
                                          begin.getparent().findall('{%(w)s}r/{%(w)s}instrText' % NAMESPACES)
                                          if idx_begin < children.index(e.getparent()) < idx_end]
                        if len(instr_elements) == 0:
                            continue

                        # set the text of the first instrText element to the concatenation
                        # of all the instrText element texts
                        instr_text = ''.join([e.text for e in instr_elements])
                        instr_elements[0].text = instr_text

                        # delete all instrText elements except the first
                        for instr in instr_elements[1:]:
                            instr.getparent().remove(instr)

                        m = r.match(instr_text)
                        if m is None:
                            continue
                        parent[idx_begin] = Element('MergeField', name=m.group(1))

                        # use this so we know *where* to put the replacement
                        instr_elements[0].tag = 'MergeText'
                        block = instr_elements[0].getparent()
                        # append the other tags in the w:r block too
                        parent[idx_begin].extend(list(block))

                        to_delete += [(parent, parent[i + 1])
                                      for i in range(idx_begin, idx_end)]

            for parent, child in to_delete:
                parent.remove(child)

            # Remove mail merge settings to avoid error messages when opening document in Winword
            if self.settings:
                settings_root = self.settings.getroot()
                mail_merge = settings_root.find('{%(w)s}mailMerge' % NAMESPACES)
                if mail_merge is not None:
                    settings_root.remove(mail_merge)
        except:
            self.zip.close()
            raise

    def __get_tree_of_file(self, file):
        if isinstance(file, etree._Element):
            fn = file.get('PartName').split('/', 1)[1]
        else:
            fn = file
        zi = self.zip.getinfo(fn)
        return zi, etree.parse(self.zip.open(zi))

    def write(self, file):
        # Replace all remaining merge fields with empty values
        for field in self.get_merge_fields():
            self.merge(**{field: ''})

        with ZipFile(file, 'w', ZIP_DEFLATED) as output:
            for zi in self.zip.filelist:
                if zi in self.parts:
                    xml = etree.tostring(self.parts[zi].getroot())
                    output.writestr(zi.filename, xml)
                elif zi == self._settings_info:
                    xml = etree.tostring(self.settings.getroot())
                    output.writestr(zi.filename, xml)
                elif zi == self._rels_info:
                    xml = etree.tostring(self.rels.getroot())
                    output.writestr(zi.filename, xml)
                else:
                    output.writestr(zi.filename, self.zip.read(zi))
            # add new images to media folder is we have images merged
            for img_id, img_data in self.media.items():
                output.writestr('media/{}.png'.format(img_id), img_data)

    def get_merge_fields(self, parts=None):
        if not parts:
            parts = self.parts.values()
        fields = set()
        for part in parts:
            for mf in part.findall('.//MergeField'):
                fields.add(mf.attrib['name'])
        return fields

    def merge_templates(self, replacements, separator):
        """
        Duplicate template. Creates a copy of the template, does a merge, and separates them by a new paragraph, a new break or a new section break.
        separator must be :
        - page_break : Page Break.
        - column_break : Column Break. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS
        - textWrapping_break : Line Break.
        - continuous_section : Continuous section break. Begins the section on the next paragraph.
        - evenPage_section : evenPage section break. section begins on the next even-numbered page, leaving the next odd page blank if necessary.
        - nextColumn_section : nextColumn section break. section begins on the following column on the page. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS
        - nextPage_section : nextPage section break. section begins on the following page.
        - oddPage_section : oddPage section break. section begins on the next odd-numbered page, leaving the next even page blank if necessary.
        """

        # TYPE PARAM CONTROL AND SPLIT
        valid_separators = {'page_break', 'column_break', 'textWrapping_break', 'continuous_section',
                            'evenPage_section', 'nextColumn_section', 'nextPage_section', 'oddPage_section'}
        if not separator in valid_separators:
            raise ValueError("Invalid separator argument")
        type, sepClass = separator.split("_")

        # GET ROOT - WORK WITH DOCUMENT
        for part in self.parts.values():
            root = part.getroot()
            tag = root.tag
            if tag == '{%(w)s}ftr' % NAMESPACES or tag == '{%(w)s}hdr' % NAMESPACES:
                continue

            if sepClass == 'section':

                # FINDING FIRST SECTION OF THE DOCUMENT
                firstSection = root.find("w:body/w:p/w:pPr/w:sectPr", namespaces=NAMESPACES)
                if firstSection == None:
                    firstSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES)

                # MODIFY TYPE ATTRIBUTE OF FIRST SECTION FOR MERGING
                nextPageSec = deepcopy(firstSection)
                for child in nextPageSec:
                    # Delete old type if exist
                    if child.tag == '{%(w)s}type' % NAMESPACES:
                        nextPageSec.remove(child)
                # Create new type (def parameter)
                newType = etree.SubElement(nextPageSec, '{%(w)s}type' % NAMESPACES)
                newType.set('{%(w)s}val' % NAMESPACES, type)

                # REPLACING FIRST SECTION
                secRoot = firstSection.getparent()
                secRoot.replace(firstSection, nextPageSec)

            # FINDING LAST SECTION OF THE DOCUMENT
            lastSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES)

            # SAVING LAST SECTION
            mainSection = deepcopy(lastSection)
            lsecRoot = lastSection.getparent()
            lsecRoot.remove(lastSection)

            # COPY CHILDREN ELEMENTS OF BODY IN A LIST
            childrenList = root.findall('w:body/*', namespaces=NAMESPACES)

            # DELETE ALL CHILDREN OF BODY
            for child in root:
                if child.tag == '{%(w)s}body' % NAMESPACES:
                    child.clear()

            # REFILL BODY AND MERGE DOCS - ADD LAST SECTION ENCAPSULATED OR NOT
            lr = len(replacements)
            lc = len(childrenList)
            parts = []
            for i, repl in enumerate(replacements):
                for (j, n) in enumerate(childrenList):
                    element = deepcopy(n)
                    for child in root:
                        if child.tag == '{%(w)s}body' % NAMESPACES:
                            child.append(element)
                            parts.append(element)
                            if (j + 1) == lc:
                                if (i + 1) == lr:
                                    child.append(mainSection)
                                    parts.append(mainSection)
                                else:
                                    if sepClass == 'section':
                                        intSection = deepcopy(mainSection)
                                        p = etree.SubElement(child, '{%(w)s}p' % NAMESPACES)
                                        pPr = etree.SubElement(p, '{%(w)s}pPr' % NAMESPACES)
                                        pPr.append(intSection)
                                        parts.append(p)
                                    elif sepClass == 'break':
                                        pb = etree.SubElement(child, '{%(w)s}p' % NAMESPACES)
                                        r = etree.SubElement(pb, '{%(w)s}r' % NAMESPACES)
                                        nbreak = Element('{%(w)s}br' % NAMESPACES)
                                        nbreak.attrib['{%(w)s}type' % NAMESPACES] = type
                                        r.append(nbreak)

                    self.merge(parts, **repl)

    def merge_pages(self, replacements):
        """
        Deprecated method.
        """
        warnings.warn("merge_pages has been deprecated in favour of merge_templates",
                      category=DeprecationWarning,
                      stacklevel=2)
        self.merge_templates(replacements, "page_break")

    def merge(self, parts=None, **replacements):
        if not parts:
            parts = self.parts.values()

        for field, replacement in replacements.items():
            if isinstance(replacement, list):
                self.merge_rows(field, replacement)
            else:
                for part in parts:
                    self.__merge_field(part, field, replacement)

    def __merge_field(self, part, field, text):
        if field.startswith('IMAGE:'):
            _, img_name = field.split(':')
            inline_img_el = part.find('.//wp:docPr[@title="{}"]/..'.format(img_name), namespaces=NAMESPACES)
            if inline_img_el:
                embed_node = inline_img_el.find('.//a:blip', namespaces=NAMESPACES)
                if embed_node:
                    # generate a random id and add tp media list for later export to media folder in zip file
                    img_id = 'MMR{}'.format(randint(10000000, 999999999))
                    self.media[img_id] = text

                    # add a relationship
                    last_img_relationship = \
                    self.rels.findall('{%(ns)s}Relationship[@Type="%(od)s/image"]' % self.RELS_NAMESPACES)[-1]
                    new_img_relationship = deepcopy(last_img_relationship)
                    new_img_relationship.set('Id', img_id)
                    new_img_relationship.set('Target', '/media/{}.png'.format(img_id))
                    self.rels.getroot().append(new_img_relationship)

                    # replace the embed attrib with the new image_id
                    embed_node = inline_img_el.find('.//a:blip', namespaces=NAMESPACES)
                    embed_attr = embed_node.attrib.keys()[0]
                    embed_node.attrib[embed_attr] = img_id
                # mark as done
                inline_img_el.find('wp:docPr', namespaces=NAMESPACES).attrib['title'] = 'replaced_image_{}'.format(
                    img_id)
            return

        for mf in part.findall('.//MergeField[@name="%s"]' % field):
            children = list(mf)
            mf.clear()  # clear away the attributes
            mf.tag = '{%(w)s}r' % NAMESPACES
            mf.extend(children)

            nodes = []
            # preserve new lines in replacement text
            text = text or ''  # text might be None
            text_parts = text.replace('\r', '').split('\n')
            for i, text_part in enumerate(text_parts):
                text_node = Element('{%(w)s}t' % NAMESPACES)
                text_node.text = text_part
                nodes.append(text_node)

                # if not last node add new line node
                if i < (len(text_parts) - 1):
                    nodes.append(Element('{%(w)s}br' % NAMESPACES))

            ph = mf.find('MergeText')
            if ph is not None:
                # add text nodes at the exact position where
                # MergeText was found
                index = mf.index(ph)
                for node in reversed(nodes):
                    mf.insert(index, node)
                mf.remove(ph)
            else:
                mf.extend(nodes)

    def merge_rows(self, anchor, rows):
        table, idx, template = self.__find_row_anchor(anchor)
        if table is not None:
            if len(rows) > 0:
                del table[idx]
                for i, row_data in enumerate(rows):
                    row = deepcopy(template)
                    self.merge([row], **row_data)
                    table.insert(idx + i, row)
            else:
                # if there is no data for a given table
                # we check whether table needs to be removed
                if self.remove_empty_tables:
                    parent = table.getparent()
                    parent.remove(table)

    def __find_row_anchor(self, field, parts=None):
        if not parts:
            parts = self.parts.values()
        for part in parts:
            for table in part.findall('.//{%(w)s}tbl' % NAMESPACES):
                for idx, row in enumerate(table):
                    if row.find('.//MergeField[@name="%s"]' % field) is not None:
                        return table, idx, row
        return None, None, None

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def close(self):
        if self.zip is not None:
            try:
                self.zip.close()
            finally:
                self.zip = None
Beispiel #52
0
def _make_super_runtime(capsule,
                        output,
                        licfile=None,
                        platforms=None,
                        restrict=True,
                        suffix=''):
    logging.info('Generating super runtime library to %s', relpath(output))
    if not os.path.exists(output):
        os.makedirs(output)

    supermode = True
    if not platforms:
        platid = _format_platid()
        filelist = _build_platforms([platid], restrict, supermode)[:1]
    elif len(platforms) == 1:
        filelist = _build_platforms(platforms, restrict, supermode)[:1]
    else:
        filelist = _build_platforms(platforms, restrict, supermode)

    myzip = ZipFile(capsule, 'r')
    if 'pytransform.key' not in myzip.namelist():
        raise RuntimeError('No pytransform.key found in capsule')
    logging.info('Extract pytransform.key')
    keydata = myzip.read('pytransform.key')
    myzip.close()

    lickey = _build_license_file(capsule, licfile)

    if sys.version_info.major == 2:
        size1 = ord(keydata[0]) + ord(keydata[1]) * 256
        size2 = ord(keydata[2]) + ord(keydata[3]) * 256
    else:
        size1 = keydata[0] + keydata[1] * 256
        size2 = keydata[2] + keydata[3] * 256

    k1 = 16
    k2 = k1 + size1
    keylist = keydata[k1:k2], keydata[k2:k2 + size2], lickey

    namelist = []
    checklist = []
    for filename in filelist:
        logging.info('Copying %s', filename)

        name = os.path.basename(filename)
        if suffix:
            k = name.rfind('pytransform') + len('pytransform')
            name = name[:k] + suffix + name[k:]
            logging.info('Rename extension to %s', name)
        if name in namelist:
            raise RuntimeError('Multiple platforms confilt with '
                               'same extension name "%s"' % name)
        namelist.append(name)

        target = os.path.join(output, name)
        shutil.copy2(filename, target)

        logging.info('Patch extension %s', target)
        data = _patch_extension(target, keylist, suffix)

        with open(target, 'wb') as f:
            f.write(data)
        checklist.append(sum(data))

    logging.info('Generate runtime files OK')
    return checklist
Beispiel #53
0
class BundleProjectStore(ProjectStore):
    """Represents a translate project bundle (zip archive)."""

    # INITIALIZERS #
    def __init__(self, fname):
        super(BundleProjectStore, self).__init__()
        self._tempfiles = {}
        if fname and os.path.isfile(fname):
            self.load(fname)
        else:
            self.zip = ZipFile(fname, 'w')
            self.save()
            self.zip.close()
            self.zip = ZipFile(fname, 'a')

    # CLASS METHODS #
    @classmethod
    def from_project(cls, proj, fname=None):
        if fname is None:
            fname = 'bundle.zip'

        bundle = BundleProjectStore(fname)
        for fn in proj.sourcefiles:
            bundle.append_sourcefile(proj.get_file(fn))
        for fn in proj.transfiles:
            bundle.append_transfile(proj.get_file(fn))
        for fn in proj.targetfiles:
            bundle.append_targetfile(proj.get_file(fn))
        bundle.settings = proj.settings.copy()
        bundle.save()
        return bundle

    # METHODS #
    def append_file(self, afile, fname, ftype='trans', delete_orig=False):
        """Append the given file to the project with the given filename, marked
            to be of type ``ftype`` ('src', 'trans', 'tgt').

            :param delete_orig: If ``True``, as set by
                                :meth:`~translate.storage.Project.convert_forward`,
                                ``afile`` is deleted after appending, if
                                possible.

            .. note:: For this implementation, the appended file will be deleted
                      from disk if ``delete_orig`` is ``True``.
            """
        if fname and fname in self.zip.namelist():
            raise ValueError("File already in bundle archive: %s" % (fname))
        if not fname and isinstance(afile, basestring) and afile in self.zip.namelist():
            raise ValueError("File already in bundle archive: %s" % (afile))

        afile, fname = super(BundleProjectStore, self).append_file(afile, fname, ftype)
        self._zip_add(fname, afile)

        if delete_orig and hasattr(afile, 'name') and afile.name not in self._tempfiles:
            try:
                os.unlink(afile.name)
            except Exception:
                pass

        return self.get_file(fname), fname

    def remove_file(self, fname, ftype=None):
        """Remove the file with the given project name from the project."""
        super(BundleProjectStore, self).remove_file(fname, ftype)
        self._zip_delete([fname])
        tempfiles = [tmpf for tmpf, prjf in self._tempfiles.iteritems() if prjf == fname]
        if tempfiles:
            for tmpf in tempfiles:
                try:
                    os.unlink(tmpf)
                except Exception:
                    pass
                del self._tempfiles[tmpf]

    def close(self):
        super(BundleProjectStore, self).close()
        self.cleanup()
        self.zip.close()

    def cleanup(self):
        """Clean up our mess: remove temporary files."""
        for tempfname in self._tempfiles:
            if os.path.isfile(tempfname):
                os.unlink(tempfname)
        self._tempfiles = {}

    def get_file(self, fname):
        """Retrieve a project file (source, translation or target file) from the
            project archive."""
        retfile = None
        if fname in self._files or fname in self.zip.namelist():
            # Check if the file has not already been extracted to a temp file
            tempfname = [tfn for tfn in self._tempfiles if self._tempfiles[tfn] == fname]
            if tempfname and os.path.isfile(tempfname[0]):
                tempfname = tempfname[0]
            else:
                tempfname = ''
            if not tempfname:
                # Extract the file to a temporary file
                zfile = self.zip.open(fname)
                tempfname = os.path.split(fname)[-1]
                tempfd, tempfname = tempfile.mkstemp(suffix='_' + tempfname)
                os.close(tempfd)
                open(tempfname, 'w').write(zfile.read())
            retfile = open(tempfname)
            self._tempfiles[tempfname] = fname

        if not retfile:
            raise FileNotInProjectError(fname)
        return retfile

    def get_proj_filename(self, realfname):
        """Try and find a project file name for the given real file name."""
        try:
            fname = super(BundleProjectStore, self).get_proj_filename(realfname)
        except ValueError as ve:
            fname = None
        if fname:
            return fname
        if realfname in self._tempfiles:
            return self._tempfiles[realfname]
        raise ValueError('Real file not in project store: %s' % (realfname))

    def load(self, zipname):
        """Load the bundle project from the zip file of the given name."""
        self.zip = ZipFile(zipname, mode='a')
        self._load_settings()

        append_section = {
            'sources': self._sourcefiles.append,
            'targets': self._targetfiles.append,
            'transfiles': self._transfiles.append,
        }
        for section in ('sources', 'targets', 'transfiles'):
            if section in self.settings:
                for fname in self.settings[section]:
                    append_section[section](fname)
                    self._files[fname] = None

    def save(self, filename=None):
        """Save all project files to the bundle zip file."""
        self._update_from_tempfiles()

        if filename:
            newzip = ZipFile(filename, 'w')
        else:
            newzip = self._create_temp_zipfile()

        # Write project file for the new zip bundle
        newzip.writestr('project.xtp', self._generate_settings())
        # Copy project files from project to the new zip file
        project_files = self._sourcefiles + self._transfiles + self._targetfiles
        for fname in project_files:
            newzip.writestr(fname, self.get_file(fname).read())
        # Copy any extra (non-project) files from the current zip
        for fname in self.zip.namelist():
            if fname in project_files or fname == 'project.xtp':
                continue
            newzip.writestr(fname, self.zip.read(fname))

        self._replace_project_zip(newzip)

    def update_file(self, pfname, infile):
        """Updates the file with the given project file name with the contents
            of ``infile``.

            :returns: the results from :meth:`BundleProjStore.append_file`."""
        if pfname not in self._files:
            raise FileNotInProjectError(pfname)

        if pfname not in self.zip.namelist():
            return super(BundleProjectStore, self).update_file(pfname, infile)

        self._zip_delete([pfname])
        self._zip_add(pfname, infile)

    def _load_settings(self):
        """Grab the project.xtp file from the zip file and load it."""
        if 'project.xtp' not in self.zip.namelist():
            raise InvalidBundleError('Not a translate project bundle')
        super(BundleProjectStore, self)._load_settings(self.zip.open('project.xtp').read())

    def _create_temp_zipfile(self):
        """Create a new zip file with a temporary file name (with mode 'w')."""
        newzipfd, newzipfname = tempfile.mkstemp(prefix='translate_bundle', suffix='.zip')
        os.close(newzipfd)
        return ZipFile(newzipfname, 'w')

    def _replace_project_zip(self, zfile):
        """Replace the currently used zip file (``self.zip``) with the given zip
            file. Basically, ``os.rename(zfile.filename, self.zip.filename)``."""
        if not zfile.fp.closed:
            zfile.close()
        if not self.zip.fp.closed:
            self.zip.close()
        shutil.move(zfile.filename, self.zip.filename)
        self.zip = ZipFile(self.zip.filename, mode='a')

    def _update_from_tempfiles(self):
        """Update project files from temporary files."""
        for tempfname in self._tempfiles:
            tmp = open(tempfname)
            self.update_file(self._tempfiles[tempfname], tmp)
            if not tmp.closed:
                tmp.close()

    def _zip_add(self, pfname, infile):
        """Add the contents of ``infile`` to the zip with file name ``pfname``."""
        if hasattr(infile, 'seek'):
            infile.seek(0)
        self.zip.writestr(pfname, infile.read())
        # Clear the cached file object to force the file to be read from the
        # zip file.
        self._files[pfname] = None

    def _zip_delete(self, fnames):
        """Delete the files with the given names from the zip file (``self.zip``)."""
        # Sanity checking
        if not isinstance(fnames, (list, tuple)):
            raise ValueError("fnames must be list or tuple: %s" % (fnames))
        if not self.zip:
            raise ValueError("No zip file to work on")
        zippedfiles = self.zip.namelist()
        for fn in fnames:
            if fn not in zippedfiles:
                raise KeyError("File not in zip archive: %s" % (fn))

        newzip = self._create_temp_zipfile()
        newzip.writestr('project.xtp', self._generate_settings())

        for fname in zippedfiles:
            # Copy all files from self.zip that are not project.xtp (already
            # in the new zip file) or in fnames (they are to be removed, after
            # all.
            if fname in fnames or fname == 'project.xtp':
                continue
            newzip.writestr(fname, self.zip.read(fname))

        self._replace_project_zip(newzip)
Beispiel #54
0
class XPCShellRemote(xpcshell.XPCShellTests, object):
    def __init__(self, devmgr, options, log):
        xpcshell.XPCShellTests.__init__(self, log)

        # Add Android version (SDK level) to mozinfo so that manifest entries
        # can be conditional on android_version.
        androidVersion = devmgr.shellCheckOutput(
            ['getprop', 'ro.build.version.sdk'])
        mozinfo.info['android_version'] = androidVersion

        self.localLib = options.localLib
        self.localBin = options.localBin
        self.options = options
        self.device = devmgr
        self.pathMapping = []
        self.remoteTestRoot = "%s/xpc" % self.device.deviceRoot
        # remoteBinDir contains xpcshell and its wrapper script, both of which must
        # be executable. Since +x permissions cannot usually be set on /mnt/sdcard,
        # and the test root may be on /mnt/sdcard, remoteBinDir is set to be on
        # /data/local, always.
        self.remoteBinDir = "/data/local/xpcb"
        # Terse directory names are used here ("c" for the components directory)
        # to minimize the length of the command line used to execute
        # xpcshell on the remote device. adb has a limit to the number
        # of characters used in a shell command, and the xpcshell command
        # line can be quite complex.
        self.remoteTmpDir = remoteJoin(self.remoteTestRoot, "tmp")
        self.remoteScriptsDir = self.remoteTestRoot
        self.remoteComponentsDir = remoteJoin(self.remoteTestRoot, "c")
        self.remoteModulesDir = remoteJoin(self.remoteTestRoot, "m")
        self.remoteMinidumpDir = remoteJoin(self.remoteTestRoot, "minidumps")
        self.remoteClearDirScript = remoteJoin(self.remoteBinDir, "cleardir")
        self.profileDir = remoteJoin(self.remoteTestRoot, "p")
        self.remoteDebugger = options.debugger
        self.remoteDebuggerArgs = options.debuggerArgs
        self.testingModulesDir = options.testingModulesDir

        self.env = {}

        if self.options.objdir:
            self.xpcDir = os.path.join(self.options.objdir, "_tests/xpcshell")
        elif os.path.isdir(os.path.join(here, 'tests')):
            self.xpcDir = os.path.join(here, 'tests')
        else:
            print >> sys.stderr, "Couldn't find local xpcshell test directory"
            sys.exit(1)

        if options.localAPK:
            self.localAPKContents = ZipFile(options.localAPK)
        if options.setup:
            self.setupUtilities()
            self.setupModules()
            self.setupTestDir()
        self.setupMinidumpDir()
        self.remoteAPK = None
        if options.localAPK:
            self.remoteAPK = remoteJoin(self.remoteBinDir,
                                        os.path.basename(options.localAPK))
            self.setAppRoot()

        # data that needs to be passed to the RemoteXPCShellTestThread
        self.mobileArgs = {
            'device': self.device,
            'remoteBinDir': self.remoteBinDir,
            'remoteScriptsDir': self.remoteScriptsDir,
            'remoteComponentsDir': self.remoteComponentsDir,
            'remoteModulesDir': self.remoteModulesDir,
            'options': self.options,
            'remoteDebugger': self.remoteDebugger,
            'pathMapping': self.pathMapping,
            'profileDir': self.profileDir,
            'remoteTmpDir': self.remoteTmpDir,
            'remoteMinidumpDir': self.remoteMinidumpDir,
            'remoteClearDirScript': self.remoteClearDirScript,
        }
        if self.remoteAPK:
            self.mobileArgs['remoteAPK'] = self.remoteAPK

    def setLD_LIBRARY_PATH(self):
        self.env["LD_LIBRARY_PATH"] = self.remoteBinDir

    def pushWrapper(self):
        # Rather than executing xpcshell directly, this wrapper script is
        # used. By setting environment variables and the cwd in the script,
        # the length of the per-test command line is shortened. This is
        # often important when using ADB, as there is a limit to the length
        # of the ADB command line.
        localWrapper = tempfile.mktemp()
        f = open(localWrapper, "w")
        f.write("#!/system/bin/sh\n")
        for envkey, envval in self.env.iteritems():
            f.write("export %s=%s\n" % (envkey, envval))
        f.writelines([
            "cd $1\n", "echo xpcw: cd $1\n", "shift\n",
            "echo xpcw: xpcshell \"$@\"\n",
            "%s/xpcshell \"$@\"\n" % self.remoteBinDir
        ])
        f.close()
        remoteWrapper = remoteJoin(self.remoteBinDir, "xpcw")
        self.device.pushFile(localWrapper, remoteWrapper)
        os.remove(localWrapper)

        # Removing and re-creating a directory is a common operation which
        # can be implemented more efficiently with a shell script.
        localWrapper = tempfile.mktemp()
        f = open(localWrapper, "w")
        # The directory may not exist initially, so rm may fail. 'rm -f' is not
        # supported on some Androids. Similarly, 'test' and 'if [ -d ]' are not
        # universally available, so we just ignore errors from rm.
        f.writelines(
            ["#!/system/bin/sh\n", "rm -r \"$1\"\n", "mkdir \"$1\"\n"])
        f.close()
        self.device.pushFile(localWrapper, self.remoteClearDirScript)
        os.remove(localWrapper)

        self.device.chmodDir(self.remoteBinDir)

    def buildEnvironment(self):
        self.buildCoreEnvironment()
        self.setLD_LIBRARY_PATH()
        self.env["MOZ_LINKER_CACHE"] = self.remoteBinDir
        if self.options.localAPK and self.appRoot:
            self.env["GRE_HOME"] = self.appRoot
        self.env["XPCSHELL_TEST_PROFILE_DIR"] = self.profileDir
        self.env["TMPDIR"] = self.remoteTmpDir
        self.env["HOME"] = self.profileDir
        self.env["XPCSHELL_TEST_TEMP_DIR"] = self.remoteTmpDir
        self.env["XPCSHELL_MINIDUMP_DIR"] = self.remoteMinidumpDir
        if self.options.setup:
            self.pushWrapper()

    def setAppRoot(self):
        # Determine the application root directory associated with the package
        # name used by the Fennec APK.
        self.appRoot = None
        packageName = None
        if self.options.localAPK:
            try:
                packageName = self.localAPKContents.read("package-name.txt")
                if packageName:
                    self.appRoot = self.device.getAppRoot(packageName.strip())
            except Exception as detail:
                print "unable to determine app root: " + str(detail)
                pass
        return None

    def setupUtilities(self):
        if (not self.device.dirExists(self.remoteBinDir)):
            # device.mkDir may fail here where shellCheckOutput may succeed -- see bug 817235
            try:
                self.device.shellCheckOutput(["mkdir", self.remoteBinDir])
            except mozdevice.DMError:
                # Might get a permission error; try again as root, if available
                self.device.shellCheckOutput(["mkdir", self.remoteBinDir],
                                             root=True)
                self.device.shellCheckOutput(
                    ["chmod", "777", self.remoteBinDir], root=True)

        remotePrefDir = remoteJoin(self.remoteBinDir, "defaults/pref")
        if (self.device.dirExists(self.remoteTmpDir)):
            self.device.removeDir(self.remoteTmpDir)
        self.device.mkDir(self.remoteTmpDir)
        if (not self.device.dirExists(remotePrefDir)):
            self.device.mkDirs(remoteJoin(remotePrefDir, "extra"))
        if (not self.device.dirExists(self.remoteScriptsDir)):
            self.device.mkDir(self.remoteScriptsDir)
        if (not self.device.dirExists(self.remoteComponentsDir)):
            self.device.mkDir(self.remoteComponentsDir)

        local = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             'head.js')
        remoteFile = remoteJoin(self.remoteScriptsDir, "head.js")
        self.device.pushFile(local, remoteFile)

        # The xpcshell binary is required for all tests. Additional binaries
        # are required for some tests. This list should be similar to
        # TEST_HARNESS_BINS in testing/mochitest/Makefile.in.
        binaries = [
            "xpcshell", "ssltunnel", "certutil", "pk12util", "BadCertServer",
            "OCSPStaplingServer", "GenerateOCSPResponse"
        ]
        for fname in binaries:
            local = os.path.join(self.localBin, fname)
            if os.path.isfile(local):
                print >> sys.stderr, "Pushing %s.." % fname
                remoteFile = remoteJoin(self.remoteBinDir, fname)
                self.device.pushFile(local, remoteFile)
            else:
                print >> sys.stderr, "*** Expected binary %s not found in %s!" % (
                    fname, self.localBin)

        local = os.path.join(self.localBin, "components/httpd.js")
        remoteFile = remoteJoin(self.remoteComponentsDir, "httpd.js")
        self.device.pushFile(local, remoteFile)

        local = os.path.join(self.localBin, "components/httpd.manifest")
        remoteFile = remoteJoin(self.remoteComponentsDir, "httpd.manifest")
        self.device.pushFile(local, remoteFile)

        local = os.path.join(self.localBin, "components/test_necko.xpt")
        remoteFile = remoteJoin(self.remoteComponentsDir, "test_necko.xpt")
        self.device.pushFile(local, remoteFile)

        if self.options.localAPK:
            remoteFile = remoteJoin(self.remoteBinDir,
                                    os.path.basename(self.options.localAPK))
            self.device.pushFile(self.options.localAPK, remoteFile)

        self.pushLibs()

    def pushLibs(self):
        if self.localBin is not None:
            szip = os.path.join(self.localBin, '..', 'host', 'bin', 'szip')
            if not os.path.exists(szip):
                # Tinderbox builds must run szip from the test package
                szip = os.path.join(self.localBin, 'host', 'szip')
            if not os.path.exists(szip):
                # If the test package doesn't contain szip, it means files
                # are not szipped in the test package.
                szip = None
        else:
            szip = None
        pushed_libs_count = 0
        if self.options.localAPK:
            try:
                dir = tempfile.mkdtemp()
                for info in self.localAPKContents.infolist():
                    if info.filename.endswith(".so"):
                        print >> sys.stderr, "Pushing %s.." % info.filename
                        remoteFile = remoteJoin(
                            self.remoteBinDir, os.path.basename(info.filename))
                        self.localAPKContents.extract(info, dir)
                        localFile = os.path.join(dir, info.filename)
                        if szip:
                            try:
                                out = subprocess.check_output(
                                    [szip, '-d', localFile],
                                    stderr=subprocess.STDOUT)
                            except CalledProcessError:
                                print >> sys.stderr, "Error calling %s on %s.." % (
                                    szip, localFile)
                                if out:
                                    print >> sys.stderr, out
                        self.device.pushFile(localFile, remoteFile)
                        pushed_libs_count += 1
            finally:
                shutil.rmtree(dir)
            return pushed_libs_count

        for file in os.listdir(self.localLib):
            if (file.endswith(".so")):
                print >> sys.stderr, "Pushing %s.." % file
                if 'libxul' in file:
                    print >> sys.stderr, "This is a big file, it could take a while."
                localFile = os.path.join(self.localLib, file)
                remoteFile = remoteJoin(self.remoteBinDir, file)
                if szip:
                    try:
                        out = subprocess.check_output([szip, '-d', localFile],
                                                      stderr=subprocess.STDOUT)
                    except CalledProcessError:
                        print >> sys.stderr, "Error calling %s on %s.." % (
                            szip, localFile)
                        if out:
                            print >> sys.stderr, out
                self.device.pushFile(localFile, remoteFile)
                pushed_libs_count += 1

        # Additional libraries may be found in a sub-directory such as "lib/armeabi-v7a"
        localArmLib = os.path.join(self.localLib, "lib")
        if os.path.exists(localArmLib):
            for root, dirs, files in os.walk(localArmLib):
                for file in files:
                    if (file.endswith(".so")):
                        print >> sys.stderr, "Pushing %s.." % file
                        localFile = os.path.join(root, file)
                        remoteFile = remoteJoin(self.remoteBinDir, file)
                        if szip:
                            try:
                                out = subprocess.check_output(
                                    [szip, '-d', localFile],
                                    stderr=subprocess.STDOUT)
                            except CalledProcessError:
                                print >> sys.stderr, "Error calling %s on %s.." % (
                                    szip, localFile)
                                if out:
                                    print >> sys.stderr, out
                        self.device.pushFile(localFile, remoteFile)
                        pushed_libs_count += 1

        return pushed_libs_count

    def setupModules(self):
        if self.testingModulesDir:
            self.device.pushDir(self.testingModulesDir, self.remoteModulesDir)

    def setupTestDir(self):
        print 'pushing %s' % self.xpcDir
        try:
            # The tests directory can be quite large: 5000 files and growing!
            # Sometimes - like on a low-end aws instance running an emulator - the push
            # may exceed the default 5 minute timeout, so we increase it here to 10 minutes.
            self.device.pushDir(self.xpcDir,
                                self.remoteScriptsDir,
                                timeout=600,
                                retryLimit=10)
        except TypeError:
            # Foopies have an older mozdevice ver without retryLimit
            self.device.pushDir(self.xpcDir, self.remoteScriptsDir)

    def setupMinidumpDir(self):
        if self.device.dirExists(self.remoteMinidumpDir):
            self.device.removeDir(self.remoteMinidumpDir)
        self.device.mkDir(self.remoteMinidumpDir)

    def buildTestList(self, test_tags=None, test_paths=None):
        xpcshell.XPCShellTests.buildTestList(self,
                                             test_tags=test_tags,
                                             test_paths=test_paths)
        uniqueTestPaths = set([])
        for test in self.alltests:
            uniqueTestPaths.add(test['here'])
        for testdir in uniqueTestPaths:
            abbrevTestDir = os.path.relpath(testdir, self.xpcDir)
            remoteScriptDir = remoteJoin(self.remoteScriptsDir, abbrevTestDir)
            self.pathMapping.append(PathMapping(testdir, remoteScriptDir))
Beispiel #55
0
def run(file_name):
    config_dict = False
    jar = ZipFile(file_name, 'r')
    # Version A
    if 'a.txt' and 'b.txt' in jar.namelist():
        pre_key = jar.read('a.txt')
        enckey = ['{0}{1}{0}{1}a'.format('plowkmsssssPosq34r', pre_key),
                  '{0}{1}{0}{1}a'.format('kevthehermitisaGAYXD', pre_key)
                  ]
        coded_jar = jar.read('b.txt')
        config_dict = version_a(enckey, coded_jar)

    # Version B
    if 'ID' and 'MANIFEST.MF' in jar.namelist():
        pre_key = jar.read('ID')
        enckey = ['{0}H3SUW7E82IKQK2J2J2IISIS'.format(pre_key)]
        coded_jar = jar.read('MANIFEST.MF')
        config_dict = version_b(enckey, coded_jar)

    # Version C
    if 'resource/password.txt' and 'resource/server.dll' in jar.namelist():
        pre_key = jar.read('resource/password.txt')
        enckey = ['CJDKSIWKSJDKEIUSYEIDWE{0}'.format(pre_key)]
        coded_jar = jar.read('resource/server.dll')
        config_dict = version_c(enckey, coded_jar)

    # Version D
    if 'java/stubcito.opp' and 'java/textito.isn' in jar.namelist():
        pre_key = jar.read('java/textito.isn')
        enckey = ['TVDKSIWKSJDKEIUSYEIDWE{0}'.format(pre_key)]
        coded_jar = jar.read('java/stubcito.opp')
        config_dict = version_c(enckey, coded_jar)

    # Version E
    if 'java/textito.text' and 'java/resource.xsx' in jar.namelist():
        pre_key = jar.read('java/textito.text')
        enckey = ['kevthehermitGAYGAYXDXD{0}'.format(pre_key)]
        coded_jar = jar.read('java/resource.xsx')
        config_dict = version_c(enckey, coded_jar)

    if 'amarillo/asdasd.asd' and 'amarillo/adqwdqwd.asdwf' in jar.namelist():
        pre_key = jar.read('amarillo/asdasd.asd')
        enckey = ['kevthehermitGAYGAYXDXD{0}'.format(pre_key)]
        coded_jar = jar.read('amarillo/adqwdqwd.asdwf')
        config_dict = version_c(enckey, coded_jar)

    # Version F
    if 'config/config.perl' in jar.namelist():
        temp_config = xor_config(jar.read('config/config.perl'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['kevthehermitGAYGAYXDXD{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_c(enckey, coded_jar)

    # Version G
    if 'config/config.pl' in jar.namelist():
        temp_config = xor_config(jar.read('config/config.pl'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['kevthehermitGAYGAYGAYD{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_c(enckey, coded_jar)

    # Version H
    if 'config/config.ini' in jar.namelist():
        temp_config = xor_config(jar.read('config/config.ini'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['kevthehermitGAYGAYGAYD{0}'.format(temp_config["PASSWORD"]),
                  'kevthehermitGADGAYGAYD{}'.format(temp_config["PASSWORD"])]
        config_dict = version_c(enckey, coded_jar)

    # Version I
    if 'windows/windows.ini' in jar.namelist():
        temp_config = xor_config(jar.read('windows/windows.ini'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['kevthehermitGADGAYGAYD{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_c(enckey, coded_jar)

    # Version J
    if 'components/linux.plsk' in jar.namelist():
        temp_config = xor_config(jar.read('components/linux.plsk'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['kevthehermitGADGAYGAYD{0}'.format(temp_config["PASSWORD"]),
                  'LDLDKFJVUI39OWIS9WOQ92{}'.format(temp_config["PASSWORD"])]
        config_dict = version_c(enckey, coded_jar)
        if config_dict is None:
            config_dict = version_d(enckey, coded_jar)

    # Version K
    if 'components/manifest.ini' in jar.namelist():
        temp_config = xor_config(jar.read('components/manifest.ini'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['LDLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_d(enckey, coded_jar)

    # Version L
    if 'components/mac.hwid' in jar.namelist():
        temp_config = xor_config(jar.read('components/mac.hwid'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['LDLDKFJVUI39OWIS9WOQ92{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_d(enckey, coded_jar)

    # Version M
    if 'components/logo.png' in jar.namelist():
        temp_config = xor_config(jar.read('components/logo.png'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['LDLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_d(enckey, coded_jar)

    # Version N
    if 'components/picture.gif' in jar.namelist():
        temp_config = xor_config(jar.read('components/picture.gif'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['TDLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_d(enckey, coded_jar)

    # Version O
    if 'klip/clip.mp4' in jar.namelist():
        temp_config = xor_config(jar.read('klip/clip.mp4'))
        coded_jar = jar.read(temp_config['SERVER'][1:])
        enckey = ['TKLDKFJVUI39OWIS9WOQ93{0}'.format(temp_config["PASSWORD"])]
        config_dict = version_d(enckey, coded_jar)

    return config_dict
Beispiel #56
0
    def preInstallCheck(self, eggInstall=True):
        """Check that prerequisite zenpacks are installed.
        Return True if no prereqs specified or if they are present.
        False otherwise.
        """
        if eggInstall:
            installedPacks = dict((pack.id, pack.version) \
                             for pack in self.dataroot.ZenPackManager.packs())

            if self.options.installPackName.lower().endswith('.egg'):
                # standard prebuilt egg
                if not os.path.exists(self.options.installPackName):
                    raise ZenPackNotFoundException("Unable to find ZenPack named '%s'" % \
                                           self.options.installPackName)
                zf = ZipFile(self.options.installPackName)
                if 'EGG-INFO/requires.txt' in zf.namelist():
                    reqZenpacks = zf.read('EGG-INFO/requires.txt').split('\n')
                else:
                    return True
            else:
                # source egg, no prebuilt egg-info
                with get_temp_dir() as tempEggDir:
                    cmd = '%s setup.py egg_info -e %s' % \
                                                (binPath('python'), tempEggDir)
                    subprocess.call(cmd,
                                    shell=True,
                                    stdout=open('/dev/null', 'w'),
                                    cwd=self.options.installPackName)

                    eggRequires = os.path.join(
                        tempEggDir, self.options.installPackName + '.egg-info',
                        'requires.txt')
                    if os.path.isfile(eggRequires):
                        reqZenpacks = open(eggRequires, 'r').read().split('\n')
                    else:
                        return True

            prereqsMet = True
            for req in reqZenpacks:
                if not req.startswith('ZenPacks'):
                    continue
                for parsed_req in parse_requirements([req]):
                    installed_version = installedPacks.get(
                        parsed_req.project_name, None)
                    if installed_version is None:
                        self.log.error(
                            'Zenpack %s requires %s' %
                            (self.options.installPackName, parsed_req))
                        prereqsMet = False
                    else:
                        if not installed_version in parsed_req:
                            self.log.error(
                                'Zenpack %s requires %s, found: %s' %
                                (self.options.installPackName, parsed_req,
                                 installed_version))
                            prereqsMet = False
            return prereqsMet

        if os.path.isfile(self.options.installPackName):
            zf = ZipFile(self.options.installPackName)
            for name in zf.namelist():
                if name.endswith == '/%s' % CONFIG_FILE:
                    sio = StringIO(zf.read(name))
            else:
                return True
        else:
            name = os.path.join(self.options.installPackName, CONFIG_FILE)
            if os.path.isfile(name):
                fp = open(name)
                sio = StringIO(fp.read())
                fp.close()
            else:
                return True

        parser = ConfigParser.SafeConfigParser()
        parser.readfp(sio, name)
        if parser.has_section(CONFIG_SECTION_ABOUT) \
            and parser.has_option(CONFIG_SECTION_ABOUT, 'requires'):
            requires = eval(parser.get(CONFIG_SECTION_ABOUT, 'requires'))
            if not isinstance(requires, list):
                requires = [zp.strip() for zp in requires.split(',')]
            missing = [
                zp for zp in requires
                if zp not in self.dataroot.ZenPackManager.packs.objectIds()
            ]
            if missing:
                self.log.error('ZenPack %s was not installed because' %
                               self.options.installPackName +
                               ' it requires the following ZenPack(s): %s' %
                               ', '.join(missing))
                return False
        return True
Beispiel #57
0
def reset_orig_chapters_epub(inputio, outfile):
    inputepub = ZipFile(inputio,
                        'r')  # works equally well with a path or a blob

    ## build zip in memory in case updating in place(CLI).
    zipio = BytesIO()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    changed = False

    unmerge_tocncxdoms = {}
    ## spin through file contents, saving any unmerge toc.ncx files.
    for zf in inputepub.namelist():
        ## logger.debug("zf:%s"%zf)
        if zf.endswith('/toc.ncx'):
            ## logger.debug("toc.ncx zf:%s"%zf)
            unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf))

    unmerge_navxhtmldoms = {}
    ## spin through file contents, saving any unmerge toc.ncx files.
    for zf in inputepub.namelist():
        ## logger.debug("zf:%s"%zf)
        if zf.endswith('/nav.xhtml'):
            ## logger.debug("toc.ncx zf:%s"%zf)
            unmerge_navxhtmldoms[zf] = parseString(inputepub.read(zf))

    tocncxdom = parseString(inputepub.read('toc.ncx'))
    if 'nav.xhtml' in inputepub.namelist():
        navxhtmldom = parseString(inputepub.read('nav.xhtml'))
    else:
        navxhtmldom = None
    ## spin through file contents.
    for zf in inputepub.namelist():
        if zf not in [
                'mimetype', 'toc.ncx', 'nav.xhtml'
        ] and not zf.endswith('/toc.ncx') and not zf.endswith('/nav.xhtml'):
            entrychanged = False
            data = inputepub.read(zf)
            # if isinstance(data,unicode):
            #     logger.debug("\n\n\ndata is unicode\n\n\n")
            if re.match(r'.*/file\d+\.xhtml', zf):
                #logger.debug("zf:%s"%zf)
                data = data.decode('utf-8')
                soup = make_soup(data)

                chapterorigtitle = None
                tag = soup.find('meta', {'name': 'chapterorigtitle'})
                if tag:
                    chapterorigtitle = tag['content']

                # toctitle is separate for add_chapter_numbers:toconly users.
                chaptertoctitle = None
                tag = soup.find('meta', {'name': 'chaptertoctitle'})
                if tag:
                    chaptertoctitle = tag['content']
                    chaptertoctitle = chapterorigtitle

                chaptertitle = None
                tag = soup.find('meta', {'name': 'chaptertitle'})
                if tag:
                    chaptertitle = tag['content']
                    chaptertitle_tag = tag

                #logger.debug("chaptertitle:(%s) chapterorigtitle:(%s)"%(chaptertitle, chapterorigtitle))
                if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle:
                    origdata = data
                    # data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>',
                    #                     u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>')
                    # data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>')
                    # data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>')
                    chaptertitle_tag['content'] = chapterorigtitle
                    title_tag = soup.find('title')
                    if title_tag and title_tag.string == chaptertitle:
                        title_tag.string.replace_with(chapterorigtitle)

                    h3_tag = soup.find('h3')
                    if h3_tag and h3_tag.string == chaptertitle:
                        h3_tag.string.replace_with(chapterorigtitle)

                    data = unicode(soup)

                    entrychanged = (origdata != data)
                    changed = changed or entrychanged

                    if entrychanged:
                        logger.debug("\nentrychanged:%s\n" % zf)
                        _replace_tocncx(tocncxdom, zf, chaptertoctitle)
                        if navxhtmldom:
                            _replace_navxhtml(navxhtmldom, zf, chaptertoctitle)
                        ## Also look for and update individual
                        ## book toc.ncx files for anthology in case
                        ## it's unmerged.
                        zf_toc = zf[:zf.rfind('/OEBPS/')] + '/toc.ncx'
                        mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')]) + 1

                        if zf_toc in unmerge_tocncxdoms:
                            _replace_tocncx(unmerge_tocncxdoms[zf_toc],
                                            zf[mergedprefix_len:],
                                            chaptertoctitle)
                        if zf_toc in unmerge_navxhtmldoms:
                            _replace_navxhtml(unmerge_navxhtmldoms[zf_toc],
                                              zf[mergedprefix_len:],
                                              chaptertoctitle)

                outputepub.writestr(zf, data.encode('utf-8'))
            else:
                # possibly binary data, thus no .encode().
                outputepub.writestr(zf, data)

    for tocnm, tocdom in unmerge_tocncxdoms.items():
        outputepub.writestr(tocnm, tocdom.toxml(encoding='utf-8'))
    for navnm, navdom in unmerge_navxhtmldoms.items():
        outputepub.writestr(navnm, navdom.toxml(encoding='utf-8'))

    outputepub.writestr('toc.ncx', tocncxdom.toxml(encoding='utf-8'))
    if navxhtmldom:
        outputepub.writestr('nav.xhtml', navxhtmldom.toxml(encoding='utf-8'))
    outputepub.close()
    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0

    # only *actually* write if changed.
    if changed:
        if isinstance(outfile, basestring):
            with open(outfile, "wb") as outputio:
                outputio.write(zipio.getvalue())
        else:
            outfile.write(zipio.getvalue())

    inputepub.close()
    zipio.close()

    return changed
Beispiel #58
0
class MovieLens100kDataManager:
    DOWNLOAD_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
    DEFAULT_PATH = os.path.expanduser('/.ml-100k.zip')

    @classmethod
    def _read_interaction(cls, byte_stream):
        with BytesIO(byte_stream) as ifs:
            data = pd.read_csv(
                ifs,
                sep='\t',
                header=None,
                names=['user_id', 'movie_id', 'rating', 'timestamp'])
            data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
            return data

    def __init__(self, zippath=None):
        if zippath is None:
            zippath = self.DEFAULT_PATH
            if not os.path.exists(zippath):
                download = input(
                    'Could not find {}.\nCan I download and save it there?[y/N]'
                    .format(zippath))
                urllib.request.urlretrieve(self.DOWNLOAD_URL, zippath)

        if zippath is not None:
            self.zf = ZipFile(zippath)
        else:
            self.zf = None

    def load_rating(self, random_state=114514, fold=None):
        if fold is None:
            df_all = self._read_interaction(self.zf.read('ml-100k/u.data'))
            df_train, df_test = train_test_split(df_all,
                                                 random_state=random_state)
        else:
            assert fold >= 1 and fold <= 5
            train_path = 'ml-100k/u{}.base'.format(fold)
            test_path = 'ml-100k/u{}.test'.format(fold)
            df_train = self._read_interaction(self.zf.read(train_path))
            df_test = self._read_interaction(self.zf.read(test_path))
        return df_train, df_test

    def load_userinfo(self):
        user_info_bytes = self.zf.read('ml-100k/u.user')
        with BytesIO(user_info_bytes) as ifs:
            return pd.read_csv(
                ifs,
                sep='|',
                header=None,
                names=['user_id', 'age', 'gender', 'occupation', 'zipcode'])

    def load_movieinfo(self):
        MOVIE_COLUMNS = ['movie_id', 'title', 'release_date', 'unk', 'url']
        with BytesIO(self.zf.read('ml-100k/u.genre')) as ifs:
            genres = pd.read_csv(ifs, sep='|', header=None)[0]
        with BytesIO(self.zf.read('ml-100k/u.item')) as ifs:
            df_mov = pd.read_csv(
                ifs,
                sep='|',
                encoding='latin-1',
                header=None,
            )
            df_mov.columns = (MOVIE_COLUMNS + list(genres))
        df_mov['release_date'] = pd.to_datetime(df_mov.release_date)
        return df_mov, list(genres)
Beispiel #59
0
def extract_zip(input_zip):
    input_zip = ZipFile(input_zip)
    return {name: input_zip.read(name) for name in input_zip.namelist()}
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt

import requests
from io import BytesIO
from zipfile import ZipFile

# Download the dataset
dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content
f = BytesIO(dk)
zipped = ZipFile(f)
df = pd.read_table(BytesIO(zipped.read('internet.dat')),
                   skiprows=1,
                   header=None,
                   sep='\s+',
                   engine='python',
                   names=['internet', 'dinternet'])

# ### Model Selection
#
# As in Durbin and Koopman, we force a number of the values to be missing.

# Get the basic series
dta_full = df.dinternet[1:].values
dta_miss = dta_full.copy()

# Remove datapoints