Beispiel #1
0
def main():
    parser = argparse.ArgumentParser('Update the icomoon icon font from the provided archive')
    parser.add_argument('archive', help='Path to .zip file generated by icomoon')
    args = parser.parse_args()

    script_dir = os.path.dirname(os.path.abspath(__file__))
    vendor_style_dir = script_dir + '/../h/static/styles/vendor'

    icon_font_archive = ZipFile(args.archive)
    icon_font_archive.extract('selection.json', vendor_style_dir + '/fonts')
    icon_font_archive.extract('fonts/h.woff', vendor_style_dir)
    css_input_file = icon_font_archive.open('style.css')

    css_output_file = open(vendor_style_dir + '/icomoon.css', 'w')

    for line in css_input_file:
        if "format('woff')" in line:
            # inline the WOFF format file
            woff_content = icon_font_archive.open('fonts/h.woff').read()
            woff_src_line = """
    /* WARNING - the URL below is inlined
     * because the CSS asset pipeline is not correctly rebasing
     * URLs when concatenating files together.
     *
     * See issue #2571
     */
    src:url('data:application/font-woff;base64,%s') format('woff');
"""
            css_output_file.write(woff_src_line % b64encode(woff_content))
        elif "url(" in line:
            # skip non-WOFF format fonts
            pass
        else:
            css_output_file.write(line)
Beispiel #2
0
 def parse(self, file):
     epub = ZipFile(file)
     if epub.read('mimetype') != 'application/epub+zip'.encode('ascii'):
         raise BadEPub
     with epub.open('META-INF/container.xml') as container_file:
         container = etree.parse(container_file).getroot()
     rootfiles = container.find('./cnt:rootfiles', NS_MAP)
     for rootfile in rootfiles.findall('./cnt:rootfile', NS_MAP):
         if rootfile.get('media-type') != 'application/oebps-package+xml':
             raise BadEPub
         content_path = rootfile.get('full-path')
         break   # only try the first rootfile
     content_dir = os.path.dirname(content_path)
     flowables = []
     with epub.open(content_path) as content_file:
         package = etree.parse(content_file).getroot()
         metadata = package.find('./opf:metadata', NS_MAP)
         print(metadata.find('./dc:title', NS_MAP).text)
         print(metadata.find('./dc:creator', NS_MAP).text)
         manifest = package.find('./opf:manifest', NS_MAP)
         items = {item.get('id'): item
                  for item in manifest.findall('./opf:item', NS_MAP)}
         spine = package.find('./opf:spine', NS_MAP)
         for itemref in spine.findall('./opf:itemref', NS_MAP):
             item = items[itemref.get('idref')]
             filename = os.path.join(content_dir, item.get('href'))
             if filename.endswith('pt04.html'):
                 break
             print(filename)
             with epub.open(filename) as xhtml_file:
                 xhtml_parser = elementtree.Parser(CustomElement)
                 xhtml_tree = xhtml_parser.parse(xhtml_file)
             for flowable in self.from_doctree(xhtml_tree.getroot()):
                 flowables.append(flowable)
     return flowables
Beispiel #3
0
def populate(show):
    if show in populated:
        return
    else:
        populated.append(show)
    showId = getTvdbId(show)
    endpoint = 'http://thetvdb.com/api/{apikey}/series/{showId}/all/en.zip'.format(apikey=api_key, showId=showId)
    r = requests.get(endpoint)
    z = ZipFile(BytesIO(r.content))
    dataFile = z.open('en.xml')
    data = BeautifulSoup(dataFile.read(),'lxml')
    dataFile.close()
    bannerFile = z.open('banners.xml')
    banners = BeautifulSoup(bannerFile.read(),'lxml')
    bannerFile.close()

    box_url = ''
    for b in banners.find_all('banner'):
        if b.bannertype.string == 'fanart':
            box_url = 'http://thetvdb.com/banners/'+b.bannerpath.string
            break
    localCache[show]['box_url'] = str(box_url)[:]
    seasons = set([int(s.string) for s in data.find_all('seasonnumber') if s.string != '0'])
    for season in seasons:
        localCache[show]['seasons'][int(season)+0] = {'episodes':{}}
        episodes = [e for e in data.find_all('episode') if int(e.seasonnumber.string) == season]
        for episode in episodes:
            number = int(episode.episodenumber.string)+0
            epName = str(episode.episodename.string)[:]
            epThumb = 'http://thetvdb.com/banners/'+str(episode.filename.string)[:]
            localCache[show]['seasons'][season]['episodes'][int(number)+0] = {
                'name':str(epName)[:],
                'thumb_url':str(epThumb)[:]
            }
    close()
Beispiel #4
0
    def handle_label(self, label, **options):
        zip = ZipFile(label)

        map = {}

        map['users'] = self.import_users(zip.open("Users.xml"))
        map['questions'], map['answers'] = self.import_posts(zip.open("Posts.xml"))
Beispiel #5
0
Datei: db.py Projekt: ajm/glutton
    def _read(self) :
        global MANIFEST_FNAME

        z = ZipFile(self.fname, 'r', compression=self.compression)
    
        def _err(msg) :
            z.close()
            raise GluttonImportantFileNotFoundError(msg)
    
        # without the manifest all is lost
        # we need this to get the names of the other
        # XML files
        if MANIFEST_FNAME not in z.namelist() :
            _err('manifest not found in %s' % self.fname)

        self.metadata = json.load(z.open(MANIFEST_FNAME))
        
        self.log.info("read manifest - created on %s using glutton version %.1f" % \
            (time.strftime('%d/%m/%y at %H:%M:%S', time.localtime(self.download_time)), \
             self.version))

        # the data file is the raw data grouped into gene families
        # when we do a local alignment we need to get the gene id
        # of the best hit and find out which gene family it belongs to 
        if self.metadata['data-file'] not in z.namelist() :
            _err('data file (%s) not found in %s' % (self.metadata['data-file'], self.fname))

        self.data = json_to_glutton(json.load(z.open(self.metadata['data-file'])))
        self.seq2famid = self._create_lookup_table(self.data)

        self.log.info("read %d gene families (%d genes)" % (len(self.data), len(self.seq2famid)))

        z.close()
Beispiel #6
0
def unzip(filename):
    z = ZipFile(filename)
    names = z.namelist()
    for path in names:
        if path.startswith('__MACOSX/'):
            continue

        base, name = os.path.split(path)

        if name.startswith('._') and\
            '%s/' % name.replace('._', '', 1) in names:
            continue

        double = os.path.join('__MACOSX', base, '._' + name)
        if double in names:
            print '=> %s.bin' % path

            info = z.getinfo(path)

            bin = MacBinary(name)
            bin.data = z.open(path, 'r').read()
            bin.res = z.open(double, 'r').read()

            modified = datetime.datetime(*info.date_time)
            bin.modified = time.mktime(modified.timetuple())
            bin.created = time.time()

            if not os.path.exists(base):
                os.makedirs(base)

            with open('%s.bin' % path.rstrip('\r'), 'wb') as f:
                f.write(bin.encode())
        else:
            print '-> %s' % path
            z.extract(path)
Beispiel #7
0
def read_jar(jar):
    zf = symname = version = lastmod = cmtid = cmttime = origin = cmtdesc = branch = None
    try:
        zf = ZipFile(jar)
        manifest_f = None
        try:
            manifest_f = zf.open('META-INF/MANIFEST.MF')
            manifest = manifest_f.read()
            symname = re_search(r'Bundle-SymbolicName: (.*?)(?:\s|;)', manifest)
            version = re_search(r'Bundle-Version: (.*?)(?:\s|;)', manifest)
            lastmod = re_search(r'Bnd-LastModified: (.*?)(?:\s|;)', manifest)
        except:
            traceback.print_exc()
        finally: 
            if manifest_f: manifest_f.close()
        gitprops_f = None
        try:
            gitprops_f = zf.open('git.properties')
            gitprops = gitprops_f.read()
            cmtid = re_search(r'git.commit.id.abbrev=(.*?)\n', gitprops)
            cmttime = re_search(r'git.commit.time=(.*?)\n', gitprops)
            origin = re_search(r'git.remote.origin.url=(.*?)\n', gitprops)
            cmtdesc = re_search(r'git.commit.id.describe=(.*?)\n', gitprops)
            branch = re_search(r'git.branch=(.*?)\n', gitprops)
        except KeyError:
            pass
        except:
            traceback.print_exc()
        finally:
            if gitprops_f: gitprops_f.close()
    finally:
        if zf: zf.close()
    return (symname, version, lastmod, cmtid, cmttime, origin, cmtdesc, branch)
    def handle_noargs(self, **options):

        is_verbose = options['verbosity'] > 0

        if is_verbose:
            print "Syncing into", self._static_root
            print "Getting zip from ", self.ZIP_URL

        zip_url = urllib2.urlopen(self.ZIP_URL)
        zip_file = ZipFile(StringIO(zip_url.read()))

        for member in zip_file.namelist():
            # we take only dist, css, and img directories
            dir_name, file_name = os.path.split(member)

            # skip directories
            if not file_name:
                continue

            _, base_dir = os.path.split(dir_name)

            if file_name == self._html_file:
                if is_verbose:
                    print "Adopting ", self._html_file

                # adopt the html to template
                source = zip_file.open(member)
                content = "{% load static from staticfiles %}\n" + source.read()

                for orig, replacement in self._replacements:
                    content = content.replace(orig, replacement)

                source.close()

                target = os.path.join(self._templates_root, 'agendas',
                                      self._html_file)

                with open(target, 'w') as f:
                    f.write(content)

            elif base_dir in self._DIRS:
                target_dir = self._DIRS[base_dir]
                if is_verbose:
                    print "Copying {0} to {1}".format(member, target_dir)

                # make sure we have the target_dir dir
                try:
                    os.makedirs(target_dir)
                except OSError:
                    pass

                source = zip_file.open(member)
                target = file(os.path.join(target_dir, file_name), "wb")

                shutil.copyfileobj(source, target)

                source.close()
                target.close()
Beispiel #9
0
class Feed(object):
    """A collection of CSV files with headers, either zipped into an archive
    or loose in a folder."""

    def __init__(self, filename, strip_fields=True):
        self.filename = filename
        self.feed_name = derive_feed_name(filename)
        self.zf = None
        self.strip_fields = strip_fields
        self.empty_to_none = True
        if not os.path.isdir(filename):
            self.zf = ZipFile(filename)
        if six.PY2:
            self.reader = self.python2_reader
        else:
            self.reader = self.python3_reader

    def __repr__(self):
        return '<Feed %s>' % self.filename

    def python2_reader(self, filename):
        if self.zf:
            try:
                binary_file_handle = self.zf.open(filename, 'rU')
            except IOError:
                raise IOError('%s is not present in feed' % filename)
        else:
            binary_file_handle = open(os.path.join(self.filename, filename),
                                      "rb")
        reader = csv.reader(binary_file_handle)
        for row in reader:
            yield [six.text_type(x, 'utf-8') for x in row]

    def python3_reader(self, filename):
        if self.zf:
            try:
                text_file_handle = io.TextIOWrapper(
                    self.zf.open(filename, "r"), encoding="utf-8")
            except IOError:
                raise IOError('%s is not present in feed' % filename)
        else:
            text_file_handle = open(os.path.join(self.filename, filename), "r",
                                    encoding="utf-8")
        return csv.reader(text_file_handle)

    def read_table(self, filename, columns):
        if self.strip_fields:
            rows = (_row_stripper(row) for row in self.reader(filename))
        else:
            rows = self.reader(filename)
        if self.empty_to_none:
            # Set empty strings to None, let nullable handle missing values.
            rows = ((x if x else None for x in row) for row in rows)
        feedtype = filename.rsplit('/')[-1].rsplit('.')[0].title().replace('_',
                                                                           '')
        return CSV(feedtype=feedtype, rows=rows, columns=columns)
Beispiel #10
0
def read_single_sheet(path, name=None):
    """ Read an xlsx, csv or tsv from a zipfile or directory
    """
    from zipfile import ZipFile
    import xlreader

    if name is None:
        root, ext = os.path.splitext(path)
        stream = open(path, 'rb')

        if ext == '.xlsx':
            return read_xl(stream)

        if ext == '.tsv':
            return read_csv(stream, dialect='excel-tab')

        if ext == '.csv':
            return read_csv(stream)

        raise ValueError('Unknown file extension for %r' % path)

    if path.endswith('.xlsx'):
        return xlreader.DictReader(open(path, 'rb'), sheetname=name)

    if path.endswith('.zip'):
        zf = ZipFile(path)
        names = zf.namelist()

        if (name + '.xlsx') in names:
            stream = zf.open(name + '.xlsx', 'r')
            return read_xl(stream)

        if (name + '.tsv') in names:
            stream = zf.open(name + '.tsv', 'rU')
            return read_csv(stream, dialect='excel-tab')

        if (name + '.csv') in names:
            stream = zf.open(name + '.csv', 'rU')
            return read_csv(stream)

    if os.path.isdir(path):
        root = os.path.join(path, name)

        if os.path.exists(root + '.xlsx'):
            stream = open(root + '.xlsx', 'rb')
            return read_xl(stream)

        if os.path.exists(root + '.tsv'):
            stream = open(root + '.tsv', 'rbU')
            return read_csv(stream, dialect='excel-tab')

        if os.path.exists(root + '.csv'):
            stream = open(root + '.csv', 'rbU')
            return read_csv(stream)

    return []
Beispiel #11
0
  def testConvertDocy(self):
    """Test conversion of docy to docx and back"""
    x_data = Handler(self.tmp_url, open("data/test_with_image.docy").read(), "docy", **self.kw).convert("docx")
    self.assertIn("word/", x_data[:2000])

    y_data = Handler(self.tmp_url, x_data, "docx", **self.kw).convert("docy")
    y_zip = ZipFile(StringIO(y_data))
    y_body_data = y_zip.open("body.txt").read()
    self.assertTrue(y_body_data.startswith("DOCY;v10;0;"), "%r... does not start with 'DOCY;v10;0;'" % (y_body_data[:20],))
    y_zip.open("media/image1.png")
 def __call__(self, zipfile):
     zipfile = ZipFile(zipfile)
     filenames = zipfile.namelist()
     xls_files = [x for x in filenames if x.endswith('xls')]
     doc_files = [x for x in filenames if x.endswith('doc')]
     if len(xls_files) > 1:
         raise Exception(_("Zip file contains too many excel files"))
     if not xls_files:
         raise Exception(_("Zip file contains no excel files"))
     return StringIO(zipfile.open(xls_files[0]).read()), [StringIO(zipfile.open(x).read()) for x in doc_files]
Beispiel #13
0
  def testConvertDocx(self):
    """Test conversion of docx to docy and back"""
    y_data = Handler(self.tmp_url, open("data/test_with_image.docx").read(), "docx", **self.kw).convert("docy")
    y_zip = ZipFile(StringIO(y_data))
    y_body_data = y_zip.open("body.txt").read()
    self.assertTrue(y_body_data.startswith("DOCY;v10;0;"), "%r... does not start with 'DOCY;v10;0;'" % (y_body_data[:20],))
    y_zip.open("media/image1.png")

    x_data = Handler(self.tmp_url, y_data, "docy", **self.kw).convert("docx")
    # magic inspired by https://github.com/minad/mimemagic/pull/19/files
    self.assertIn("word/", x_data[:2000])
    def import_gtfs(self, gtfs_file, verbose=False):
        """Import a GTFS file as feed

        Keyword arguments:
        gtfs_file - A path or file-like object for the GTFS feed

        Returns is a list of objects imported
        """
        z = ZipFile(gtfs_file, 'r')
        files = z.namelist()

        gtfs_order = (
            ('agency.txt', Agency),
            ('stops.txt', Stop),
            ('routes.txt', Route),
            ('calendar.txt', Service),
            ('calendar_dates.txt', ServiceDate),
            ('shapes.txt', ShapePoint),
            ('trips.txt', Trip),
            ('stop_times.txt', StopTime),
            ('frequencies.txt', Frequency),
            ('fare_attributes.txt', Fare),
            ('fare_rules.txt', FareRule),
            ('transfers.txt', Transfer),
            ('feed_info.txt', FeedInfo),
        )

        post_save.disconnect(dispatch_uid='post_save_shapepoint')
        post_save.disconnect(dispatch_uid='post_save_stop')
        try:
            for table_name, klass in gtfs_order:
                for f in files:
                    if f.endswith(table_name):
                        table = z.open(f, 'rU')
                        if verbose:
                            rows = len(list(csv.reader(table)))
                            print("importing {x} rows of {table}".format(x=rows, table=table_name))

                        table = z.open(f, 'rU')
                        klass.import_txt(table, self, verbose=verbose)
        finally:
            post_save.connect(post_save_shapepoint, sender=ShapePoint)
            post_save.connect(post_save_stop, sender=Stop)

        # Update geometries
        print("updating geometries...")
        # TODO: Add test feed that includes shapes (issue #20)
        for shape in self.shape_set.all():  # pragma: no cover
            shape.update_geometry(update_parent=False)
        for trip in Trip.objects.in_feed(self):
            trip.update_geometry(update_parent=False)
        for route in self.route_set.all():
            route.update_geometry()
Beispiel #15
0
def configure_search_replace(request):
    if request.method == 'GET':
        zf_in = ZipFile(request.session['stored_archive_filename'], mode='r')
        all_filenames_lst = zf_in.namelist()
        all_filenames = set(all_filenames_lst)
        assert len(all_filenames) == len(all_filenames_lst), "Duplicate filenames in the input file?!"
        zf_in.close()
        return render_to_response('docx_search_replace/configure_search_replace.html',
                                  {'filenames': sorted(all_filenames)})
    elif request.method == 'POST':
        replacements = []
        for i in range(1, 6):  # We have input fields "from1", "to1"... "from5", "to5"
            if request.POST['from%d' % i]:
                replacements.append((request.POST['from%d' % i], request.POST['to%d' % i]))
        logging.info('replacements: %s' % replacements)

        selected_filenames = [k for k in request.POST if request.POST[k] == 'on']
        logging.info('selected_filenames: %s' % selected_filenames)

        zf_in = ZipFile(request.session['stored_archive_filename'], mode='r')
        all_filenames = zf_in.namelist()
        stored_output_file = tempfile.NamedTemporaryFile(delete=False)
        zf_out = ZipFile(stored_output_file.name, mode='w', compression=zf_in.compression)

        for fname in selected_filenames:
            file_contents = zf_in.open(fname).read().decode('utf-8')
            for r in replacements:
                file_contents = file_contents.replace(*r)
            zf_out.writestr(fname, file_contents.encode('utf-8'))

        filenames_to_copy_unchanged = set(all_filenames) - set(selected_filenames)
        for fname in filenames_to_copy_unchanged:
            zf_out.writestr(fname, zf_in.open(fname).read(), compress_type=ZIP_DEFLATED)

        zf_in.close()
        zf_out.close()

        orig_uploaded_filename = request.session['uploaded_filename']
        if orig_uploaded_filename.endswith('.docx'):
            downloading_filename = re.sub('.docx$', '_EDITED.docx', orig_uploaded_filename)
        else:
            downloading_filename = orig_uploaded_filename + '_EDITED'

        ret_file = open(stored_output_file.name, 'rb')
        resp = HttpResponse(status=200,
                content=ret_file.read(),
                mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
        resp['Content-Disposition'] = 'attachment; filename="%s"' % downloading_filename
        return resp

    else:
        return HttpResponseBadRequest('Invalid method: %s' % request.method)
def main(args):
	if len(args) < 2:
		print 'Usage: analyze_olp.py filename'
		exit(1)
	olpfile = ZipFile(args[1], "r")
	channelfile = olpfile.open('channel.labels')
	channels = [line.strip() for line in channelfile.readlines()]
	channelfile.close()
	chanmap = {}
	for i in xrange(0, len(channels)):
		chanmap[channels[i]] = i
	datafile = olpfile.open('data.ols')
	analyze_delays(chanmap, datafile)
Beispiel #17
0
def main():
    pattern = re.compile(r'\d{2,}')
    zf = ZipFile('channel.zip')
    fp = zf.open('readme.txt')
    chain = open('chain.txt', 'w')
    text = fp.read()
    number = pattern.search(text).group(0)
    while True:
        finfo = zf.getinfo(number + '.txt')
        print finfo.comment
        print number
        text = zf.open(finfo).read()
        chain.write(finfo.comment)
        number = pattern.search(text).group(0)
Beispiel #18
0
def scan(pk3dir, basedir):
    """
    Scan a pk3 files in a folder

    Check for shader conflicts and build a report of texture usages.

    Args:
        pk3dir - Path to a directory containing pk3 files to scan
        basedir - Path to directory of a clean game installation (basewsw)
    """
    logger = logging.getLogger('scan')
    pk3path = Path(pk3dir)
    basepath = Path(basedir)

    if not pk3path.is_dir():
        logger.error('{} is not a valid directory'.format(pk3path))
        sys.exit(1)

    if not basepath.is_dir():
        logger.error('{} is not a valid directory'.format(basepath))
        sys.exit(1)

    # Build an index of base game files to check against
    basefiles = set()
    for pk3file in basepath.glob('*.pk3'):
        pk3zip = ZipFile(str(pk3file))
        for name in pk3zip.namelist():
            if name.endswith('/'):
                continue
            elif name.endswith('.shader'):
                basefiles.update(parse_shader(pk3zip.open(name)))
            else:
                basefiles.add(name)

    # Check if pk3s include same files
    for pk3file in pk3path.glob('*.pk3'):
        try:
            pk3zip = ZipFile(str(pk3file))
        except BadZipfile:
            logging.error('error: {} is not a zipfile!'.format(pk3file))
            continue
        for name in pk3zip.namelist():
            if name in basefiles:
                logging.error('{} overwrites file {}'.format(pk3file, name))

            if name.endswith('.shader'):
                for texture in basefiles & parse_shader(pk3zip.open(name)):
                    logging.error('{} overwrites file {}' \
                                    .format(pk3file, texture))
Beispiel #19
0
def fetch_wilt(data_home=None, download_if_missing=True,
               random_state=None, shuffle=False):
    """Load the wilt dataset, downloading it if necessary.
    """
    URL = ('http://archive.ics.uci.edu/ml/'
           'machine-learning-databases/00285/wilt.zip')

    data_home = get_data_home(data_home=data_home)
    wilt_dir = join(data_home, "wilt")
    samples_path = _pkl_filepath(wilt_dir, "samples")
    targets_path = _pkl_filepath(wilt_dir, "targets")
    available = exists(samples_path)

    if download_if_missing and not available:
        makedirs(wilt_dir, exist_ok=True)
        logger.warning("Downloading %s" % URL)
        f = BytesIO(urlopen(URL).read())
        # ou X = np.load(f)

        ff = ZipFile(f, mode='r')
        file1 = ff.open('training.csv')
        Xy1 = np.genfromtxt(file1, delimiter=',', dtype=object)
        file2 = ff.open('testing.csv')
        Xy2 = np.genfromtxt(file2, delimiter=',', dtype=object)
        # the first row is nan:
        Xy1 = Xy1[1:, :]
        Xy2 = Xy2[1:, :]

        Xy = np.r_[Xy1, Xy2]
        X = Xy[:, 1:].astype(float)
        y = Xy[:, 0]

        joblib.dump(X, samples_path, compress=9)
        joblib.dump(y, targets_path, compress=9)

    try:
        X, y
    except NameError:
        X = joblib.load(samples_path)
        y = joblib.load(targets_path)

    if shuffle:
        ind = np.arange(X.shape[0])
        rng = check_random_state(random_state)
        rng.shuffle(ind)
        X = X[ind]
        y = y[ind]

    return Bunch(data=X, target=y, DESCR=__doc__)
Beispiel #20
0
def examine_zip(filepath):
    zipper = ZipFile(filepath)
    files = zipper.infolist()

    config_files = [f for f in files if f.filename.lower().endswith(".ini")]
    if config_files:
        config_file = zipper.open(config_files[0])
        config = ConfigReader(file=config_file)
    else:
        config = None

    xml_files = [zipper.open(f) for f in files
                 if f.filename.lower().endswith(".xml")]

    return (config, xml_files)
Beispiel #21
0
class Dataset(object):
    loaders = {
        'adjustment': AdjustmentLoader,
        'fee': FeeLoader,
        'product': ProductLoader,
        'rate': RateLoader,
        'region': RegionLoader,
    }

    def __init__(self, f):
        self.zf = ZipFile(f)

    @cached_property
    def cover_sheet(self):
        with self.zf.open('CoverSheet.xml') as f:
            return CoverSheet(f)

    @cached_property
    def timestamp(self):
        ts = datetime.combine(self.cover_sheet.date, time.min)
        return timezone.make_aware(ts, timezone.get_current_timezone())

    @cached_property
    def filename_prefix(self):
        return self.cover_sheet.date.strftime('%Y%m%d')

    def load(self):
        # Sort the list of loaders so that Region loads last, as a bellwether
        for key, loader_cls in sorted(self.loaders.items()):
            try:
                f = self.datafile(key)
            except KeyError:
                # The fees data is expected to be temporarily unavailable,
                # so if the fees file is not found, we skip it and
                # continue loading the other data types.
                if key == 'fee':
                    continue
                raise

            # The zip file may be opened as binary, but we want to process the
            # files that it contains as text.
            f_text = io.TextIOWrapper(f)
            loader = loader_cls(f_text, data_timestamp=self.timestamp)
            loader.load()

    def datafile(self, name):
        filename = '{}_{}.txt'.format(self.filename_prefix, name)
        return self.zf.open(filename)
Beispiel #22
0
    def aqcuire_all_resources(self, format_dict):
        import cStringIO as StringIO
        from zipfile import ZipFile

        # Download archive.
        url = self.url(format_dict)
        shapefile_online = self._urlopen(url)
        zfh = ZipFile(StringIO.StringIO(shapefile_online.read()), 'r')
        shapefile_online.close()

        # Iterate through all scales and levels and extract relevant files.
        modified_format_dict = dict(format_dict)
        scales = ('c', 'l', 'i', 'h', 'f')
        levels = (1, 2, 3, 4)
        for scale, level in itertools.product(scales, levels):
            modified_format_dict.update({'scale': scale, 'level': level})
            target_path = self.target_path(modified_format_dict)
            target_dir = os.path.dirname(target_path)
            if not os.path.isdir(target_dir):
                os.makedirs(target_dir)

            for member_path in self.zip_file_contents(modified_format_dict):
                ext = os.path.splitext(member_path)[1]
                target = os.path.splitext(target_path)[0] + ext
                member = zfh.getinfo(member_path)
                with open(target, 'wb') as fh:
                    fh.write(zfh.open(member).read())

        zfh.close()
Beispiel #23
0
def load_otto_group():
    """
    Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition.
    Link: https://www.kaggle.com/c/otto-group-product-classification-challenge

    Returns
    ----------
    data : array-like
        Pandas data frame containing the entire data set.

    X : array-like
        Training input samples.

    y : array-like
        Target values.
    """
    file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip')
    z = ZipFile(file_location)
    data = pd.read_csv(z.open('train.csv'))
    data = data.set_index('id')

    # move the label to the first position
    cols = data.columns.tolist()
    cols = cols[-1:] + cols[0:-1]
    data = data[cols]

    X = data.iloc[:, 1:].values

    y = data.iloc[:, 0].values

    # transform the labels from strings to integers
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    return data, X, y
Beispiel #24
0
def get_data_famafrench(name, start=None, end=None):
    start, end = _sanitize_dates(start, end)

    # path of zip files
    zipFileURL = "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"

    url = urllib.urlopen(zipFileURL + name + ".zip")
    zipfile = ZipFile(StringIO(url.read()))
    data = zipfile.open(name + ".txt").readlines()

    file_edges = np.where(np.array([len(d) for d in data]) == 2)[0]

    datasets = {}
    for i in range(len(file_edges) - 1):
        dataset = [d.split() for d in data[(file_edges[i] + 1):
                                           file_edges[i + 1]]]
        if(len(dataset) > 10):
            ncol = np.median(np.array([len(d) for d in dataset]))
            header_index = np.where(
                np.array([len(d) for d in dataset]) == (ncol - 1))[0][-1]
            header = dataset[header_index]
            # to ensure the header is unique
            header = [str(j + 1) + " " + header[j] for j in range(len(header))]
            index = np.array(
                [d[0] for d in dataset[(header_index + 1):]], dtype=int)
            dataset = np.array(
                [d[1:] for d in dataset[(header_index + 1):]], dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Beispiel #25
0
def getTranslations(type, localesDir, defaultLocale, projectName, key):
  result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/export?key=%s' % (projectName, key)).read()
  if result.find('<success') < 0:
    raise Exception('Server indicated that the operation was not successful\n' + result)

  result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/download/all.zip?key=%s' % (projectName, key)).read()
  zip = ZipFile(StringIO(result))
  dirs = {}
  for info in zip.infolist():
    if not info.filename.endswith('.json'):
      continue

    dir, file = os.path.split(info.filename)
    if not re.match(r'^[\w\-]+$', dir) or dir == defaultLocale:
      continue
    if type == 'chrome' and file.count('.') == 1:
      origFile = file
    else:
      origFile = re.sub(r'\.json$', '', file)
    if type == 'gecko' and not origFile.endswith('.dtd') and not origFile.endswith('.properties'):
      continue

    mapping = langMappingChrome if type == 'chrome' else langMappingGecko
    for key, value in mapping.iteritems():
      if value == dir:
        dir = key
    if type == 'chrome':
      dir = dir.replace('-', '_')

    data = zip.open(info.filename).read()
    if data == '[]':
      continue

    if not dir in dirs:
      dirs[dir] = set()
    dirs[dir].add(origFile)

    path = os.path.join(localesDir, dir, origFile)
    if not os.path.exists(os.path.dirname(path)):
      os.makedirs(os.path.dirname(path))
    if type == 'chrome' and origFile.endswith('.json'):
      postprocessChromeLocale(path, data)
    elif type == 'chrome':
      data = json.loads(data)
      if origFile in data:
        fileHandle = codecs.open(path, 'wb', encoding='utf-8')
        fileHandle.write(data[origFile]['message'])
        fileHandle.close()
    else:
      fromJSON(path, data)

  # Remove any extra files
  for dir, files in dirs.iteritems():
    baseDir = os.path.join(localesDir, dir)
    if not os.path.exists(baseDir):
      continue
    for file in os.listdir(baseDir):
      path = os.path.join(baseDir, file)
      if os.path.isfile(path) and (file.endswith('.json') or file.endswith('.properties') or file.endswith('.dtd')) and not file in files:
        os.remove(path)
Beispiel #26
0
def load_property_inspection():
    """
    Loads and returns several variables for the data set from Kaggle's Property Inspection Prediction competition.
    Link: https://www.kaggle.com/c/liberty-mutual-group-property-inspection-prediction

    Returns
    ----------
    data : array-like
        Pandas data frame containing the entire data set.

    X : array-like
        Training input samples.

    y : array-like
        Target values.
    """
    file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'property_inspection.zip')
    z = ZipFile(file_location)
    data = pd.read_csv(z.open('train.csv'))
    data = data.set_index('Id')

    X = data.iloc[:, 1:].values
    y = data.iloc[:, 0].values

    # transform the categorical variables from strings to integers
    encoder = CategoryEncoder()
    X = encoder.fit_transform(X)

    return data, X, y
Beispiel #27
0
def load_forest_cover():
    """
    Loads and returns several variables for the data set from Kaggle's Forest Cover Type Prediction competition.
    Link: https://www.kaggle.com/c/forest-cover-type-prediction

    Returns
    ----------
    data : array-like
        Pandas data frame containing the entire data set.

    X : array-like
        Training input samples.

    y : array-like
        Target values.
    """
    file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'forest_cover.zip')
    z = ZipFile(file_location)
    data = pd.read_csv(z.open('train.csv'))
    data = data.set_index('Id')

    # move the label to the first position
    cols = data.columns.tolist()
    cols = cols[-1:] + cols[0:-1]
    data = data[cols]

    X = data.iloc[:, 1:].values
    y = data.iloc[:, 0].values

    return data, X, y
Beispiel #28
0
def get_sightings_from_atlas(uri, species_ids):
    # Create a dict of sightings
    # Each species ID will have a list of sightings with [lat, long]
    sightings = dict()
    for species_id in species_ids:
        sightings[species_id] = []

    # The CSV headers
    LONG = 0
    LAT = 1
    LSID = 2
        
    # Download API call and unzip
    url = urlopen(uri)
    zipfile = ZipFile(StringIO(url.read()))

    # Skip the header row using [1:]
    for line in zipfile.open("data.csv").readlines()[1:]:
        sighting_record = line.split(",")
        sightings[sighting_record[LSID][1:-2]].append([sighting_record[LAT][1:-1],sighting_record[LONG][1:-1]])
        
    for species_id in species_ids:
        # Don't return too many sightings for a single species
        sightings[species_id] = sightings[species_id][0:species_sighting_limit]
        # Prune any empty entries
        if sightings[species_id] == []: del sightings[species_id]
        
    return sightings
Beispiel #29
0
def download_unzip(input_zip):
    url = urllib.urlopen(input_zip)
    unzipped_string = ''
    zipfile = ZipFile(StringIO(url.read()))
    for name in zipfile.namelist():
        unzipped_string += zipfile.open(name).read()
    return unzipped_string
Beispiel #30
0
    def acquire_resource(self, target_path, format_dict):
        """
        Downloads the zip file and extracts the files listed in
        :meth:`zip_file_contents` to the target path.

        """
        import cStringIO as StringIO
        from zipfile import ZipFile

        target_dir = os.path.dirname(target_path)
        if not os.path.isdir(target_dir):
            os.makedirs(target_dir)

        url = self.url(format_dict)

        shapefile_online = self._urlopen(url)

        zfh = ZipFile(StringIO.StringIO(shapefile_online.read()), 'r')

        for member_path in self.zip_file_contents(format_dict):
            ext = os.path.splitext(member_path)[1]
            target = os.path.splitext(target_path)[0] + ext
            member = zfh.getinfo(member_path)
            with open(target, 'wb') as fh:
                fh.write(zfh.open(member).read())

        shapefile_online.close()
        zfh.close()

        return target_path
def _zip_filehandle(filename):
    zipfile = ZipFile(filename)
    _filename = zipfile.namelist()[0]
    filehandle = zipfile.open(_filename)
    return filehandle
Beispiel #32
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import gc
#import lightgbm as lgb
import time
from sklearn.metrics import r2_score
import os 
os.chdir('C:/Users/andre/Documents/Github/Walmart-ACC')

#from https://stackoverflow.com/questions/44575251/reading-multiple-files-contained-in-a-zip-file-with-pandas
from zipfile import ZipFile
zip_file = ZipFile('m5-forecasting-accuracy.zip')
calendar_df = pd.read_csv(zip_file.open('calendar.csv'))
sell_prices_df = pd.read_csv(zip_file.open('sell_prices.csv'))
sales_train_validation_df = pd.read_csv(zip_file.open('sales_train_validation.csv'))
sample_submission_df = pd.read_csv(zip_file.open('sample_submission.csv'))

#sell_prices_df.info(memory_usage='deep') #957.5MB this is around 3x bigger than Mnist
#calendar_df.info(memory_usage='deep')
#sales_train_validation_df.info(memory_usage = 'deep')

###############################################################################
#Memory Reduction
###############################################################################
# Calendar data type cast -> Memory Usage Reduction
calendar_df[["month", "snap_CA", "snap_TX", "snap_WI", "wday"]] = calendar_df[["month", "snap_CA", "snap_TX", "snap_WI", "wday"]].astype("int8")
calendar_df[["wm_yr_wk", "year"]] = calendar_df[["wm_yr_wk", "year"]].astype("int16") 
calendar_df["date"] = calendar_df["date"].astype("datetime64")
def main():
    global output_encoding, datfilecomment

    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--input-file',
                        required=True,
                        help='input zip file containings csv databases')
    parser.add_argument('-o', '--output-file', help='output GeoIP dat file')
    parser.add_argument('-f',
                        '--fips-file',
                        help='geonameid to fips code mappings')
    parser.add_argument(
        '-e',
        '--encoding',
        help='encoding to use for the output rather than utf-8')
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='debug mode')
    parser.add_argument('-6',
                        '--ipv6',
                        action='store_const',
                        default='IPv4',
                        const='IPv6',
                        help='use ipv6 database')
    opts = parser.parse_args()

    if opts.encoding:
        try:
            codecs.lookup(opts.encoding)
        except LookupError as e:
            print(e)
            sys.exit(1)
        output_encoding = opts.encoding

    re_entry = re.compile(
        r'.*?/Geo(?:Lite|IP)2-(?P<database>.*?)-(?P<filetype>.*?)-(?P<arg>.*)\.csv'
    )

    entries = defaultdict(lambda: defaultdict(dict))

    ziparchive = ZipFile(opts.input_file)
    for entry in ziparchive.filelist:
        match = re_entry.match(entry.filename)
        if match is None:
            continue

        db, filetype, arg = match.groups()
        entries[db][filetype][arg] = entry

    if len(entries) != 1:
        print('More than one kind of database found, please check the archive')
        sys.exit(1)

    # noinspection PyUnboundLocalVariable
    datfilecomment = '{} converted to legacy MaxMind DB with geolite2legacy'.format(
        os.path.dirname(entry.filename))
    dbtype, entries = entries.popitem()

    if dbtype == 'ASN':
        locs = None
    else:
        if not {'Locations', 'Blocks'} <= set(entries.keys()):
            print('Missing Locations or Block files, please check the archive')
            sys.exit(1)

        locs = entries['Locations'].get('en')
        if locs is None:
            print('Selected locale not found in archive')
            sys.exit(1)

        locs = TextIOWrapper(ziparchive.open(locs, 'r'), encoding='utf-8')

    if dbtype not in RTree:
        print('{} not supported'.format(dbtype))
        sys.exit(1)

    r = RTree[dbtype][opts.ipv6](debug=opts.debug)
    blocks = entries['Blocks'].get(opts.ipv6)

    if blocks is None:
        print('The selected block file not found in archive')
        sys.exit(1)

    if dbtype != 'ASN':
        fips_file = opts.fips_file or os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'geoname2fips.csv')
        parse_fips(fips_file)

    tstart = time()
    print('Database type {} - Blocks {} - Encoding: {}'.format(
        dbtype, opts.ipv6, output_encoding))

    r.load(locs, TextIOWrapper(ziparchive.open(blocks, 'r'), encoding='utf-8'))

    if not opts.output_file:
        opts.output_file = Filenames[dbtype][opts.ipv6]
        print('Output file {}'.format(opts.output_file))

    with open(opts.output_file, 'wb') as output:
        r.serialize(output)

    tstop = time()

    print(
        'wrote %d-node trie with %d networks (%d distinct labels) in %d seconds'
        % (len(r.segments), r.netcount, len(r.data_offsets), tstop - tstart))
Beispiel #34
0
class XMLparser(object):
    """docx解析类,用于解析docx文件"""
    def __init__(self):
        self.uploadDir = os.path.join(basedir,
                                      "static/upload/htmlcoder")  # upload文件夹路径
        self.docxPath = self._get_docx_path()  # docx文件路径
        self.docxName = self._get_docx_name()  # docx文件名
        self.docx = ZipFile(self.docxPath, "r")  # docx文件的ZipFile对象
        self.documentXml = self.docx.read("word/document.xml").decode(
            "utf-8")  # document.xml 定义了 docx 的文件结构
        self.imgRelsXml = self.docx.read(
            "word/_rels/document.xml.rels").decode(
                "utf-8")  # document.xml.rels 定义了 img-id-path 的映射关系
        self.stylesRelsXml = self.docx.read("word/styles.xml").decode(
            "utf-8")  # styles.xml 定义了 style-id-name 的映射关系
        self.imgRels = self._get_imgRels()  # 解析 img-id-path
        self.styleRels = self._get_styleRels()  # 解析 style-id-name
        self.imgNames = dict()  # 保存图片 MD5 文件名的字典,以 imgID 为key

    def _get_docx_path(self):
        """获得工作路径下指定目录内的docx文件路径,多个文件时按创建时间先后顺序排序"""
        docxDir = self.uploadDir
        docxDirList = [
            dirName for dirName in os.listdir(docxDir)
            if dirName.endswith(".docx")
        ]  #筛选docx文件
        docxDirList.sort(key=lambda dirName: os.path.getctime(
            os.path.join(docxDir, dirName)),
                         reverse=True)
        return os.path.join(docxDir, docxDirList[0])  #返回最新的docx文件名

    def _get_docx_name(self):
        """获取从文件名中获得docx名称"""
        return self.docxPath[self.docxPath.rindex("/") +
                             1:self.docxPath.rindex(".docx")]

    def _get_imgRels(self):
        """根据xml解析img-id-path"""
        relDict = dict()
        for soup in BeautifulSoup(self.imgRelsXml,
                                  "lxml-xml").find_all("Relationship"):
            relId = soup.get("Id")
            relTarget = soup.get("Target")  # 相应图片对应的target
            if relTarget[:5] == "media":  # 保存以media开头的映射关系
                relDict[relId] = relTarget
        return relDict

    def _get_styleRels(self):
        """根据xml解析style-id-name"""
        relDict = dict()
        for soup in BeautifulSoup(self.stylesRelsXml,
                                  "lxml-xml").find_all("w:style"):
            styleId = soup.get("w:styleId")
            styleName = soup.find("w:name").get(
                "w:val")  # 保存style的id与其对应名称的映射关系
            relDict[styleId] = styleName
        return relDict

    def get_img_path(self, imgId, local=False):
        """利用已生成的img-id映射关系找到相应图片的路径"""
        imgPath = self.imgRels.get(imgId)
        if local:  # 本地化测试用,直接返回路径名,指向手动解压的docx文件夹
            return imgPath
        else:
            imgFmt = imgPath[imgPath.rindex(".") + 1:]  # 图片格式
            imgName = "{}.{}".format(self._get_img_name(imgPath), imgFmt)
            self.imgNames[imgId] = imgName  # 保存文件名
            return "/static/upload/htmlcoder/%s" % imgName  # 返回浏览器的访问路径

    def _get_img_name(self, imgPath):
        """计算图片MD5并以之作为文件名返回"""
        imgBytes = self.docx.open(os.path.join("word", imgPath)).read()
        return get_MD5(imgBytes)

    def get_style(self, styleId):
        """利用已生成的style-id映射关系找到相应的style名称"""
        return self.styleRels.get(styleId)

    def write_to_file(self, filePath, xml, format=True):
        """测试用,将xml写入本地"""
        soup = BeautifulSoup(xml, "lxml")
        with open(filePath, "w", encoding="utf-8") as fp:
            if format:
                fp.write(soup.prettify())
            else:
                fp.write(str(soup))

    def extract_imgs(self):
        """解压并重命名docx中的图片"""
        uploadDirSet = {
            dirName[:dirName.rindex(".")]
            for dirName in os.listdir(self.uploadDir)
        }  # 获得Upload文件夹中去扩展的文件名
        for imgId, imgName in self.imgNames.items():
            if imgName in uploadDirSet:  # 说明是重复图片
                continue
            else:
                imgPath = self.imgRels[imgId]  # 获得docx文件内部的img路径
                with open(os.path.join(self.uploadDir, imgName),
                          "wb") as fp:  # 将docx的图片保存到本地
                    fp.write(
                        self.docx.open(os.path.join("word", imgPath)).read())
Beispiel #35
0
    def _get_interactions(self, limit):
        LOG.info("getting interactions")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        matchcounter = 0

        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip comment lines
                if re.match(r'^#', line.decode()):
                    LOG.debug("Skipping header line")
                    continue
                line_counter += 1
                line = line.decode().strip()
                # print(line)
                (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a,
                 aliases_b, detection_method, pub_author, pub_id, taxid_a,
                 taxid_b, interaction_type, source_db, interaction_id,
                 confidence_val) = line.split('\t')
                taxid_a = taxid_a.rstrip()
                taxid_b = taxid_b.rstrip()

                # get the actual gene ids,
                # typically formated like: gene/locuslink:351|BIOGRID:106848
                gene_a_num = re.search(r'locuslink\:(\d+)\|?',
                                       interactor_a).groups()[0]
                gene_b_num = re.search(r'locuslink\:(\d+)\|?',
                                       interactor_b).groups()[0]

                if self.test_mode:
                    graph = self.testgraph
                    # skip any genes that don't match our test set
                    if (int(gene_a_num) not in self.test_ids) or\
                            (int(gene_b_num) not in self.test_ids):
                        continue
                else:
                    graph = self.graph
                    # when not in test mode, filter by taxon
                    if int(taxid_a.split(':')[-1]) not in self.tax_ids or \
                            int(taxid_b.split(':')[-1]) not in self.tax_ids:
                        continue
                    else:
                        matchcounter += 1

                gene_a = 'NCBIGene:' + gene_a_num
                gene_b = 'NCBIGene:' + gene_b_num

                # get the interaction type
                # psi-mi:"MI:0407"(direct interaction)
                int_type = re.search(r'MI:\d+', interaction_type).group()
                rel = self.resolve(int_type, False)
                if rel == int_type:
                    rel = self.globaltt['interacts with']

                # scrub pubmed-->PMID prefix
                pub_id = re.sub(r'pubmed', 'PMID', pub_id)
                # remove bogus whitespace
                pub_id = pub_id.strip()

                # get the method, and convert to evidence code
                det_code = re.search(r'MI:\d+', detection_method).group()
                evidence = self.resolve(det_code, False)
                if evidence == det_code:
                    evidence = self.globaltt["experimental evidence"]

                # note that the interaction_id is some kind of internal biogrid
                # identifier that does not map to a public URI.
                # we will construct a monarch identifier from this

                assoc = InteractionAssoc(graph, self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence)
                assoc.add_source(pub_id)
                assoc.add_association_to_graph()

                if not self.test_mode and (limit is not None
                                           and line_counter > limit):
                    break

        myzip.close()

        return
Beispiel #36
0
class TestGckWithDGVC(unittest.TestCase):

    sample_data = {
        "pACW": {
            "file":
            "Drosophila Gateway Vectors GCK/pACW",
            "name":
            "Construct:",
            "id":
            "Construct:",
            "description":
            "Construct:  pACTIN-RW-SV",
            "length":
            7957,
            "topology":
            "circular",
            "features": [
                {
                    "type": "CDS",
                    "start": 6155,
                    "end": 7013,
                    "strand": 1,
                    "label": "ampR",
                },
                {
                    "type": "misc_feature",
                    "start": 5216,
                    "end": 6071,
                    "strand": 1,
                    "label": "SV40 sti/polyA",
                },
                {
                    "type": "misc_feature",
                    "start": 89,
                    "end": 2662,
                    "strand": 1,
                    "label": "actin5C promoter",
                },
                {
                    "type": "CDS",
                    "start": 3722,
                    "end": 4400,
                    "strand": 1,
                    "label": "chlR",
                },
                {
                    "type": "CDS",
                    "start": 4722,
                    "end": 5025,
                    "strand": 1,
                    "label": "ccdB",
                },
                {
                    "type": "misc_feature",
                    "start": 3489,
                    "end": 3507,
                    "strand": 1,
                    "label": "attR1",
                },
                {
                    "type": "misc_feature",
                    "start": 5175,
                    "end": 5193,
                    "strand": -1,
                    "label": "attR2",
                },
                {
                    "type": "misc_feature",
                    "start": 3489,
                    "end": 5193,
                    "strand": 1,
                    "label": "Gateway cassette",
                },
                {
                    "type": "misc_feature",
                    "start": 5192,
                    "end": 5205,
                    "strand": 1,
                    "label": "triple STOP",
                },
                {
                    "type": "CDS",
                    "start": 2763,
                    "end": 3480,
                    "strand": 1,
                    "label": "ECFP",
                },
                {
                    "type": "misc_feature",
                    "start": 2755,
                    "end": 3482,
                    "strand": 1,
                    "label": "pACTIN-SV",
                },
                {
                    "type": "misc_feature",
                    "start": 2755,
                    "end": 3482,
                    "strand": 1,
                    "label": "Construct:  pACTIN-RW-SV",
                },
            ],
        },
        "pPWF": {
            "file":
            "Drosophila Gateway Vectors GCK/pPWG",
            "name":
            "Construct:",
            "id":
            "Construct:",
            "description":
            "Construct:  pPWF",
            "length":
            12320,
            "topology":
            "circular",
            "features": [
                {
                    "type": "misc_feature",
                    "start": 0,
                    "end": 587,
                    "strand": 1,
                    "label": "P 5' end",
                },
                {
                    "type": "misc_feature",
                    "start": 9327,
                    "end": 9560,
                    "strand": -1,
                    "label": "P 3' end",
                },
                {
                    "type": "misc_feature",
                    "start": 1363,
                    "end": 4244,
                    "strand": -1,
                    "label": "mini-white",
                },
                {
                    "type": "CDS",
                    "start": 10466,
                    "end": 11324,
                    "strand": 1,
                    "label": "ampR",
                },
                {
                    "type": "misc_feature",
                    "start": 7930,
                    "end": 9314,
                    "strand": 1,
                    "label": "K10 terminator",
                },
                {
                    "type": "misc_feature",
                    "start": 4762,
                    "end": 4829,
                    "strand": 1,
                    "label": "GAGA repeats",
                },
                {
                    "type": "misc_feature",
                    "start": 4855,
                    "end": 5177,
                    "strand": 1,
                    "label": "GAL4 sites",
                },
                {
                    "type": "misc_feature",
                    "start": 5279,
                    "end": 5415,
                    "strand": 1,
                    "label": "P intron",
                },
                {
                    "type": "misc_feature",
                    "start": 5184,
                    "end": 5279,
                    "strand": 1,
                    "label": "P promoter",
                },
                {
                    "type": "misc_feature",
                    "start": 4762,
                    "end": 5416,
                    "strand": 1,
                    "label": "UASp promoter",
                },
                {
                    "type": "misc_feature",
                    "start": 10060,
                    "end": 12092,
                    "strand": 1,
                    "label": "pUC8",
                },
                {
                    "type": "misc_feature",
                    "start": 7106,
                    "end": 7124,
                    "strand": -1,
                    "label": "attR2",
                },
                {
                    "type": "CDS",
                    "start": 6653,
                    "end": 6956,
                    "strand": 1,
                    "label": "ccdB",
                },
                {
                    "type": "CDS",
                    "start": 5653,
                    "end": 6331,
                    "strand": 1,
                    "label": "chlR",
                },
                {
                    "type": "misc_feature",
                    "start": 5420,
                    "end": 7124,
                    "strand": 1,
                    "label": "Gateway Cassette",
                },
                {
                    "type": "misc_feature",
                    "start": 5420,
                    "end": 5438,
                    "strand": 1,
                    "label": "attR1",
                },
                {
                    "type": "CDS",
                    "start": 7137,
                    "end": 7854,
                    "strand": 1,
                    "label": "EGFP",
                },
                {
                    "type": "misc_feature",
                    "start": 7129,
                    "end": 7856,
                    "strand": 1,
                    "label": "pACTIN-SV",
                },
                {
                    "type": "misc_feature",
                    "start": 7129,
                    "end": 7856,
                    "strand": 1,
                    "label": "Construct:  pACTIN-WC-SV",
                },
                {
                    "type": "misc_feature",
                    "start": 5416,
                    "end": 7875,
                    "strand": 1,
                    "label": "Construct:  pPWF",
                },
            ],
        },
    }

    def setUp(self):
        # We are using the files of the Drosophila Gateway Vector Collection
        # (<https://emb.carnegiescience.edu/drosophila-gateway-vector-collection>)
        # as sample Gck files. We cannot redistribute those files along with
        # Biopython, so we need to download them now for the tests to run.
        if not os.path.exists("Gck/DGVC_GCK.zip"):
            try:
                requires_internet.check()
            except MissingExternalDependencyError:
                self.skipTest("Sample files missing and no Internet access")
                return

            try:
                with urlopen(
                        "https://emb.carnegiescience.edu/sites/default/files/DGVC_GCK.zip"
                ) as src, open("Gck/DGVC_GCK.zip", "wb") as dst:
                    shutil.copyfileobj(src, dst)
            except HTTPError:
                self.skipTest("Cannot download the sample files")
                return

        self.zipdata = ZipFile("Gck/DGVC_GCK.zip")

    def tearDown(self):
        self.zipdata.close()

    def test_read(self):
        """Read sample files."""
        for sample in self.sample_data.values():
            with self.zipdata.open(sample["file"]) as f:
                record = SeqIO.read(f, "gck")
            self.assertEqual(sample["name"], record.name)
            self.assertEqual(sample["id"], record.id)
            self.assertEqual(sample["description"], record.description)
            self.assertEqual(sample["length"], len(record))
            self.assertEqual(sample["topology"],
                             record.annotations["topology"])

            self.assertEqual(len(sample["features"]), len(record.features))
            for i, exp_feat in enumerate(sample["features"]):
                read_feat = record.features[i]
                self.assertEqual(exp_feat["type"], read_feat.type)
                self.assertEqual(exp_feat["start"], read_feat.location.start)
                self.assertEqual(exp_feat["end"], read_feat.location.end)
                self.assertEqual(exp_feat["strand"], read_feat.location.strand)
                self.assertEqual(exp_feat["label"],
                                 read_feat.qualifiers["label"][0])
Beispiel #37
0
class Wheel:
    def __init__(self, path):
        self.path = path
        self.parsed_filename = parse_wheel_filename(os.path.basename(path))
        self.dist_info = '{0.project}-{0.version}.dist-info'\
                            .format(self.parsed_filename)

    def __enter__(self):
        self.fp = open(self.path, 'rb')
        self.zipfile = ZipFile(self.fp)
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.zipfile.close()
        self.fp.close()
        return False

    @cached_property
    def record(self):
        rec = self._get_dist_info('RECORD')
        if rec is None:
            raise errors.MissingRecordError()
        with self.zipfile.open(rec) as fp:
            # The csv module requires this file to be opened with `newline=''`
            return Record.load(io.TextIOWrapper(fp, 'utf-8', newline=''))

    def verify_record(self):
        # Check everything in RECORD against actual values:
        for entry in self.record:
            if entry:
                entry.verify(self.zipfile)
            elif entry.path != self.dist_info + '/RECORD':
                raise errors.NullEntryError(entry.path)
        # Check everything in zipfile appears in RECORD (except signatures and
        # directories):
        for path in self.zipfile.namelist():
            if path not in self.record and path not in (
                    self.dist_info + '/RECORD.jws',
                    self.dist_info + '/RECORD.p7s',
            ) and not path.endswith('/'):
                raise errors.ExtraFileError(path)

    @cached_property
    def metadata(self):
        rec = self._get_dist_info('METADATA')
        if rec is None:
            ### TODO: This should be an error
            return None
        with self.zipfile.open(rec) as fp:
            return parse_metadata(io.TextIOWrapper(fp, 'utf-8'))

    @cached_property
    def wheel_info(self):
        rec = self._get_dist_info('WHEEL')
        if rec is None:
            ### TODO: This should be an error
            return None
        with self.zipfile.open(rec) as fp:
            return parse_wheel_info(io.TextIOWrapper(fp, 'utf-8'))

    def _get_dist_info(self, filename):
        try:
            return self.zipfile.getinfo(self.dist_info + '/' + filename)
        except KeyError:
            return None

    def inspect(self):
        namebits = self.parsed_filename
        about = {
            "filename": os.path.basename(self.path),
            "project": namebits.project,
            "version": namebits.version,
            "buildver": namebits.build,
            "pyver": namebits.python_tags,
            "abi": namebits.abi_tags,
            "arch": namebits.platform_tags,
        }
        try:
            record = self.record
        except WheelValidationError as e:
            record = None
            about["valid"] = False
            about["validation_error"] = {
                "type": type(e).__name__,
                "str": str(e),
            }
        else:
            try:
                self.verify_record()
            except WheelValidationError as e:
                about["valid"] = False
                about["validation_error"] = {
                    "type": type(e).__name__,
                    "str": str(e),
                }
            else:
                about["valid"] = True

        about["file"] = {"size": os.path.getsize(self.path)}
        self.fp.seek(0)
        about["file"]["digests"] = digest_file(self.fp, ["md5", "sha256"])

        about["dist_info"] = {}
        if self.metadata is not None:
            about["dist_info"]["metadata"] = self.metadata
        if record is not None:
            about["dist_info"]["record"] = record.for_json()
        if self.wheel_info is not None:
            about["dist_info"]["wheel"] = self.wheel_info

        for fname, parser, key in EXTRA_DIST_INFO_FILES:
            info = self._get_dist_info(fname)
            if info is not None:
                with self.zipfile.open(info) as fp:
                    about["dist_info"][key] = parser(
                        io.TextIOWrapper(fp, 'utf-8'))

        if self._get_dist_info('zip-safe') is not None:
            about["dist_info"]["zip_safe"] = True
        elif self._get_dist_info('not-zip-safe') is not None:
            about["dist_info"]["zip_safe"] = False

        md = about["dist_info"].get("metadata", {})
        about["derived"] = {
            "description_in_body": "BODY" in md,
            "description_in_headers": "description" in md,
        }

        if "BODY" in md and "description" not in md:
            md["description"] = md["BODY"]
        md.pop("BODY", None)
        readme = md.get("description")
        if readme is not None:
            md["description"] = {"length": len(md["description"])}
            dct = md.get("description_content_type")
            if dct is None or parse_header(dct)[0] == 'text/x-rst':
                about["derived"]["readme_renders"] = render(readme) is not None
            else:
                about["derived"]["readme_renders"] = True
        else:
            about["derived"]["readme_renders"] = None

        if md.get("keywords") is not None:
            about["derived"]["keywords"], about["derived"]["keyword_separator"] \
                = split_keywords(md["keywords"])
        else:
            about["derived"]["keywords"], about["derived"]["keyword_separator"] \
                = [], None
        about["derived"]["keywords"] = sorted(set(
            about["derived"]["keywords"]))

        about["derived"]["dependencies"] = sorted(
            unique_projects(req["name"]
                            for req in md.get("requires_dist", [])))

        about["derived"]["modules"] = extract_modules(
            [rec["path"] for rec in about["dist_info"].get("record", [])])

        return about
Beispiel #38
0
def build_check_requires_timestamp(t):
    from zipfile import ZipFile
    unused_count = 0
    all_provides = set()
    zf = ZipFile(PLOVR_JAR)
    for zi in zf.infolist():
        if zi.filename.endswith('.js'):
            if not zi.filename.startswith('closure/goog/'):
                continue
            # Skip goog.i18n because it contains so many modules that it causes
            # the generated regular expression to exceed Python's limits
            if zi.filename.startswith('closure/goog/i18n/'):
                continue
            for line in zf.open(zi, 'rU'):
                m = re.match(r'goog.provide\(\'(.*)\'\);', line)
                if m:
                    all_provides.add(m.group(1))
    for filename in sorted(t.dependencies):
        if filename == 'build/src/internal/src/requireall.js':
            continue
        require_linenos = {}
        uses = set()
        lines = open(filename, 'rU').readlines()
        for lineno, line in _strip_comments(lines):
            m = re.match(r'goog.provide\(\'(.*)\'\);', line)
            if m:
                all_provides.add(m.group(1))
                continue
            m = re.match(r'goog.require\(\'(.*)\'\);', line)
            if m:
                require_linenos[m.group(1)] = lineno
                continue
        ignore_linenos = require_linenos.values()
        for lineno, line in enumerate(lines):
            if lineno in ignore_linenos:
                continue
            for require in require_linenos.iterkeys():
                if require in line:
                    uses.add(require)
        for require in sorted(set(require_linenos.keys()) - uses):
            t.info('%s:%d: unused goog.require: %r' %
                   (filename, require_linenos[require], require))
            unused_count += 1
    all_provides.discard('ol')
    all_provides.discard('ol.MapProperty')

    class Node(object):
        def __init__(self):
            self.present = False
            self.children = {}

        def _build_re(self, key):
            if key == '*':
                assert len(self.children) == 0
                # We want to match `.doIt` but not `.SomeClass` or `.more.stuff`
                return '(?=\\.[a-z]\\w*\\b(?!\\.))'
            elif len(self.children) == 1:
                child_key, child = next(self.children.iteritems())
                child_re = child._build_re(child_key)
                if child_key != '*':
                    child_re = '\\.' + child_re
                if self.present:
                    return key + '(' + child_re + ')?'
                else:
                    return key + child_re
            elif self.children:
                children_re = '(?:' + '|'.join(
                    ('\\.' if k != '*' else '') + self.children[k]._build_re(k)
                    for k in sorted(self.children.keys())) + ')'
                if self.present:
                    return key + children_re + '?'
                else:
                    return key + children_re
            else:
                assert self.present
                return key

        def build_re(self, key):
            return re.compile('\\b' + self._build_re(key) + '\\b')

    root = Node()
    for provide in all_provides:
        node = root
        for component in provide.split('.'):
            if component not in node.children:
                node.children[component] = Node()
            node = node.children[component]
        if component[0].islower():
            # We've arrived at a namespace provide like `ol.foo`.
            # In this case, we want to match uses like `ol.foo.doIt()` but
            # not match things like `new ol.foo.SomeClass()`.
            # For this purpose, we use the special wildcard key for the child.
            node.children['*'] = Node()
        else:
            node.present = True
    provide_res = [
        child.build_re(key) for key, child in root.children.iteritems()
    ]
    missing_count = 0
    for filename in sorted(t.dependencies):
        if filename in INTERNAL_SRC or filename in EXTERNAL_SRC:
            continue
        provides = set()
        requires = set()
        uses = set()
        uses_linenos = {}
        for lineno, line in _strip_comments(open(filename, 'rU')):
            m = re.match(r'goog.provide\(\'(.*)\'\);', line)
            if m:
                provides.add(m.group(1))
                continue
            m = re.match(r'goog.require\(\'(.*)\'\);', line)
            if m:
                requires.add(m.group(1))
                continue
            while True:
                for provide_re in provide_res:
                    m = provide_re.search(line)
                    if m:
                        uses.add(m.group())
                        uses_linenos[m.group()] = lineno
                        line = line[:m.start()] + line[m.end():]
                        break
                else:
                    break
        if filename == 'src/ol/renderer/layerrenderer.js':
            uses.discard('ol.renderer.Map')
        m = re.match(r'src/ol/renderer/(\w+)/\1(\w*)layerrenderer\.js\Z',
                     filename)
        if m:
            uses.discard('ol.renderer.Map')
            uses.discard('ol.renderer.%s.Map' % (m.group(1), ))
        missing_requires = uses - requires - provides
        if missing_requires:
            for missing_require in sorted(missing_requires):
                t.info(
                    "%s:%d missing goog.require('%s')" %
                    (filename, uses_linenos[missing_require], missing_require))
                missing_count += 1
    if unused_count or missing_count:
        t.error('%d unused goog.requires, %d missing goog.requires' %
                (unused_count, missing_count))
    t.touch()
Beispiel #39
0
def proj4js(t):
    from zipfile import ZipFile
    zf = ZipFile(PROJ4JS_ZIP)
    contents = zf.open('proj4js/lib/proj4js-combined.js').read()
    with open(t.name, 'wb') as f:
        f.write(contents)
Beispiel #40
0
def fat_aar(distdir,
            aars_paths,
            no_process=False,
            no_compatibility_check=False):
    if no_process:
        print("Not processing architecture-specific artifact Maven AARs.")
        return 0

    # Map {filename: {fingerprint: [arch1, arch2, ...]}}.
    diffs = defaultdict(lambda: defaultdict(list))
    missing_arch_prefs = set()
    # Collect multi-architecture inputs to the fat AAR.
    copier = FileCopier()

    for arch, aar_path in aars_paths.items():
        # Map old non-architecture-specific path to new architecture-specific path.
        old_rewrite_map = {
            "greprefs.js":
            "{}/greprefs.js".format(arch),
            "defaults/pref/geckoview-prefs.js":
            "defaults/pref/{}/geckoview-prefs.js".format(arch),
        }

        # Architecture-specific preferences files.
        arch_prefs = set(old_rewrite_map.values())
        missing_arch_prefs |= set(arch_prefs)

        jar_finder = JarFinder(aar_path, JarReader(aar_path))
        for path, fileobj in UnpackFinder(jar_finder):
            # Native libraries go straight through.
            if mozpath.match(path, "jni/**"):
                copier.add(path, fileobj)

            elif path in arch_prefs:
                copier.add(path, fileobj)

            elif path in ("classes.jar", "annotations.zip"):
                # annotations.zip differs due to timestamps, but the contents should not.

                # `JarReader` fails on the non-standard `classes.jar` produced by Gradle/aapt,
                # and it's not worth working around, so we use Python's zip functionality
                # instead.
                z = ZipFile(BytesIO(fileobj.open().read()))
                for r in z.namelist():
                    fingerprint = sha1(z.open(r).read()).hexdigest()
                    diffs["{}!/{}".format(path, r)][fingerprint].append(arch)

            else:
                fingerprint = sha1(six.ensure_binary(
                    fileobj.open().read())).hexdigest()
                # There's no need to distinguish `target.maven.zip` from `assets/omni.ja` here,
                # since in practice they will never overlap.
                diffs[path][fingerprint].append(arch)

            missing_arch_prefs.discard(path)

    # Some differences are allowed across the architecture-specific AARs.  We could allow-list
    # the actual content, but it's not necessary right now.
    allow_pattern_list = {
        "AndroidManifest.xml",  # Min SDK version is different for 32- and 64-bit builds.
        "classes.jar!/org/mozilla/gecko/util/HardwareUtils.class",  # Min SDK as well.
        "classes.jar!/org/mozilla/geckoview/BuildConfig.class",
        # Each input captures its CPU architecture.
        "chrome/toolkit/content/global/buildconfig.html",
        # Bug 1556162: localized resources are not deterministic across
        # per-architecture builds triggered from the same push.
        "**/*.ftl",
        "**/*.dtd",
        "**/*.properties",
    }

    not_allowed = OrderedDict()

    def format_diffs(ds):
        # Like '  armeabi-v7a, arm64-v8a -> XXX\n  x86, x86_64 -> YYY'.
        return "\n".join(
            sorted("  {archs} -> {fingerprint}".format(
                archs=", ".join(sorted(archs)), fingerprint=fingerprint)
                   for fingerprint, archs in ds.items()))

    for p, ds in sorted(diffs.items()):
        if len(ds) <= 1:
            # Only one hash across all inputs: roll on.
            continue

        if any(mozpath.match(p, pat) for pat in allow_pattern_list):
            print(
                'Allowed: Path "{path}" has architecture-specific versions:\n{ds_repr}'
                .format(path=p, ds_repr=format_diffs(ds)))
            continue

        not_allowed[p] = ds

    for p, ds in not_allowed.items():
        print(
            'Disallowed: Path "{path}" has architecture-specific versions:\n{ds_repr}'
            .format(path=p, ds_repr=format_diffs(ds)))

    for missing in sorted(missing_arch_prefs):
        print(
            "Disallowed: Inputs missing expected architecture-specific input: {missing}"
            .format(missing=missing))

    if not no_compatibility_check and (missing_arch_prefs or not_allowed):
        return 1

    output_dir = mozpath.join(distdir, "output")
    copier.copy(output_dir)

    return 0
Beispiel #41
0
def zip_extract(zip_file: zipfile.ZipFile, file_name: str,
                target_file_obj: IO):
    with zip_file.open(file_name) as fp:
        shutil.copyfileobj(fp, target_file_obj)
        target_file_obj.flush()
Beispiel #42
0
 def open(self, *args, **kwargs):
     base = BaseZipFile.open(self, *args, **kwargs)
     return ZipExtFile(base)
os.chdir('/home/llq205/clean_feature1')
zipfiles = glob('*zip')

for zfname in zipfiles:
    print(zfname)
    zfile = ZipFile(zfname)
    year = zfname.split('/')[-1][:-4]

    members = zfile.namelist()
    year_count = Counter()

    for fname in members:
        #         print(fname)
        if fname.endswith('-maj.p'):
            each = pickle.load(zfile.open(fname, 'r'))
            if each != [] and each[0] != Counter():
                d = Counter({k: 1 for k, v in each[0].items()})
                year_count += d

        elif fname.endswith('.p') == False:
            continue
        elif fname.endswith('dis/.p') == True:
            continue
        else:
            optype = fname.split('-')[-1][:-2]
            docid = fname.split('/')[-1][:-2]

            if len(optype) == 7:
                each = pickle.load(zfile.open(fname, 'r'))
                if each != [] and each[0] != Counter():
def run(args, tmp_dir):
    tmp_dir = Path(tmp_dir)

    base_dir = Path(args.dest)
    if not base_dir.exists():
        base_dir.mkdir(parents=True)
    base_dir = base_dir.resolve()

    if Path(args.pack_path).exists():
        pack_path = args.pack_path
        logger.info('Found local modpack {}'.format(pack_path))
    else:
        pack_path = tmp_dir / 'modpack.zip'
        logger.info('Downloading the modpack to {} ...'.format(pack_path))
        pack_path, _ = urlretrieve(args.pack_path, str(pack_path))
        logger.info('  Done')

    modpack = ZipFile(pack_path)

    manifest = json.loads(modpack.open('manifest.json').read().decode('utf-8'))
    logger.info('Modpack: {name} (Version {version})'.format(**manifest))

    if args.exclude:
        mod_blacklist = list(line.rstrip() for line in args.exclude)
        logger.debug('Mod blacklist: {}'.format(mod_blacklist))
        args.exclude.close()
    else:
        mod_blacklist = None

    # Download the mod files
    mod_store = tmp_dir / 'mod_store'
    mod_store.mkdir()
    logger.info('Starting mod downloads, this may take a while')
    with ThreadPoolExecutor(args.threads) as executor:
        futures = []

        for mod in manifest['files']:
            futures.append(executor.submit(download_mod, mod, mod_store, blacklist=mod_blacklist))

        bonus = manifest.get('directDownload', [])
        for entry in bonus:
            url, filename = (entry.get(x) for x in ('url', 'filename'))
            if url is None or filename is None:
                logger.warning('Error while handling entry {}'.format(entry), file=sys.stderr)
                continue
            futures.append(executor.submit(download, url, mod_store, filename))

        # Re-raise the exceptions which might have happened
        for f in as_completed(futures):
            e = f.exception()
            if e:
                for g in futures: g.cancel()
            if isinstance(e, urllib.error.HTTPError):
                logger.error('Error while fetching {}'.format(e.url))
            f.result()
    logger.info('  Done')

    # Backup some config
    subdirs = {d: base_dir / d for d in ('mods', 'config')}
    backups = {k: d.with_suffix('.bak') for k, d in subdirs.items()}
    for k, d in subdirs.items():
        if d.exists():
            b = backups[k]
            if b.exists():
                shutil.rmtree(str(b))
            d.replace(b)
        d.mkdir()

    # Update Forge
    if not args.keep_forge:
        mc_spec = manifest.get('minecraft', {})
        mc_version = mc_spec.get('version')
        forge_ids = [x['id'].replace('forge-', '') for x in mc_spec.get('modLoaders', []) if x.get('id', '').startswith('forge-')]
        if mc_version and forge_ids:
            update_forge(mc_version, forge_ids[0], tmp_dir, base_dir)
        else:
            logger.warning('Could not extract Forge informations from the manifest')
            logger.debug('minecraft : {}\nmodLoaders : {}'.format(manifest.get('minecraft'), manifest.get('modLoaders')))

    # Install mod files
    logger.info('Installing mods...')
    copytree(str(mod_store), str(subdirs['mods']))

    # Apply ovverides
    logger.info('Applying custom config...')
    overrides = manifest.get('overrides')
    if overrides is not None:
        overrides = Path(overrides)
        todo = [entry for entry in modpack.namelist() if Path(entry) > overrides]
        modpack.extractall(str(tmp_dir), todo)
        copytree(str(tmp_dir / overrides), str(base_dir))

    if args.keep_config and backups['config'].exists():
        copytree(str(backups['config']), str(subdirs['config']))

    logger.info('Modpack {name} successfully installed'.format(**manifest))
Beispiel #45
0
import pandas as pd
import plotly.graph_objects as go  # plotly 4.14.1
from plotly.subplots import make_subplots
import dash  # print(dash.__version__) (version 1.18.0) pip install dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc  # version 0.11.1

#----------------------------------------------
# Importing Google Mobility data
#----------------------------------------------
url = "https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip"
filename = requests.get(url).content
zf = ZipFile(BytesIO(filename), 'r')
df = pd.read_csv(zf.open('2020_DE_Region_Mobility_Report.csv'),
                 usecols=[
                     'sub_region_1', 'date',
                     'retail_and_recreation_percent_change_from_baseline',
                     'grocery_and_pharmacy_percent_change_from_baseline',
                     'parks_percent_change_from_baseline',
                     'transit_stations_percent_change_from_baseline',
                     'workplaces_percent_change_from_baseline',
                     'residential_percent_change_from_baseline'
                 ])

# Modifying the name of the columns
df.rename(columns={
    "sub_region_1": "state",
    "retail_and_recreation_percent_change_from_baseline": "retil_creat",
    "grocery_and_pharmacy_percent_change_from_baseline": "groce_pharma",
Beispiel #46
0
    def form_valid(self, form):
        # This method is called when valid form data has been POSTed.
        # It should return an HttpResponse.

        # TODO parse the kml file more smartly to locate the first placemark and work from there.
        kml_file_upload = self.request.FILES[
            'kmlfileUpload']  # get a handle on the file

        kml_file_upload_name = self.request.FILES[
            'kmlfileUpload'].name  # get the file name
        # kml_file_name = kml_file_upload_name[:kml_file_upload_name.rfind('.')]  # get the file name no extension
        kml_file_extension = kml_file_upload_name[
            kml_file_upload_name.rfind('.') + 1:]  # get the file extension

        kml_file_path = os.path.join(settings.MEDIA_ROOT)

        # Define a routine for importing Placemarks from a list of placemark elements
        def import_placemarks(kml_placemark_list):
            """
            A procedure that reads a KML placemark list and saves the data into the django database
            :param kml_placemark_list:
            :return:
            """
            occurrence_count, archaeology_count, biology_count, geology_count = [
                0, 0, 0, 0
            ]
            Occurrence.objects.all().update(
                last_import=False)  # Toggle off all last imports
            for o in kml_placemark_list:

                # Check to make sure that the object is a Placemark, filter out folder objects
                if type(o) is Placemark:
                    # Step 1 - parse the xml and copy placemark attributes to a dictionary
                    table = etree.fromstring(
                        o.description
                    )  # get the table element with all the data from the xml.
                    attributes = table.xpath(
                        "//text()|//img"
                    )  # get all text values and image tags from xml string
                    # TODO test attributes is even length
                    # Create a diction ary from the attribute list. The list has key value pairs as alternating
                    # elements in the list, the line below takes the first and every other elements and adds them
                    # as keys, then the second and every other element and adds them as values.
                    # e.g.
                    # attributes[0::2] = ["Basis of Record", "Time", "Item Type" ...]
                    # attributes[1::2] = ["Collection", "May 27, 2017, 10:12 AM", "Faunal" ...]
                    # zip creates a list of tuples  = [("Basis of Record", "Collection), ...]
                    # which is converted to a dictionary.
                    attributes_dict = dict(
                        zip(attributes[0::2], attributes[1::2]))

                    # Step 2 - Create a new Occurrence object (or subtype)
                    lgrp_occ = None
                    # Determine the appropriate subtype and initialize
                    item_type = attributes_dict.get("Item Type")
                    occurrence_count += 1
                    if item_type in ("Artifact", "Artifactual", "Archeology",
                                     "Archaeological"):
                        lgrp_occ = Archaeology()
                        archaeology_count += 1
                    elif item_type in ("Faunal", "Fauna", "Floral", "Flora"):
                        lgrp_occ = Biology()
                        biology_count += 1
                    elif item_type in ("Geological", "Geology"):
                        lgrp_occ = Geology()
                        geology_count += 1

                    # Step 3 - Copy attributes from dictionary to Occurrence object, validate as we go.
                    # Improve by checking each field to see if it has a choice list. If so validate against choice
                    # list.

                    # Verbatim Data - save a verbatim copy of the original kml placemark attributes.
                    lgrp_occ.verbatim_kml_data = attributes

                    # Validate Basis of Record
                    if attributes_dict.get("Basis Of Record") in (
                            "Fossil", "FossilSpecimen", "Collection"):
                        lgrp_occ.basis_of_record = "Collection"
                    elif attributes_dict.get("Basis Of Record") in (
                            "Observation", "HumanObservation"):
                        lgrp_occ.basis_of_record = "Observation"

                    # Validate Item Type
                    item_type = attributes_dict.get("Item Type")
                    if item_type in ("Artifact", "Artifactual", "Archeology",
                                     "Archaeological"):
                        lgrp_occ.item_type = "Artifactual"
                    elif item_type in ("Faunal", "Fauna"):
                        lgrp_occ.item_type = "Faunal"
                    elif item_type in ("Floral", "Flora"):
                        lgrp_occ.item_type = "Floral"
                    elif item_type in ("Geological", "Geology"):
                        lgrp_occ.item_type = "Geological"

                    # Date Recorded
                    try:
                        # parse the time
                        lgrp_occ.date_recorded = parse(
                            attributes_dict.get("Time"))
                        # set the year collected form field number
                        lgrp_occ.year_collected = lgrp_occ.date_recorded.year
                    except ValueError:
                        # If there's a problem getting the fieldnumber, use the current date time and set the
                        # problem flag to True.
                        lgrp_occ.date_recorded = datetime.now()
                        lgrp_occ.problem = True
                        try:
                            error_string = "Upload error, missing field number, using current date and time instead."
                            lgrp_occ.problem_comment = lgrp_occ.problem_comment + " " + error_string
                        except TypeError:
                            lgrp_occ.problem_comment = error_string

                    # Process point, comes in as well known text string
                    # Assuming point is in GCS WGS84 datum = SRID 4326
                    pnt = GEOSGeometry("POINT (" + str(o.geometry.x) + " " +
                                       str(o.geometry.y) + ")", 4326)  # WKT
                    lgrp_occ.geom = pnt

                    scientific_name_string = attributes_dict.get(
                        "Scientific Name")
                    lgrp_occ.item_scientific_name = scientific_name_string
                    if lgrp_occ.item_scientific_name:
                        match, match_count, match_list = match_taxon(lgrp_occ)
                        if match and match_count == 1:
                            lgrp_occ.taxon = match_list[0]

                    lgrp_occ.item_description = attributes_dict.get(
                        "Description")
                    if lgrp_occ.item_description:
                        match, match_count, match_list = match_element(
                            lgrp_occ)
                        if match and match_count == 1:
                            lgrp_occ.element = lgrp_occ.item_description.lower(
                            )

                    #######################
                    # NON-REQUIRED FIELDS #
                    #######################
                    lgrp_occ.barcode = attributes_dict.get("Barcode")
                    lgrp_occ.item_number = lgrp_occ.barcode
                    lgrp_occ.collection_remarks = attributes_dict.get(
                        "Collecting Remarks")
                    lgrp_occ.geology_remarks = attributes_dict.get(
                        "Geology Remarks")

                    lgrp_occ.collecting_method = attributes_dict.get(
                        "Collection Method")
                    finder_string = attributes_dict.get("Finder")
                    lgrp_occ.finder = finder_string
                    # import person object, validated against look up data in Person table
                    lgrp_occ.finder_person, created = Person.objects.get_or_create(
                        name=finder_string)

                    collector_string = attributes_dict.get("Collector")
                    lgrp_occ.collector = collector_string
                    # import person object, validated against look up data in Person table
                    lgrp_occ.collector_person, created = Person.objects.get_or_create(
                        name=collector_string)

                    lgrp_occ.individual_count = attributes_dict.get("Count")

                    if attributes_dict.get("In Situ") in ('No', "NO", 'no'):
                        lgrp_occ.in_situ = False
                    elif attributes_dict.get("In Situ") in ('Yes', "YES",
                                                            'yes'):
                        lgrp_occ.in_situ = True

                    if attributes_dict.get("Ranked Unit") in ('No', "NO",
                                                              'no'):
                        lgrp_occ.ranked = False
                    elif attributes_dict.get("Ranked Unit") in ('Yes', "YES",
                                                                'yes'):
                        lgrp_occ.ranked = True

                    unit_found_string = attributes_dict.get("Unit Found")
                    unit_likely_string = attributes_dict.get("Unit Likely")
                    lgrp_occ.analytical_unit_found = unit_found_string
                    lgrp_occ.analytical_unit_likely = unit_likely_string
                    lgrp_occ.analytical_unit_1 = attributes_dict.get("Unit 1")
                    lgrp_occ.analytical_unit_2 = attributes_dict.get("Unit 2")
                    lgrp_occ.analytical_unit_3 = attributes_dict.get("Unit 3")

                    # import statigraphy object, validate against look up data in Stratigraphy table
                    lgrp_occ.unit_found, created = StratigraphicUnit.objects.get_or_create(
                        name=unit_found_string)
                    lgrp_occ.unit_likly, created = StratigraphicUnit.objects.get_or_create(
                        name=unit_likely_string)

                    # Save Occurrence before saving media. Need id to rename media files
                    lgrp_occ.last_import = True
                    lgrp_occ.save()

                    # Save image
                    if kml_file_extension.lower() == "kmz":
                        # grab image names from XML
                        image_names = table.xpath("//img/@src")
                        # grab the name of the first image
                        # Future: add functionality to import multiple images
                        if image_names and len(
                                image_names
                        ) == 1:  # This will break if image_names is None
                            image_name = image_names[0]
                            # Check that the image name is in the kmz file list
                            kmz_file.filenames = [
                                f.orig_filename for f in kmz_file.filelist
                            ]
                            if image_name in kmz_file.filenames:
                                # etch the kmz image file object, this is a ZipInfo object not a File object
                                image_file_obj = next(
                                    f for f in kmz_file.filelist
                                    if f.orig_filename == image_name)
                                # fetch the upload directory from the model definition
                                upload_dir = Biology._meta.get_field(
                                    'image').upload_to
                                # update image name to include upload path and occurrence id
                                # e.g. /uploads/images/lgrp/14775_188.jpg
                                new_image_name = os.path.join(
                                    upload_dir,
                                    str(lgrp_occ.id) + '_' + image_name)
                                # Save the image
                                lgrp_occ.image.save(
                                    new_image_name,
                                    ContentFile(kmz_file.read(image_file_obj)))

                elif type(o) is not Placemark:
                    raise IOError("KML File is badly formatted")
            if occurrence_count == 1:
                message_string = '1 occurrence'
            if occurrence_count > 1:
                message_string = '{} occurrences'.format(occurrence_count)
            messages.add_message(
                self.request, messages.INFO,
                'Successfully imported {} occurrences'.format(message_string))

        kml_file = kml.KML()
        if kml_file_extension == "kmz":
            kmz_file = ZipFile(kml_file_upload, 'r')
            kml_document = kmz_file.open('doc.kml', 'r').read()
        else:
            # read() loads entire file as one string
            kml_document = open(kml_file_path + "/" + kml_file_upload_name,
                                'r').read()

        kml_file.from_string(
            kml_document
        )  # pass contents of kml string to kml document instance for parsing

        # get the top level features object (this is essentially the layers list)
        level1_elements = list(kml_file.features())

        # Check that the kml file is well-formed with a single document element.
        if len(level1_elements) == 1 and type(level1_elements[0]) == Document:
            document = level1_elements[0]

            #  If well-formed document, check if the file has folders, which correspond to layers
            level2_elements = list(document.features())
            if len(level2_elements) == 1 and type(
                    level2_elements[0]) == Folder:
                folder = level2_elements[0]

                #  If a single folder is present import placemarks from that folder
                #  Get features from the folder
                level3_elements = list(folder.features())
                #  Check that the features are Placemarks. If they are, import them
                if len(level3_elements) >= 1 and type(
                        level3_elements[0]) == Placemark:
                    placemark_list = level3_elements
                    import_placemarks(placemark_list)

            elif len(level2_elements) >= 1 and type(
                    level2_elements[0]) == Placemark:
                placemark_list = level2_elements
                import_placemarks(placemark_list)

        return super(ImportKMZ, self).form_valid(form)
    def output(self):
        '''
        Generate SRTM data wrapper

        @return SRTM Image Wrapper
        '''

        lat_tile_array = np.arange(self.lat_tile_start, self.lat_tile_end + 1)
        lon_tile_array = np.arange(self.lon_tile_start, self.lon_tile_end + 1)

        lat_grid, lon_grid = np.meshgrid(lat_tile_array, lon_tile_array)

        lat_grid = lat_grid.ravel()
        lon_grid = lon_grid.ravel()

        filename_root = '.SRTMGL1.'
        base_url = 'https://e4ftl01.cr.usgs.gov/MEASURES/'
        folder_root = 'SRTMGL1.003/2000.02.11/'
        if self.arcsecond_sampling == 3:
            filename_root = '.SRTMGL3.'
            folder_root = 'SRTMGL3.003/2000.02.11/'
        base_url += folder_root

        filename_list = []
        for lat, lon in zip(lat_grid, lon_grid):

            if lat < 0:
                lat_label = 'S'
                lat = np.abs(lat)
            else:
                lat_label = 'N'

            if lon < 0:
                lon_label = 'W'
                lon = np.abs(lon)
            else:
                lon_label = 'E'

            filename_list.append(lat_label + convertToStr(lat, 2) + lon_label +
                                 convertToStr(lon, 3) + filename_root +
                                 'hgt.zip')
            if self.mask_water == True:
                filename_list.append(lat_label + convertToStr(lat, 2) +
                                     lon_label + convertToStr(lon, 3) +
                                     filename_root + 'num.zip')

        # Read in list of available data
        srtm_list_filename = 'srtm_gl1.txt'
        if self.arcsecond_sampling == 3:
            srtm_list_filename = 'srtm_gl3.txt'
        srtm_support_filename = resource_filename(
            'skdaccess', os.path.join('support', srtm_list_filename))
        available_file_list = open(srtm_support_filename).readlines()
        available_file_list = [
            filename.strip() for filename in available_file_list
        ]

        requested_files = pd.DataFrame({'Filename': filename_list})
        requested_files['Valid'] = [
            '.'.join(filename.split('.')[0:-2]) in available_file_list
            for filename in filename_list
        ]
        valid_filename_list = requested_files.loc[requested_files['Valid'] ==
                                                  True, 'Filename'].tolist()
        url_list = [base_url + filename for filename in valid_filename_list]
        downloaded_file_list = self.cacheData(
            'srtm', url_list, self.username, self.password,
            'https://urs.earthdata.nasa.gov')
        requested_files.loc[requested_files['Valid'] == True,
                            'Full Path'] = downloaded_file_list

        def getCoordinates(filename):
            '''
            Determine the longitude and latitude of the lowerleft corner of the input filename

            @param in_filename: Input SRTM filename
            @return Latitude of southwest corner, Longitude of southwest corner
            '''

            lat_start = int(filename[1:3])

            if filename[0] == 'S':
                lat_start *= -1

            lon_start = int(filename[4:7])

            if filename[3] == 'W':
                lon_start *= -1

            return lat_start, lon_start

        data_dict = OrderedDict()
        metadata_dict = OrderedDict()

        array_shape = (3601, 3601)
        if self.arcsecond_sampling == 3:
            array_shape = (1201, 1201)

        file_slice = slice(None)
        water_value = 0
        if self.mask_water == True:
            file_slice = slice(0, -1, 2)
            water_value = np.nan

        for i in requested_files.index[file_slice]:

            hgt_full_path = requested_files.at[i, 'Full Path']
            hgt_filename = requested_files.at[i, 'Filename']

            label = hgt_filename[:7]
            lat_start, lon_start = getCoordinates(hgt_filename)

            metadata_dict[label] = OrderedDict()

            x_res = 1.0 / (array_shape[0] - 1)
            y_res = 1.0 / (array_shape[1] - 1)
            extents = [
                lon_start - x_res / 2, lon_start + 1 + x_res / 2,
                lat_start - y_res / 2, lat_start + 1 + y_res / 2
            ]

            if requested_files.at[i, 'Valid']:

                masked_dem_data = np.ones(array_shape)
                if self.mask_water == True and requested_files.at[i + 1,
                                                                  'Valid']:

                    num_full_path = requested_files.at[i + 1, 'Full Path']
                    num_filename = requested_files.at[i + 1, 'Full Path']

                    zipped_num_data = ZipFile(num_full_path)
                    zipped_num_full_path = zipped_num_data.infolist(
                    )[0].filename

                    num_data = np.frombuffer(
                        zipped_num_data.open(zipped_num_full_path).read(),
                        np.dtype('uint8')).reshape(array_shape)

                    masked_dem_data[(num_data == 1) |
                                    (num_data == 2)] = water_value

                    i += 1

                zipped_hgt_data = ZipFile(hgt_full_path)

                dem_dataset = gdal.Open(hgt_full_path, gdal.GA_ReadOnly)

                dem_data = dem_dataset.ReadAsArray()

                masked_dem_data *= dem_data

                metadata_dict[label]['WKT'] = dem_dataset.GetProjection()
                metadata_dict[label][
                    'GeoTransform'] = dem_dataset.GetGeoTransform()

            else:

                geo_transform = []
                geo_transform.append(extents[0])
                geo_transform.append(x_res)
                geo_transform.append(0)
                geo_transform.append(extents[-1])
                geo_transform.append(0)
                geo_transform.append(-y_res)

                metadata_dict[label]['WKT'] = self._missing_data_projection
                metadata_dict[label]['GeoTransform'] = geo_transform
                masked_dem_data = np.full(shape=array_shape,
                                          fill_value=water_value)

                i += 1

            data_dict[label] = masked_dem_data
            metadata_dict[label]['Geolocation'] = AffineGlobalCoords(
                metadata_dict[label]['GeoTransform'], center_pixels=True)
            metadata_dict[label]['extents'] = extents

            if self.store_geolocation_grids:
                lat_coords, lon_coords = np.meshgrid(
                    np.linspace(lat_start + 1, lat_start, array_shape[0]),
                    np.linspace(lon_start, lon_start + 1, array_shape[1]),
                    indexing='ij')

                metadata_dict[label]['Latitude'] = lat_coords
                metadata_dict[label]['Longitude'] = lon_coords

        return ImageWrapper(obj_wrap=data_dict, meta_data=metadata_dict)
Beispiel #48
0
        #sys.exit(1)

# dxf file input
#---------------

if args.dxf is not None:
    print("DIPPY Tool: DXF input file: " + args.dxf)

    dxfile = []

    if args.dxf.endswith(".zip"):
        myzip = ZipFile(args.dxf, 'r')

        for sl in myzip.namelist():
            if sl.find("negative") < 0:
                dxfile.append(myzip.open(sl, 'r'))
    else:
        dxfile.append(open(args.dxf, 'r'))

    if len(dxfile) > 0:
        for s in dxfile:
            dxf15parse = ET.XMLParser(remove_blank_text=True)
            doc = ET.parse(s, dxf15parse)
            if opcode == 'addexpt':

                if args.acf in ['DEFAULT', 'True']:
                    print("Auto-create mode")
                    dpa = DP.ExptSubmitter()
                    dpa.addExpt(doc)

                else:
Beispiel #49
0
        def import_tree(id):
            def import_region(feature):
                def extract_data(properties):
                    result = {'level': properties['admin_level']}
                    fields = ['boundary', 'ISO3166-1:alpha3', 'timezone']
                    for field in fields:
                        result[field] = properties['tags'].get(field, None)
                    return result

                print(feature['properties']['name'])
                parent = None
                if len(feature['rpath']) > 2:
                    # sometimes they are swapped
                    parent_id = feature['rpath'][1] if int(feature['rpath'][0]) == feature['id'] else \
                    feature['rpath'][0]
                    parent = Region.objects.get(osm_id=parent_id)
                region = Region.objects.create(
                    title=feature['properties']['name'],
                    polygon=GEOSGeometry(json.dumps(feature['geometry'])),
                    parent=parent,
                    wikidata_id=feature['properties']['tags'].get('wikidata'),
                    osm_id=feature['id'],
                    osm_data=extract_data(feature['properties'])
                )
                for lang in ('en', 'ru'):
                    trans = region.load_translation(lang)
                    trans.master = region
                    trans.name = region.title
                    trans.save()

            zip_file = os.path.join(settings.GEOJSON_DIR, '{}.zip'.format(id))
            if not os.path.exists(zip_file):
                url = settings.OSM_URL.format(id=id, key=settings.OSM_KEY)
                print(url)
                response = requests.get(url, stream=True)
                if response.status_code != 200:
                    raise Exception('Bad request')
                with open(zip_file, 'wb') as out_file:
                    response.raw.decode_content = True
                    shutil.copyfileobj(response.raw, out_file)
            zipfile = ZipFile(zip_file)
            zip_names = zipfile.namelist()
            for zip_name in zip_names:
                print(zip_name)
                # if zip_name.endswith('AL2.GeoJson') or zip_name.endswith('AL3.GeoJson') or zip_name.endswith('AL4.GeoJson'):
                # if not zip_name.endswith('AL6.GeoJson'):
                #     continue
                level = json.loads(zipfile.open(zip_name).read().decode())
                not_passed = []
                for feature in level['features']:
                    try:
                        if not Region.objects.filter(osm_id=feature['id']).exists():
                            import_region(feature)
                    except Region.DoesNotExist:
                        not_passed.append(feature)
                        continue
                while len(not_passed) > 0:
                    bad_passed = []
                    for feature in not_passed:
                        try:
                            import_region(feature)
                        except Region.DoesNotExist:
                            bad_passed.append(feature)
                            continue
                    if not_passed == bad_passed:
                        print('Circular references')
                        break
                    not_passed = bad_passed
# This dataset comes from the [UCI Machine Learning Data Repository](https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data). It includes data on air pollutants and weather from 12 sites. To simplify the example, we'll focus on weekly averages for two measures: PM10 and SO2. Since these measures are strictly positive, we log-transform them.

# + {"tags": ["remove_cell"]}
# Read in data
try:
    df_aq = pd.read_csv("./PRSA2017_Data_20130301-20170228.csv")
except FileNotFoundError:
    import requests
    from zipfile import ZipFile
    from io import BytesIO
    response =\
        requests.get('http://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip')
    zip_file = ZipFile(BytesIO(response.content))
    files = zip_file.namelist()
    df_aq = pd.concat(
        [pd.read_csv(zip_file.open(f)) for f in files if f.endswith('csv')])
    df_aq.to_csv("./PRSA2017_Data_20130301-20170228.csv", index=False)

df_aq['time'] = pd.to_datetime(df_aq.loc[:, ['year', 'month', 'day', 'hour']])
df_aq = df_aq.rename(columns={'PM2.5': 'PM2p5'})

df_aq_weekly = df_aq.\
    assign(date= lambda df: df['time'].astype('datetime64[D]') - pd.to_timedelta(df['time'].dt.dayofweek, unit='d')).\
    drop(columns= ['year','month','day','hour']).\
    groupby(['date','station']).\
    agg('mean').\
    reset_index().\
    sort_values(['station','date']).\
    reset_index()

# for training/validation split
Beispiel #51
0
def read_single_sheet(path, name=None):
    """ Read an xlsx, csv or tsv from a zipfile or directory
    """
    from zipfile import ZipFile
    from . import xlreader

    if name is None:
        root, ext = os.path.splitext(path)
        stream = open(path, 'r')

        if ext == '.xlsx':
            return read_xl(stream)

        if ext == '.tsv':
            return read_csv(stream, dialect='excel-tab')

        if ext == '.csv':
            return read_csv(stream)

        if ext == '.json':
            return read_json(stream)

        raise ValueError('Unknown file extension for %r' % path)

    if path.endswith('.xlsx'):
        return cast_row_values(
            xlreader.DictReader(open(path, 'rb'), sheetname=name))

    if path.endswith('.zip'):
        zf = ZipFile(path)
        names = zf.namelist()

        if (name + '.xlsx') in names:
            stream = zf.open(name + '.xlsx', 'r')
            return read_xl(stream)

        if (name + '.tsv') in names:
            stream = zf.open(name + '.tsv', 'rU')
            return read_csv(stream, dialect='excel-tab')

        if (name + '.csv') in names:
            stream = zf.open(name + '.csv', 'rU')
            return read_csv(stream)

        if (name + '.json') in names:
            stream = zf.open(name + '.json', 'r')
            return read_json(stream)

    if os.path.isdir(path):
        root = os.path.join(path, name)

        if os.path.exists(root + '.xlsx'):
            stream = open(root + '.xlsx', 'rb')
            return read_xl(stream)

        if os.path.exists(root + '.tsv'):
            stream = open(root + '.tsv', 'rU')
            return read_csv(stream, dialect='excel-tab')

        if os.path.exists(root + '.csv'):
            stream = open(root + '.csv', 'rU')
            return read_csv(stream)

        if os.path.exists(root + '.json'):
            stream = open(root + '.json', 'r')
            return read_json(stream)

    return []
"""
Convert the scraped data from Hoboken into the same format as the Yelp Open Dataset data
BIA660D - Group 1: Alec Kulakowski
"""
# Navigate into the correct project directory
import os # os.listdir()
os.chdir('../BIA660D_Group_1_Project')
# Extract the review data from its .zip form
from zipfile import ZipFile
import pandas as pd
zf = ZipFile("Hoboken_restaurants_reviews.csv.zip")
raw = pd.read_csv(zf.open('Hoboken_restaurants_reviews.csv'))
validation = raw.copy()
validation = validation.drop(columns=validation.columns.values[0:2]) # Drop index columns
# Convert user_ratings from string to integer and restaurant ratings from string to float
validation['user_rating'] = validation['user_rating'].apply(lambda x: int(x[0]))
validation['restaurant_rating'] = validation['restaurant_rating'].apply(lambda x: float(x[0:3]))
# Display price distribution
print(validation['restaurant_price'].value_counts())
# Display number of absent prices
print('Missing prices: '+str(validation['restaurant_price'].isnull().sum()))
# Replace missing values with mean and convert to integer
def try_convert(x, y=0):
    try: return(x.count('$'))
    except: return(y)
average_price = sum(validation['restaurant_price'].apply(lambda x: try_convert(x))) / validation['restaurant_price'].value_counts().sum()
validation['restaurant_price'] = validation['restaurant_price'].apply(lambda x: try_convert(x, y=average_price))
# Separate Restaurant Type
# from sklearn.feature_extraction import DictVectorizer
# dv = DictVectorizer(sparse=False)
# dv.fit_transform()
Beispiel #53
0
def find_ole(filename, data):
    """ try to open somehow as zip/ole/rtf/... ; yield None if fail

    If data is given, filename is (mostly) ignored.

    yields embedded ole streams in form of OleFileIO.
    """

    if data is not None:
        # isOleFile and is_ppt can work on data directly but zip need file
        # --> wrap data in a file-like object without copying data
        log.debug('working on data, file is not touched below')
        arg_for_ole = data
        arg_for_zip = FakeFile(data)
    else:
        # we only have a file name
        log.debug('working on file by name')
        arg_for_ole = filename
        arg_for_zip = filename

    ole = None
    try:
        if olefile.isOleFile(arg_for_ole):
            if is_ppt(arg_for_ole):
                log.info('is ppt file: ' + filename)
                for ole in find_ole_in_ppt(arg_for_ole):
                    yield ole
                    ole = None   # is closed in find_ole_in_ppt
            # in any case: check for embedded stuff in non-sectored streams
            log.info('is ole file: ' + filename)
            ole = olefile.OleFileIO(arg_for_ole)
            yield ole
        elif is_zipfile(arg_for_zip):
            log.info('is zip file: ' + filename)
            zipper = ZipFile(arg_for_zip, 'r')
            for subfile in zipper.namelist():
                head = b''
                try:
                    with zipper.open(subfile) as file_handle:
                        head = file_handle.read(len(olefile.MAGIC))
                except RuntimeError:
                    log.error('zip is encrypted: ' + filename)
                    yield None
                    continue

                if head == olefile.MAGIC:
                    log.info('  unzipping ole: ' + subfile)
                    with ZipSubFile(zipper, subfile) as file_handle:
                        try:
                            ole = olefile.OleFileIO(file_handle)
                            yield ole
                        except IOError:
                            log.warning('Error reading data from {0}/{1} or '
                                        'interpreting it as OLE object'
                                        .format(filename, subfile))
                            log.debug('', exc_info=True)
                        finally:
                            if ole is not None:
                                ole.close()
                                ole = None
                else:
                    log.debug('unzip skip: ' + subfile)
        else:
            log.warning('open failed: {0} (or its data) is neither zip nor OLE'
                        .format(filename))
            yield None
    except Exception:
        log.error('Caught exception opening {0}'.format(filename),
                  exc_info=True)
        yield None
    finally:
        if ole is not None:
            ole.close()
Beispiel #54
0
class MailMerge(object):
    def __init__(self, file, remove_empty_tables=False):
        self.zip = ZipFile(file)
        self.parts = {}
        self.settings = None
        self._settings_info = None
        self.remove_empty_tables = remove_empty_tables

        try:
            content_types = etree.parse(self.zip.open('[Content_Types].xml'))
            for file in content_types.findall('{%(ct)s}Override' % NAMESPACES):
                type = file.attrib['ContentType' % NAMESPACES]
                if type in CONTENT_TYPES_PARTS:
                    zi, self.parts[zi] = self.__get_tree_of_file(file)
                elif type == CONTENT_TYPE_SETTINGS:
                    self._settings_info, self.settings = self.__get_tree_of_file(file)

            to_delete = []

            for part in self.parts.values():

                for parent in part.findall('.//{%(w)s}fldSimple/..' % NAMESPACES):
                    for idx, child in enumerate(parent):
                        if child.tag != '{%(w)s}fldSimple' % NAMESPACES:
                            continue
                        instr = child.attrib['{%(w)s}instr' % NAMESPACES]

                        name = self.__parse_instr(instr)
                        if name is None:
                            continue
                        parent[idx] = Element('MergeField', name=name)

                for parent in part.findall('.//{%(w)s}instrText/../..' % NAMESPACES):
                    children = list(parent)
                    fields = zip(
                        [children.index(e) for e in
                         parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="begin"]/..' % NAMESPACES)],
                        [children.index(e) for e in
                         parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="end"]/..' % NAMESPACES)]
                    )

                    for idx_begin, idx_end in fields:
                        # consolidate all instrText nodes between'begin' and 'end' into a single node
                        begin = children[idx_begin]
                        instr_elements = [e for e in
                                          begin.getparent().findall('{%(w)s}r/{%(w)s}instrText' % NAMESPACES)
                                          if idx_begin < children.index(e.getparent()) < idx_end]
                        if len(instr_elements) == 0:
                            continue

                        # set the text of the first instrText element to the concatenation
                        # of all the instrText element texts
                        instr_text = ''.join([e.text for e in instr_elements])
                        instr_elements[0].text = instr_text

                        # delete all instrText elements except the first
                        for instr in instr_elements[1:]:
                            instr.getparent().remove(instr)

                        name = self.__parse_instr(instr_text)
                        if name is None:
                            continue

                        parent[idx_begin] = Element('MergeField', name=name)

                        # use this so we know *where* to put the replacement
                        instr_elements[0].tag = 'MergeText'
                        block = instr_elements[0].getparent()
                        # append the other tags in the w:r block too
                        parent[idx_begin].extend(list(block))

                        to_delete += [(parent, parent[i + 1])
                                      for i in range(idx_begin, idx_end)]

            for parent, child in to_delete:
                parent.remove(child)

            # Remove mail merge settings to avoid error messages when opening document in Winword
            if self.settings:
                settings_root = self.settings.getroot()
                mail_merge = settings_root.find('{%(w)s}mailMerge' % NAMESPACES)
                if mail_merge is not None:
                    settings_root.remove(mail_merge)
        except:
            self.zip.close()
            raise

    @classmethod
    def __parse_instr(cls, instr):
        args = shlex.split(instr, posix=False)
        if args[0] != 'MERGEFIELD':
            return None
        name = args[1]
        if name[0] == '"' and name[-1] == '"':
            name = name[1:-1]
        return name

    def __get_tree_of_file(self, file):
        fn = file.attrib['PartName' % NAMESPACES].split('/', 1)[1]
        zi = self.zip.getinfo(fn)
        return zi, etree.parse(self.zip.open(zi))

    def write(self, file):
        # Replace all remaining merge fields with empty values
        for field in self.get_merge_fields():
            self.merge(**{field: ''})

        with ZipFile(file, 'w', ZIP_DEFLATED) as output:
            for zi in self.zip.filelist:
                if zi in self.parts:
                    xml = etree.tostring(self.parts[zi].getroot())
                    output.writestr(zi.filename, xml)
                elif zi == self._settings_info:
                    xml = etree.tostring(self.settings.getroot())
                    output.writestr(zi.filename, xml)
                else:
                    output.writestr(zi.filename, self.zip.read(zi))

    def get_merge_fields(self, parts=None):
        if not parts:
            parts = self.parts.values()
        fields = set()
        for part in parts:
            for mf in part.findall('.//MergeField'):
                fields.add(mf.attrib['name'])
        return fields

    def merge_templates(self, replacements, separator):
        """
        Duplicate template. Creates a copy of the template, does a merge, and separates them by a new paragraph, a new break or a new section break.
        separator must be :
        - page_break : Page Break. 
        - column_break : Column Break. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS
        - textWrapping_break : Line Break.
        - continuous_section : Continuous section break. Begins the section on the next paragraph.
        - evenPage_section : evenPage section break. section begins on the next even-numbered page, leaving the next odd page blank if necessary.
        - nextColumn_section : nextColumn section break. section begins on the following column on the page. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS
        - nextPage_section : nextPage section break. section begins on the following page.
        - oddPage_section : oddPage section break. section begins on the next odd-numbered page, leaving the next even page blank if necessary.
        """

        #TYPE PARAM CONTROL AND SPLIT
        valid_separators = {'page_break', 'column_break', 'textWrapping_break', 'continuous_section', 'evenPage_section', 'nextColumn_section', 'nextPage_section', 'oddPage_section'}
        if not separator in valid_separators:
            raise ValueError("Invalid separator argument")
        type, sepClass = separator.split("_")
  

        #GET ROOT - WORK WITH DOCUMENT
        for part in self.parts.values():
            root = part.getroot()
            tag = root.tag
            if tag == '{%(w)s}ftr' % NAMESPACES or tag == '{%(w)s}hdr' % NAMESPACES:
                continue
		
            if sepClass == 'section':

                #FINDING FIRST SECTION OF THE DOCUMENT
                firstSection = root.find("w:body/w:p/w:pPr/w:sectPr", namespaces=NAMESPACES)
                if firstSection == None:
                    firstSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES)
			
                #MODIFY TYPE ATTRIBUTE OF FIRST SECTION FOR MERGING
                nextPageSec = deepcopy(firstSection)
                for child in nextPageSec:
                #Delete old type if exist
                    if child.tag == '{%(w)s}type' % NAMESPACES:
                        nextPageSec.remove(child)
                #Create new type (def parameter)
                newType = etree.SubElement(nextPageSec, '{%(w)s}type'  % NAMESPACES)
                newType.set('{%(w)s}val'  % NAMESPACES, type)

                #REPLACING FIRST SECTION
                secRoot = firstSection.getparent()
                secRoot.replace(firstSection, nextPageSec)

            #FINDING LAST SECTION OF THE DOCUMENT
            lastSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES)

            #SAVING LAST SECTION
            mainSection = deepcopy(lastSection)
            lsecRoot = lastSection.getparent()
            lsecRoot.remove(lastSection)

            #COPY CHILDREN ELEMENTS OF BODY IN A LIST
            childrenList = root.findall('w:body/*', namespaces=NAMESPACES)

            #DELETE ALL CHILDREN OF BODY
            for child in root:
                if child.tag == '{%(w)s}body' % NAMESPACES:
                    child.clear()

            #REFILL BODY AND MERGE DOCS - ADD LAST SECTION ENCAPSULATED OR NOT
            lr = len(replacements)
            lc = len(childrenList)

            for i, repl in enumerate(replacements):
                parts = []
                for (j, n) in enumerate(childrenList):
                    element = deepcopy(n)
                    for child in root:
                        if child.tag == '{%(w)s}body' % NAMESPACES:
                            child.append(element)
                            parts.append(element)
                            if (j + 1) == lc:
                                if (i + 1) == lr:
                                    child.append(mainSection)
                                    parts.append(mainSection)
                                else:
                                    if sepClass == 'section':
                                        intSection = deepcopy(mainSection)
                                        p   = etree.SubElement(child, '{%(w)s}p'  % NAMESPACES)
                                        pPr = etree.SubElement(p, '{%(w)s}pPr'  % NAMESPACES)
                                        pPr.append(intSection)
                                        parts.append(p)
                                    elif sepClass == 'break':
                                        pb   = etree.SubElement(child, '{%(w)s}p'  % NAMESPACES)
                                        r = etree.SubElement(pb, '{%(w)s}r'  % NAMESPACES)
                                        nbreak = Element('{%(w)s}br' % NAMESPACES)
                                        nbreak.attrib['{%(w)s}type' % NAMESPACES] = type
                                        r.append(nbreak)

                    self.merge(parts, **repl)

    def merge_pages(self, replacements):
         """
         Deprecated method.
         """
         warnings.warn("merge_pages has been deprecated in favour of merge_templates",
                      category=DeprecationWarning,
                      stacklevel=2)         
         self.merge_templates(replacements, "page_break")

    def merge(self, parts=None, **replacements):
        if not parts:
            parts = self.parts.values()

        for field, replacement in replacements.items():
            if isinstance(replacement, list):
                self.merge_rows(field, replacement)
            else:
                for part in parts:
                    self.__merge_field(part, field, replacement)

    def __merge_field(self, part, field, text):
        for mf in part.findall('.//MergeField[@name="%s"]' % field):
            children = list(mf)
            mf.clear()  # clear away the attributes
            mf.tag = '{%(w)s}r' % NAMESPACES
            mf.extend(children)

            nodes = []
            # preserve new lines in replacement text
            text = text or ''  # text might be None
            text_parts = str(text).replace('\r', '').split('\n')
            for i, text_part in enumerate(text_parts):
                text_node = Element('{%(w)s}t' % NAMESPACES)
                text_node.text = text_part
                nodes.append(text_node)

                # if not last node add new line node
                if i < (len(text_parts) - 1):
                    nodes.append(Element('{%(w)s}br' % NAMESPACES))

            ph = mf.find('MergeText')
            if ph is not None:
                # add text nodes at the exact position where
                # MergeText was found
                index = mf.index(ph)
                for node in reversed(nodes):
                    mf.insert(index, node)
                mf.remove(ph)
            else:
                mf.extend(nodes)

    def merge_rows(self, anchor, rows):
        table, idx, template = self.__find_row_anchor(anchor)
        if table is not None:
            if len(rows) > 0:
                del table[idx]
                for i, row_data in enumerate(rows):
                    row = deepcopy(template)
                    self.merge([row], **row_data)
                    table.insert(idx + i, row)
            else:
                # if there is no data for a given table
                # we check whether table needs to be removed
                if self.remove_empty_tables:
                    parent = table.getparent()
                    parent.remove(table)

    def __find_row_anchor(self, field, parts=None):
        if not parts:
            parts = self.parts.values()
        for part in parts:
            for table in part.findall('.//{%(w)s}tbl' % NAMESPACES):
                for idx, row in enumerate(table):
                    if row.find('.//MergeField[@name="%s"]' % field) is not None:
                        return table, idx, row
        return None, None, None

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def close(self):
        if self.zip is not None:
            try:
                self.zip.close()
            finally:
                self.zip = None
Beispiel #55
0
 def __init__(
         self,
         fn,
         only_load_visible_shapes = True,
         visible_if_ViewObject_missing = True,
         printLevel=0
 ):
     z = ZipFile( fn )
     if printLevel > 0:
         print( z.namelist() )
         print( xml_prettify( z.open('Document.xml').read().decode('utf-8') ) )
         if 'GuiDocument.xml' in z.namelist():
             print( xml_prettify( z.open('GuiDocument.xml').read().decode('utf-8') ) )
     tree_doc = XML_Tree.fromstring(  z.open('Document.xml').read().decode('utf-8') )
     if 'GuiDocument.xml' in z.namelist():
         tree_gui = XML_Tree.fromstring(  z.open('GuiDocument.xml').read().decode('utf-8') )
     else:
         tree_gui = None
     #tree_shapes =  ElementTree.fromstring(  z.open('PartShape.brp').read() )
     doc = Fcstd_Property_List( tree_doc.find('Properties') )
     self.__dict__.update( doc.__dict__ )
     self.Name = os.path.split( fn )[1][:-6]
     self.Objects = []
     self.Objects_dict = {}
     #objectData
     for o in tree_doc.find('ObjectData').findall('Object'):
         k = o.attrib['name']
         assert not k in self.Objects
         obj = Fcstd_Property_List( o.find('Properties') )
         obj.Name = k
         obj.Content = str(XML_Tree.tostring( o ))
         self.Objects_dict[k] = obj
         self.Objects.append( self.Objects_dict[k] )
     #viewObjects
     if tree_gui != None:
         for o in tree_gui.find('ViewProviderData').findall('ViewProvider'):
             k = o.attrib['name']
             if k in self.Objects_dict:
                 ViewObject =  Fcstd_Property_List( o.find('Properties') )
                 ViewObject.isVisible = isVisible_Bound_Method( ViewObject )
                 self.Objects_dict[k].ViewObject = ViewObject
     else:
         for obj in self.Objects:
             xml = '<Properties> <Property name="Visibility" type="App::PropertyBool"> <Bool value="%s"/> </Property> </Properties>' % ( 'true' if visible_if_ViewObject_missing else 'false' )
             obj.ViewObject =  Fcstd_Property_List(  XML_Tree.fromstring(xml) )
             obj.ViewObject.isVisible = isVisible_Bound_Method( obj.ViewObject )
     #shapes
     for obj in self.Objects:
         if hasattr( obj, 'Shape'):
             shape_zip_name = obj.Shape
             delattr( obj, 'Shape' )
             if not only_load_visible_shapes or obj.ViewObject.Visibility:
                 obj.Shape = Part.Shape()
                 obj.Shape.importBrepFromString( z.open( shape_zip_name ).read().decode('utf-8') )
     #colour lists
     for obj in self.Objects:
         if hasattr( obj, 'ViewObject' ):
             v = obj.ViewObject
             if not only_load_visible_shapes or obj.ViewObject.Visibility:
                 for p_name, p_type in zip( v.PropertiesList, v.PropertiesTypes ):
                     if p_type == 'App::PropertyColorList':
                         #print( p_name, getattr(v,p_name) )
                         fn = getattr(v,p_name)
                         C = parse_Clr_Array(  z.open( fn ).read() )
                         setattr(  v, p_name, C )
Beispiel #56
0
def getTranslations(localeConfig, projectName, key):
    """Download all available translations from crowdin.

    Trigger crowdin to build the available export, wait for crowdin to
    finish the job and download the generated zip afterwards.
    """
    crowdin_request(projectName, 'export', key)

    result = crowdin_request(projectName, 'download/all.zip', key, raw=True)
    zip = ZipFile(StringIO(result))
    dirs = {}

    normalizedDefaultLocale = localeConfig['default_locale'].replace('_', '-')
    normalizedDefaultLocale = CROWDIN_LANG_MAPPING.get(normalizedDefaultLocale,
                                                       normalizedDefaultLocale)

    for info in zip.infolist():
        if not info.filename.endswith('.json'):
            continue

        dir, file = os.path.split(info.filename)
        if not re.match(r'^[\w\-]+$', dir) or dir == normalizedDefaultLocale:
            continue
        if file.count('.') == 1:
            origFile = file
        else:
            origFile = os.path.splitext(file)[0]

        for key, value in CROWDIN_LANG_MAPPING.iteritems():
            if value == dir:
                dir = key
        dir = dir.replace('-', '_')

        data = zip.open(info.filename).read()
        if data == '[]':
            continue

        if not dir in dirs:
            dirs[dir] = set()
        dirs[dir].add(origFile)

        path = os.path.join(localeConfig['base_path'], dir, origFile)
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        if file.endswith('.json'):
            postprocessChromeLocale(path, data)
        else:
            data = json.loads(data)
            if origFile in data:
                fileHandle = codecs.open(path, 'wb', encoding='utf-8')
                fileHandle.write(data[origFile]['message'])
                fileHandle.close()

    # Remove any extra files
    for dir, files in dirs.iteritems():
        baseDir = os.path.join(localeConfig['base_path'], dir)
        if not os.path.exists(baseDir):
            continue
        for file in os.listdir(baseDir):
            path = os.path.join(baseDir, file)
            valid_extension = file.endswith('.json')
            if os.path.isfile(path) and valid_extension and not file in files:
                os.remove(path)
Beispiel #57
0
This script is responsible for the update of source data of PyCollatinus.

.. author:: Thibault Clérice (@ponteineptique)
"""
from io import BytesIO
from zipfile import ZipFile
from urllib3 import PoolManager
import glob

# Setting up the list of file to update
files = [(file, file.replace("pycollatinus/data",
                             "collatinus-master/bin/data"))
         for file in glob.glob("pycollatinus/data/*.*")
         if not file.endswith(".pickle")]

print("Contacting Github")
http = PoolManager()
url = http.request(
    "GET", "https://github.com/biblissima/collatinus/archive/master.zip")
print("Reading zip")
zipfile = ZipFile(BytesIO(url.data))
for target, source in files:
    print("\tUpdating {}".format(target))
    with zipfile.open(source) as source_io:
        with open(target, "w") as target_io:
            target_io.write(source_io.read().decode().replace(
                "ho!|inv|||interj.|1", "ho|inv|||interj.|1"
            )  # Known line that creates bug in PyCollatinus
                            )

print("Done")
Beispiel #58
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        LOG.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.test_mode:
                    graph = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    graph = self.graph

                model = Model(graph)

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:' + biogrid_num
                prefix = self.localtt[id_type]

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        model.addEquivalentClass(biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        model.addClassToGraph(biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        myzip.close()

        return
Beispiel #59
0
    def __init__(self, path="."):
        if os.path.isfile(path):
            zip_file = ZipFile(path)
            open_file = lambda filename: io.TextIOWrapper(
                zip_file.open(filename), encoding="utf-8")
        else:
            assert os.path.isdir(path)
            open_file = lambda filename: open(os.path.join(path, filename))
        self.stops = {
            stop.stop_id: stop
            for stop in parse_csv(open_file("stops.txt"), "Stop")
        }
        self.routes = {
            route.route_id: route
            for route in parse_csv(open_file("routes.txt"), "Route")
        }
        self.trips = {
            trip.trip_id: trip
            for trip in parse_csv(open_file("trips.txt"), "Trip")
        }
        self.stop_times = parse_csv(open_file("stop_times.txt"), "StopTime")
        try:
            self.services = {
                service.service_id: service
                for service in parse_csv(open_file("calendar.txt"), "Calendar")
            }
        except Exception as e:
            print(e)
            self.services = {}
        self.agency = {
            agency.agency_id: agency
            for agency in parse_csv(open_file("agency.txt"), "Agency")
        }
        self.shapes = {}
        try:
            for point in parse_csv(open_file("shapes.txt"), "Shape"):
                if point.shape_id not in self.shapes:
                    self.shapes[point.shape_id] = []
                self.shapes[point.shape_id].append(point)
        except:
            pass
        for shape in self.shapes.values():
            shape.sort(key=lambda point: int(point.shape_pt_sequence))

        self.stop_times_by_trip_id = {}
        for stop_time in self.stop_times:
            if stop_time.trip_id not in self.stop_times_by_trip_id:
                self.stop_times_by_trip_id[stop_time.trip_id] = []
            self.stop_times_by_trip_id[stop_time.trip_id].append(stop_time)
        for stop_time_list in self.stop_times_by_trip_id.values():
            stop_time_list.sort(
                key=lambda stop_time: int(stop_time.stop_sequence))

        self.trips_by_list_of_stops = {}
        for trip in self.trips.values():
            trip_stops = self.trip_stops_ids(trip.trip_id)
            if trip_stops not in self.trips_by_list_of_stops:
                self.trips_by_list_of_stops[trip_stops] = set()
            self.trips_by_list_of_stops[trip_stops].add(trip.trip_id)

        self.all_lists_of_stops = sorted(self.trips_by_list_of_stops.keys())
Beispiel #60
0
        from StringIO import StringIO as ReaderIO
        from urllib import urlopen
    except ImportError:
        from io import BytesIO as ReaderIO
        from urllib.request import urlopen

    print('Downloading large collection of URDF from Drake project...')
    print('This might take a few minutes...')
    resp = urlopen(
        'https://github.com/RobotLocomotion/drake/archive/master.zip')
    zipfile = ZipFile(ReaderIO(resp.read()))
    errors = []
    all_files = []

    for f in zipfile.namelist():
        if f.endswith('.urdf') or f.endswith('.xacro'):
            with zipfile.open(f) as urdf_file:
                try:
                    all_files.append(f)
                    r = Robot.from_urdf_file(urdf_file)
                except Exception as e:
                    errors.append((f, e))

    print('Found %d files and parsed successfully %d of them' %
          (len(all_files), len(all_files) - len(errors)))

    if len(errors):
        print('\nErrors found during parsing:')
        for error in errors:
            print(' * File=%s, Error=%s' % error)