Example #1
0
 def _check_md5(self):
     import sys
     if sys.platform == 'win32':
         msg = "Verifying md5 checksums is not yet supported for your OS."
         logger.warn(msg)
         warnings.warn(msg, UserWarning)
         return
     if self.md5:
         if self.md5 != get_md5_checksum(self.filepath):
             msg = \
                 """The MD5 checksum of the file {} does not match the one
                  specified in the schema. This may not be the file you are
                  looking for."""
             logger.warn(msg.format(self.filepath))
             warnings.warn(msg.format(self.filepath), UserWarning)
Example #2
0
 def _check_md5(self):
     import sys
     if sys.platform == 'win32':
         msg = "Verifying md5 checksums is not yet supported for your OS."
         logger.warn(msg)
         warnings.warn(msg, UserWarning)
         return
     if self.md5:
         if self.md5 != get_md5_checksum(self.filepath):
             msg = \
                 """The MD5 checksum of the file {} does not match the one
                  specified in the schema. This may not be the file you are
                  looking for."""
             logger.warn(msg.format(self.filepath))
             warnings.warn(msg.format(self.filepath), UserWarning)
 def test_md5(self):
     """Check if the md5 checksum validation works properly."""
     schema = deepcopy(self.basespecs["iris"])
     schema['md5'] = get_md5_checksum(schema['path'])
     SchemaValidator(specification=schema)
     tempdir = tempfile.mkdtemp()
     outpath = op.join(tempdir, "bad_iris.csv")
     iris = pd.read_csv(schema['path'])
     del iris['Species']
     iris.to_csv(outpath, index=False)
     schema['path'] = outpath
     try:
         with warnings.catch_warnings(record=True) as catcher:
             SchemaValidator(specification=schema).get_parser_args()
             assert len(catcher) == 1
             assert issubclass(catcher[-1].category, UserWarning)
     finally:
         shutil.rmtree(tempdir)
Example #4
0
 def test_md5(self):
     """Check if the md5 checksum validation works properly."""
     schema = deepcopy(self.basespecs["iris"])
     schema['md5'] = get_md5_checksum(schema['path'])
     SchemaValidator(specification=schema)
     tempdir = tempfile.mkdtemp()
     outpath = op.join(tempdir, "bad_iris.csv")
     iris = pd.read_csv(schema['path'])
     del iris['Species']
     iris.to_csv(outpath, index=False)
     schema['path'] = outpath
     try:
         with warnings.catch_warnings(record=True) as catcher:
             SchemaValidator(specification=schema).get_parser_args()
             assert len(catcher) == 1
             assert issubclass(catcher[-1].category, UserWarning)
     finally:
         shutil.rmtree(tempdir)
Example #5
0
    def _get_parser_args(self):
        if self.md5:
            if self.md5 != get_md5_checksum(self.filepath):
                msg = \
                    """The MD5 checksum of the file {} does not match the one
                     specified in the schema. This may not be the file you are
                     looking for."""
                logger.warn(msg.format(self.filepath))
                warnings.warn(msg.format(self.filepath), UserWarning)
        args = {}
        if self._delimiter:
            args['sep'] = self._delimiter

        # Columns to use
        if len(self.colnames) > 0:
            args['usecols'] = self.colnames

        # Columns to exclude
        if len(self.exclude_columns) > 0:
            usecols = colnames(self._filepath, sep=args.get('sep', ','))
            for colname in self.exclude_columns:
                usecols.remove(colname)
            args['usecols'] = usecols

        # NA values
        if len(self.na_values) > 0:
            args['na_values'] = self.na_values

        # Date/Time arguments
        # FIXME: Allow for a mix of datetime column groupings and individual
        # columns
        if len(self.datetime_cols) > 0:
            if isinstance(self.datetime_cols, dict):
                args['parse_dates'] = self.datetime_cols
            elif isinstance(self.datetime_cols, list):
                args['parse_dates'] = [self.datetime_cols]
        else:
            parse_dates = []
            for k, v in self._dtypes.iteritems():
                if v is datetime.date:
                    parse_dates.append(k)
            for k in parse_dates:
                del self._dtypes[k]
            args['dtype'] = self.dtypes
            if len(parse_dates) > 0:
                args['parse_dates'] = parse_dates

        if len(self.converters) > 0:
            args['converters'] = self.converters

        if self.header != 0:
            args['header'] = self.header
        if self.column_names is not None:
            if isinstance(self.column_names, list):
                args['names'] = self.column_names
                # Force include the header argument
                args['header'] = self.header
            elif isinstance(self.column_names, dict) or callable(self.column_names):
                self.df_rules['column_names'] = self.column_names

        if self.is_multifile:
            arglist = []
            for i in range(len(self._filepath)):
                argset = copy.deepcopy(args)
                argset.update({'filepath_or_buffer': self._filepath[i]})
                argset.update({'nrows': self._nrows[i]})
                arglist.append(argset)
            return arglist
        else:
            if self._filepath:
                args.update({'filepath_or_buffer': self._filepath})
            if "nrows" in self.specification:
                args.update({'nrows': self._nrows})
            self.pickled_args.update(args)
            return self.pickled_args
Example #6
0
 def test_md5(self):
     ideal = "9b3ecf3031979169c0ecc5e03cfe20a6"
     actual = get_md5_checksum(self.filepath)
     self.assertEqual(ideal, actual)
Example #7
0
 def test_md5(self):
     """Test the md5 checksum calculator."""
     ideal = "9b3ecf3031979169c0ecc5e03cfe20a6"
     actual = get_md5_checksum(self.filepath)
     self.assertEqual(ideal, actual)