コード例 #1
0
ファイル: schemas.py プロジェクト: lucassimon/upload-datasets
 def __init__(self):
     self.schemas = Schema(
         [
             Column(
                 "Given Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "Family Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("Age", [InRangeValidation(0, 120)]),
             Column("Sex", [InListValidation(["Male", "Female", "Other"])]),
             Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]),
         ]
     )
コード例 #2
0
class UnorderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a'),
         Column('b', [LeadingWhitespaceValidation()])],
        ordered=False)

    def test_fields(self):
        self.assertEqual(len(self.schema.columns), 2,
                         'The schema is not storing all of its columns')
        self.assertEqual(
            self.schema.ordered, False,
            'The schema is not storing the correct value of ordered')

    def test_validate_valid(self):
        df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 0,
                         'A correct data frame should have no errors')

    def test_validate_invalid(self):
        df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 1,
                         'An incorrect data frame should report errors')

    def test_mixed_columns(self):
        """
        Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position.
        In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a validation on column b in the schema.

        Schema         a                b (validation)
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema is linked to column b in the data frame,
        as is correct behaviour.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )
コード例 #3
0
ファイル: schemas.py プロジェクト: lucassimon/upload-datasets
 def __init__(self):
     self.schemas = Schema(
         [
             Column("id"),
             Column(
                 "payer_name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("document_amount"),
             Column("payed_amount"),
             Column("payer_id_number"),
             Column(
                 "payer_address",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("barcode"),
             Column("typable_line"),
             Column("number"),
             Column(
                 "document_number",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("due_date", [DateFormatValidation("%m/%d/%y")]),
             Column(
                 "city",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "state",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("zip_code"),
             Column("bank_answer_date"),
             Column("pdf_upload_date"),
             Column(
                 "status", [InListValidation(["pending", "paid", "due", "error"])]
             ),
             Column("callback"),
             Column("object_id"),
             Column("extra"),
         ]
     )
コード例 #4
0
    def create_schema(self) -> Schema:
        """ Create Pandas schema with all the necessary validation rules read in from config """
        col_list = []
        for column in self.__spreadsheet_def.keys():
            validators = [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()
            ]

            mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column)

            # Special cases for checking institutions/countries...
            if column == 'submitting_institution':
                validators.append(
                    InListValidation([i.name for i in self.__institutions]))
            if column == 'country':
                validators.append(
                    InListValidation([i.country for i in self.__institutions]))
            else:
                # Regex validation
                if self.__spreadsheet_def.get_regex(column):
                    validators.append(
                        MatchesPatternValidation(
                            self.__spreadsheet_def.get_regex(column),
                            message=self.__spreadsheet_def.
                            get_regex_validation_message(column)))

                # Validate allowed values
                elif self.__spreadsheet_def.get_allowed_values(column):
                    validators.append(
                        InListValidation(
                            self.__spreadsheet_def.get_allowed_values(column),
                            case_sensitive=False))

                # Field length validation
                max_len = self.__spreadsheet_def.get_max_length(column)
                if max_len and max_len > 0:
                    validators.append(
                        _StringLengthValidation(
                            'field length is greater than {} characters'.
                            format(str(max_len)), max_len))

            # Mandatory field validation
            col_list.append(
                Column(self.__spreadsheet_def.get_column_name(column),
                       validators,
                       allow_empty=not mandatory_field_flag))

        return Schema(col_list)
コード例 #5
0
ファイル: utils.py プロジェクト: alexwohletz/pandas-utils
    def check_join_cols(df1, df2, on):

        schema = Schema([
            Column(
                col,
                [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    IsDistinctValidation()
                ],
            ) for col in on
        ])
        results = [schema.validate(df) for df in [df1[on], df2[on]]]

        if len(results) > 0:
            print("The following issues exist in the index:")
            for error in itertools.chain(*results):
                print(error)
コード例 #6
0
ファイル: test_column.py プロジェクト: santoshdata/PandasRef
class DoubleValidationColumn(unittest.TestCase):
    """
    Test a column with two different validations
    """
    NAME = 'col1'

    col = Column(
        NAME, [TrailingWhitespaceValidation(),
               LeadingWhitespaceValidation()],
        allow_empty=False)
    ser = pd.Series([' a ', ' b ', ' c '])

    def test_outputs(self):
        results = self.col.validate(self.ser)

        # There should be 6 errors, 2 for each row
        self.assertEqual(len(results), 2 * len(self.ser),
                         'A Column produces the wrong number of errors')
        for i in range(2):
            in_row = [r for r in results if r.row == i]
            self.assertEqual(
                len(in_row), 2,
                'A Column does not report both errors for every row')
コード例 #7
0
ファイル: test_schema.py プロジェクト: twschum/PandasSchema
class OrderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a', [LeadingWhitespaceValidation()]),
         Column('b')],
        ordered=True)

    def test_mixed_columns(self):
        """
        Tests that when ordered=True, the schema columns are associated with data frame columns by position, not name.

        In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a validation on column a in the schema.

        Schema         a (validation)   b
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema is linked to column a in the data frame,
        as is correct behaviour when ordered=True.
        """
        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by position'
        )
コード例 #8
0
ファイル: test_schema.py プロジェクト: twschum/PandasSchema
class UnorderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a'),
         Column('b', [LeadingWhitespaceValidation()])],
        ordered=False)

    def test_fields(self):
        self.assertEqual(len(self.schema.columns), 2,
                         'The schema is not storing all of its columns')
        self.assertEqual(
            self.schema.ordered, False,
            'The schema is not storing the correct value of ordered')

    def test_validate_valid(self):
        df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 0,
                         'A correct data frame should have no errors')

    def test_validate_invalid(self):
        df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 1,
                         'An incorrect data frame should report errors')

    def test_mixed_columns(self):
        """
        Tests that when ordered=False, the schema columns are 
        associated with data frame columns by name, not position.
        In this case, the schema's column order is [a, b], while
         the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a 
        validation on column b in the schema.

        Schema         a                b (validation)
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema 
        is linked to column b in the data frame, as is correct
        behaviour.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )

    def test_column_subset_detect(self):
        """
        Tests that when ordered=False, validation is possible by
        passing a subset of the columns contained in the schema

        Schema         a*                b (validation)
        Data Frame     b (error)        a not passed

        column* is not being passed

        Thus there will only be an error if column b in the schema
        is linked to column b in the data frame, as is correct 
        behaviour
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)

        results = self.schema.validate(df, columns=['b'])

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )

    def test_column_subset_detect_empty(self):
        """
        Tests that when ordered=False, validation is possible by
        passing a subset of the columns contained in the schema

        Schema         a                b* (validation)
        Data Frame     b (error)        a

        column* is not being passed

        There will be an error if other than zero errors are found.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        # should detect no errors
        results_empty = self.schema.validate(df, columns=['a'])

        self.assertEqual(len(results_empty), 0, 'There should be no errors')

    def test_column_subset_error(self):
        """
        Tests that when ordered=False, validation is possible by
        passing a subset of the columns contained in the schema

        Schema         a                b (validation)
        Data Frame     b (error)        a 

        There will be an error if a column different than 'a' or 'b' is passed
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)

        # should raise a PanSchArgumentError
        self.assertRaises(PanSchArgumentError,
                          self.schema.validate,
                          df,
                          columns=['c'])
コード例 #9
0
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \
    DateFormatValidation, InListValidation

schema = Schema([
    Column('name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('title',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('salary', [InRangeValidation(0, 33000)]),
    Column('sex', [InListValidation(['F', 'M'])]),
    Column('date', [DateFormatValidation('%Y-%m-%d')])
])

widths = [
    9,  # name
    19,  # title
    6,  # salary
    4,  # sex
    11,  # date
]

# read source data
test_data = pd.read_fwf("data/fixed_width.txt", widths=widths)
print('orig dataset')
print(test_data)

# data verification
コード例 #10
0
def main():
    # Parse input arguments
    parser = get_parser()
    args = parser.parse_args()

    data_path = args.path_in

    path_tsv = os.path.join(data_path, 'participants.tsv')
    tsv_file = pd.read_csv(path_tsv, sep='\t')
    list_subj = [
        name for name in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub')
    ]
    df = pd.DataFrame(tsv_file)
    list_tsv_participants = df['participant_id'].tolist()
    missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants))
    missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj))

    if missing_subjects_tsv:
        # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv)
        print('\nWarning missing following subjects from participants.tsv: ')
        missing_subjects_tsv.sort()
        pprint(missing_subjects_tsv)
    if missing_subjects_folder:
        # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder)
        print(
            '\nWarning missing data for subjects listed in participants.tsv: ')
        missing_subjects_folder.sort()
        pprint(missing_subjects_folder)

    for dirName, subdirList, fileList in os.walk(data_path):
        for file in fileList:
            if file.endswith('.nii.gz'):
                originalFilePath = os.path.join(dirName, file)
                jsonSidecarPath = os.path.join(dirName,
                                               file.split(".")[0] + '.json')
                if os.path.exists(jsonSidecarPath) == False:
                    print("Missing jsonSidecar: " + jsonSidecarPath)

    # Checking participants.tsv contents
    schema = Schema([
        Column('participant_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('sex', [InListValidation(['M', 'F'])]),
        Column('age', [InRangeValidation(18, 60)]),
        Column('height', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('weight', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('date_of_scan', [
            DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-")
        ]),
        Column('institution_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('institution',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturer',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturers_model_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('receive_coil_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('software_versions',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('researcher',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
    ])

    errors = schema.validate(tsv_file)
    print('\nChecking the contents of participants.tsv')
    if not errors:
        print("--> all good 👍")
    else:
        for error in errors:
            print(error)
コード例 #11
0
        # Iterate over each pair of schema columns and data frame series and run validations
        column_pairs, errors = self._get_column_pairs(panda_sdrf)
        for series, column in column_pairs:
            errors += column.validate(series)
        return sorted(errors, key=lambda e: e.row)

    def check_recommendations(self, panda_sdrf):
        column_pairs, errors = self._get_column_pairs(panda_sdrf)
        warnings = []
        for series, column in column_pairs:
            warnings += column.validate_optional(series)
        return sorted(warnings, key=lambda e: e.row)


default_schema = SDRFSchema([
    SDRFColumn('source name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=True,
               optional_type=False),
    SDRFColumn('characteristics[organism part]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=True,
               optional_type=False),
    SDRFColumn('characteristics[disease]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=True,
               optional_type=False),
    SDRFColumn('characteristics[organism]',
               [LeadingWhitespaceValidation(), TrailingWhitespaceValidation(),
                OntologyTerm("ncbitaxon", not_applicable=True)],
               allow_empty=False,
               optional_type=False),
    SDRFColumn('characteristics[cell type]', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
               allow_empty=False,
コード例 #12
0
    def validate_csv(cls, data, registration_type):
        df = pd.read_csv(data)
        logger.info("Printing dataframe before CSV validation...")
        logger.info(df)
        if registration_type == 'slr':
            csv_schema = Schema([
                Column('ipaddr', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('username', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('password', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('sa_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('va_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('domain', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('license', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation()
                ]),
                Column('license_count', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation()
                ]),
                Column('tftp_server_ip', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('tftp_server_path', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ])
            ])
        elif registration_type == 'sl':
            csv_schema = Schema([
                Column('ipaddr', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    MatchesPatternValidation(
                        r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$'),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('username', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('password', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('sa_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('va_name', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ]),
                Column('domain', [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    CustomSeriesValidation(lambda x: x.str.len() > 0,
                                           'Column is empty!')
                ])
            ])

        errors = csv_schema.validate(df)

        if errors:
            errors_list = []
            for error in errors:
                print(error)
                errors_list.append(error)
            return False, errors_list, df
        else:
            return True, None, df
コード例 #13
0
             '38': 'GRCh38'}

VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"]

GENERIC_VALIDATORS = {
    SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True),
    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True),
    OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True),
    HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True),
    BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True),
    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False),
    REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True),
    FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True),
    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True)
}

SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)

SNP_EMPTY_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_EMPTY_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|nan)$')], allow_empty=False)
SNP_EMPTY_VALIDATORS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
SNP_EMPTY_VALIDATORS[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)

POS_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
POS_VALIDATORS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
POS_VALIDATORS[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)

EFFECT_WEIGHT_VALIDATOR = {k:v for k,v in GENERIC_VALIDATORS.items()}
コード例 #14
0
ファイル: views_clients.py プロジェクト: ccpic/SalesExpense
def validate(df):
    d_error = {}
    list_bu = [x[0] for x in BU_CHOICES]
    list_rd = [x[0] for x in RD_CHOICES]
    list_dept = [x[0] for x in DEPT_CHOICES]
    list_hplevel = [x[0] for x in HPLEVEL_CHOICES]
    list_province = [x[0] for x in PROVINCE_CHOICES]
    list_title = [x[0] for x in TITLE_CHOICES]

    NullValidation = CustomElementValidation(lambda d: d is not np.nan,
                                             "该字段不能为空")
    schema = Schema([
        Column("南北中国", [InListValidation(list_bu)]),
        Column("区域", [InListValidation(list_rd)]),
        Column("大区", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("地区经理", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("负责代表", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column(
            "医院编码",
            [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation(),
                NullValidation,
                MatchesPatternValidation(r"^[H]{1}(\d){9}$"),
            ],
        ),
        Column("医院全称", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("省/自治区/直辖市", [InListValidation(list_province)]),
        Column("是否双call", [InListValidation(["是", "否"])]),
        Column("医院级别", [InListValidation(list_hplevel)]),
        Column("开户进展", [InListValidation(["已开户", "未开户"])]),
        Column("客户姓名", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(),
            IsDistinctValidation()
        ]),
        Column("所在科室", [InListValidation(list_dept)]),
        Column("职称", [InListValidation(list_title)]),
        Column("月出诊次数(半天计)",
               [CanConvertValidation(int),
                InRangeValidation(0, 63)]),
        Column("每半天\n门诊量", [CanConvertValidation(int),
                            InRangeValidation(0, )]),
        Column("相关病人\n比例(%)\n建议比例:40%-80%",
               [CanConvertValidation(int),
                InRangeValidation(0, 101)]),
        Column("备注"),
    ])
    errors = schema.validate(df.loc[:, COL])
    for error in errors:
        str_warning = str(error)
        for term in D_TRANSLATE:
            str_warning = str_warning.replace(term, D_TRANSLATE[term])
            findword = r": [0-9]\d*"
            str_warning = re.sub(findword, row_refined, str_warning)
        d_error[str_warning] = "<br>"

    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")}
    d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")}
    d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")}
    d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")}

    d_error = {**d_error, **check_hplevel_with_dept(df)}  # 检查医院级别和所在科室是否出现矛盾
    return d_error
コード例 #15
0
ファイル: metadata_schema.py プロジェクト: biobakery/jdrf1
from pandas_schema import Column, Schema
from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, 
                                      CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation,
                                      InRangeValidation, InListValidation, DateFormatValidation)


study_schema = Schema([
    Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') &
                        ~InListValidation([''])]),
    Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') &
                        ~InListValidation([''])]),
    Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]),
    Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]),
    Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]),
    Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]),
    Column('sequencing_facility', [LeadingWhitespaceValidation()]),
    Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]),
    Column('animal_vendor', [LeadingWhitespaceValidation()]),
    Column('paired', [InListValidation(['true', 'false'])]),
    Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')])
    ,
    Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') &
                        ~InListValidation([''])])
])

sample_schema = Schema([
    Column('host_subject_id', [MatchesPatternValidation(r'\w+', message='Host Subject ID may only contain alphanumeric characters.')]),
    Column('host_diet', [LeadingWhitespaceValidation()]),