def generate_template_from_df(input_df: pd.DataFrame, dataset_qnode: str, dataset_id: str) -> dict: """ Function used for datamart annotation batch mode, return a dict of dataFrame instead of output a xlsx file. """ # Assumes cell [0,0] is the start of the annotation if not input_df.iloc[0, 0] == 'dataset': raise Exception( 'The first column of the dataframe must bet the annotations (not the index of the dataframe)' ) utility = Utility() # updated 2020.7.22: it is possible that header is not at row 7, so we need to search header row if exist header_row, data_row = utility.find_data_start_row(input_df) input_df = input_df.set_index(0) if 'tag' in input_df.iloc[:7, 0]: annotation_rows = list(range(1, 7)) + [header_row] else: annotation_rows = list(range(1, 6)) + [header_row] content_rows = list(range(data_row, len(input_df))) annotation_part = input_df.iloc[annotation_rows].fillna("") content_part = input_df.iloc[content_rows] # start generate dataframe for templates dataset_df = _generate_dataset_tab(input_df, dataset_qnode, dataset_id) attribute_df = _generate_attributes_tab(dataset_qnode, annotation_part) unit_df = _generate_unit_tab(dataset_qnode, content_part, annotation_part) extra_df, wikifier_df1 = _process_main_subject(dataset_qnode, content_part, annotation_part, data_row) wikifier_df2 = _generate_wikifier_part(content_part, annotation_part, data_row) wikifier_df = pd.concat([wikifier_df1, wikifier_df2]) output_df_dict = { 'dataset_file': dataset_df, 'attributes_file': attribute_df, 'units_file': unit_df, "extra_edges": extra_df, "Wikifier_t2wml": wikifier_df, "wikifier": None, "qualifiers": None, } return output_df_dict
def _get_admins(self) -> list: # country and admins qualifiers = [] items = [] for col_type in [Type.ADMIN3, Type.ADMIN2, Type.ADMIN1, Type.COUNTRY]: if get_indices(self.sheet.iloc[self.type_index, :], col_type.value).shape[0]: col_index = Utility.get_index(self.sheet.iloc[self.type_index, :], col_type.value) context = location_context[col_type] if self.main_subject_index == col_index: context = "main subject" item = f'item[{to_letter_column(col_index)}, $row, "{context}"]' value = f'={item}' entry = { 'property': f'{property_node[col_type]}', 'value': value } # 2021-05-16: Adding location edges causes Postgres to generate O(row^2) query plans. if not context == 'main subject': qualifiers.append(entry) items.append(item) if items: qualifier = { 'property': 'P131', 'value': '=' + ' or '.join(items) } qualifiers.append(qualifier) return qualifiers
def _get_region(self) -> dict: try: top = self.data_index bottom = self.sheet.shape[0] left = Utility.get_index(self.sheet.iloc[1, :], Role.VARIABLE.value) right = Utility.get_index(self.sheet.iloc[1, :], Role.VARIABLE.value, pos=-1) region = { 'left': to_letter_column(left), 'right': to_letter_column(right), 'top': top + 1, 'bottom': bottom } except IndexError: print('WARNING: No columns with "variable" role annotation.') region = { 'left': 'A # FIX ME', 'right': 'A # FIX ME', 'top': top + 1, 'bottom': bottom } return region
def __init__(self): self.error_report = [] self.valid_roles = { 'main subject': 1, 'time': 1, 'location': 1, 'variable': 1, 'qualifier': 1, 'unit': 1 } self.valid_types = { 'main subject': ['string', 'entity', 'admin1', 'admin2', 'admin3', 'country'], 'time': ['date', 'year', 'month', 'day'], 'location': ['admin1', 'admin2', 'admin3', 'longitude', 'latitude', 'country'], 'variable': ['number'], 'qualifier': ['string', 'number', 'date'], 'unit': ['string'] } self.reserved_qualifier_columns = { 'location': 1, 'time': 1, 'dataset': 1, 'variable': 1, 'main subject': 1, 'value': 1, 'time_precision': 1, 'country': 1, 'admin1': 1, 'admin2': 1, 'admin3': 1, 'longitude': 1, 'latitude': 1 } self.inhouse_utilty = Utility() self.renamed_columns = {}
def __init__(self, annotated_spreadsheet: pd.DataFrame, dataset_qnode: str): self.sheet = annotated_spreadsheet self.dataset_qnode = dataset_qnode # category rows # self.dataset_index = get_index(self.sheet.iloc[:, 0], Category.DATASET.value) Not needed self.role_index = Utility.get_index(self.sheet.iloc[:, 0], Category.ROLE.value) self.type_index = Utility.get_index(self.sheet.iloc[:, 0], Category.TYPE.value) self.unit_index = Utility.get_index(self.sheet.iloc[:, 0], Category.UNIT.value) self.header_index = Utility.get_index(self.sheet.iloc[:, 0], Category.HEADER.value) self.data_index = Utility.get_index(self.sheet.iloc[:, 0], Category.DATA.value) # role self.time_indcies = get_indices(self.sheet.iloc[self.role_index, :], Role.TIME.value) self.location_indices = get_indices( self.sheet.iloc[self.role_index, :], Role.LOCATION.value) self.variable_indices = get_indices( self.sheet.iloc[self.role_index, :], Role.VARIABLE.value) self.qualifier_indices = get_indices( self.sheet.iloc[self.role_index, :], Role.QUALIFIER.value) self.units_indices = get_indices(self.sheet.iloc[self.role_index, :], Role.UNIT.value, startswith=True) self.variable_columns = self.sheet.iloc[1, :] == Role.VARIABLE.value self.failures = [] ## main subject role try: self.main_subject_index = Utility.get_index( self.sheet.iloc[self.role_index, :], Role.MAIN_SUBJECT.value) except: # use a location column for main subject self.main_subject_index = 0 if self.location_indices.shape[0]: for col_type in [ Type.ADMIN3, Type.ADMIN2, Type.ADMIN1, Type.COUNTRY ]: admin_indices = get_indices( self.sheet.iloc[self.type_index, :], col_type.value) if admin_indices.shape[0]: self.main_subject_index = admin_indices[0] break if self.main_subject_index == 0: print( 'WARNING: No columns with "main subject" role annotation, and no "location" roles.' )
def validate_annotation_column_one(self, df, dataset_id): valid_first_column = True dataset = df.iloc[0, 1].strip() if dataset == '': valid_first_column = False self.error_report.append( self.error_row('Please specify the dataset', 1, utility.xl_col_to_name(1), 'dataset can not be blank')) if dataset != dataset_id: valid_first_column = False self.error_report.append( self.error_row( 'Dataset ID in the file is not same as the Dataset ID in the URL', 1, utility.xl_col_to_name(1), 'Expected: {} but got:{}'.format(dataset_id, dataset))) if df.iloc[0, 0].strip().lower() != 'dataset': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 1, utility.xl_col_to_name(0), 'First row in column 1 should be "dataset"')) if df.iloc[1, 0].strip().lower() != 'role': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 2, utility.xl_col_to_name(0), 'Second row in column 1 should be "role"')) if df.iloc[2, 0].strip().lower() != 'type': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 3, utility.xl_col_to_name(0), 'Third row in column 1 should be "type"')) if df.iloc[3, 0].strip().lower() != 'description': valid_first_column = False self.error_report.append( self.error_row( 'Incorrect annotation: First Column', 4, utility.xl_col_to_name(0), 'Fourth row in column 1 should be "description"')) if df.iloc[4, 0].strip().lower() != 'name': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 5, utility.xl_col_to_name(0), 'Fifth row in column 1 should be "name"')) if df.iloc[5, 0].strip().lower() != 'unit': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 6, utility.xl_col_to_name(0), 'Sixth row in column 1 should be "unit"')) if df.iloc[6, 0].strip().lower() != 'tag': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 7, utility.xl_col_to_name(0), 'Seventh row in column 1 should be "tag"')) try: header_index = Utility.get_index(df.iloc[:, 0], Category.HEADER.value) except: valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', -1, utility.xl_col_to_name(0), 'Missing annotation "header" in column A')) try: data_index = Utility.get_index(df.iloc[:, 0], Category.DATA.value) except: valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', -1, utility.xl_col_to_name(0), 'Missing annotation "data" in column A')) return valid_first_column
class ValidateAnnotation(object): def __init__(self): self.error_report = [] self.valid_roles = { 'main subject': 1, 'time': 1, 'location': 1, 'variable': 1, 'qualifier': 1, 'unit': 1 } self.valid_types = { 'main subject': ['string', 'entity', 'admin1', 'admin2', 'admin3', 'country'], 'time': ['date', 'year', 'month', 'day'], 'location': ['admin1', 'admin2', 'admin3', 'longitude', 'latitude', 'country'], 'variable': ['number'], 'qualifier': ['string', 'number', 'date'], 'unit': ['string'] } self.reserved_qualifier_columns = { 'location': 1, 'time': 1, 'dataset': 1, 'variable': 1, 'main subject': 1, 'value': 1, 'time_precision': 1, 'country': 1, 'admin1': 1, 'admin2': 1, 'admin3': 1, 'longitude': 1, 'latitude': 1 } self.inhouse_utilty = Utility() self.renamed_columns = {} def validate(self, dataset_id, file_path=None, df=None): if file_path is None and df is None: raise Exception('Please specify a file path or a pandas DataFrame') if file_path is not None: df = pd.read_excel(file_path, header=None).fillna('') valid_column_one = self.validate_annotation_column_one(df, dataset_id) valid_roles, qualifier_cols, variable_col_ids = self.validate_roles(df) valid_role_and_type = self.validate_roles_types(df) try: rename_columns = self.validate_qualifier_headers( df, qualifier_cols) rename_columns.extend( self.validate_variable_headers(df, variable_col_ids)) except IndexError: # Missing "data" in column one causes exception rename_columns = [] if not valid_role_and_type or not valid_column_one or not valid_roles: return json.dumps(self.error_report, indent=4), False, rename_columns return "", True, rename_columns def validate_roles(self, df): # 1. one main subject # 2. at least one time # 3. location annotation is optional # 4. at least one variable annotation must be present roles = list(df.iloc[1]) valid_roles = True main_subject_cols = [] variable_cols = [] variable_col_ids = [] time_cols = [] location_cols = [] qualifier_cols = [] for i, x in enumerate(roles): if x == 'main subject': main_subject_cols.append(utility.xl_col_to_name(i)) if x == 'variable': variable_cols.append(utility.xl_col_to_name(i)) variable_col_ids.append(i) if x == 'time': time_cols.append(utility.xl_col_to_name(i)) if x == 'location': location_cols.append(utility.xl_col_to_name(i)) if x == 'qualifier': qualifier_cols.append(utility.xl_col_to_name(i)) if len(variable_cols) == 0: valid_roles = False self.error_report.append( self.error_row( 'Annotation missing: variable', ROLE_ROW, -1, 'Annotation spreadsheet should have at least one column annotated as "variable"' )) if len(time_cols) == 0: valid_roles = False self.error_report.append( self.error_row( 'Annotation missing: time', ROLE_ROW, -1, 'Annotation spreadsheet should have at least one column annotated as "time"' )) if len(main_subject_cols) > 1: valid_roles = False self.error_report.append( self.error_row( 'Annotation invalid: main subject', ROLE_ROW, ','.join(main_subject_cols), 'Annotation spreadsheet can have at maximum one column annotated as "main subject". ' 'The following columns are annotated as main subject: {}'. format(','.join(main_subject_cols)))) if len(main_subject_cols) < 1: valid_roles = False self.error_report.append( self.error_row( 'Annotation invalid: main subject', ROLE_ROW, ','.join(main_subject_cols), 'Annotation spreadsheet should have at least one column annotated as "main subject".' )) return valid_roles, qualifier_cols, variable_col_ids def validate_roles_types(self, df): roles = list(df.iloc[1]) types = list(df.iloc[2]) invalid_roles = [] for i in range(1, len(roles)): if roles[i].strip().split( ';')[0] not in self.valid_roles and roles[i] != '': invalid_roles.append(utility.xl_col_to_name(i)) if invalid_roles: self.error_report.append( self.error_row( 'Roles in following column(s) are invalid: {}'.format( ','.join(invalid_roles)), ROLE_ROW, ','.join(invalid_roles), 'Valid roles are one of the following: {}'.format(','.join( self.valid_roles)))) return False valid_types = True for i in range(1, len(types)): r = roles[i].strip().split(';')[0] t = types[i].strip() if t == '' and r != '': self.error_report.append( self.error_row( 'Missing TYPE for role: {}'.format(r), TYPE_ROW, utility.xl_col_to_name(i), 'Please specify a valid type. Valid type for role: {} is one of the following: [{}]' .format(r, ','.join(self.valid_types[r])))) valid_types = False elif t != '' and r == '': self.error_report.append( self.error_row( 'Missing ROLE for type: {}'.format(t), ROLE_ROW, utility.xl_col_to_name(i), 'Specifying TYPE without a ROLE is invalid')) valid_types = False elif t != '': if r == 'time': if t not in self.valid_types[r] and not t.startswith('%'): self.error_report.append( self.error_row( 'Invalid type: {}, for the role: {}'.format( t, r), TYPE_ROW, utility.xl_col_to_name(i), 'Valid type for role: {}, is either one of [{}], OR a python date format ' 'regex(https://docs.python.org/3.7/library/datetime.html#strftime-and-strptime-behavior)' .format(r, ','.join(self.valid_types[r])))) valid_types = False else: if t not in self.valid_types[r]: self.error_report.append( self.error_row( 'Invalid type: {}, for the role: {}'.format( t, r), TYPE_ROW, utility.xl_col_to_name(i), 'Valid type for role: {}, is [{}]'.format( r, ','.join(self.valid_types[r])))) valid_types = False return valid_types def validate_annotation_column_one(self, df, dataset_id): valid_first_column = True dataset = df.iloc[0, 1].strip() if dataset == '': valid_first_column = False self.error_report.append( self.error_row('Please specify the dataset', 1, utility.xl_col_to_name(1), 'dataset can not be blank')) if dataset != dataset_id: valid_first_column = False self.error_report.append( self.error_row( 'Dataset ID in the file is not same as the Dataset ID in the URL', 1, utility.xl_col_to_name(1), 'Expected: {} but got:{}'.format(dataset_id, dataset))) if df.iloc[0, 0].strip().lower() != 'dataset': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 1, utility.xl_col_to_name(0), 'First row in column 1 should be "dataset"')) if df.iloc[1, 0].strip().lower() != 'role': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 2, utility.xl_col_to_name(0), 'Second row in column 1 should be "role"')) if df.iloc[2, 0].strip().lower() != 'type': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 3, utility.xl_col_to_name(0), 'Third row in column 1 should be "type"')) if df.iloc[3, 0].strip().lower() != 'description': valid_first_column = False self.error_report.append( self.error_row( 'Incorrect annotation: First Column', 4, utility.xl_col_to_name(0), 'Fourth row in column 1 should be "description"')) if df.iloc[4, 0].strip().lower() != 'name': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 5, utility.xl_col_to_name(0), 'Fifth row in column 1 should be "name"')) if df.iloc[5, 0].strip().lower() != 'unit': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 6, utility.xl_col_to_name(0), 'Sixth row in column 1 should be "unit"')) if df.iloc[6, 0].strip().lower() != 'tag': valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', 7, utility.xl_col_to_name(0), 'Seventh row in column 1 should be "tag"')) try: header_index = Utility.get_index(df.iloc[:, 0], Category.HEADER.value) except: valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', -1, utility.xl_col_to_name(0), 'Missing annotation "header" in column A')) try: data_index = Utility.get_index(df.iloc[:, 0], Category.DATA.value) except: valid_first_column = False self.error_report.append( self.error_row('Incorrect annotation: First Column', -1, utility.xl_col_to_name(0), 'Missing annotation "data" in column A')) return valid_first_column def validate_qualifier_headers(self, df, qualifier_cols): header_row, data_row = self.inhouse_utilty.find_data_start_row(df) rename_columns = [] for i, col in enumerate(df.iloc[header_row]): if utility.xl_col_to_name(i) in qualifier_cols: if col in self.reserved_qualifier_columns: # record the cell and renamed rename_columns.append( (header_row, i, self.rename_column(col))) return rename_columns def validate_variable_headers(self, df, variable_col_ids): header_row, data_row = self.inhouse_utilty.find_data_start_row(df) variable_dict = {} rename_columns = [] for vi in variable_col_ids: if df.iloc[header_row, vi] not in variable_dict: variable_dict[df.iloc[header_row, vi]] = [] variable_dict[df.iloc[header_row, vi]].append((df.iloc[4, vi], vi)) # For each unique header name for v in variable_dict: # if more than one header name with the same name if len(variable_dict[v]) > 1: # list of pairs, where each pair is annotated name and column index values = variable_dict[v] # maps annotated name to list columns that should have this name v_dict = {} for val in values: val_0 = val[0].strip() if val_0 not in v_dict: v_dict[val_0] = [] v_dict[val_0].append(val[1]) for k in v_dict: # List column indices that should have the same name vids = v_dict[k] # Rename column if needed # Should use k (the annotaed name) if k is not empty, right??? renamed_col = self.rename_column(df.iloc[header_row, vids[0]]) for vid in vids: rename_columns.append((header_row, vid, renamed_col)) return rename_columns def rename_column(self, col_name): # If the name has already been used rename it, otherwise return the same name if col_name not in self.renamed_columns: self.renamed_columns[col_name] = 0 else: self.renamed_columns[col_name] += 1 if self.renamed_columns[col_name] == 0: return '{}'.format(col_name) else: return '{}_{}'.format(col_name, self.renamed_columns[col_name]) @staticmethod def validate_number(num): try: float(num) except ValueError: return False return True @staticmethod def error_row(error, row, column, description): return { 'Error': error, 'Line Number': row, 'Column': column, 'Description': description }