def convert(matchobj): data_oid = int(matchobj.group(1)) if matchobj.group(2) == "article": if data_oid in od: new_data_oid = od[data_oid] else: new_data_oid = 0 log_me("No match for link: {}".format(matchobj.group(0))) return "<a data-oid=\"{0}\" data-otype=\"article\" href=\"/article/{0}/\"></a>".format( new_data_oid) elif matchobj.group(2) == "query": query_title = matchobj.group(4) if not queries.empty: q_match = queries.title == query_title if q_match.any(): matching_queries = queries[q_match] # m is a reference to somewhere and we need to fix it. oid = (matching_queries.iloc[-1]).id # -1 means last return "<a data-oid=\"{0}\" data-otype=\"query\" href=\"/query/{0}/\"></a>".format( oid) elif matchobj.group(2) == "table": qual_name = matchobj.group(4).split()[0] tb = target.get_tables_by_name(qual_name) if not tb.empty: oid = tb.index[-1] return "<a data-oid=\"{0}\" data-otype=\"table\" href=\"/table/{0}/\"></a>".format( oid)
def send_statement(self, statement, final=False): try: #log_me(statement) cursor = self.connection.cursor() if not cursor: raise("Connection error") cursor.execute(statement) self.connection.commit() cursor.close() return True except (Exception, DatabaseError) as error: log_me(f"Error while executing {statement}:\n{error}") # Remember all the types that are not supported by Postgres # by saving the type name in dict missing match = re.search(r'type "(\w+)" does not exist', error.pgerror) if match: if match.group(1) in missing: missing[match.group(1)] +=1 else: log_me(error) missing[match.group(1)] = 1 cursor.close() self.connection.rollback() return False finally: # closing database connection. if final: cursor.close() self.connection.close()
def __init__(self, name, parent=None): name = name.strip() self.name = name # this is the short name self.parent = parent # this is the long name of the parent reg = f'{self}' # this is a long name, taking advantage of recursion, calling __repr__ # The external ID includes BI Server ID and a random component, too # that will be the key to the registry external_id = create_external_id(reg) if external_id not in Folder.registry: # ---- create BI folder ----------- bif = [ create_folder_object(name, external_id, parent_folder=parent) ] api = f'{bi_server_url}{bi_server}/folder/' # if parent: # print(f'Creating "{parent}/{name}"') # else: # print(f'Creating "{name}"') r = alation.generic_api_post(api, body=bif) if 'status' in r: if r['status'] == 'successful': log_me(f'{external_id}:{r["result"]}') # ---- keep track of what was created ---- Folder.registry[external_id] = self
def check_sequence(self, first): # first is the ID of the first article (top of the hierarchy) # it will be the last to be created on the target... # we need to put the articles in a logical order. # we put the first in front, but we expect it to be pushed all the way to the last when we are done order = deque([first]) # the to-do-list is all articles without the first to_do_list = deque(self.article.index) to_do_list.remove(first) # we have taken care of the first already while to_do_list: # get the right most item last = order[-1] # we either remove a child or the next in the to-do list # do we have children? current_children = deque(self.article.children[last]) while current_children: c = current_children.pop() try: # move to the top of the to-do list to_do_list.remove(c['id']) to_do_list.appendleft(c['id']) except: log_me( f"WARNING --- Article {c['id']}/{c['title']} does not appear to be loaded." ) order.append(to_do_list.popleft()) # next one return order
def convert_references(self): # First pass: create a DataFrame of target articles with # New articles that are being migrated or referenced # All references to articles "zero-ed out" - will be re-calculated in Second Pass # The title gets saved in the title attribute of the anchor (safer) for a in self.article.itertuples(): soup = BeautifulSoup(a.body, "html5lib") # Find all Anchors match = soup.findAll('a') for m in match: # We only care about Alation anchors, identified by the attr data-oid if 'data-oid' in m.attrs: oid = m['data-oid'] otype = m['data-otype'] if otype == 'article': try: actual_title = self.article.at[int(oid), 'title'] except: log_me(u"Warning! Ref to article not found {}->{}". format(a.title, m.get_text())) actual_title = m.get_text() m.string = actual_title m['data-oid'] = 0 del m['href'] m['title'] = actual_title self.article.at[a.Index, 'body'] = soup.prettify( ) # update the article body else: #log_me(m) m['data-oid'] = 0 del m['href'] m['title'] = m.get_text() self.article.at[a.Index, 'body'] = soup.prettify( ) # update the article body
def update_datasource(alation_instance, ds_id, schemas, warnings=None): if warnings: # get all flags to see if we need to append our warnings flags_raw = alation_instance.generic_api_get(api=f"/integration/flag/?oid={ds_id}&otype=data", official=True) if flags_raw: existing_warning_text = "" existing_warning_id = None for flag in flags_raw: # there can be at most one warning if flag.get('flag_type')=='WARNING': existing_warning_text = flag.get('flag_reason') existing_warning_id = flag.get('id') new_warning_text = existing_warning_text + "Missing tables: " +", ".join(warnings) # There is a warning already -- just append and hope the admin will take action before the warning # gets too long to display in Alation if existing_warning_id: update_flag = alation_instance.generic_api_put(api=f"/integration/flag/{existing_warning_id}/", body=dict(flag_reason=new_warning_text), official=True) else: # create a new warning flag new_flag = alation_instance.generic_api_post(api=f"/integration/flag/", body=dict(flag_type="WARNING", subject=dict(id=int(ds_id), otype="data"), flag_reason=new_warning_text), official=True) else: # create the very first flag, namely the warning new_flag = alation_instance.generic_api_post(api=f"/integration/flag/", body=dict(flag_type="WARNING", subject=dict(id=int(ds_id), otype="data"), flag_reason="Missing tables: " +", ".join(warnings)), official=True) """ cron_extraction Yes The extraction schedule in crontab format (minute, hour, day of month, month of year, day of week) disable_auto_extraction No True if the extraction schedule should not be executed, false to run extraction according to cron_extraction limit_schemas Yes Schemas to include. exclude_schemas Yes Schemas to exclude. remove_filtered_schemas Yes Whether to remove filtered schemas. """ params=dict(force_refresh=True) log_me("Running MDE") mde = alation_instance.generic_api_post(api=f'/data/{ds_id}/list_schemas/') mde = alation_instance.generic_api_get(api=f'/integration/v1/datasource/{ds_id}/available_schemas/', params=params, official=True) body=dict(cron_extraction="{r} 0 * * *".format(r=random.randint(0, 59)), disable_auto_extraction=False, limit_schemas=[], exclude_schemas=['pg_temp_1', 'pg_toast', 'pg_toast_temp_1', 'public'], remove_filtered_schemas=True ) sync = alation_instance.generic_api_put(api=f'/integration/v1/datasource/{ds_id}/sync_configuration/metadata_extraction/', body=body, official=True) mde = alation_instance.generic_api_post(api=f'/data/{ds_id}/extract_now/', params=params, official=False) return mde
def modify_attribute(attr): for pattern, replacement in substitutions.items(): match = re.search(pattern, attr, flags=re.IGNORECASE) if match: # Remember the substitution and print it for debugging if not match.group(0) in seen: log_me(f'{match.group(0)} -> {replacement}') seen[match.group(0)] = True attr = replacement break return attr
def bulk_api_body(self): log_me("Creating Body for Bulk API") body = "" # Iterate through all the articles for id, article in self.article.iterrows(): new_row = dict(description=article['body'], key=article['title']) # Iterate through the custom fields (caller could have sent fewer) for field in article['custom_fields']: if field['value_type'] in [ 'picker', 'multi_picker', 'date', 'rich_text' ]: new_row[field['field_name']] = field['value'] else: # In the case of Object Sets and People Sets, this may not be any good log_me( f"Warning: {field['field_name']}/{field['value_type']}/{field['value']}" ) new_row[field['field_name']] = { field['value_type']: field['value'] } body = body + json.dumps(new_row) + '\n' return body
#dimension_articles_ = dimension_articles_.apply(target.postArticle, axis=1) # For one of the dimensions, "View", we will create an article as the parent for all the articles that use # that "View" c_fields = target.put_custom_fields(custom_fields_pd) print(c_fields) # returns a list of field IDs (existing or new) target.put_custom_template(file_key, c_fields) n = 100 s = dtv.shape[0] j = math.floor(s / n) + 1 # how many blocks of 100? for b in range(j): log_me("Starting block {} of {} - total {}".format(b, n, s)) body = "" for i in range(n): if i + b * n >= s: break art = dtv.iloc[i + b * n] art_not_na = art[art.notna()] art_as_dict = dict(art_not_na) new = {} table_in_body = [] for k, v in art_as_dict.items(): ## Let's create a row in a table for these values table_in_body.append(add_table_row(k, v)) # If the field is a picker, let's populate the field value if k in pickers and pickers[k] > 1:
dd = pickle_cont['dd'] allArticles = pickle_cont['article'] queries = pickle_cont['queries'] allTemplates = pickle_cont['template'] custom_fields = pickle_cont['custom_fields'] # --- Log into the target instance url_2 = args['host'] user_2 = args['username'] passwd_2 = args['password'] delete_flag = args['delete'] target = AlationInstance(url_2, user_2, passwd_2) if delete_flag: a = target.get_articles(template=desired_template) log_me('Deleting existing articles: {}'.format(a.id)) a.id.apply(target.del_article) Art = Article(allArticles) # convert to Article class templates = target.get_templates() template_id = int(templates[templates.title == desired_template]['id']) target.put_queries(queries=queries) queries = target.get_queries() # this is so we can figure out the number order = check_sequence(allArticles, first=1889) dummy = target.post_article( dict(title="dummy {}".format( time.strftime(u"%Y-%b-%d %H:%M:%S", time.localtime())), body='Delete this afterwards'))
# Rename certain columns df.rename(columns=mapper, inplace=True) # Only keep the columns mentioned in the mapper cols = list(mapper.values()) df = df.loc[:, cols] # convert dataframe into JSON rows format jsr = "\n".join(list(df.apply(json_row, axis=1))) desired_template = config.desired_template alation_1.put_articles_2(jsr, desired_template) log_me("Getting desired articles") articles = alation_1.get_articles( template=desired_template) # download all articles Art = Article(articles) # convert to Article class # First pass of fixing references # Art.convert_references() templates = alation_1.get_templates( ) # download all templates (with their custom fields) custom_fields = alation_1.get_custom_fields_from_template( desired_template) # this way we also get the template info # Next, we put the objects we want. We need to start with the custom fields, then the template, # then the articles, and finally the glossaries.
random_users_2 = [dict(otype='user', oid=u) for u in list(users.id)] # Create a BI Server, by passing a list of 1 URI bi_server_details = [{ "uri": "https://alation.looker.com/browse", "title": f"My BI Server {file_key}" }] bi_server_url = '/integration/v2/bi/server/' # bi_server will be populated properly by this... r = alation.generic_api_post(api=bi_server_url, body=bi_server_details) # {'Status': 'Success: Created 1 servers.', 'Count': 1, 'Errors': [None], 'Server IDs': [48]} if r['Count'] == 1: bi_server = r['Server IDs'][0] #alation.update_custom_field(o_type='bi_server', o_id=bi_server, field_id=3, update=file_key) log_me(f'Created server {file_key}: {base_url}/bi/v2/server/{bi_server}/') else: log_me(f"Expected one BI Server to be created: {r}") # bi_server = 2 # =========== Handling of the input file from Customer, containing reports in folders =============== report_df = pd.read_csv('~/Downloads/mysql-analytics_run_1_stmt_1_0 (3).csv', sep=',') report_df.index = report_df.id # using global variables now, not clean def create_external_id(folder): if folder: return f'{file_key}+{bi_server}+{folder}'
def create_pdf(self, first, additional_html=''): now = datetime.datetime.now() # Use pdfkit to create final ABOK pdf file # Options for PDFKit (wkhtmltopdf really) to generate the pdf - https://wkhtmltopdf.org/usage/wkhtmltopdf.txt bodyoptions = { 'page-size': 'Letter', 'footer-line': '', 'footer-center': 'For use only by Alation customers. No duplication or transmission without permission.', 'footer-font-size': '9', #'disable-internal-links': True, #'disable-external-links': True, 'dpi': '300', 'minimum-font-size': '12', 'disable-smart-shrinking': '', 'header-left': 'Alation Book of Knowledge' + now.strftime(u" %Y-%m-%d %H:%M "), 'header-line': '', 'header-font-size': '9', 'header-spacing': '4', 'margin-bottom': '15', 'margin-top': '15', 'footer-spacing': '4', 'margin-left': '10', 'margin-right': '10', 'footer-right': '[page]/[toPage]', 'enable-toc-back-links': '', 'outline': '', 'quiet': '' } # Define the location of the created ABOK pdf file ABOKpdffilename = 'ABOK' + now.strftime(u" %Y-%b-%d %H_%M ") + '.pdf' seq = self.check_sequence(first) html = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' +\ '<link rel="stylesheet" href="https://use.typekit.net/pno7yrt.css">' +\ '<link href="alation.css" rel="stylesheet" type="text/css">' for i in seq: html = html + '<h1>' + self.article.title[i] + '</h1></p>' html = html + self.article.body[i] + '</p>' html2 = abok.clean_up(html) html2 = html2 + additional_html pdfkit.from_string(html2, ABOKpdffilename, options=bodyoptions, css="alation.css", cover='cover.html', cover_first=True) log_me('pdfkit finished processing')
if part2: statement = f'COMMENT ON {otype} "{part1}"."{part2}"' + f" IS '{comment_text}'" else: statement = f'COMMENT ON {otype} "{part1}"' + f" IS '{comment_text}'" self.send_statement(statement) def get_existing_schemas(self): return pd.read_sql("SELECT catalog_name ,schema_name, schema_owner FROM information_schema.schemata", con=self.connection) # Main program if __name__ == "__main__": desc = "Copies physical metadata from rosemeta to another pgSQL" log_me("Reading data from pickle file") data = pd.read_pickle("rosemeta.gzip") # Replace non-postgres types with equivalent postgres types substitutions={ r'^(small)?datetime': 'timestamp', r'^timestamp_ltz': 'timestamp with time zone', r'^(timestamp_ntz|smalltimestamp)': 'timestamp', r'^string': 'text', r'^text\(max\)' : 'text', r'^(long|medium|short)?text(\(\d+\))?': 'text', r'^(number|double|float|numeric)(\(\d+\))?': 'numeric', r'^integer(\d)?' : 'integer', r'^(big|small)?integer(\d)?' : 'integer', r'^(big|small)?integer(\(\d+\))?' : 'integer', r'^(big|small)?_integer(\d)?' : 'integer[]', # int with (digit)
random_users_2 = [dict(otype='user', oid=u) for u in list(users.id)] # Create a BI Server, by passing a list of 1 URI bi_server_details = [{"uri": "https://alation.looker.com/browse"}] bi_server_url = '/integration/v2/bi/server/' # bi_server will be populated properly by this... r = alation.generic_api_post(api=bi_server_url, body=bi_server_details) # {'Status': 'Success: Created 1 servers.', 'Count': 1, 'Errors': [None], 'Server IDs': [48]} if r['Count'] == 1: bi_server = r['Server IDs'][0] alation.update_custom_field(o_type='bi_server', o_id=bi_server, field_id=3, update=file_key) log_me(f'Created server {file_key}: {base_url}/bi/v2/server/{bi_server}/') else: log_me(f"Expected one BI Server to be created: {r}") #bi_server = 123 # =========== Handling of the input file from Customer, containing reports in folders =============== report_df = pd.read_csv('reports_full.csv', sep=';') report_df.index = report_df.ID # using global variables now, not clean def create_external_id(folder): if folder: return f'{file_key}+{bi_server}+{folder}' # else it the root folder which does not need an external ID
official=True) tables = tables.decode().split('\n') elems = [] for t in tables: if len(t) > 1: elems.append(json.loads(t)) tables_pd = pd.DataFrame(elems) tables_pd.index = tables_pd.key del tables_pd['key'] return tables_pd df1 = get_values('steward') df2 = get_values('some multi') df3 = df1.merge(df2, left_index=True, right_index=True) log_me("Getting desired articles") articles = alation_1.get_articles( template=desired_template) # download all articles Art = Article(articles) # convert to Article class #queries = alation_1.get_queries() # First pass of fixing references #target.put_queries(queries=queries) # Art.convert_references() log_me("Getting media files via download") list_of_files = list(Art.get_files()) alation_1.get_media_file(list_of_files, config.base_path) extract_files(config.base_path) log_me("Creating PDF")
p = pickle.Unpickler(mypickle) pickle_cont = p.load() # extract data dictionary, articles, queries, templates, custom fields from the pickle dd = pickle_cont['dd'] articles = pickle_cont['article'] queries = pickle_cont['queries'] # --- Log into the target instance target = AlationInstance(config.args['host'], config.args['username'], config.args['password']) # -- Make sure ABOK Article template is created template_id = target.put_custom_template('ABOK Article') # If desired, delete all pre-existing ABOK articles. if config.args['delete']: a = target.get_articles(template=config.desired_template) log_me('Deleting existing articles: {}'.format(a.id)) a.id.apply(target.del_article) # Upload all queries to the instance. Note we implicitly assume here that the only # references are to existing objects, e.g. AA tables target.put_queries(queries=queries) queries = target.get_queries() # this is so we can figure out the ID # to-do: check sequence before pickling! Then we can simplify this code even more order = check_sequence( articles, first=config.first_abok_article) # order is list of IDs n = len(order) offset = use_dummy_to_get_highest_id() + 1 # if the order is set beforehand, the mapping dict is no longer required, we would just add the offset mapping_dict = {}