def __init__(self, base_path, generate_engine_json=True): if base_path: # binds the context to the nodes in use to avoid a context singleton class node_class(Node.Node): pass self.node_class = node_class self.node_class.__module__ = "waflib.Node" self.node_class.__name__ = "Nod3" self.node_class.ctx = self self.root = self.node_class('', None) self.base_node = self.root.make_node(base_path) self.srcnode = self.base_node.make_node('dev') self.bintemp_node = self.srcnode.make_node('BinTemp') self.path = self.srcnode.make_node('Code') self.bldnode = self.bintemp_node if generate_engine_json: default_engine_json = { "FileVersion": 1, "LumberyardVersion": "0.0.0.0", "LumberyardCopyrightYear": 2019 } utils.write_json_file(default_engine_json, os.path.join(base_path, 'engine.json')) self.env = {} self.project_overrides = {} self.file_overrides = {}
def sort_module_name_list(): global auto_bump global ballerina_version_regex try: name_list = utils.read_json_file(constants.MODULE_LIST_FILE) except Exception as e: print('Failed to read module_list.json', e) sys.exit() name_list['standard_library'].sort(key=lambda x: x['name'].split('-')[-1]) try: utils.write_json_file(constants.MODULE_LIST_FILE, name_list) except Exception as e: print('Failed to write to file module_list.json', e) sys.exit() name_list['standard_library'].append({ 'name': 'ballerina-distribution' }) auto_bump = name_list['auto_bump'] ballerina_version_regex = name_list['lang_version_substring'] return name_list['standard_library']
def write_updated_objects_to_file( self, objects_retrieved, objects_removed ): # Default json updated_objects = { 'retrieved': [], 'removed': [], } # object_retrieve is a dict. Loop though and build full path and append for obj in objects_retrieved: obj_abs_path = os.path.join(self.paths['dest_sync'], obj) updated_objects['retrieved'].append(obj_abs_path) # objects_removed is an array. Extend array updated_objects['removed'].extend(objects_removed) # Write updated_objects file to disk filename = '%s_updated_objects.json' % self.shelf if self.shelf == 'default': filename = 'all_updated_objects.json' updated_objects_path = os.path.join(self.paths['scripts'], filename) utils.write_json_file(updated_objects_path, updated_objects)
def set_geo_metadata_to_dataframe(dataframe): """ Iterate over all cities in the dataframe, then add geo metadata to all of them """ geolocator = get_goog_geolocator(GEOCODE_API_KEY) cached_json = read_json_file(GEOCODE_CACHED_JSON_FILENAME) add_empty_columns(dataframe, ['latitude', 'longitude', 'reverse_address']) api_count = 0 # pylint: disable=W0612 for index, row in dataframe.iterrows(): search_query = '{}, {}, USA'.format(row['city'], row['state']) # make sure all 3 values are in the cache... if search_query in cached_json: # add to dataframe from cache city_geo_dict = cached_json[search_query] row[list(city_geo_dict.keys())] = list(city_geo_dict.values()) continue location = geolocator.geocode(search_query) reverse_address = get_reverse_address(geolocator, location) set_geo_metadata_to_dataframe_row(row, location, reverse_address) set_geo_metadata_to_dict(cached_json, location, reverse_address, search_query) api_count += 2 # two api hits per loop, 1 for geocode, 1 for reverse address time.sleep(1) if api_count % 50 == 0: print('API count: ', str(api_count)) write_json_file(GEOCODE_CACHED_JSON_FILENAME, cached_json) write_json_file(GEOCODE_CACHED_JSON_FILENAME, cached_json)
def write_json_frequency_products(): result = list() for prod in products: total = 0 total = frequency_product(categories, prod[ID], key=PRODUCTOS) total += frequency_product(products, prod[ID], key=SUBPRODUCTOS) result.append({prod['name']: total}) write_json_file('datos/frecuencia_productos.json', result)
def generate_json_ordered(): homeria_sorted = { CATEGORIAS: sorted_array(categories, ORDEN, default=NINETYNINE), PRODUCTOS: sorted_array(products, ORDEN, default=NINETYNINE) } write_json_file('datos/homeria_sorted.json', homeria_sorted) write_json_file( 'datos/homeria_categories_sorted.json', sorted_array(categories, 'nombre', default='') )
def write_diff_file(self, object_metadata): """ write_diff_file(self, object_metadata) Write json file contain metadata about a object. """ # Write diff file to git repo path, filename = os.path.split( os.path.join(self.paths['shelves'], object_metadata.keys()[0])) diff_file = '%s/%s.pitem' % (path, filename) utils.write_json_file(diff_file, object_metadata)
def write_diff_file(self, object_metadata): """ write_diff_file(self, object_metadata) Write json file contain metadata about a object. """ # Write diff file to git repo path, filename = os.path.split(os.path.join( self.paths['shelves'], object_metadata.keys()[0] )) diff_file = '%s/%s.pitem' % (path, filename) utils.write_json_file(diff_file, object_metadata)
def start_crawl(cnt=28, write_file=False, show_output=True): if cnt > 28: cnt = 28 try: if (write_file): dict_list = [] landing_page = utils.crawl_page(URL) data_rows = landing_page.findAll('tr', {"class": ["gr", "gr_even"]}) print('PTWC (Pacific Tsunami Warning Center) (Past 30 days)') print('URL:', URL) for idx, row in enumerate(data_rows): if (idx >= cnt): break datum_dict = { "time": row.findAll('td')[0].text, "region": row.findAll('td')[1].text, "type": row.findAll('td')[2].text, "details_link": URL + row.findAll('td')[4].findAll('a')[1]['href'] } details_page = utils.crawl_page( datum_dict['details_link']).find('body').text evaluation_re = 'EVALUATION(\r\n|\r|\n){2}([ \w.]+(\r\n|\r|\n))+(\r\n|\r|\n)' evaluation_match = re.search(evaluation_re, details_page) if (evaluation_match): replace_dict = {"EVALUATION": '', "\r": '', "\n": '', "\t": ''} evaluation_match = utils.replace_all(evaluation_match.group(0), replace_dict) datum_dict['evaluation'] = evaluation_match else: print('NO EVALUATION FOUND') if (show_output): utils.print_dict(datum_dict) if (write_file): dict_list.append(datum_dict) if (write_file): utils.write_json_file(WEBSITE, dict_list) except Exception as e: print('err:', str(e))
def main(): module_name_list = sort_module_name_list() print('Fetched module name list') module_details_json = initialize_module_details(module_name_list) print('Initialized module details') module_details_json = get_immediate_dependents(module_name_list, module_details_json) print('Fetched immediate dependents of each module') module_details_json = calculate_levels(module_name_list, module_details_json) print('Generated module dependency graph and updated module levels') module_details_json['modules'].sort(key=lambda s: s['level']) module_details_json = remove_modules_not_included_in_distribution( module_details_json) print('Removed central only modules and updated the list') try: utils.write_json_file(constants.EXTENSIONS_FILE, module_details_json) except Exception as e: print('Failed to write to extensions.json', e) sys.exit() print('Updated module details successfully') try: updated_file_content = open(constants.EXTENSIONS_FILE, 'r').read() update = utils.commit_file( 'ballerina-release', constants.EXTENSIONS_FILE, updated_file_content, constants.EXTENSIONS_UPDATE_BRANCH, '[Automated] Update Extensions Dependencies')[0] if update: utils.open_pr_and_merge( 'ballerina-release', '[Automated] Update Extensions Dependencies', 'Update dependencies in extensions.json', constants.EXTENSIONS_UPDATE_BRANCH) else: print('No changes to ' + constants.EXTENSIONS_FILE + ' file') except GithubException as e: print('Error occurred while committing extensions.json', e) sys.exit(1) print("Updated module details in 'ballerina-release' successfully")
def store_vs_version_to_cache(conf, vs_version, windows_kit, fingerprint, versions): """ Store the version tuples for a visual studio environment to the environment.json file :param conf: Configuration Context :param vs_version: The visual studio version the cache is being lookup up for :param windows_kit: The windows kit value to store to the cache :param fingerprint: The current input fingerprint to compare against any cached fingerprint if any :param versions: The result array of version tuples to populate if a cached version is found """ try: cache_path = os.path.join(conf.bldnode.abspath(), BINTEMP_CACHE_TOOLS) if not os.path.isdir(cache_path): os.makedirs(cache_path) environment_json_path = os.path.join(cache_path, CACHED_TOOL_ENVIRONMENT_FILE) if os.path.exists(environment_json_path): environment_json = parse_json_file(environment_json_path) else: environment_json = {} if 'vs_compilers' not in environment_json: vs_compilers = {} environment_json['vs_compilers'] = vs_compilers else: vs_compilers = environment_json.get('vs_compilers') ver_winkit_key = _make_vsversion_winkit_key(vs_version, windows_kit) if not ver_winkit_key in vs_compilers: vs_compiler_setting = {} vs_compilers[ver_winkit_key] = vs_compiler_setting else: vs_compiler_setting = vs_compilers.get(ver_winkit_key) vs_compiler_setting['fingerprint'] = fingerprint vs_compiler_setting['versions'] = versions write_json_file(environment_json, environment_json_path) except Exception as err: conf.warn_once('Unable to use visual studio environment cache. Will run msvc tool detection scripts. ({})'.format(err.message or err.msg))
def start_crawl(cnt=10000, write_file=False, show_output=True): try: news_end_points = crawl_news_end_point(cnt, write_file, show_output) # news_end_points = ['/echo/news/sahel-crisis-eu-gives-%E2%82%AC142-million-humanitarian-aid-2014_en'] if(write_file): dict_list = [] cnt_down = len(news_end_points) for end_point in news_end_points: page = utils.crawl_page(URL+end_point) date_re = '(\d){2}/(\d){2}/(\d){4}' publication_date = page.find('div', { "class": "row c_left field field-field_news_publication_date last" }).text publication_date = re.search(date_re, publication_date).group(0) image_url = page.find('div', { "class": "field-item even" }).img if image_url: image_url = image_url['src'].strip() else: image_url = '' datum_dict = { "title": page.find('h1', { "class": "title" }).text.strip(), "image": image_url, "content": page.find('div', { "class": "row c_left field field-body" }).text.strip(), "publication_date": publication_date.strip() } if(show_output): utils.print_dict(datum_dict) print('cnt left:', cnt_down) cnt_down = cnt_down - 1 if(write_file): dict_list.append(datum_dict) if(write_file): utils.write_json_file(WEBSITE, dict_list) except Exception as e: print('err:', e)
def write_updated_objects_to_file(self, objects_retrieved, objects_removed): # Default json updated_objects = { 'retrieved': [], 'removed': [], } # object_retrieve is a dict. Loop though and build full path and append for obj in objects_retrieved: obj_abs_path = os.path.join(self.paths['dest_sync'], obj) updated_objects['retrieved'].append(obj_abs_path) # objects_removed is an array. Extend array updated_objects['removed'].extend(objects_removed) # Write updated_objects file to disk filename = '%s_updated_objects.json' % self.shelf if self.shelf == 'default': filename = 'all_updated_objects.json' updated_objects_path = os.path.join(self.paths['scripts'], filename) utils.write_json_file(updated_objects_path, updated_objects)
def main(): global lang_version global extensions_file global all_modules global current_level_modules try: extensions_file = utils.read_json_file(constants.EXTENSIONS_FILE) except Exception as e: print('[Error] Error while loading modules list ', e) sys.exit(1) print("Workflow invoked of type '" + event_type + "'") if event_type == 'schedule' and not extensions_file['auto_bump']: print( "Schedule workflow invoked, exiting script as 'auto_bump' flag in modules_list.json is false." ) return if override_ballerina_version != '': lang_version = override_ballerina_version else: lang_version = utils.get_latest_lang_version() bal_version = {'version': lang_version} try: utils.write_json_file(constants.LANG_VERSION_FILE, bal_version) except Exception as e: print('Failed to write to file latest_ballerina_lang_version.json', e) sys.exit() try: updated_file_content = open(constants.LANG_VERSION_FILE, 'r').read() update = utils.commit_file( 'ballerina-release', constants.LANG_VERSION_FILE, updated_file_content, constants.EXTENSIONS_UPDATE_BRANCH, '[Automated] Update Workflow Lang Version')[0] if update: utils.open_pr_and_merge( 'ballerina-release', '[Automated] Update Dependency Bump Workflow Triggered Version', 'Update bumped ballerina lang version', constants.EXTENSIONS_UPDATE_BRANCH) else: print('No changes to ' + constants.LANG_VERSION_FILE + ' file') except GithubException as e: print( 'Error occurred while committing latest_ballerinalang_version.md', e) sys.exit(1) print('Workflow started with Ballerina Lang version : ' + lang_version) all_modules = extensions_file['modules'] last_level = all_modules[-1]['level'] print( 'Start dependency bump to extensions packed in ballerina-distribution') for i in range(last_level): current_level = i + 1 current_level_modules = list( filter(lambda s: s['level'] == current_level, all_modules)) for idx, module in enumerate(current_level_modules): print("[Info] Check lang dependency in module '" + module['name'] + "'") update_module(idx, current_level) if auto_merge_pull_requests.lower() == 'true': wait_for_current_level_build(current_level) print( 'Successfully bumped dependencies in extensions packed in ballerina-distribution' ) central_module_level = extensions_file['central_modules'][-1]['level'] print('Start dependency bump to extensions available only in central') for j in range(last_level, central_module_level): current_level = j + 1 current_level_modules = list( filter(lambda s: s['level'] == current_level, extensions_file['central_modules'])) for idx, module in enumerate(current_level_modules): print("[Info] Check lang dependency in module '" + module['name'] + "'") update_module(idx, current_level) if auto_merge_pull_requests.lower() == 'true': wait_for_current_level_build(current_level) print( 'Successfully bumped dependencies in extensions available in central')
def read_file_list(bld, file): """ Read and process a file list file (.waf_file) and manage duplicate files and possible globbing patterns to prepare the list for injestion by the project :param bld: The build context :param file: The .waf_file file list to process :return: The processed list file """ if not os.path.isfile(os.path.join(bld.path.abspath(), file)): raise Errors.WafError( "Invalid waf file list file: {}. File not found.".format(file)) # Manage duplicate files and glob hits dup_set = set() glob_hits = 0 waf_file_node = bld.path.make_node(file) waf_file_node_abs = waf_file_node.abspath() base_path_abs = waf_file_node.parent.abspath() if not os.path.exists(waf_file_node_abs): raise Errors.WafError( 'Invalid WAF file list: {}'.format(waf_file_node_abs)) def _invalid_alias_callback(alias_key): error_message = "Invalid alias '{}' specified in {}".format( alias_key, file) raise Errors.WafError(error_message) def _alias_not_enabled_callback(alias_key, roles): required_checks = utils.convert_roles_to_setup_assistant_description( roles) error_message = "3rd Party alias '{}' specified in {} is not enabled. Make sure that at least one of the " \ "following items are checked in SetupAssistant: [{}]".format(alias_key, file, ', '.join(required_checks)) raise Errors.WafError(error_message) def _determine_vs_filter(input_rel_folder_path, input_filter_name, input_filter_pattern): """ Calculate the vvs filter based on the resulting relative path, the input filter name, and the pattern used to derive the input relative path """ vs_filter = input_filter_name if len(input_rel_folder_path) > 0: # If the resulting relative path has a subfolder, the base the filter on the following conditions if input_filter_name.lower() == 'root': # This is the root folder, use the relative folder subpath as the filter vs_filter = input_rel_folder_path else: # This is a named filter, the filter will place all results under this filter pattern_dirname = os.path.dirname(input_filter_pattern) if len(pattern_dirname) > 0: if input_rel_folder_path != pattern_dirname: # Strip out the base of the filter name vs_filter = input_filter_name + '/' + input_rel_folder_path.replace( pattern_dirname, '') else: vs_filter = input_filter_name else: vs_filter = input_filter_name + '/' + input_rel_folder_path return vs_filter def _process_glob_entry(glob_content, filter_name, current_uber_dict): """ Process a glob content from the input file list """ if 'pattern' not in glob_content: raise Errors.WafError( 'Missing keyword "pattern" from the glob entry"') original_pattern = glob_content.pop('pattern').replace('\\', '/') if original_pattern.startswith('@'): ALIAS_PATTERN = re.compile('@.*@') alias_match = ALIAS_PATTERN.search(original_pattern) if alias_match: alias = alias_match.group(0)[1:-1] pattern = original_pattern[len(alias) + 2:] if alias == 'ENGINE': search_node = bld.path else: search_node = bld.root.make_node(bld.ThirdPartyPath(alias)) else: pattern = original_pattern search_node = waf_file_node.parent else: pattern = original_pattern search_node = waf_file_node.parent while pattern.startswith('../'): pattern = pattern[3:] search_node = search_node.parent glob_results = search_node.ant_glob(pattern, **glob_content) for globbed_file in glob_results: rel_path = globbed_file.path_from(waf_file_node.parent).replace( '\\', '/') abs_path = globbed_file.abspath().replace('\\', '/') rel_folder_path = os.path.dirname(rel_path) vs_filter = _determine_vs_filter(rel_folder_path, filter_name, original_pattern) if vs_filter not in current_uber_dict: current_uber_dict[vs_filter] = [] if abs_path in dup_set: Logs.warn( "[WARN] File '{}' specified by the pattern '{}' in waf file '{}' is a duplicate. It will be ignored" .format(abs_path, original_pattern, waf_file_node_abs)) else: current_uber_dict[vs_filter].append(rel_path) dup_set.add(abs_path) def _clear_empty_uber_dict(current_uber_dict): """ Perform house clean in case glob pattern overrides move all files out of a 'root' group. """ empty_filters = [] for filter_name, filter_contents in current_uber_dict.items(): if len(filter_contents) == 0: empty_filters.append(filter_name) for empty_filter in empty_filters: current_uber_dict.pop(empty_filter) return current_uber_dict def _process_uber_dict(uber_section, uber_dict): """ Process each uber dictionary value """ processed_uber_dict = {} for filter_name, filter_contents in uber_dict.items(): for filter_content in filter_contents: if isinstance(filter_content, str): if '*' in filter_content or '?' in filter_content: # If this is a raw glob pattern, stuff it into the expected glob dictionary _process_glob_entry(dict(pattern=filter_content), filter_name, processed_uber_dict) else: # This is a straight up file reference. # Do any processing on an aliased reference if filter_content.startswith('@'): processed_path = bld.PreprocessFilePath( filter_content, _invalid_alias_callback, _alias_not_enabled_callback) else: processed_path = os.path.normpath( os.path.join(base_path_abs, filter_content)) if not os.path.exists(processed_path): Logs.warn( "[WARN] File '{}' specified in '{}' does not exist. It will be ignored" .format(processed_path, waf_file_node_abs)) elif not os.path.isfile(processed_path): Logs.warn( "[WARN] Path '{}' specified in '{}' is a folder, only files or glob patterns are " "allowed. It will be ignored".format( processed_path, waf_file_node_abs)) elif processed_path in dup_set: Logs.warn( "[WARN] File '{}' specified in '{}' is a duplicate. It will be ignored" .format(processed_path, waf_file_node_abs)) else: if filter_name not in processed_uber_dict: processed_uber_dict[filter_name] = [] processed_uber_dict[filter_name].append( processed_path) dup_set.add(processed_path) elif isinstance(filter_content, dict): # Dictionaries automatically go through the glob pattern working _process_glob_entry(filter_content, filter_name, processed_uber_dict) else: raise Errors.WafError( "Invalid entry '{}' in file '{}', section '{}/{}'". format(filter_content, file, uber_section, filter_name)) return _clear_empty_uber_dict(processed_uber_dict) def _get_cached_file_list(): """ Calculate the location of the cached waf_files path """ bld_node = file_node.get_bld() return bld_node.abspath() file_node = bld.path.make_node(file) if not bld.is_option_true('enable_dynamic_file_globbing'): # Unless this is a configuration context (where we want to always calculate any potential glob patterns in the # waf_file list) check if the file list exists from any previous waf configure. If the waf_files had changed # in between builds, auto-configure will pick up that change and force a re-write of the waf_files list processed_waf_files_path = _get_cached_file_list() if os.path.exists(processed_waf_files_path) and not isinstance( bld, Configure.ConfigurationContext): processed_file_list = utils.parse_json_file( processed_waf_files_path) return processed_file_list # Read the source waf_file list source_file_list = bld.parse_json_file(file_node) # Prepare a processed waf_file list processed_file_list = {} for uber_file_entry, uber_file_dict in source_file_list.items(): processed_file_list[uber_file_entry] = _process_uber_dict( uber_file_entry, uber_file_dict) pass if glob_hits > WAF_FILE_GLOB_WARNING_THRESHOLD: Logs.warn( '[WARN] Source file globbing for waf file {} resulted in over {} files. If this is expected, ' 'consider increasing the warning limit value WAF_FILE_GLOB_WARNING_THRESHOLD in waf_branch_spec.py' .format(file_node.abspath(), WAF_FILE_GLOB_WARNING_THRESHOLD)) if not bld.is_option_true('enable_dynamic_file_globbing') and isinstance( bld, Configure.ConfigurationContext): # If dynamic file globbing is off, then store the cached file list during every configure command processed_waf_files_path = _get_cached_file_list() processed_waf_files_dir = os.path.dirname(processed_waf_files_path) if not os.path.exists(processed_waf_files_dir): os.makedirs(processed_waf_files_dir) utils.write_json_file(processed_file_list, processed_waf_files_path) return processed_file_list
def cache_auth_token(self, auth_token): """ cache auth_token to .pantri_auth_token """ # Cache auth token auth_token_cache = os.path.join(self.git_path, '.pantri_auth_token') utils.write_json_file(auth_token_cache, {'auth_token': auth_token}) self.logger.info('Auth token stored in %s' % auth_token_cache)
def _dependency(self, snm, sver, ars): spth = self.get_path_tmp() + os.sep + snm smsg = snm + " " + sver utils.info("BEGIN " + snm) try: conf = utils.read_json_file(spth + os.sep + snm + ".json") bupd = True if conf is not None: if "version" in conf: if conf["version"] == sver: bupd = False else: utils.info("incorrect version.") else: utils.info("version not found.") else: utils.info("version not found.") if bupd: sfx = detectinfo.get_native_suffix() if sfx is None or "generic" in sfx: utils.info("os not detected.") raise Exception("You have to compile it manually.") if self._b32bit: sfx = sfx.replace("64", "32") utils.init_path(spth) utils.info("download headers and library ...") nurl = utils.get_node_url() if snm is not "lib_gcc" and snm is not "lib_stdcpp": appnm = "headers_" + snm + ".zip" utils.download_file(nurl + "getAgentFile.dw?name=" + appnm, spth + os.sep + appnm) utils.unzip_file(spth + os.sep + appnm, spth + os.sep) utils.remove_file(spth + os.sep + appnm) appnm = snm + "_" + sfx + ".zip" utils.download_file(nurl + "getAgentFile.dw?name=" + appnm, spth + os.sep + appnm) utils.unzip_file(spth + os.sep + appnm, spth + os.sep, "native/") utils.remove_file(spth + os.sep + appnm) #FIX Version conf = utils.read_json_file(spth + os.sep + snm + ".json") if conf is not None: if "version" not in conf: conf["version"] = sver utils.write_json_file(conf, spth + os.sep + snm + ".json") #COPY LIB TO NATIVE for f in os.listdir(spth): if f.endswith('.dll') or f.endswith('.so') or f.endswith( '.dylib'): shutil.copy2(spth + os.sep + f, self.get_path_native() + os.sep + f) #POST FIX self._dependency_post_fix(snm, sver) smsg += " - OK!" ars.append(smsg) utils.info("END " + snm) except Exception as e: smsg += " - ERROR: " + utils.exception_to_string(e) ars.append(smsg) raise e
def save(self): '''Save wallet file to disk''' filename = self.filename() data = self.to_dict() write_json_file(data, filename) logger.info(f"Saved wallet to {filename}")
def cache_auth_token(self, auth_token): """ cache auth_token to .pantri_auth_token """ # Cache auth token auth_token_cache = os.path.join(self.git_path, '.pantri_auth_token') utils.write_json_file(auth_token_cache, {'auth_token': auth_token}) self.logger.info('Auth token stored in %s' % auth_token_cache)
def main(): logging = create_logging() logging.info("Starting the requests. ESTIMATED TIME: 10s.") ## Request APIs to extract information entries = utils.request_api(args.input_url_tools, logging) metrics = utils.request_api(args.input_url_metrics, logging) logging.info(f"%-40s\t{len(entries):,}" % ("Total entries in /tool:")) logging.info(f"%-40s\t{len(metrics):,}" % ("Total metrics in /metrics:")) entries_ultimate, number_tools = get_ultimate_entries_and_len_tools( entries) logging.info(f"%-40s\t{len(entries_ultimate):,}" % ("Total Entries Ultimate in /tool:")) logging.info(f"%-40s\t{number_tools:,}" % ("Number of tools:")) logging.info("Extracting entries from APIs. ESTIMATED TIME: 12s.") ## Get for each website unique the corresponding metrics metrics_unique_homepage = match_entries_tools_metrics_by_unique_homepage( metrics, entries) # metrics_unique_homepage = utils.open_json("metrics_unique_homepage.json") utils.write_json_file("metrics_unique_homepage.json", metrics_unique_homepage) logging.info(f"%-40s\t{len(metrics_unique_homepage):,}" % ("Unique websites:")) # Instance the object to calculate the differents metrics: api_extractor_obj = api_extractors.MetricsExtractor( metrics_unique_homepage, CLASSIFICATION_DOMAINS, logging) logging.info("Calculating the stadistics. ESTIMATED TIME: 1s.") logging.info("Stadistics succesfully extracted.") write_json_file_given_path( "total_counter", counter_total=api_extractor_obj.total_dict_domains_counter) # Create Dataframe of the counter of domains df_domains = utils.create_df_from_dict( api_extractor_obj.total_dict_domains_counter, "Domain", "Count", args.number_domains) # Extract as a list the columns of the dataframe count_of_most_popular_domains = utils.extract_columns_df(df_domains) # Access Tab dataframe: df_tab_access = utils.create_dataframe_access(api_extractor_obj) df_final = df_tab_access.to_dict(orient="records") # Instance of the JSON writer object write_json_file_given_path( f"{args.output_directory}/{args.output_file_name_metrics}", time_of_execution=str(datetime.now()), bioschemas_ssl_https_license=api_extractor_obj. values_bioschemas_ssl_liscense_https, http_codes_by_classification=api_extractor_obj.values_codes, domains_classification=CLASSIFICATION_DOMAINS, domains_count=count_of_most_popular_domains, df_acces=df_final, dict_http_codes_count=change_keys_of_dictionary( dict( collections.Counter(df_tab_access['HTTP_Code'].to_list() + [ code for list_codes in df_tab_access['Redirections'].dropna().to_list() for code in list_codes ])), DICT_CODES_DESCRIPTION), dict_uptimes_days=dict( collections.Counter( df_tab_access['Days_Up'].dropna().astype(int).to_list())), total_entries_ultimate=len(entries_ultimate), total_len_tools=number_tools) write_json_file_given_path( f"/home/andreu/BSC/dashboard_openebench/new_input_data/extracted_metrics", time_of_execution=str(datetime.now()), bioschemas_ssl_https_license=api_extractor_obj. values_bioschemas_ssl_liscense_https, http_codes_by_classification=api_extractor_obj.values_codes, domains_classification=CLASSIFICATION_DOMAINS, domains_count=count_of_most_popular_domains, df_acces=df_final, dict_http_codes_count=change_keys_of_dictionary( dict( collections.Counter(df_tab_access['HTTP_Code'].to_list() + [ code for list_codes in df_tab_access['Redirections'].dropna().to_list() for code in list_codes ])), DICT_CODES_DESCRIPTION), dict_uptimes_days=dict( collections.Counter( df_tab_access['Days_Up'].dropna().astype(int).to_list())), total_entries_ultimate=len(entries_ultimate), total_len_tools=number_tools) logging.info( f"Saved the Stadistics in {args.output_directory}/{args.output_file_name_metrics}.json" )
SENTENCE_HIDDEN_DIM = 256 HIERARCHIAL_HIDDEN_DIM = 512 DA_HIDDEN_DIM = 64 FF_HIDDDEN_DIM = 256 param_grid = { 'learning_rate': [.0001, .001, .01, .1, 1], 'model_prop': [1, 1 / 2, 1 / 4, 1 / 8, 1 / 16, 1 / 32], 'batch_size': [4, 8, 16, 32, 64, 128], 'ff_dropout_prob': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 'random_seed': None } # create best_config.json file to keep track of which parameters lead to best results best_dict = {'experiment_name': None, 'best_val_loss': 1000000000} utils.write_json_file(best_dict, os.path.join(args.output_dir, 'best_config.json')) #param_combs = list(ParameterGrid(param_grid)) json_path = os.path.join(args.output_dir, 'params.json') for t in range(num_of_trials): model_scale = random.choice(param_grid['model_prop']) param_config = { 'learning_rate': random.choice(param_grid['learning_rate']), 'sentence_hidden_dim': int(model_scale * SENTENCE_HIDDEN_DIM), 'hierarchial_hidden_dim': int(model_scale * HIERARCHIAL_HIDDEN_DIM), # 'hierarchial_hidden_dim' 'da_hidden_dim': int(model_scale * DA_HIDDEN_DIM), # 'da_hidden_dim' 'ff_hidden_dim': int(model_scale * FF_HIDDDEN_DIM),