def read_global_site_json(self): try: self.global_site_urls = adm.read_json(self.CRAWLING_OUTPUT_URLS) self.global_site_pages = adm.read_json(self.CRAWLING_OUTPUT_PAGES) self.global_site_redirects = adm.read_json(self.CRAWLING_OUTPUT_REDIRECTS) self.global_site_error_urls = adm.read_json(self.CRAWLING_OUTPUT_ERROR_URLS) self.global_site_ignored_urls = adm.read_json(self.CRAWLING_OUTPUT_IGNORED_URLS) except: pass
def do_content(this_preset_dir, noscan): global content if role_stats['kalite']['active']: content["kalite"] = {'lang_code': 'en', 'topics': []} # defaults content_file = this_preset_dir + 'content.json' if os.path.exists(content_file): old_content = adm.read_json(content_file) else: old_content = {} if noscan: content_from_menu(this_preset_dir) else: content_from_files() # read list of maps if os.path.exists(map_path): excl_maps = [''] map_list = os.listdir(map_path) for fname in map_list: content["maps"].append(fname) # preserve any kalite for now content["kalite"] = old_content.get("kalite", {}) if role_stats['kalite']['active']: lang = get_kalite_lang() content["kalite"]["lang_code"] = lang get_kalite_complete('khan/', lang) content["kalite"]["topics"] = kalite_topics adm.write_json_file(content, content_file)
def main(): global map_catalog global base_catalog args = parse_args() map_id = args.map_id catalog = adm.read_json(catalog_path) map_catalog = catalog['maps'] base_catalog = catalog['base'] #for k in catalog.keys(): #print(k) is_map = map_id in map_catalog is_base = map_id in base_catalog if not is_base and not is_map: print('Download URL not found in Map Catalog: %s'%args.map_id) sys.exit(1) # create init.json which sets initial coords and zoom if is_map: init = {} map = map_catalog[map_id] init['region'] = map['region'] init['zoom'] = map['zoom'] init['center_lon'] = map['center_lon'] init['center_lat'] = map['center_lat'] init_fn = viewer_path + '/init.json' adm.write_json_file(init, init_fn) installed_maps = get_installed_tiles() print('installed_maps') print(repr(installed_maps)) write_vector_map_idx_v2(installed_maps)
def main(argv): global site_urls global site_pages global site_redirects global site_ignored_urls global site_error_urls # Pass in json file name if len(sys.argv) > 1: site = sys.argv[1] else: print('usage: site-analzyer.py <site>') sys.exit(1) url_json_file = site + '_urls.json' page_json_file = site + '_pages.json' redirects_json_file = site + '_redirects.json' ignored_urls_json_file = site + '_ignored_urls.json' error_urls_json_file = site + '_error_urls.json' try: site_urls = adm.read_json(url_json_file) site_pages = adm.read_json(page_json_file) site_redirects = adm.read_json(redirects_json_file) site_error_urls = adm.read_json(ignored_urls_json_file) site_ignored_urls = adm.read_json(error_urls_json_file) except: print('Unable to read one or more site files') sys.exit(1) calc_page_children() compare_urls() # look for page/url mismatches sum_content_types() #recursive_visit_extract_urls(channel_dict) json_formatted_str = json.dumps(content_types, indent=2) print(json_formatted_str) for content_type in content_types: print(content_type, content_types[content_type]['count'], human_readable(content_types[content_type]['bytes'])) print('Total Site Size: ' + human_readable(total_bytes))
def content_from_menu(this_preset_dir): global content menu = adm.read_json(this_preset_dir + 'menu.json') all_menu_defs = adm.get_all_menu_defs() for menu_def in menu["menu_items_1"]: if menu_def in all_menu_defs: if all_menu_defs[menu_def]["intended_use"] == "html": content["modules"].append(all_menu_defs[menu_def]["moddir"]) elif all_menu_defs[menu_def]["intended_use"] == "zim": content["zims"].append(all_menu_defs[menu_def]["zim_name"])
def main(): global verbose global download_flag oer2go_catalog = {} err_num = 0 err_str = "SUCCESS" local_oer2go_catalog = adm.read_json(adm.CONST.oer2go_catalog_file) local_oer2go_catalog = local_oer2go_catalog['modules'] err_num, err_str, oer2go_catalog_v2 = get_oer2go_cat_v2() for item in oer2go_catalog_v2: if item in local_oer2go_catalog: continue id = oer2go_catalog_v2[item]['module_id'] moddir = oer2go_catalog_v2[item]['moddir'] if id in dup_list: continue print(id, moddir)
import iiab.iiab_lib as iiab import iiab.adm_lib as adm try: adm.pcgvtd9() except: print("Unable to contact Server") sys.exit(1) # load lang codes iiab.read_lang_codes() local_menu_item_defs = adm.get_local_menu_item_defs() # returns dict menu_def_repo_data = adm.get_menu_def_repo_data() # returns dict repo_menu_item_defs = menu_def_repo_data['defs'] obsolete_menu_item_defs = adm.read_json(adm.CONST.obsolete_menu_defs) changes_made = False # download menu item defs from repo that are not present for menu_item_def_name in repo_menu_item_defs: if menu_item_def_name not in local_menu_item_defs: if menu_item_def_name in obsolete_menu_item_defs: print('Skipping obsolete menu definition ' + menu_item_def_name) continue # don't download obsolete menu_item_def = adm.get_menu_item_def_from_repo_by_name(menu_item_def_name) adm.write_other_menu_item_def_files(menu_item_def) adm.write_menu_item_def(menu_item_def_name, menu_item_def) print ('Downloading new remote menu item definition ' + menu_item_def_name) changes_made = True # upload new and changed local menu item defs to repo if upload_flag set for menu_item_def_name in local_menu_item_defs:
def main(): global verbose oer2go_catalog = {} args = parse_args() if args.verbose: verbose = True # make sure we have menu js_menu_dir if args.menu true if args.menu: if not os.path.isdir(adm.CONST.js_menu_dir): sys.stdout.write( "GET-OER2GO-CAT ERROR - iiab-menu not installed and --menu option given\n" ) sys.stdout.flush() sys.exit(99) # for now we will assume that old modules are still in the current catalog # get new oer2go catalog unless told not to if not args.no_download: try: url_handle = urllib.request.urlopen(adm.CONST.oer2go_cat_url) oer2go_catalog_json = url_handle.read() url_handle.close() except (urllib.error.URLError) as exc: sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(exc.reason) + '\n') sys.stdout.flush() sys.exit(1) try: url_handle = urllib.request.urlopen(adm.CONST.iiab_module_cat_url) iiab_catalog_json = url_handle.read() url_handle.close() except (urllib.error.URLError) as exc: sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(exc.reason) + '\n') sys.stdout.flush() sys.exit(2) # now try to parse try: oer2go_catalog = json.loads(oer2go_catalog_json) iiab_catalog = json.loads(iiab_catalog_json) except: sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(sys.exc_info()[0]) + "," + str(sys.exc_info()[1]) + '\n') sys.stdout.flush() sys.exit(3) # merge iiab_catalog.json if was downloaded otherwise assume was previously merged for item in iiab_catalog: moddir = item['moddir'] id = item['module_id'] module = item iiab_oer2go_catalog[moddir] = module else: local_oer2go_catalog = adm.read_json(adm.CONST.oer2go_catalog_file) oer2go_catalog = local_oer2go_catalog['modules'] working_dir = adm.CONST.rachel_working_dir + str(uuid.uuid4()) + "/" os.mkdir(working_dir) #os.mkdir(iiab_menu_download_dir) for item in oer2go_catalog: # structure of local and remote catalogs is different if args.no_download: # local moddir = item module = oer2go_catalog[moddir] module_id = module['module_id'] else: # remote moddir = item['moddir'] module_id = item['module_id'] module = item if moddir is None: # skip items with no moddir continue menu_item_name = moddir if module_id not in dup_list: is_downloaded, has_menu_def = adm.get_module_status(module) if args.menu and is_downloaded: if not has_menu_def: menu_item_name = adm.create_module_menu_def( module, working_dir, incl_extra_html=False) msg = "Generating menu files" if verbose: print("%s %s %s" % (msg, module_id, moddir)) adm.update_menu_json( menu_item_name) # only adds if not already in menu else: msg = "Skipping module not needed by Internet in a Box" if verbose: print("%s %s %s" % (msg, module_id, moddir)) continue iiab_oer2go_catalog[moddir] = module # no need to write catalog if not downloaded as we don't need wip and other extra menu def fields if not args.no_download: dated_oer2go_cat = {} dated_oer2go_cat['download_date'] = time.strftime("%Y-%m-%d.%H:%M:%S") dated_oer2go_cat['modules'] = iiab_oer2go_catalog with open(adm.CONST.oer2go_catalog_file, 'w') as outfile: json.dump(dated_oer2go_cat, outfile, indent=2) shutil.rmtree(working_dir) sys.stdout.write("SUCCESS") sys.stdout.flush() sys.exit(0)
import re import string from urllib.parse import urljoin, urldefrag, urlparse from bs4 import BeautifulSoup, Comment, SoupStrainer import iiab.adm_lib as adm site = 'rarediseases.info.nih.gov' orig_dir = '/articlelibrary/viewarticle/' base_url = 'https://' + site + orig_dir src_dir = 'raw/html' dst_dir = '/library/www/html/modules/en-nih_rarediseases' # read urls url_json_file = site + '_urls.json' site_urls = adm.read_json(url_json_file) def main(argv): # need site_urls for type of image - see below file_list = os.listdir(src_dir) #file_list = ['article-17922.html','article-41380.html','article-788.html', 'article-99590.html', 'article-29120.html', 'article-16989.html'] for filename in file_list: print('Converting ' + filename) if not filename.endswith(".html"): print('Skippinging ' + filename) continue page = do_page(os.path.join(src_dir, filename)) html_output = page.encode_contents(formatter='html') with open(dst_dir + filename, 'wb') as f:
def main (): global verbose global download_flag oer2go_catalog = {} err_num = 0 err_str = "SUCCESS" args = parse_args() if args.verbose: verbose = True if args.no_download: download_flag = False # make sure we have menu js_menu_dir if args.menu true if args.menu: if not os.path.isdir(adm.CONST.js_menu_dir): sys.stdout.write("GET-OER2GO-CAT ERROR - iiab-menu not installed and --menu option given\n") sys.stdout.flush() sys.exit(99) # always get our catalog # failure is fatal try: url_handle = urllib.request.urlopen(adm.CONST.iiab_module_cat_url) iiab_catalog_json = url_handle.read() url_handle.close() iiab_catalog = json.loads(iiab_catalog_json) except (urllib.error.URLError) as exc: sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(exc.reason) +'\n') sys.stdout.flush() sys.exit(2) # for now we will assume that old modules are still in the current catalog # get new oer2go catalog unless told not to if download_flag: err_num, err_str, oer2go_catalog = get_oer2go_cat() if err_num != 0: download_flag = False if not download_flag: # get local copy local_oer2go_catalog = adm.read_json(adm.CONST.oer2go_catalog_file) oer2go_catalog = local_oer2go_catalog['modules'] # start with iiab_catalog.json for item in iiab_catalog: moddir = item['moddir'] id = item['module_id'] module = item iiab_oer2go_catalog[moddir] = module working_dir = adm.CONST.rachel_working_dir + str(uuid.uuid4()) + "/" os.mkdir(working_dir) #os.mkdir(iiab_menu_download_dir) for item in oer2go_catalog: # structure of local and remote catalogs is different if not download_flag: # local moddir = item module = oer2go_catalog[moddir] module_id = module['module_id'] else: # remote moddir = item['moddir'] module_id = item['module_id'] module = item if moddir is None: # skip items with no moddir continue menu_item_name = moddir if str(module_id) in dup_list: msg = "Skipping module not needed by Internet in a Box" if verbose: print("%s %s %s" % (msg, module_id, moddir)) continue if module.get('type') != 'html': continue is_downloaded, has_menu_def = adm.get_module_status (module) #if args.menu and is_downloaded: if args.menu: if not has_menu_def: menu_item_name = adm.create_module_menu_def(module, working_dir, incl_extra_html = False) msg = "Generating menu files" if verbose: print("%s %s %s" % (msg, module_id, moddir)) if is_downloaded: adm.update_menu_json(menu_item_name) # only adds if not already in menu iiab_oer2go_catalog[moddir] = module # write catalog even if not downloaded as our could have changed dated_oer2go_cat = {} dated_oer2go_cat['download_date'] = time.strftime("%Y-%m-%d.%H:%M:%S") dated_oer2go_cat['modules'] = iiab_oer2go_catalog adm.write_json_file(dated_oer2go_cat, adm.CONST.oer2go_catalog_file) shutil.rmtree(working_dir) sys.stdout.write(err_str) sys.stdout.flush() sys.exit(err_num)
################################################################################ site = 'www.ncbi.nlm.nih.gov' MAIN_SOURCE_DOMAIN = 'https://' + site START_PAGE = 'https://' + site SOURCE_DOMAINS = [] IGNORE_URLS = [] crawler = BasicSpider(main_source_domain=MAIN_SOURCE_DOMAIN) crawler.IGNORE_URLS.extend(IGNORE_URLS) crawler.set_output_file_names(site) crawler.pre_crawl_setup() crawler.read_global_site_json() crawler.SHORTEN_CRAWL = True stat_pearl_catalog = {} stat_pearl_catalog = adm.read_json('stat-pearl-catalog.json') test_cnt = 5 for article_id in stat_pearl_catalog: original_url = MAIN_SOURCE_DOMAIN + stat_pearl_catalog[article_id]['url'] local_file = original_url.split('://')[1] if local_file[-1] == '/': local_file = local_file[:-1] + '.html' else: local_file = local_file + '.html' local_dir = os.path.dirname(local_file) if not os.path.exists(local_dir): os.makedirs(local_dir) if os.path.isfile(local_file):
################################################################################ site = 'rarediseases.info.nih.gov' MAIN_SOURCE_DOMAIN = 'https://' + site START_PAGE = 'https://' + site SOURCE_DOMAINS = [] IGNORE_URLS = [] crawler = BasicSpider(main_source_domain=MAIN_SOURCE_DOMAIN) crawler.IGNORE_URLS.extend(IGNORE_URLS) crawler.set_output_file_names(site) crawler.pre_crawl_setup() crawler.read_global_site_json() crawler.SHORTEN_CRAWL = True disease_catalog = {} disease_catalog = adm.read_json('disease-catalog.json') test_cnt = 5 for disease_url in disease_catalog: original_url = MAIN_SOURCE_DOMAIN + disease_url local_name = disease_url[1:].replace('/', '.') local_file = 'raw/html/' + local_name + '.html' local_dir = os.path.dirname(local_file) if not os.path.exists(local_dir): os.makedirs(local_dir) if os.path.isfile(local_file): continue # print(original_url) url, html = crawler.download_page(original_url)