def add_position_to_node(): nodes = common.read_json(OUTPUT_DIR, NODE_FILE_NAME) positions = common.read_json(OUTPUT_DIR, POSITION_FILE_NAME) for node in nodes: position = positions[str(node['id'])] node['x'] = position['x'] node['y'] = position['y'] common.write_json(nodes, OUTPUT_DIR, NODE_FILE_NAME)
def add_value_to_node(): nodes = common.read_json(OUTPUT_DIR, NODE_FILE_NAME) edges = common.read_json(OUTPUT_DIR, EDGE_FILE_NAME) for node in nodes: count = len(list(filter(lambda x: x['from'] == node['id'], edges))) count += len(list(filter(lambda x: x['to'] == node['id'], edges))) node['value'] = count common.write_json(nodes, OUTPUT_DIR, NODE_FILE_NAME)
def convert_to_rdf(): """ Converts the read data to triples """ print "" print "Convert to RDF..." movies = common.read_json(JSON_OUT_FILE) g = Graph() g.bind("", NS_OMDB) g.bind("dbpedia-owl", NS_DBPEDIA_OWL) g.bind("dbpprop", NS_DBPPROP) for m in movies: movie = URIRef(BASE_URI % common.encodeString(m["title"])) g.add((movie, RDF.type, NS_DBPEDIA_OWL.Film)) g.add((movie, RDFS.label, Literal(m["title"]))) g.add((movie, NS_DBPPROP.title, Literal(m["title"]))) if "imdbID" in m: g.add((movie, NS_DBPEDIA_OWL.imdbId, Literal(m["imdbID"]))) common.write_rdf(RDF_OUT_FILE, g)
def analyze_experiment(info, experiments_dir, tmp_data_dir, date_str, exp_name): exp_dir = os.path.join(experiments_dir, exp_name) exp_data_dir = os.path.join(tmp_data_dir, exp_name) tmp_analysis_dir = os.path.join(exp_data_dir, 'analysis') idemp_mkdir(tmp_analysis_dir) analyzed_data_dir = info.exp_data_dir(exp_name) if not os.path.exists(analyzed_data_dir): idemp_mkdir(analyzed_data_dir) subprocess.call([os.path.join(exp_dir, 'analyze.sh'), info.exp_config_dir(exp_name), exp_data_dir, tmp_analysis_dir], cwd=exp_dir) status = validate_status(tmp_analysis_dir) # read the analyzed data, append a timestamp field, and copy over to the permanent data dir if status['success']: data_exists = check_file_exists(tmp_analysis_dir, 'data.json') if not data_exists: status = {'success': False, 'message': 'No data.json file produced by {}'.format(exp_name)} else: # collect data to dump to data_*.json dump_data = { 'timestamp' : date_str, } dump_data.update(read_json(tmp_analysis_dir, 'data.json')) # fetch time spent on the experiment dump_data.update(get_timing_info(info, exp_name)) write_json(analyzed_data_dir, 'data_{}.json'.format(date_str), dump_data) info.report_exp_status(exp_name, 'analysis', status) return status['success']
def convert_to_rdf(): """ Converts the read data to triples """ print "" print "Convert to RDF..." songs = common.read_json(JSON_OUT_FILE) g = Graph() g.bind("", NS_LASTFM) g.bind("dbpedia-owl", NS_DBPEDIA_OWL) g.bind("dbpprop", NS_DBPPROP) for s in songs: if "tags" not in s or len(s["tags"]) < 1: continue artist = URIRef(BASE_URI % common.encodeString(s["artist"])) g.add((artist, RDF.type, NS_DBPEDIA_OWL.MusicalArtist)) g.add((artist, RDFS.label, Literal(s["artist"]))) g.add((artist, NS_DBPPROP.name, Literal(s["artist"]))) song = URIRef(BASE_URI % common.encodeString(u"{0:s} - {1:s}".format(s['artist'], s["title"]))) g.add((song, RDF.type, NS_DBPEDIA_OWL.Song)) g.add((song, RDFS.label, Literal(u"{0:s} - {1:s}".format(s['artist'], s["title"])))) g.add((song, NS_DBPPROP.title, Literal(s["title"]))) g.add((song, NS_DBPEDIA_OWL.artist, artist)) for t in s["tags"]: g.add((song, NS_LASTFM.tagged, Literal(t))) common.write_rdf(RDF_OUT_FILE, g)
def check_config(): try: config = common.read_json("doc/config.json") config_error = 0 except: config_error = 1 return config_error
def run_baseline(model, exp_config, config, config_dir, output_dir): ''' Run a baseline triral and obtain memory usage. This is used for getting a reference memory usage for DTR `ratio` commands ''' baseline_config = {'batch_size': exp_config['batch_size']} if 'extra_params' in exp_config: baseline_config['extra_params'] = exp_config['extra_params'] filename = str(time.time()) + '.json' temp_file = prepare_out_file(os.getcwd(), filename) success, msg = run_trials(config_dir, python_command('baseline', config), 'baseline', model, baseline_config, config['n_inputs'], config['n_reps'], output_dir, report_errors=config['report_errors'], append_to_csv=False, trial_run=True, trial_run_outfile=temp_file) if not success: return False, 'Error while running baseline trial: \n{}'.format(msg) mem_usage = read_json(output_dir, temp_file) os.remove(temp_file) if 'mem' not in mem_usage: return False, 'failed to get baseline memory usage' return True, mem_usage['mem']
def main(argv): args = parse_arguments(argv) if args.save_as is None: out_dir = os.path.join(args.out_dir, base64_encode(args.keyword)) else: out_dir = os.path.join(args.out_dir, base64_encode(args.save_as)) downloader = NiconicoDownloader(out_dir) auth = read_json(args.auth_json) auth = dict() if auth is None else auth while True: if 'niconico' not in auth: auth['niconico'] = { 'username': input('Username >> '), 'password': getpass('Password >> '), } try: downloader.authenticate(**auth['niconico']) write_json(args.auth_json, auth) break except NoSuchElementException: logger.error('Failed to login.') del auth['niconico'] downloader(args.keyword)
def init_vta_env(target): """Read the VTA config and set the target to `target`.""" config_dir = os.path.join(os.environ['TVM_HOME'], 'vta', 'config') config_filename = 'vta_config.json' vta_config = read_json(config_dir, config_filename) vta_config["TARGET"] = target return vta.Environment(vta_config)
def run_baseline(model, exp_config, config, config_dir, output_dir): ''' Run a baseline triral and obtain memory usage. This is used for getting a reference memory usage for DTR `ratio` commands ''' baseline_config = { 'batch_size' : exp_config['batch_size'], 'timeout': exp_config.get('timeout', 60), # only doing a minimal number of runs because we are only getting the memory usage, # which should be identical between runs 'n_reps': 10, 'extra_params': exp_config.get('extra_params', {}) } if 'input_params' in exp_config: baseline_config['input_params'] = exp_config['input_params'] filename = str(time.time()) + '.json' temp_file = prepare_out_file(os.getcwd(), filename) success, msg = run_trials(config_dir, python_command('baseline', config), 'baseline', model, baseline_config, exp_config.get('n_inputs', config['n_inputs']), output_dir, report_errors=config['report_errors'], append_to_csv=False, trial_run=True, trial_run_outfile=temp_file, sync_gpu=config['sync_gpu']) if not success: return False, 'Error while running baseline trial: \n{}'.format(msg) mem_usage = read_json(output_dir, temp_file) os.remove(temp_file) if 'mem' not in mem_usage: return False, 'failed to get baseline memory usage' return True, mem_usage['mem']
def check_confidentials(): try: confidentials = common.read_json("confidentials.json") confidentials_error = 0 except: confidentials_error = 1 return confidentials_error
def read_first_existing(*files): for f in files: if os.path.exists(f): try: return c.read_json(f) except ValueError: pass raise IOError("Cannot find any of " + files)
def main(home_dir, experiments_dir, subsystem_dir, telemetry_script_dir): """ Home directory: Where config info for experiments, etc., is Experiments directory: Where experiment implementations are Both should be given as absolute directories """ time_str = get_timestamp() if not check_file_exists(home_dir, 'config.json'): print('Dashboard config (config.json) is missing in {}'.format(home_dir)) return 1 dash_config = read_json(home_dir, 'config.json') # must expand all tildes in the config to avoid future errors for path_field in ['tmp_data_dir', 'setup_dir', 'backup_dir']: dash_config[path_field] = os.path.expanduser(dash_config[path_field]) tmp_data_dir = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str) data_archive = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str + '_data.tar.gz') setup_dir = dash_config['setup_dir'] backup_archive = os.path.join(dash_config['backup_dir'], 'dashboard_' + time_str + '.tar.gz') idemp_mkdir(tmp_data_dir) idemp_mkdir(os.path.dirname(backup_archive)) idemp_mkdir(setup_dir) info = DashboardInfo(home_dir) # make a backup of the previous dashboard files if they exist if os.path.exists(home_dir): subprocess.call(['tar', '-zcf', backup_archive, home_dir]) # directories whose contents should not change between runs of the dashboard persistent_dirs = {info.exp_data, info.exp_configs, info.subsys_configs, info.subsys_output} all_dashboard_dirs = info.all_experiment_dirs() + info.all_subsystem_dirs() # instantiate necessary dashboard dirs and clean any that should be empty for dashboard_dir in all_dashboard_dirs: if dashboard_dir not in persistent_dirs: subprocess.call(['rm', '-rf', dashboard_dir]) idemp_mkdir(dashboard_dir) randomize_exps = True if 'randomize' in dash_config: randomize_exps = dash_config['randomize'] telemetry_rate = dash_config.get('telemetry_rate', 15) run_cpu_telemetry = dash_config.get('run_cpu_telemetry', False) run_gpu_telemetry = dash_config.get('run_gpu_telemetry', False) run_all_experiments(info, experiments_dir, setup_dir, tmp_data_dir, data_archive, time_str, telemetry_script_dir, run_cpu_telemetry=run_cpu_telemetry, run_gpu_telemetry=run_gpu_telemetry, telemetry_interval=telemetry_rate, randomize=randomize_exps) run_all_subsystems(info, subsystem_dir, time_str)
def _eval(self, gt_fname, rec_fname): gt_playlists = read_json(gt_fname) gt_dict = {g["id"]: g for g in gt_playlists} rec_playlists = read_json(rec_fname) gt_ids = set([g["id"] for g in gt_playlists]) rec_ids = set([r["id"] for r in rec_playlists]) if gt_ids != rec_ids: raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.") rec_song_counts = [len(p["songs"]) for p in rec_playlists] rec_tag_counts = [len(p["tags"]) for p in rec_playlists] if set(rec_song_counts) != set([100]): raise Exception("추천 곡 결과의 개수가 맞지 않습니다.") if set(rec_tag_counts) != set([10]): raise Exception("추천 태그 결과의 개수가 맞지 않습니다.") rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists] rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists] if set(rec_unique_song_counts) != set([100]): raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.") if set(rec_unique_tag_counts) != set([10]): raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.") music_ndcg = 0.0 tag_ndcg = 0.0 for rec in rec_playlists: gt = gt_dict[rec["id"]] music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100]) tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10]) music_ndcg = music_ndcg / len(rec_playlists) tag_ndcg = tag_ndcg / len(rec_playlists) score = music_ndcg * 0.85 + tag_ndcg * 0.15 return music_ndcg, tag_ndcg, score
def summary_valid(exp_summary_dir): """ Checks that the experiment summary directory contains a summary.json file and that the summary.json file contains the required fields, title and value. """ exists = check_file_exists(exp_summary_dir, 'summary.json') if not exists: return False summary = read_json(exp_summary_dir, 'summary.json') return 'title' in summary and 'value' in summary
async def message(session: CommandSession): user_id = session.event['user_id'] if user_id in config.SUPERUSERS: info = common.read_json('animal_crossing/data/pid.json', False) if info is not False: fo = open('animal_crossing/data/pid.json', "w") info['rebot'] = True fo.write(json.dumps(info)) fo.flush() fo.close()
def main(data_dir, config_dir, output_dir): config, msg = validate(config_dir) if config is None: write_status(output_dir, False, msg) return 1 # No further analysis is required beyond the raw stats reported by the VTA # simulator, so we just propagate the data to the next stage of the # pipeline. data = read_json(data_dir, 'data.json') write_json(output_dir, 'data.json', data) write_status(output_dir, True, 'success')
def make_topology(): entries = common.read_json(OUTPUT_DIR, ENTRIES_FILE_NAME) nodes = [] edges = [] for entry in entries: nodes.append(make_node(entry)) edges += make_edges(entry, entries) common.write_json(nodes, OUTPUT_DIR, NODE_FILE_NAME) common.write_json(edges, OUTPUT_DIR, EDGE_FILE_NAME)
def run(args): domain = args.domain if not domain: print('usage: wydomain.py -d aliyun.com') sys.exit(1) outfile = '{0}.log'.format(domain) script_path = os.path.dirname(os.path.abspath(__file__)) _cache_path = os.path.join(script_path, 'result/{0}'.format(domain)) if not os.path.exists(_cache_path): os.makedirs(_cache_path, 0777) # start crt print '[*]Starting Crt fetch ...' result = Crt(domain=domain).run() _cache_file = os.path.join(_cache_path, 'crt.json') save_result(_cache_file, result) print '\t[-]Fetch complete | Found {}'.format(len(result)) # start ilink print '[*]Starting iLink fetch ...' result = Ilink(domain=domain).run() _cache_file = os.path.join(_cache_path, 'ilink.json') save_result(_cache_file, result) print '\t[-]Fetch complete | Found {}'.format(len(result)) # new start brute print '[*]Starting Brute sub ...' result = BruteDns(domain=domain).run() _cache_file = os.path.join(_cache_path, 'brute.json') save_result(_cache_file, result) print '\n\t[-]Bruteforce complete | Found {}'.format(len(result)) # _cache_files = ['crt.json', 'ilink.json', 'brute.json'] subdomains = [] for file in _cache_files: _cache_file = os.path.join(_cache_path, file) json_data = read_json(_cache_file) if json_data: subdomains.extend(json_data) subdomains = list(set(subdomains)) _result_file = os.path.join(script_path, outfile) save_result(_result_file, subdomains) print '[*]{0} {1} subdomains save to {2}'.format(domain, len(subdomains), _result_file)
def main(argv): args = parse_arguments(argv) dirname = os.path.basename(args.input_dir) valid_dir = os.path.join(args.output_dir, dirname, 'valid') invalid_dir = os.path.join(args.output_dir, dirname, 'invalid') os.makedirs(valid_dir, exist_ok=True) os.makedirs(invalid_dir, exist_ok=True) removed_json = os.path.join(args.output_dir, dirname, '.cache.json') names = get_filenames(args.input_dir) valid_names = get_filenames(valid_dir) invalid_names = get_filenames(invalid_dir) removed_names = read_json(removed_json) removed_names = [] if removed_names is None else removed_names names = sorted( set(names) - set(valid_names) - set(invalid_names) - set(removed_names)) # Instruction sys.stdout.write('Key input instructions:\n' 'j: Accept current image\n' 'k: Reject current image\n' 'u: Undo recent validation\n' 'd: Exclude image \n' 'q: Quit validation\n') i = 0 while i < len(names): path = os.path.join(args.input_dir, names[i]) key = show_image(path, args.size) if key == KeyStatus.UNDO and i > 1: i -= 1 if os.path.exists(os.path.join(valid_dir, names[i])): os.remove(os.path.join(valid_dir, names[i])) elif os.path.exists(os.path.join(invalid_dir, names[i])): os.remove(os.path.join(invalid_dir, names[i])) else: removed_names.pop() elif key == KeyStatus.OK: shutil.copyfile(path, os.path.join(valid_dir, names[i])) i += 1 elif key == KeyStatus.FAIL: shutil.copyfile(path, os.path.join(invalid_dir, names[i])) i += 1 elif key == KeyStatus.REMOVE: removed_names.append(names[i]) write_json(removed_json, removed_names) i += 1 else: exit()
def main(config_dir, experiment_mode, model_name, input_idx, params_file, out_file, trial_run=False, trial_run_outfile=None): if 'DTR_MODEL_NAME' in os.environ: model_name = os.environ['DTR_MODEL_NAME'] config, msg = validate_trials_config(config_dir) if config is None: print(msg) return 1 use_dtr = (experiment_mode == 'dtr') i = int(input_idx) is_trial = trial_run == 'True' if config['set_seed']: torch.manual_seed(config['seed'] + i) random.seed(config['seed'] + i) cwd = os.getcwd() # handle specific params, esp. for DTR specific_params = read_json(cwd, params_file) if 'DTR_MEMORY_BUDGET' in os.environ: specific_params['memory_budget'] = float( os.environ['DTR_MEMORY_BUDGET']) assert 'batch_size' in specific_params if use_dtr: assert 'memory_budget' in specific_params if specific_params['memory_budget'] > 0: print(f'Setting budget to {int(specific_params["memory_budget"])}') torch.set_memory_budget(int(specific_params['memory_budget'])) if is_trial: timing_loop(model_name, i, config, use_dtr, specific_params, None, True, trial_run_outfile) return with open(out_file, 'a', newline='') as csvfile: writer = create_csv_writer(csvfile, specific_params) timing_loop(model_name, i, config, use_dtr, specific_params, writer, memory_budget=specific_params.get('memory_budget', -1))
def search(args): hits = common.read_json(common.read_url("http://ajax.googleapis.com/ajax/services/search/web?v=1.0&safe=off&q=", args))['responseData']['results'] if hits: striphtml = lambda s: re.sub(r'<.+?>', '', re.sub(r'( +|\n)', '', s)) url = striphtml(hits[0]['unescapedUrl']) title = striphtml(hits[0]['titleNoFormatting']) content = striphtml(hits[0]['content']) result = "{1}: {2} -- {0}".format(url, title, content) else: result = "No hits." return result
def attempt_parse_config(config_dir, target): """ Returns the parsed config for the target (experiment or subsystem) if it exists. Returns None if the config is missing or could not be parsed. """ conf_subdir = os.path.join(config_dir, target) if not check_file_exists(conf_subdir, 'config.json'): return None try: return read_json(conf_subdir, 'config.json') except Exception as e: return None
def pp_search(args, url_re): """ Search google for a Profound Programmer page matching the args. Return the url. """ searchterms = '{} site:theprofoundprogrammer.com/post/'.format(args) hits = common.read_json(common.read_url("http://ajax.googleapis.com/ajax/services/search/web?v=1.0&safe=off&q=", searchterms))['responseData']['results'] if not hits: return None striphtml = lambda s: re.sub(r'<.+?>', '', re.sub(r' +', '', s)) return striphtml(hits[0]['url'])
def main(): print "Processing" movies = common.read_json("tunefind.json") pool = Pool(5) results = [pool.apply_async(process_movie, [m]) for m in movies] updated_movies = [] for w in results: w.wait() updated_movies.append(w.get()) common.write_json("musicbrainz.json", updated_movies)
def __init__(self, cascade_path, dst_dir, image_size=200, margin=0.2, max_scale=1.2): min_size = int(image_size / (1.0 + margin) / max_scale) self._detector = AnimeFaceDetector(cascade_path, min_size=min_size) self.dst_dir = dst_dir os.makedirs(self.dst_dir, exist_ok=True) self._image_size = (image_size, image_size) self._margin = margin rects = read_json(os.path.join(dst_dir, self._CACHE_FILENAME)) self._rects = dict() if rects is None else rects
def check_error(experiment_name, model_name, specific_params, path_prefix): if not check_file_exists(path_prefix, 'errors.json'): return False logged_errors = read_json(path_prefix, 'errors.json') if experiment_name not in logged_errors: return False if model_name not in logged_errors[experiment_name]: return False errors = logged_errors[experiment_name][model_name] check_func = lambda err: lambda kv: err.get(kv[0]) == kv[1] if specific_params.get('kind') == 'ratio': check_func = lambda err: lambda kv: err.get(kv[0]) == kv[1] if kv[0] != 'memory_budget' else True return any(map(lambda err: all(map(check_func(err), specific_params.items())), errors))
def _check_stage_status(target_status_dir, stage_name): filename = '{}.json'.format(stage_name) if not check_file_exists(target_status_dir, filename): return { 'success': False, 'message': '{} stage status missing'.format(stage_name) } try: return read_json(target_status_dir, filename) except: return { 'success': False, 'message': 'Failed to parse {} stage status'.format(stage_name) }
def delete_duplication(): entries = common.read_json(OUTPUT_DIR, ENTRIES_FILE_NAME) fixed_entries = [] titles = [] index = 1 for entry in entries: title = entry['title'] if title in titles: continue fixed = entry fixed['id'] = index fixed_entries.append(fixed) titles.append(title) index += 1 common.write_json(fixed_entries, OUTPUT_DIR, 'entries_fixed.json')
def log_error(experiment_name, model_name, specific_params, inp, err_msg, path_prefix): err_info = {'input': inp, 'msg': err_msg} logged_errors = {} if check_file_exists(path_prefix, 'errors.json'): logged_errors = read_json(path_prefix, 'errors.json') if experiment_name not in logged_errors: logged_errors[experiment_name] = {} if model_name not in logged_errors[experiment_name]: logged_errors[experiment_name][model_name] = [] logged_errors[experiment_name][model_name].append({ 'err_info': err_info, **specific_params }) write_json(path_prefix, 'errors.json', logged_errors)
def load_from_web(): print "Loading from Web" movies = common.read_json(JSON_IN_FILE) pool = Pool(5) worker = [pool.apply_async(process_movie, [m]) for m in movies] imdb_movies = [] for w in worker: w.wait() result = w.get() if result is not None: imdb_movies.append(w.get()) common.write_json(JSON_OUT_FILE, imdb_movies)
def convert_to_rdf(): print "" print "Convert to RDF..." charts = common.read_json(JSON_OUT_FILE) g = Graph() g.bind("", NS_CHARTS) g.bind("dbpedia-owl", NS_DBPEDIA_OWL) g.bind("dbpprop", NS_DBPPROP) for c in charts: if c["date"] < "2005-01-01T00:00:00": continue chart = URIRef(BASE_URI % common.encodeString( datetime.strptime(c["date"], "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d"))) g.add((chart, RDF.type, NS_CHARTS.Chart)) g.add((chart, NS_DBPEDIA_OWL.publicationDate, Literal(c["date"] + "Z", datatype=XSD.dateTime))) for t in c["tracks"]: artist = URIRef(BASE_URI % common.encodeString(t["artist"])) g.add((artist, RDF.type, NS_DBPEDIA_OWL.MusicalArtist)) g.add((artist, RDFS.label, Literal(t["artist"]))) g.add((artist, NS_DBPPROP.name, Literal(t["artist"]))) song = URIRef(BASE_URI % common.encodeString( u"{0:s} - {1:s}".format(t['artist'], t["title"]))) g.add((song, RDF.type, NS_DBPEDIA_OWL.Song)) g.add((song, RDFS.label, Literal(u"{0:s} - {1:s}".format(t['artist'], t["title"])))) g.add((song, NS_DBPPROP.title, Literal(t["title"]))) g.add((song, NS_DBPEDIA_OWL.artist, artist)) ranked = BNode() g.add((ranked, RDF.type, NS_CHARTS.RankedSong)) g.add((ranked, NS_CHARTS.song, song)) g.add((ranked, NS_CHARTS.position, Literal(t["pos"], datatype=XSD.integer))) g.add((ranked, RDFS.label, Literal(u"{0:s}: {1:s} - {2:s}".format( t["pos"], t['artist'], t["title"])))) g.add((chart, NS_CHARTS.rankedSong, ranked)) common.write_rdf(RDF_OUT_FILE, g)
def convert_to_rdf(): """ Converts the read data to triples """ print "" print "Convert to RDF..." charts = common.read_json(JSON_OUT_FILE) g = Graph() g.bind("", NS_CHARTS) g.bind("dbpedia-owl", NS_DBPEDIA_OWL) g.bind("dbpprop", NS_DBPPROP) for c in charts: if c["date"] < CONVERT_FROM_DATE: continue chart = URIRef( BASE_URI % common.encodeString(datetime.strptime(c["date"], "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d"))) g.add((chart, RDF.type, NS_CHARTS.Chart)) g.add((chart, NS_DBPEDIA_OWL.publicationDate, Literal(c["date"] + "Z", datatype=XSD.dateTime))) for t in c["tracks"]: artist = URIRef(BASE_URI % common.encodeString(t["artist"])) g.add((artist, RDF.type, NS_DBPEDIA_OWL.MusicalArtist)) g.add((artist, RDFS.label, Literal(t["artist"]))) g.add((artist, NS_DBPPROP.name, Literal(t["artist"]))) song = URIRef(BASE_URI % common.encodeString(u"{0:s} - {1:s}".format(t['artist'], t["title"]))) g.add((song, RDF.type, NS_DBPEDIA_OWL.Song)) g.add((song, RDFS.label, Literal(u"{0:s} - {1:s}".format(t['artist'], t["title"])))) g.add((song, NS_DBPPROP.title, Literal(t["title"]))) g.add((song, NS_DBPEDIA_OWL.artist, artist)) ranked = BNode() g.add((ranked, RDF.type, NS_CHARTS.RankedSong)) g.add((ranked, NS_CHARTS.song, song)) g.add((ranked, NS_CHARTS.position, Literal(t["pos"], datatype=XSD.integer))) g.add((ranked, RDFS.label, Literal(u"{0:s}: {1:s} - {2:s}".format(t["pos"], t['artist'], t["title"])))) g.add((chart, NS_CHARTS.rankedSong, ranked)) common.write_rdf(RDF_OUT_FILE, g)
def read(self, var): if var == "room": self.room = common.read_json("animal_crossing/data/room.json", {}) elif var == "member": self.member = common.read_json("animal_crossing/data/member.json", {}) elif var == "ban": self.ban = common.read_json("animal_crossing/data/ban.json", {}) elif var == "count": self.count = common.read_json("animal_crossing/data/count.json", {"count": 0}) elif var == "queue": self.queue = common.read_json("animal_crossing/data/queue.json", {}) elif var == "group_member": self.group_member = common.read_json("animal_crossing/data/group_member.json", {})
def load_from_web(): print "Loading from Web..." network = pylast.LastFMNetwork(api_key=API_KEY, api_secret=API_SECRET) movies = common.read_json(JSON_IN_FILE) song_chunks = [] for m in movies: if len(m["soundtrack"]) > 0: song_chunks.append(m["soundtrack"]) pool = Pool(5) worker = [pool.apply_async(process_songs, [chunk, network]) for chunk in song_chunks] lastfm_songs = [] for w in worker: w.wait() for s in w.get(): lastfm_songs.append(s) common.write_json(JSON_OUT_FILE, lastfm_songs)
def convert_to_rdf(): """ Converts the read data to triples """ print "" print "Convert to RDF..." movies = common.read_json(JSON_OUT_FILE) g = Graph() g.bind("", NS_TUNEFIND) g.bind("dbpedia-owl", NS_DBPEDIA_OWL) g.bind("dbpprop", NS_DBPPROP) for m in movies: movie = URIRef(BASE_URI % common.encodeString(m["title"])) g.add((movie, RDF.type, NS_DBPEDIA_OWL.Film)) g.add((movie, RDFS.label, Literal(m["title"]))) g.add((movie, NS_DBPPROP.title, Literal(m["title"]))) for s in m["soundtrack"]: artist = URIRef(BASE_URI % common.encodeString(s["artist"])) g.add((artist, RDF.type, NS_DBPEDIA_OWL.MusicalArtist)) g.add((artist, RDFS.label, Literal(s["artist"]))) g.add((artist, NS_DBPPROP.name, Literal(s["artist"]))) song = URIRef(BASE_URI % common.encodeString(u"{0:s} - {1:s}".format(s['artist'], s["title"]))) g.add((song, RDF.type, NS_DBPEDIA_OWL.Song)) g.add((song, RDFS.label, Literal(u"{0:s} - {1:s}".format(s['artist'], s["title"])))) g.add((song, NS_DBPPROP.title, Literal(s["title"]))) g.add((song, NS_DBPEDIA_OWL.artist, artist)) g.add((movie, NS_TUNEFIND.contains, song)) common.write_rdf(RDF_OUT_FILE, g)
def read_tasklist(path): data = read_json(path) if data is None: return set(), [], 1 num = max([t['num'] for t in data['tasks'] ]) + 1 return set(data['tags']), data['tasks'], num
def convert_to_rdf(): """ Converts the read data to triples """ print "" print "Convert to RDF..." movies = common.read_json(JSON_OUT_FILE) g = Graph() g.bind("", NS_IMDB) g.bind("dbpedia-owl", NS_DBPEDIA_OWL) g.bind("dbpprop", NS_DBPPROP) for m in movies: if not release_filter(m): continue movie = URIRef(BASE_URI % common.encodeString(m["title"])) g.add((movie, RDF.type, NS_DBPEDIA_OWL.Film)) g.add((movie, RDFS.label, Literal(m["title"]))) g.add((movie, NS_DBPPROP.title, Literal(m["title"]))) g.add((movie, NS_DBPEDIA_OWL.imdbId, Literal(m["imdbID"]))) if "directors" in m: for name in m["directors"]: director = URIRef(BASE_URI % common.encodeString(name)) g.add((director, RDF.type, NS_DBPEDIA_OWL.Person)) g.add((director, RDFS.label, Literal(name))) g.add((director, NS_DBPPROP.name, Literal(name))) g.add((movie, NS_DBPEDIA_OWL.director, director)) if "cast" in m: for cast in m["cast"][:CONVERT_MAX_CAST]: if cast["screen_name"] == "": continue actor = URIRef(BASE_URI % common.encodeString(cast["name"])) g.add((actor, RDF.type, NS_DBPEDIA_OWL.Actor)) g.add((actor, RDFS.label, Literal(cast["name"]))) g.add((actor, NS_DBPPROP.name, Literal(cast["name"]))) character = BNode() g.add((character, RDF.type, NS_IMDB.Character)) g.add((character, RDFS.label, Literal(cast["screen_name"]))) g.add((character, NS_IMDB.actedBy, actor)) g.add((character, NS_IMDB.screenName, Literal(cast["screen_name"]))) g.add((movie, NS_IMDB.cast, character)) if "release_info" in m: for info in m["release_info"]: if "date" not in info: continue if info["country"] not in CONVERT_RELEASE_COUNTRY: continue release = BNode() g.add((release, RDF.type, NS_IMDB.ReleaseCountry)) g.add((release, RDFS.label, Literal(info["country"] if info["event"] == "" else info["country"] + " - " + info["event"]))) g.add((release, NS_DBPEDIA_OWL.publicationDate, Literal(info["date"] + "Z", datatype=XSD.dateTime))) g.add((release, NS_DBPEDIA_OWL.comment, Literal(info["event"]))) g.add((release, NS_DBPEDIA_OWL.country, URIRef("http://dbpedia.org/resource/%s" % common.encodeString(info["country"])))) g.add((movie, NS_IMDB.releasedIn, release)) common.write_rdf(RDF_OUT_FILE, g)
def load_data(self,dfile): try: self.data = common.read_json(dfile); except Exception as e: raise e;
def run(args): domain = args.domain outfile = args.domain + '_wy.txt' if not domain: print('usage: wydomain.py -d aliyun.com') sys.exit(1) # init _cache_path script_path = os.path.dirname(os.path.abspath(__file__)) _cache_path = os.path.join(script_path, 'result/{0}'.format(domain)) if not os.path.exists(_cache_path): os.makedirs(_cache_path, 0777) # alexa result json file logging.info("starting alexa fetcher...") _cache_file = os.path.join(_cache_path, 'alexa.json') result = Alexa(domain=domain).run() save_result(_cache_file, result) logging.info("alexa fetcher subdomains({0}) successfully...".format(len(result))) # threatminer result json file logging.info("starting threatminer fetcher...") _cache_file = os.path.join(_cache_path, 'threatminer.json') result = Threatminer(domain=domain).run() save_result(_cache_file, result) logging.info("threatminer fetcher subdomains({0}) successfully...".format(len(result))) # threatcrowd result json file logging.info("starting threatcrowd fetcher...") _cache_file = os.path.join(_cache_path, 'threatcrowd.json') result = Threatcrowd(domain=domain).run() save_result(_cache_file, result) logging.info("threatcrowd fetcher subdomains({0}) successfully...".format(len(result))) # sitedossier result json file logging.info("starting sitedossier fetcher...") _cache_file = os.path.join(_cache_path, 'sitedossier.json') result = Sitedossier(domain=domain).run() save_result(_cache_file, result) logging.info("sitedossier fetcher subdomains({0}) successfully...".format(len(result))) # netcraft result json file logging.info("starting netcraft fetcher...") _cache_file = os.path.join(_cache_path, 'netcraft.json') result = Netcraft(domain=domain).run() save_result(_cache_file, result) logging.info("netcraft fetcher subdomains({0}) successfully...".format(len(result))) # ilinks result json file logging.info("starting ilinks fetcher...") _cache_file = os.path.join(_cache_path, 'ilinks.json') result = Ilinks(domain=domain).run() save_result(_cache_file, result) logging.info("ilinks fetcher subdomains({0}) successfully...".format(len(result))) # chaxunla result json file logging.info("starting chaxunla fetcher...") _cache_file = os.path.join(_cache_path, 'chaxunla.json') result = Chaxunla(domain=domain).run() save_result(_cache_file, result) logging.info("chaxunla fetcher subdomains({0}) successfully...".format(len(result))) # google TransparencyReport result json file logging.info("starting google TransparencyReport fetcher...") result = TransparencyReport(domain=domain).run() _cache_file = os.path.join(_cache_path, 'googlect_subject.json') save_result(_cache_file, result.get('subjects')) _cache_file = os.path.join(_cache_path, 'googlect_dnsnames.json') save_result(_cache_file, result.get('dns_names')) logging.info("google TransparencyReport fetcher subdomains({0}) successfully...".format(len(result.get('dns_names')))) # Collection API Subdomains sub_files = [ 'alexa.json', 'chaxunla.json', 'ilinks.json', 'netcraft.json', 'sitedossier.json', 'threatcrowd.json', 'threatminer.json'] # process all cache files subdomains = [] for file in sub_files: _cache_file = os.path.join(_cache_path, file) json_data = read_json(_cache_file) if json_data: subdomains.extend(json_data) # process openssl x509 dns_names _cache_file = os.path.join(_cache_path, 'googlect_dnsnames.json') json_data = read_json(_cache_file) for sub in json_data: if sub.endswith(domain): subdomains.append(sub) # collection burte force subdomains _burte_file = os.path.join(_cache_path, 'dnsburte.json') if FileUtils.exists(_burte_file): json_data = read_json(_burte_file) if json_data: subdomains.extend(json_data) # save all subdomains to outfile subdomains = list(set(subdomains)) _result_file = os.path.join(script_path, outfile) save_result(_result_file, subdomains) logging.info("{0} {1} subdomains save to {2}".format( domain, len(subdomains), _result_file))
def initialize(self): self.wait_for_message = False self.active_threads = [] self.settings = read_json(read_file_or_die('config/general.json'))
# encoding: utf-8 import os from tools.skynet import SkynetDomain from utils.fileutils import FileUtils from common import read_json # init path script_path = os.path.dirname(os.path.abspath(__file__)) result_file = os.path.join(script_path, 'domains.log') # upload subdomains dict to skynet. _subdomains = read_json(result_file) skynet = SkynetDomain() skynet.bulk_sync(_subdomains)