def load_details(conf, proc_details): ''' loads the json specification for configuration and proc_details parameters. ''' DB_CONFIG.update(load_json_file(conf)) PROC_DETAILS.extend(load_json_file(proc_details))
def get(city=None, taxi_id=None): if city and taxi_id: filedata = utils.load_json_file('taxis.json')["data"] results = [taxi for taxi in filedata if (taxi['city'] == city and taxi['name'] == taxi_id)] print(results) number_results = len(results) response = { "meta": { "count":number_results, "links":{ "self":"https://mock-travel-apis.herokuapp.com/taxis/"+city+"/"+taxi_id }, }, "data":results } return json.dumps(response) elif city: filedata = utils.load_json_file('taxis.json')["data"] results = [taxi for taxi in filedata if taxi['city'] == city] number_results = len(results) response = { "meta":{ "count":number_results, "links":{ "self":"https://mock-travel-apis.herokuapp.com/taxis/"+city }, },"data":results } return json.dumps(response) else: return utils.load_json_file('taxis.json')
def kegg_map_coloring(name_prefix, color_rev=False): deg_json = {} map2ko = {} ko2color = {} # make sub folder for map images if not os.path.exists(name_prefix + '_kegg_maps'): os.makedirs(name_prefix + '_kegg_maps') try: deg_json = utils.load_json_file(name_prefix + '_represent_gene.cache') except FileNotFoundError: try: deg_json = utils.load_json_file(name_prefix + '_represent_isoform.cache') except FileNotFoundError: logger.warning('represent_gene or _isoform cache not found.') if deg_json: # build map2ko dict for deg in deg_json.values(): if deg.get('keggko'): # workaround if type(deg['keggko']) is not list: deg['keggko'] = [deg['keggko']] for keggko in deg['keggko']: ko = keggko.split(':')[1] for kmap in deg['keggmap']: if kmap not in map2ko: map2ko[kmap] = [ko] else: if ko not in map2ko[kmap]: map2ko[kmap].append(ko) # build ko2color dict for deg in deg_json.values(): if deg.get('keggko'): for keggko in deg['keggko']: ko = keggko.split(':')[1] if ko not in ko2color: ko2color[ko] = [deg['hits'], deg['logFC']] else: if deg['hits'] > ko2color[ko][0]: ko2color[ko] = [deg['hits'], deg['logFC']] for ko, fcs in ko2color.items(): if color_rev: fc = -fcs[1] else: fc = fcs[1] ko2color[ko] = de_color_mapping(fc) # build kegg map request string for kmap, kos in map2ko.items(): req = '' + kmap for ko in kos: req += '/' + ko + '%09' + ko2color[ko] kegg_weblink_pathway(name_prefix + '_kegg_maps', req) return True
def test_handler(): lex_handler = LexBotHandler() test_context = None test_event = load_json_file(os.path.join(TEST_DATA_DIR, 'test_med_time.json')) # test registering new intent handlers expected_response = '1234' lex_handler.register_intent('MedicationTime', lambda event: expected_response) assert lex_handler.handle_lambda(test_event, test_context) == expected_response # test when intent is not supported, gracefully respond to user not_supported_test_event = load_json_file(os.path.join(TEST_DATA_DIR, 'not_supported_intent.json')) response = lex_handler.handle_lambda(not_supported_test_event, test_context) assert 'Sorry' in response['dialogAction']['message']['content']
def run(load=False): """ Entry point. This function could be called by external services. Connects to veganistan.se and fetches all important data and dumps it into a date-named json-file. Returns the filename of the created file. """ created_file = None if load: data = load_json_file("json", "20140725_0940.json") entry_manager = EntryManager(data_dict=data) else: # start scraping the base data for all entries. entry_manager = scrape_base_info() created_file = serialize_and_save( entries=entry_manager.get_entries(), filename='json/%s.json' % datetime.now().strftime("%Y%m%d_%H%M") ) for entry in entry_manager.get_entries(): scrape_detail(entry) if created_file: return serialize_and_save(entry_manager.get_entries(), created_file) return None
def fastp_parser(task): fastp_json_path = task.path.joinpath(task.id, 'reads', 'fastp.json') fastp_dict = utils.load_json_file(fastp_json_path) before_total_reads = fastp_dict['summary']['before_filtering'][ 'total_reads'] before_total_bases = fastp_dict['summary']['before_filtering'][ 'total_bases'] before_total_q30 = fastp_dict['summary']['before_filtering']['q30_rate'] before_r1_length = fastp_dict['summary']['before_filtering'][ 'read1_mean_length'] before_r2_length = fastp_dict['summary']['before_filtering'][ 'read2_mean_length'] after_total_reads = fastp_dict['summary']['after_filtering']['total_reads'] after_total_bases = fastp_dict['summary']['after_filtering']['total_bases'] after_total_q30 = fastp_dict['summary']['after_filtering']['q30_rate'] after_r1_length = fastp_dict['summary']['after_filtering'][ 'read1_mean_length'] after_r2_length = fastp_dict['summary']['after_filtering'][ 'read2_mean_length'] duplication_rate = fastp_dict['duplication']['rate'] fastp_abs = { 'before_total_reads': before_total_reads, 'before_total_bases': before_total_bases, 'before_total_q30': before_total_q30, 'before_r1_length': before_r1_length, 'before_r2_length': before_r2_length, 'after_total_reads': after_total_reads, 'after_total_bases': after_total_bases, 'after_total_q30': after_total_q30, 'after_r1_length': after_r1_length, 'after_r2_length': after_r2_length, 'duplication_rate': duplication_rate } return fastp_abs
def main(): utils.create_csv_file_if_necessary(OUTPUT_FILE_PATH, BREAKING_LIBRARY_VERSIONS_FIELD_NAMES) print("Finding files to parse that match {} in {}".format( FILE_NAME_SEARCH_STRING, SEARCH_DIR_PATH)) files_to_parse = utils.get_list_of_unread_files(SEARCH_DIR_PATH, FILE_NAME_SEARCH_STRING) print("Found {} files".format(len(files_to_parse))) count = 0 for ftp in files_to_parse: try: count += 1 print("{}: Parsing + writing {}".format(count, ftp)) file_contents = utils.load_json_file(ftp) lines_to_write = list() package_name = file_contents['name'] for v in file_contents['versions']: lines_to_write.append({ 'package_name': package_name, "version": v['number'], "version_published_at": v['published_at'] }) utils.write_lines_to_existing_csv( OUTPUT_FILE_PATH, BREAKING_LIBRARY_VERSIONS_FIELD_NAMES, lines_to_write) utils.mark_file_as_read(ftp) except Exception as e: print("[ERROR] on file {}. Continuing from next file.".format(ftp)) print("DONE")
def main(): utils.create_csv_file_if_necessary(OUTPUT_FILE_PATH, COMMENTS_FIELD_NAMES) print("Finding files to parse that match {} in {}".format(FILE_NAME_SEARCH_STRING, SEARCH_DIR_PATH)) files_to_parse = utils.get_list_of_unread_files(SEARCH_DIR_PATH, FILE_NAME_SEARCH_STRING) total = len(files_to_parse) print("Found {} files".format(total)) count = 0 for ftp in files_to_parse: try: count += 1 print("{}/{}: Parsing + writing {}".format(count, total, ftp)) comments = utils.load_json_file(ftp) issue_id, repo_name = parse_issue_id_and_repo_name_from_file_name(ftp) lines_to_write = list() for c in comments: lines_to_write.append({ 'id': c['id'], 'issue_id': issue_id, 'repo_name': repo_name, 'url': c['url'], 'issue_url': c['issue_url'], 'user_id': c['user']['id'], 'user_login': c['user']['login'], 'user_type': c['user']['type'], 'created_at': c['created_at'], 'updated_at': c['updated_at'], 'body': c['body'], }) utils.write_lines_to_existing_csv(OUTPUT_FILE_PATH, COMMENTS_FIELD_NAMES, lines_to_write) utils.mark_file_as_read(ftp) except Exception as e: print("[ERROR] on file {}. Continuing from next file.".format(ftp)) print("DONE")
def run(load=False): """ Entry point. This function could be called by external services. Connects to veganistan.se and fetches all important data and dumps it into a date-named json-file. Returns the filename of the created file. """ created_file = None if load: data = load_json_file("json", "20140725_0940.json") entry_manager = EntryManager(data_dict=data) else: # start scraping the base data for all entries. entry_manager = scrape_base_info() created_file = serialize_and_save( entries=entry_manager.get_entries(), filename='json/%s.json' % datetime.now().strftime("%Y%m%d_%H%M")) for entry in entry_manager.get_entries(): scrape_detail(entry) if created_file: return serialize_and_save(entry_manager.get_entries(), created_file) return None
def verify_and_test_db(main_file, host_id, run_params): # Parse main query file. parse_data = load_json_file(main_file) query_data = load_json_file('fi-framework/' + parse_data['query_file']) queries = query_data['queries'] db_type = parse_data['db_type'] db_init = parse_data['db_meta'] localhost = '127.0.0.1' run_type = run_params['type'] db_session = None if db_type == 'cassandra' and run_type in ['verify', 'test', 'query']: # Do not always delete the keyspace when the reuse_keyspace param is set to # False. Example given, when querying you probably do not want to remove the # data each time. force_reuse_keyspace = False if ('insert_data' in parse_data and parse_data['insert_data'] == True) and \ (run_type != 'verify'): force_reuse_keyspace = True db_session = create_dbsession_from_type(db_type, db_init, host=localhost, force_reuse_keyspace=force_reuse_keyspace) if run_type == 'verify': # Initialize and fill the verification database. _insert_and_verify_cmd(parse_data, db_session, queries) db_session.shutdown() elif run_type == 'test': # Run the test queries and verification queries. _test_cmd(db_session, queries, run_params, db_type) db_session.shutdown() elif run_type == 'retrieve_targets': # Retrieve DBMS target files. password = '' if 'password' in parse_data['server_meta']: password = parse_data['server_meta']['password'] if not (isinstance(password, unicode) or isinstance(password, str)): password = password[host_id] _retrieve_cmd(run_params, parse_data, password, db_type) elif run_type == 'clear_verification_db': # Clear the verification DBMS. print "Deleting verification db." verification_db = SQLiteDB() verification_db.drop_table() elif run_type == 'restore': # Restore the current db_data directory with the backup tar. _restore_cmd(run_params) elif run_type == 'query': # Query the DBSession. _query_cmd(db_session, run_params) db_session.shutdown() else: print "Unknown command given: {}".format(run_type)
def main(): utils.create_csv_file_if_necessary(OUTPUT_FILE_PATH, OUTPUT_FIELD_NAMES) print("Finding files to parse that match {} in {}".format(FILE_NAME_SEARCH_STRING, SEARCH_DIR_PATH)) files_to_parse = utils.get_list_of_unread_files(SEARCH_DIR_PATH, FILE_NAME_SEARCH_STRING) total = len(files_to_parse) print("Found {} files".format(total)) count = 0 for ftp in files_to_parse: try: count += 1 print("{}/{}: Parsing + writing {}".format(count, total, ftp)) commit = utils.load_json_file(ftp) if 'message' in commit and 'No commit found for SHA' in commit['message']: print('No Commit found...continuing') continue commit_sha, issue_id, repo_name = parse_artifacts_from_file_name(ftp) message = commit['commit']['message'] url = commit['url'] html_url = commit['html_url'] author_login = commit['author']['login'] if commit['author'] is not None else '' author_type = commit['author']['type'] if commit['author'] is not None else '' committer_login = commit['committer']['login'] if commit['committer'] is not None else '' committer_type = commit['committer']['type'] if commit['committer'] is not None else '' stats_total = commit['stats']['total'] stats_additions = commit['stats']['additions'] stats_deletions = commit['stats']['deletions'] def make_new_commit_line(f): return { 'commit_sha': commit_sha, 'issue_id': issue_id, 'repo_name': repo_name, 'url': url, 'html_url': html_url, 'message': message, 'author_login': author_login, 'author_type': author_type, 'committer_login': committer_login, 'committer_type': committer_type, 'stats_total': stats_total, 'stats_additions': stats_additions, 'stats_deletions': stats_deletions, 'file_name': f['filename'], 'file_status': f['status'], 'file_additions': f['additions'], 'file_deletions': f['deletions'], 'file_changes': f['changes'], 'file_patch': f['patch'] if 'patch' in f else None, } lines_to_write = list() for file in commit['files']: new_line = make_new_commit_line(file) lines_to_write.append(new_line) utils.write_lines_to_existing_csv(OUTPUT_FILE_PATH, OUTPUT_FIELD_NAMES, lines_to_write) utils.mark_file_as_read(ftp) except Exception as e: print("[ERROR] on file {}. Continuing from next file.".format(ftp)) print("DONE")
def inspect_data(path): sample = utils.load_json_file(path) key_fq = Counter([i["label"] for i in sample]) sorted_key_fq = sorted(key_fq.items(), key=lambda x: x[1], reverse=True) log_obj.info("样本总数" + str(len(sample))) log_obj.info("类别总数" + str(len(key_fq))) log_obj.info(key_fq) log_obj.info(sorted_key_fq) return sorted_key_fq
def process(path, train_to_path, dev_to_path, extend_sample_map): """ :param extend_sample_map: 过采样比例 """ # 划分训练与验证集 model0_sample = utils.load_json_file(path) sample_dic = {} for i in model0_sample: if i["label"] in sample_dic: sample_dic[i["label"]].append(i) else: sample_dic[i["label"]] = [i] train_examples = [] test_examples = [] for l, lis in sample_dic.items(): a, b = train_test_split(lis, test_size=properties.test_dev_size) train_examples.extend(a) test_examples.extend(b) print("train vs dev=", len(train_examples), len(test_examples)) # 扩充训练集 train_dic = {} for i in train_examples: if i["label"] in train_dic: train_dic[i["label"]].append(i) else: train_dic[i["label"]] = [i] for k, v in train_dic.items(): print(k, len(v)) for label, ratio in extend_sample_map.items(): if ratio <= 1: tmp_lis = copy.deepcopy(train_dic[label]) tmp_lis = shuffle_list(tmp_lis) train_dic[label] = tmp_lis[:int(len(tmp_lis) * ratio)] else: tmp_lis = copy.deepcopy(train_dic[label]) tmp_lis = shuffle_list(tmp_lis) for j in range(math.ceil(ratio)): train_dic[label].extend(tmp_lis) train_dic[label] = train_dic[label][:int(len(tmp_lis) * ratio)] print("重新采样后") for k, v in train_dic.items(): print(k, len(v)) train_examples = [] for l, lis in train_dic.items(): train_examples.extend(lis) train_examples = shuffle_list(train_examples) test_examples = shuffle_list(test_examples) log_obj.info("划分 训练集 : 验证集 = %s : %s" % (len(train_examples), len(test_examples))) utils.dump_json_file(train_to_path, train_examples) utils.dump_json_file(dev_to_path, test_examples)
def test_authors_list(self): """ Test if the list of top authors is returned correctly or not. """ authors = git.Authors(self.git_index, self.start, self.end) authors_list = authors.aggregations() authors_test = load_json_file(TOP_AUTHORS) assert_array_equal(authors_list['keys'], authors_test['keys']) assert_array_equal(authors_list['values'], authors_test['values'])
def test_get_list(self): """ Testing multi valued aggregations. """ self.Query_test_object.until(end=self.end) self.Query_test_object.get_terms("author_name") authors = self.Query_test_object.get_list() authors_test = load_json_file(AUTHORS_LIST) self.assertDictEqual(authors, authors_test)
def test_organization_list(self): """ Test if the list of top organizations is returned correctly or not. """ orgs = git.Organizations(self.git_index, self.start, self.end) orgs_list = orgs.aggregations() orgs_test = load_json_file(TOP_ORGANIZATIONS) assert_array_equal(orgs_list['keys'], orgs_test['keys']) assert_array_equal(orgs_list['values'], orgs_test['values'])
def test_update_dataset(self): data = load_json_file('basic_dataset.json') status, dataset = metax.create_dataset(data) self.assertIn(status, self.OK, "could not create dataset") # data = load_json_file('metax_dataset.json') dataset['research_dataset']['title']['en'] = 'title updated' status, updated_data = metax.update_dataset(dataset['id'], dataset) self.assertIn(status, self.OK, "Metax update failure") urn = updated_data["identifier"] etsin_status, etsin_data = etsin.view_dataset(urn) self.assertIn(etsin_status, self.OK, "Etsin failure")
def test_create_dataset(self): # loading the example dataset data = load_json_file('basic_dataset.json') status, cdata = metax.create_dataset(data) self.assertIn(status, self.OK, "could not create dataset") urn = cdata["identifier"] time.sleep(10) etsin_status, etsin_data = etsin.view_dataset(urn) self.assertIn(etsin_status, self.OK, "Etsin could not found the dataset")
def test_delete_dataset(self): data = load_json_file('basic_dataset.json') status, cdata = metax.create_dataset(data) self.assertIn(status, self.OK, "could not create dataset") urn = cdata["identifier"] time.sleep(2) status = metax.delete_dataset(cdata['id']) self.assertIn(status, self.OK, "Metax dataset delete failure") etsin_status, etsin_data = etsin.view_dataset(urn) self.assertIn(etsin_status, self.FAIL, "Etsin found the deleted dataset")
def test_fetch_aggregation_results(self): """ Test the fetched aggregation data """ self.Query_test_object.until(end=self.end)\ .get_cardinality(self.field1)\ .by_authors(field=self.field2) response = self.Query_test_object.fetch_aggregation_results() aggregations = {"aggregations": response['aggregations']} actual_response = load_json_file(FETCH_AGGREGATION_RESULTS_DATA1) self.assertDictEqual(aggregations, actual_response)
def test_reject_dataset(self): # Create a dataset in metax and reject the dataset for preservation # loading the example dataset data = load_json_file('basic_dataset.json') # creating a dataset status, cdata = metax.create_dataset(data) self.assertIn(status, self.OK, "Metax create dataset fails") id = cdata['id'] # rejecting the dataset status = pas.reject_dataset(id) self.assertIn(status, self.OK, "PAS dataset rejection fails")
def fresh(self, config_file=None, namespace=None): """ sets the environment with a fresh config or namespace that is not the defaults if config_file or namespace parameters are given """ if not config_file: config_file = self.config_file() self.__config_file = config_file self.__config = load_json_file(config_file) self._commander = self.__config.get('__cmds__') self._commander = self.__config.get('__scripts__') self.namespace = namespace self.wrappers = {}
def test_fetch_results_from_source(self): """ Testing if specific fields can be fetched from index """ self.Query_test_object.until(end=self.end) self.Query_test_object.search = self.Query_test_object.search.extra(sort=[ { "commit_date": { "order": "asc" } }]) response = self.Query_test_object.fetch_results_from_source(self.field2) actual_response = load_json_file(FETCH_SOURCE_RESULTS_DATA1) self.assertEqual(response, actual_response['hits'])
def _config(self): """ Lazy load the config so that any errors happen then """ if not self.__config_file: # If there is not config file then return an error. # TODO: Refactor the config code, it's overly confusing raise Exception("""No config found. Set environment variable LNK_DIR to point to your link configuration directory or create a #.link/link.config file in your HOME directory""") if not self.__config: self.__config = load_json_file(self.__config_file) return self.__config
def test_by_authors(self): """ Test nested aggregation wrt authors """ self.Query_test_object.get_sum(self.field3)\ .by_authors(self.field2)\ .since(start=self.start)\ .until(end=self.end) response = self.Query_test_object.fetch_aggregation_results()['aggregations'] buckets = {"buckets": response['0']['buckets']} sum_lines_added = load_json_file(SUM_LINES_ADDED_BY_AUTHORS) self.assertEqual(sum_lines_added, buckets)
def _load_confs(self): """ 加载配置 :return: 配置字典 {key:domain,value:config} """ prefix = os.path.split(os.path.abspath(__file__))[0] path = os.sep.join([prefix, "configs"]) files = get_path_files(path) config = dict() for f in files: obj = load_json_file(f) domain = obj["domain"] config[domain] = obj["conf"] return config
def main(): if PROJECT_PATH is None: raise Exception("No PROJECT_ROOT_PATH") if GITHUB_ACCESS_TOKEN is None: raise Exception("No GITHUB_ACCESS_TOKEN") repos = utils.read_csv_ignore_headers(INPUT_CSV_FILE_PATH, INPUT_CSV_FILE_FIELD_NAMES) total = len(repos) count = 0 list_to_write = list() for repo in repos: repo_name = repo['repo_name'] try: count += 1 print("\t{}/{} repo={}".format(count, total, repo_name)) project_git_folder = f"{REPOS_PATH}/repos/{repo_name.replace('/', '#')}" if os.path.isdir(project_git_folder): package_json_path = f"{project_git_folder}/package.json" if utils.file_or_read_file_already_exists(package_json_path): package_json_contents = utils.load_json_file( package_json_path) # Will throw if 'name' is not there list_to_write.append({ 'repo_name': repo_name, 'package_name': package_json_contents['name'] }) else: # package.json doesn't exist list_to_write.append({ 'repo_name': repo_name, 'package_name': None }) else: # Could not clone project list_to_write.append({ 'repo_name': repo_name, 'package_name': None }) except Exception as e: list_to_write.append({ 'repo_name': repo_name, 'package_name': None }) utils.write_lines_to_new_csv(OUTPUT_FILE_PATH, OUTPUT_FIELD_NAMES, list_to_write) print("Done")
def __load_all_jsons(self, targetPath): """ loads all files from directory """ currentDir = os.path.join(os.getcwd(), targetPath) files = [ x for x in os.listdir(currentDir) if os.path.isfile(os.path.join(currentDir, x)) ] for file in files: if file.endswith('.json'): jsonObject = load_json_file(os.path.join(currentDir, file)) self.add_dialogue_file(jsonObject=jsonObject, fileName=file)
def _config(self): """ Lazy load the config so that any errors happen then """ if not self.__config_file: # If there is not config file then return an error. # TODO: Refactor the config code, it's overly confusing raise Exception( """No config found. Set environment variable LNK_DIR to point to your link configuration directory or create a #.link/link.config file in your HOME directory""") if not self.__config: self.__config = load_json_file(self.__config_file) return self.__config
def test_preserve_dataset(self): # Create a dataset in metax and preserve the dataset # loading the example dataset data = load_json_file('basic_dataset.json') # creating a dataset status, cdata = metax.create_dataset(data) self.assertIn(status, self.OK, "Metax create dataset fails") id = cdata['id'] # preserving the dataset status = pas.preserve_dataset(id) self.assertIn(status, self.OK, "PAS preserve fails")
def test_by_period_with_params(self): """ Test the date_histogram aggregation with all the parameters """ self.Query_test_object.since(start=self.start)\ .until(end=self.end)\ .get_cardinality(self.field1)\ .by_period(field=self.date_field2, period="quarter", timezone=self.timezone) response = self.Query_test_object.fetch_aggregation_results()['aggregations'] hash_by_period = load_json_file(NUM_HASHES_BY_QUARTER) buckets = {"buckets": response['0']['buckets']} self.assertEqual(hash_by_period, buckets)
def main(): utils.create_csv_file_if_necessary(OUTPUT_FILE_PATH, ISSUE_FIELD_NAMES) print("Finding files to parse that match {} in {}".format( FILE_NAME_SEARCH_STRING, SEARCH_DIR_PATH)) files_to_parse = utils.get_list_of_unread_files(SEARCH_DIR_PATH, FILE_NAME_SEARCH_STRING) print("Found {} files".format(len(files_to_parse))) count = 0 for ftp in files_to_parse: try: count += 1 print("{}: Parsing + writing {}".format(count, ftp)) issues = utils.load_json_file(ftp) repo_name = parse_repo_name_form_file_name(ftp) lines_to_write = list() for i in issues: lines_to_write.append({ 'id': i['id'], 'repo_name': repo_name, 'url': i['url'], 'repository_url': i['repository_url'], 'comments_url': i['comments_url'], 'events_url': i['events_url'], 'html_url': i['html_url'], 'number': i['number'], 'title': i['title'], 'user_id': i['user']['id'], 'user_login': i['user']['login'], 'user_type': i['user']['type'], 'state': i['state'], 'locked': i['locked'], 'comments': i['comments'], 'created_at': i['created_at'], 'updated_at': i['updated_at'], 'closed_at': i['closed_at'], 'body': i['body'], 'is_pull_request': 'pull_request' in i }) utils.write_lines_to_existing_csv(OUTPUT_FILE_PATH, ISSUE_FIELD_NAMES, lines_to_write) utils.mark_file_as_read(ftp) except Exception as e: print("[ERROR] on file {}. Continuing from next file.".format(ftp)) print("DONE")
def test_get_terms(self): """ Test the terms aggregation """ field = self.field2 # without field param with self.assertRaises(AttributeError): self.Query_test_object.get_terms() # with field param self.Query_test_object.get_terms(field)\ .since(start=self.start)\ .until(end=self.end) response = self.Query_test_object.fetch_aggregation_results()['aggregations'] buckets = {"buckets": response['0']['buckets']} authors = load_json_file(TERMS_AGGREGATION_DATA) self.assertEqual(authors, buckets)
def set_file(self, filePath, fileName=None): """ sets the file and tries to load it to use """ self.__filePath = filePath if fileName: self.__fileName = fileName try: self.__dialogues = load_json_file( os.path.join(self.__filePath, self.__fileName)) except FileNotFoundError: save_json_file(obj=self.__dialogues, path=os.path.join(self.__filePath, self.__fileName)) else: self.__fileName = DialogueAnnotator.__DEFAULT_FILENAME
def build_argument_data_batch(file_name, FV, clf): gold_list = [] matrix_list = [] f_json = utils.load_json_file(file_name) for sentence in f_json['sentences']: event_candidates_list = sentence['eventCandidates'] for event in event_candidates_list: argumentslist = event['arguments'] for argument in argumentslist: arg_index = argument['begin'] token_index = event['begin'] matrix_list.append( FV.get_feature_matrix_argument_prediction(token_index, arg_index, sentence, clf) ) gold_list.append( argument['gold'] ) if len(matrix_list) == 0: return None, None if clf=='perc': return matrix_list, gold_list elif clf=='nb': return vstack(matrix_list), gold_list
def build_trigger_data_batch(file_name, FV, clf): trigger_list = [] token_index_list = [] sentence_list = [] f_json = utils.load_json_file(file_name) for sentence in f_json['sentences']: event_candidates_list = sentence['eventCandidates'] for event in event_candidates_list: token_index_list.append( event['begin'] ) sentence_list.append(sentence) trigger_list += [ event['gold'] ] matrix_list = [] for token_index,sentence in zip(token_index_list, sentence_list): matrix_list.append( FV.get_feature_matrix(token_index, sentence, clf) ) if len(matrix_list) == 0: return None, None if clf=='perc': return matrix_list, trigger_list elif clf=='nb': return vstack(matrix_list), trigger_list
def serialize_and_save(entries, filename): json_data = serialize_items(entries) return save_json_file( data=json_data, filename=filename) if __name__ == "__main__": # TODO: Accept sys args load = False created_file = None if load: data = load_json_file("json", "20140725_0940.json") entry_manager = EntryManager(data_dict=data) else: # start scraping the base data for all entries. entry_manager = scrape_base_info() created_file = serialize_and_save( entries=entry_manager.get_entries(), filename='json/%s.json' % datetime.now().strftime("%Y%m%d_%H%M") ) # json_data = serialize_items(entry_manager.get_entries()) # save_json_file( # json_data, # "json", # '%s.json' % datetime.now().strftime("%Y%m%d_%H%M"))
def main(): ################### EXPLORATORY DATA ANALYSIS ############################# # Just testing my functions a bit list_of_files = utils.list_files() print (list_of_files[0]) f1 = utils.load_json_file(list_of_files[0]) pprint(len(f1['sentences'])) # Finding and counting all event triggers t = utils.get_all_triggers(list_of_files) print("Number of distinct event triggers: {0}".format(len(t.keys()))) pprint(t) # Finding and counting all possible arguments (=relationship labels) arg = utils.get_all_arguments(list_of_files) print("Number of relation arguments: {0}".format(len(arg.keys()))) pprint(arg) ########################## NAIVE BAYES #################################### # Crossvalidation rates = [0.5,0.6,0.7,0.8,0.9,0.95] # x = crossvalidation_experiment(rates, list_of_files, load=True, mode='trig', k=3) # pprint(x) # x2 = crossvalidation_experiment(rates, list_of_files, load=True, mode='arg', k=3) # pprint(x2) ## Naive Bayes on trigger # Read data print "Experiment 1: Naive Bayes predicting triggers" FV_trig = feature_vector.FeatureVector('trigger') train_list, valid_list = utils.create_training_and_validation_file_lists(list_of_files) X_train, y_train = build_dataset(train_list, FV_trig, ind=1, kind='train', mode='trig', clf='nb', load=True) X_train, y_train = subsample(X_train, y_train, clf='nb', subsampling_rate=0.50) X_valid, y_valid = build_dataset(valid_list, FV_trig, ind=1, kind='valid', mode='trig', clf='nb', load=True) NB_trig = nb.NaiveBayes() NB_trig.train(np.asarray(X_train.todense()),np.asarray(y_train)) # print "Evaluate Naive Bayes classifer predicting triggers on the train set..." # CM, prec, rec, F1 = NB_trig.evaluate(np.asarray(X_train.todense()), np.asarray(y_train)) # print "Precision: {0}".format(prec) # print "Recall: {0}".format(rec) # print "F1-measure: {0}".format(F1) # print "Confusion matrix:\n", np.int64(CM) print "Evaluate Naive Bayes classifer predicting triggers on the validation set..." CM, prec, rec, F1 = NB_trig.evaluate(np.asarray(X_valid.todense()), np.asarray(y_valid)) print "Precision: {0}".format(prec) print "Recall: {0}".format(rec) print "F1-measure: {0}".format(F1) print "Confusion matrix:\n", np.int64(CM) ## Naive Bayes on argument print "Experiment 2: Naive Bayes predicting arguments" FV_arg = feature_vector.FeatureVector('argument') X_train, y_train = build_dataset(train_list, FV_arg, ind=1, kind='train', mode='arg', clf='nb', load=True) X_train, y_train = subsample(X_train, y_train, clf='nb', subsampling_rate=0.50) X_valid, y_valid = build_dataset(valid_list, FV_arg, ind=1, kind='valid', mode='arg', clf='nb', load=True) NB_arg = nb.NaiveBayes() NB_arg.train(np.asarray(X_train.todense()), np.asarray(y_train)) # print "Evaluate Naive Bayes classifer predicting arguments on the train set..." # CM, prec, rec, F1 = NB_arg.evaluate(np.asarray(X_train.todense()), np.asarray(y_train)) # print "Precision: {0}".format(prec) # print "Recall: {0}".format(rec) # print "F1-measure: {0}".format(F1) # print "Confusion matrix:\n", np.int64(CM) print "Evaluate Naive Bayes classifer predicting arguments on the validation set..." CM, prec, rec, F1 = NB_arg.evaluate(np.asarray(X_valid.todense()), np.asarray(y_valid)) print "Precision: {0}".format(prec) print "Recall: {0}".format(rec) print "F1-measure: {0}".format(F1) print "Confusion matrix:\n", np.int64(CM)
#load weights of pretrained perceptron. with open('Perceptron_trigger.data', 'rb') as f: Lambda_e, misc_e = cPickle.load(f) with open('Perceptron_argument.data', 'rb') as f: Lambda_a, misc_a = cPickle.load(f) for i_f,test_file in enumerate(evaluate_test_list): print 'Test File', i_f, 'of' , len(evaluate_test_list) #generate predictions for current file, p_e and p_a are the predicted values. (p_e, g_e) = perc.test_perceptron(FV_trig, Lambda_e, [test_file], mode='Trigger') (p_a, g_a) = perc.test_perceptron(FV_arg, Lambda_a, [test_file], mode='Argument') f_fill_this = utils.load_json_file(test_file) counter_e = 0 counter_a = 0 for sentence in f_fill_this['sentences']: event_candidates = sentence['eventCandidates'] for ec in event_candidates: ec['predicted'] = FV_trig.trigger_list[p_e[counter_e]] counter_e +=1 for arg in ec['arguments']: arg['predicted'] = FV_arg.arguments_list[p_a[counter_a]] counter_a +=1 if counter_e != len(p_e): print 'PROBLEM: LENGTH OF PREDICTION VECTOR (trigger) DOESNT FIT!' if counter_a != len(p_a): print 'PROBLEM: LENGTH OF PREDICTION VECTOR (argument) DOESNT FIT!'
'name': boss_name, 'loot_table': [], } boss_list[str(boss_id)]['loot_table'].append({ 'id': item_id, 'name': item_name, 'slot': slot.lower(), 'level': item_level, 'difficulties': difficulties, 'specs': item_specs, }) # Save the raid informations in a JSON file json_raids = load_json_file(os.path.join(dest, 'raids.json'), []) found = False for raid_entry in json_raids: if raid_entry['name'] == raid_name: raid_entry['boss'] = boss_list raid_entry['wings'] = wings_list found = True break if not(found): json_raids.append({ 'name': raid_name, 'wings': wings_list, 'boss': boss_list, })