def extract(metrics, model_dir, params_filename, metrics_filename): metrics_file = os.path.join(model_dir, metrics_filename) params_file = os.path.join(model_dir, params_filename) if os.path.isfile(metrics_file): data = load_from_json(metrics_file) data.update(load_from_json(params_file)) data['path'] = model_dir metrics.append(data) for subitem in os.listdir(model_dir): subdir = os.path.join(model_dir, subitem) if not os.path.isdir(subdir): continue extract(metrics, subdir, params_filename, metrics_filename)
def cityDic(places): geolocator = Nominatim(user_agent="specify_your_app_name_here") geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) place_dicts = [] for place in places: place_dict = {"text":place, "address":"", "latitude":"", "longtitude":""} location = geocode(place) if location: place_dict["address"] = location.address point = tuple(location.point) place_dict["latitude"] = point[0] place_dict["longtitude"] = point[1] place_dicts.append(place_dict) return place_dicts if __name__ == '__main__': args = get_args() data = load_from_json(args.data) place_tags = [] # TODO : Process only sentences with label 1 for sentence in data["sentences"]: places = geograpy.get_place_context(text=sentence) place_dicts = cityDic(places.cities) # Only cities ??? place_tags.append(place_dicts) data["place_tags"] = place_tags write_to_json(data, data["id"], extension="json", out_dir=args.out_dir)
def ui_load(expenses): path = ui_input_path() serialized = load_from_json(path) expenses.do('set_serialized', serialized)
def make_supplementary_table_three( *, args, topn_results_obs, topn_results_counter_diss, topn_results_counter_suff ): casecards = load_from_json(args.datapath / VIGNETTES_FILE) paired_results = {} doc_topn = {} doc_topn_caseav = {} doc_score, obs_score = [], [] for num, card in enumerate(casecards.values()): if args.first is not None and num >= args.first: continue true_id = card["card"]["diseases"][0]["id"] pred_suff = topn_results_counter_suff[num] pred_diss = topn_results_counter_diss[num] pred_obs = topn_results_obs[num] doc_res_n = doctor_top_ns(card, true_id) for val in doc_res_n: if val[1] == 0: continue if val[0] not in paired_results.keys(): paired_results[val[0]] = [ [ deepcopy(val[2]), deepcopy(pred_suff[val[1] - 1]), deepcopy(pred_diss[val[1] - 1]), deepcopy(pred_obs[val[1] - 1]), ] ] else: paired_results[val[0]] += [ [ deepcopy(val[2]), deepcopy(pred_suff[val[1] - 1]), deepcopy(pred_diss[val[1] - 1]), deepcopy(pred_obs[val[1] - 1]), ] ] for val in doc_res_n: if val[1] == 0: continue if val[0] not in doc_topn.keys(): doc_topn[val[0]] = { "count": 1, "sufficiency": { val[1]: np.array([1, deepcopy(pred_suff[val[1] - 1])]) }, "disablement": { val[1]: np.array([1, deepcopy(pred_diss[val[1] - 1])]) }, "obs": {val[1]: np.array([1, deepcopy(pred_obs[val[1] - 1])])}, "doctor": {val[1]: np.array([1, deepcopy(val[2])])}, } else: doc_topn[val[0]]["count"] += 1 if ( val[1] not in doc_topn[val[0]]["sufficiency"].keys() ): # this doctor has never had this score before doc_topn[val[0]]["sufficiency"][val[1]] = np.array( [1, deepcopy(pred_suff[val[1] - 1])] ) doc_topn[val[0]]["disablement"][val[1]] = np.array( [1, deepcopy(pred_diss[val[1] - 1])] ) doc_topn[val[0]]["obs"][val[1]] = np.array( [1, deepcopy(pred_obs[val[1] - 1])] ) doc_topn[val[0]]["doctor"][val[1]] = np.array([1, deepcopy(val[2])]) else: doc_topn[val[0]]["sufficiency"][val[1]] += np.array( [1, deepcopy(pred_suff[val[1] - 1])] ) doc_topn[val[0]]["disablement"][val[1]] += np.array( [1, deepcopy(pred_diss[val[1] - 1])] ) doc_topn[val[0]]["obs"][val[1]] += np.array( [1, deepcopy(pred_obs[val[1] - 1])] ) doc_topn[val[0]]["doctor"][val[1]] += np.array( [1, deepcopy(val[2])] ) this_card_res_doc = { 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], } this_card_res_suff = { 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], } this_card_res_diss = { 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], } this_card_res_obs = { 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], } for val in doc_res_n: if val[1] == 0: continue if val[1] > 9: continue this_card_res_doc[val[1]] += [val[2]] this_card_res_suff[val[1]] += [deepcopy(pred_suff[val[1] - 1])] this_card_res_diss[val[1]] += [deepcopy(pred_diss[val[1] - 1])] this_card_res_obs[val[1]] += [deepcopy(pred_obs[val[1] - 1])] this_card_res_doc = dict( [[k, mean_list(val)] for k, val in this_card_res_doc.items()] ) this_card_res_suff = dict( [[k, mean_list(val)] for k, val in this_card_res_suff.items()] ) this_card_res_diss = dict( [[k, mean_list(val)] for k, val in this_card_res_diss.items()] ) this_card_res_obs = dict( [[k, mean_list(val)] for k, val in this_card_res_obs.items()] ) for k, val in this_card_res_doc.items(): if val == "none": # no data collected on differentials of this size continue else: # if a value was collected for this value for doctors, it was collected for the other two algorithms too if k not in doc_topn_caseav.keys(): doc_topn_caseav[k] = { "count": 1, "suff": deepcopy(this_card_res_suff[k]), "diss": deepcopy(this_card_res_diss[k]), "obs": deepcopy(this_card_res_obs[k]), "doc": deepcopy(this_card_res_doc[k]), } else: # we have recorded a case of this length before doc_topn_caseav[k]["count"] += 1 doc_topn_caseav[k]["suff"] += deepcopy(this_card_res_suff[k]) doc_topn_caseav[k]["diss"] += deepcopy(this_card_res_diss[k]) doc_topn_caseav[k]["obs"] += deepcopy(this_card_res_obs[k]) doc_topn_caseav[k]["doc"] += deepcopy(this_card_res_doc[k]) # card mean score if [val[2] for val in [_val for _val in doc_res_n if _val[1] != 0]] == []: continue doc_mean_score = np.mean( [val[2] for val in [_val for _val in doc_res_n if _val[1] != 0]] ) obs_mean_score = np.mean( [ deepcopy(pred_obs[val[1] - 1]) for val in [_val for _val in doc_res_n if _val[1] != 0] ] ) doc_score += [doc_mean_score] obs_score += [obs_mean_score] doc_score = [] doc_error = [] obs_score = [] obs_error = [] suff_score = [] suff_error = [] diss_score = [] diss_error = [] for k, val in doc_topn.items(): n = val["count"] if n < 50: continue docp = sum(val["doctor"].values())[1] / n obsp = sum(val["obs"].values())[1] / n suffp = sum(val["sufficiency"].values())[1] / n dissp = sum(val["disablement"].values())[1] / n doc_score += [docp] doc_error += [np.sqrt(docp * (1 - docp) / n)] obs_score += [obsp] obs_error += [np.sqrt(obsp * (1 - obsp) / n)] suff_score += [suffp] suff_error += [np.sqrt(suffp * (1 - suffp) / n)] diss_score += [suffp] diss_error += [np.sqrt(dissp * (1 - dissp) / n)] raw_data = { "doc_score": doc_score, "doc_error": doc_error, "obs_score": obs_score, "obs_error": obs_error, "sufficiency_score": suff_score, "sufficiency_error": suff_error, "disablement_score": diss_score, "disablement_error": diss_error, } df_results = pd.DataFrame( raw_data, columns=[ "doc_score", "doc_error", "obs_score", "obs_error", "sufficiency_score", "sufficiency_error", "disablement_score", "disablement_error", ], ) df_results.to_pickle(args.results / "supp_table_3_df.p") return df_results, doc_topn
import utils import random import collections from sklearn import svm, tree # Set parameters and load dataset DATASET = 'soundscapesDescriptor' NUMBER_OF_DIMENSIONS_OF_FEATURE_VECTOR = 7 # Maximum number of dimensions for the feature vector. Only the N most common tags will be used. Use a big number to "ommit" this parameter CLASSIFIER_TYPE = 'tree' # Use 'svm' or 'tree' PERCENTAGE_OF_TRAINING_DATA = 0.5 # Percentage of sounds that will be used for training (others are for testing) MAX_INPUT_TAGS_FOR_TESTING = 5 # Use a big number to "omit" this parameter and use as many tags as originally are in the sound dataset = utils.load_from_json(DATASET + '.json') N = len(dataset[dataset.keys()[0]]) # Number of sounds per class CLASS_NAMES = dataset.keys() # 3) Define vector space # ********************** # Get all tags in the dataset (the vocabulary) all_tags = list() for class_name in CLASS_NAMES: class_tags = utils.get_all_tags_from_class(class_name, dataset) all_tags += class_tags # Filter out tags with less frequency (get only top N tags) most_common_tags = [tag for tag, count in collections.Counter(all_tags).most_common(NUMBER_OF_DIMENSIONS_OF_FEATURE_VECTOR)] filtered_tags = [tag for tag in most_common_tags if tag in all_tags] # Build our prototype feature vector (unique list of tags), and print first 10 tags prototype_feature_vector = list(set(filtered_tags)) print 'Created prototype feature vector with %i dimensions (originally %i dimensions)' % (len(prototype_feature_vector), len(set(all_tags)))
def make_table_one_and_supplementary_table_two( *, args, topn_results_obs, topn_results_counter_diss, topn_results_counter_suff ): casecards = load_from_json(args.datapath / VIGNETTES_FILE) results_obs = { "common": [], "rare": [], "very_rare": [], "almost_impossible": [], "uncommon": [], "very_common": [], } results_counter = { "common": [], "rare": [], "very_rare": [], "almost_impossible": [], "uncommon": [], "very_common": [], } wins_obs = { "common": 0, "rare": 0, "very_rare": 0, "almost_impossible": 0, "uncommon": 0, "very_common": 0, } wins_counter = { "common": 0, "rare": 0, "very_rare": 0, "almost_impossible": 0, "uncommon": 0, "very_common": 0, } draws = { "common": 0, "rare": 0, "very_rare": 0, "almost_impossible": 0, "uncommon": 0, "very_common": 0, } for num, card in enumerate(casecards.values()): if args.first is not None and num >= args.first: continue rareness = card["card"]["diseases"][0]["rareness"] r_obs = sum(topn_results_obs[num]) r_suff = sum(topn_results_counter_suff[num]) results_obs[rareness] += [min(21 - r_obs, 20)] results_counter[rareness] += [min(21 - r_suff, 20)] if r_obs > r_suff: wins_obs[rareness] += 1 elif r_obs < r_suff: wins_counter[rareness] += 1 else: draws[rareness] += 1 results_obs_sum = [] wins_obs_all, wins_counter_all, draws_all = 0, 0, 0 for k, val in results_obs.items(): results_obs_sum += val wins_obs_all += wins_obs[k] draws_all += draws[k] results_obs[k] = {"mean": np.mean(val), "std": np.std(val)} results_obs["all"] = { "mean": np.mean(results_obs_sum), "std": np.std(results_obs_sum), } results_counter_sum = [] wins_counter_all = 0 for k, val in results_counter.items(): results_counter_sum += val wins_counter_all += wins_counter[k] results_counter[k] = {"mean": np.mean(val), "std": np.std(val)} results_counter["all"] = { "mean": np.mean(results_counter_sum), "std": np.std(results_counter_sum), } draws["all"] = draws_all wins_obs["all"] = wins_obs_all wins_counter["all"] = wins_counter_all print("> Observational Results") pprint(results_obs) print("") print("> Counterfactual Results") pprint(results_counter) print("")
def load_networks(datapath, filename=NETWORKS_FILE): return load_from_json(datapath / filename)
""" # Network Parameters num_epochs = 10 hidden_layers = [25, 25] learning_rate = .01 # Iterate through different time periods for response in ['daily', 'weekly', 'bi_weekly', 'monthly']: print('Beginning {} Models'.format(response)) # Load which predictors will be used with open('saved_models/' + response + '/' + 'predictors.json', 'r') as fd: predictors = json.loads(fd.read()) # Load the predictor values from files variables_dict = load_from_json(predictors, response, verbose=True) # Split the data into training and test sets xtrain, xtest, ytrain, ytest, ytrain_hot, ytest_hot = create_model_data( variables_dict, predictors, response, model_type='Both') # Iterate through regression and classification models for model_type in ['Regression', 'Classification']: print('Beginning {} Model for {} Predictions'.format( model_type, response)) print('Setting up TensorBoard') # Create the model structure, define the cost optimization functions x = tf.placeholder(tf.float32, [None, len(predictors)], name='x') if model_type == 'Classification': y = tf.placeholder(tf.float32, [None, 2], name='y') output_layer = create_network(x, hidden_layers, num_classes=2)
def main(args): # Source 1 times # Source 2 newind # Source 3 ind # Source 4 thehin # Source 5 scm # Source 6 people data = load_from_json(args.data) filename = args.input_dir + "/" + data["id"] with open(filename, "rb") as g: html_string = g.read() text = data["text"].splitlines() stoplist1 = None stoplist2 = None stoplist3 = None stoplist4 = None if args.source == 1: text = deletesamesubstr(text) stoplist1 = [ "RELATED", "From around the web", "More from The Times of India", "Recommended By Colombia", "more from times of india Cities", "You might also", "You might also like", "more from times of india", "All Comments ()+^ Back to Top", "more from times of india News", "more from times of india TV", "more from times of india Sports", "more from times of india Entertainment", "more from times of india Life & Style", "more from times of india Business" ] stoplist2 = ["FOLLOW US", "FOLLOW PHOTOS", "FOLLOW LIFE & STYLE"] elif args.source == 3: stoplist1 = [ "Tags:", "ALSO READ", "Please read our before posting comments", "TERMS OF USE: The views expressed in comments published on indianexpress.com are those of the comment writer's alone. They do not represent the views or opinions of The Indian Express Group or its staff. Comments are automatically posted live; however, indianexpress.com reserves the right to take it down at any time. We also reserve the right not to publish comments that are abusive, obscene, inflammatory, derogatory or defamatory." ] elif args.source == 4: stoplist3 = [ "ShareArticle", "Updated:", "MoreIn", "SpecialCorrespondent", "METRO PLUS", "EDUCATION PLUS", "PROPERTY PLUS", "CINEMA PLUS", "DISTRICT PLUS" ] stoplist4 = [ "METRO PLUS", "EDUCATION PLUS", "PROPERTY PLUS", "CINEMA PLUS", "DISTRICT PLUS" ] elif args.source == 5: stoplist1 = ["Print Email", "Video"] stoplist2 = [ "Viewed", "Associated Press", "Get updates direct to your inbox", "Opinion" ] elif args.source == 6: stoplist2 = [ 'Email | Print', '+', 'stumbleupon', 'More Pictures', 'Save Article', 'Click the "PLAY" button and listen. Do you like the online audio service here?', 'Good, I like it', 'Do you have anything to say?', 'Name' ] text = [line for line in text if not line.startswith("Source")] if text: text = deletecertainstr(text, stoplist1=stoplist1, stoplist2=stoplist2, stoplist3=stoplist3) if text: text, data = addnewstime(text, html_string, data, args.source, stoplist=stoplist4) if args.source == 1: text = deletesamesubstr(text) if text: text = "".join([ line.strip() + "\n" if line.strip() != "" else "" for line in text ])[:-1] data["text"] = text data = dump_to_json(data) return data
def run_vignettes_experiment(*, args): if args.reproduce is False: # run over the test_networks.json file and perform inference calculation networks = load_from_json(args.datapath / NETWORKS_FILE) casecards = load_from_json(args.datapath / VIGNETTES_FILE) inference_output = None else: # use pre-calcd inference output networks = None inference_output = load_from_json(args.datapath / RESULTS_FILE) casecards = load_from_json(args.datapath / VIGNETTES_FILE) topn_results_obs = [] topn_results_counter_suff = [] topn_results_counter_diss = [] count_all = 0 ind_obs_store = [] ind_suff_store = [] ind_diss_store = [] total_to_run = len(casecards) if args.first is not None: total_to_run = args.first pbar = tqdm(total=total_to_run, desc="Casecards", unit="cards") for card in casecards.values(): if args.first is not None and count_all >= args.first: continue if args.reproduce is False: if card["card"]["network_name"] not in networks: continue if inference_output is None and networks is not None: counter_suff, counter_diss, obs, true_id = run_single_vignette( card=card, networks=networks, datapath=args.datapath, ) else: output = inference_output[str(card["card"]["id"])] counter_suff = output["sufficiency"] counter_diss = output["disablement"] obs = output["posterior"] true_id = card["card"]["diseases"][0]["id"] pred_suff = np.array( [ 1 if true_id in sorted(counter_suff, key=counter_suff.get, reverse=True)[:i] else 0 for i in range(1, 21) ] ) pred_diss = np.array( [ 1 if true_id in sorted(counter_diss, key=counter_diss.get, reverse=True)[:i] else 0 for i in range(1, 21) ] ) pred_obs = np.array( [ 1 if true_id in sorted(obs, key=obs.get, reverse=True)[:i] else 0 for i in range(1, 21) ] ) topn_results_obs += [pred_obs] topn_results_counter_suff += [pred_suff] topn_results_counter_diss += [pred_diss] count_all += 1 pbar.update(1) if args.verbose and ( (count_all % 10 == 0) or (count_all == len(casecards) - 1) ): pbar.write(f"N_processed: {count_all}") pbar.write(f"TopN CFSuff: {sum(topn_results_counter_suff) / count_all}") pbar.write(f"TopN CFDiss: {sum(topn_results_counter_diss) / count_all}") pbar.write(f"TopN Obs: {sum(topn_results_obs) / count_all}\n") write_to_pickle(topn_results_obs, args.results / RESULTS_OBS_FILE) write_to_pickle( topn_results_counter_diss, args.results / RESULTS_CF_DISSABLEMENT_FILE ) write_to_pickle( topn_results_counter_suff, args.results / RESULTS_CF_SUFFICIENCY_FILE )