def test_get_counts(self): """get_counts: should work with all parameters""" seq = RnaSequence('UCAG-NAUGU') p = BasePairs([(1, 8), (2, 7)]) p2 = BasePairs([ (1, 8), (2, 6), (3, 6), (4, 9), ]) exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\ 'FP_INCONS':0, 'FP_CONTRA':0, 'FP_COMP':0} self.assertEqual(get_counts(p, p2, False), exp) exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\ 'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1} self.assertEqual(get_counts(p, p2, split_fp=True), exp) seq = RnaSequence('UCAG-NACGU') exp = {'TP':1,'TN':7, 'FN':1,'FP':3,\ 'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1} self.assertEqual(get_counts(p, p2, split_fp=True,\ sequences=[seq], min_dist=2), exp) # check against compare_ct.pm exp = {'TP':4,'TN':266, 'FN':6,'FP':6,\ 'FP_INCONS':2, 'FP_CONTRA':2, 'FP_COMP':2} seq = 'agguugaaggggauccgauccacuccccggcuggucaaccu'.upper() self.assertEqual(get_counts(self.true, self.predicted, split_fp=True,\ sequences=[seq], min_dist=4), exp)
def calc_gain_ratio(parent_data, attribute): """ Pseudocode: INFO GAIN: Entropy of parent data set - weight average of child entropies -> sum ( child proportion of parent set * child entropy ) Calculates reduction in disorder / increase in organization SPLIT INFO: sum of child proportions * log(child proportions) """ current_split = Split(attribute) parent_entropy = calc_entropy(parent_data) parent_data_count = sum(get_counts(parent_data, LABEL_LOCATION).values()) children_data_sets = partition_data(parent_data, current_split) # Initialize metrics weighted_child_avg = 0 split_info = 0 for child in children_data_sets: child_data = children_data_sets[child] child_entropy = calc_entropy(child_data) child_data_count = sum(get_counts(child_data, LABEL_LOCATION).values()) child_proportion = child_data_count / parent_data_count split_info -= child_proportion * math.log(child_proportion, 2) weighted_child_avg += child_proportion * child_entropy info_gain = parent_entropy - weighted_child_avg # Store info gain, split info, and gain ratio all in one object # initialize with info gain and split info data = {'info_gain': info_gain, 'split_info': split_info} # calculate gain ratio simply from the other two metrics if data['split_info'] == 0: data['gain_ratio'] = 0 else: data['gain_ratio'] = data['info_gain'] / \ data['split_info'] print "Calculating gain ratio for {}: {}".format(attribute, data['gain_ratio']) return data
def sample_cv(num_clusters, random_seed=None, out_dir='experiments/trained_models'): data = get_counts() random_seed = int(time.time()) if random_seed is None else random_seed np.random.seed(random_seed) model_name = 'Mix_' + str(num_clusters).zfill(3) dataset = 'ICGC-BRCA' out_dir_for_file = os.path.join(out_dir, dataset, model_name) try: os.makedirs(out_dir_for_file) except OSError: pass out_file = out_dir_for_file + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} already exist'.format( model_name, dataset, random_seed)) # return scores_dict, parameters = train(num_clusters, data) dict_to_save = {'scores': scores_dict, 'parameters': parameters} save_json(out_file, dict_to_save)
def get_new_label_set(new_image_metadata, curr_labels): label_counts = utils.get_counts(new_image_metadata) for label_name, _ in sorted(label_counts.items(), key=itemgetter(1), reverse=True): if label_name in curr_labels: continue new_labels = curr_labels.union([label_name]) if len('+'.join(new_labels)) > 100: # pixabay api rule continue dup_set_found = False for used_label_group, image_ids in iter(USED_LABELS.items()): if len(used_label_group.symmetric_difference(new_labels)) == 0: dup_set_found = True break if not dup_set_found: break else: new_labels = set() return new_labels
def update_files(metadata): write_pixabay_metadata_file(metadata) remove_orphaned_metadata() metadata = read_pixabay_metadata_file() print(f'metadata file saved. {len(metadata)} total records') label_counts = utils.get_counts(metadata) write_pixabay_tally_file(label_counts) print(f'tally file saved. {len(label_counts)} unique labels.')
def calc_entropy(data): entropy = 0 predictions = get_predictions(get_counts(data, LABEL_LOCATION)) for label in predictions: entropy -= predictions[label] * math.log(predictions[label], 2) return entropy
def index(): cols, rows = insert_stats(hours=hours, window=12) total_counts = sorted(list(map(list, get_counts(hours=24).items()))) total_counts.insert(0, ['Website', 'Counts']) return render_template('charts.html', cols=cols, rows=rows, counts=total_counts, h=str(hours))
def test_get_counts_pseudo(self): """get_counts: should work when pseudo in ref -> classification off""" # pairs that would normally be compatible, are now contradicting ref = BasePairs([(0, 8), (1, 7), (4, 10)]) pred = BasePairs([(0, 8), (3, 6), (4, 10)]) seq = 'GACUGUGUCAU' exp = {'TP':2,'TN':13-2-1, 'FN':1,'FP':1,\ 'FP_INCONS':0, 'FP_CONTRA':1, 'FP_COMP':0} self.assertEqual(get_counts(ref, pred, split_fp=True,\ sequences=[seq], min_dist=4), exp)
def update_files(metadata, totalsdata, top3=False): write_pixabay_metadata_file(metadata) remove_orphaned_metadata() metadata = read_pixabay_metadata_file() print(f'metadata file saved. {len(metadata)} total records.', end=' ') label_counts = utils.get_counts(metadata) write_pixabay_tally_file(label_counts, top3=top3) print(f'tally file saved. {len(label_counts)} unique labels.', end=' ') write_pixabay_totals_file(totalsdata) print(f'totals file saved.')
def sample_cv(num_clusters, num_folds, fold, out_dir='experiments/sampleCV'): data = get_counts() if not 0 <= fold < num_folds: raise ValueError('num_folds is {} but fold is {}'.format( num_folds, fold)) model_name = 'Mix_' + str(num_clusters).zfill(3) dataset = 'ICGC-BRCA' out_dir_for_file = os.path.join(out_dir, dataset, model_name) try: os.makedirs(out_dir_for_file) except OSError: pass out_file = out_dir_for_file + "/" + str(fold + 1) + '_' + str(num_folds) if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} {} already exist'.format( model_name, dataset, num_folds, fold)) return # splitting the data sample_names = np.arange(len(data)) splits = np.array_split(sample_names, num_folds) train_data = [] test_data = [] for chunk in range(num_folds): if chunk == fold: test_data.extend(splits[chunk]) else: train_data.extend(splits[chunk]) train_data = data[train_data] test_data = data[test_data] scores_dict, parameters = train_and_test(num_clusters, train_data, test_data) dict_to_save = {'scores': scores_dict, 'parameters': parameters} save_json(out_file, dict_to_save)
from elasticsearch import Elasticsearch import pandas as pd import sys from utils import get_counts print('Connecting to elasticsearch...') es = Elasticsearch(hosts='localhost:9200') print('pulling state-measure counts...') counts = get_counts(es) print('Building plot...') counts_df = pd.DataFrame([{'state': item['key']['state'], 'measure_id': item['key']['measure_id'], 'counts': item['doc_count']} for item in counts]) plot_df = counts_df.pivot(index='state', columns='measure_id', values='counts') plot = plot_df.plot.bar(stacked=True, figsize=(20, 20)) print('Saving plot to {}'.format(sys.argv[1])) plot.get_figure().savefig(sys.argv[1])
def trace(root, i, j): if len(backpointers[i][j][root]) == 1: return [root, backpointers[i][j][root]] else: s = backpointers[i][j][root][0] Y = backpointers[i][j][root][1] Z = backpointers[i][j][root][2] return [root, trace(Y, i, s), trace(Z, s, j)] # To get script runtime (optional) # start = timeit.default_timer() # Obtain the count(X->YZ), count(X->w), count(X), A.K.A, the binary, unary and non-terminal counts # (see utils.py) nonterminal_count, unary_count, binary_count = get_counts(sys.argv[1]) nonterminal_simple, unary_simple, binary_simple = get_counts(sys.argv[3]) # Read dev file line by line dev_data = file(sys.argv[2], "r") line = dev_data.readline().strip() while line: words = line.split(" ") chart, backpointers = cyk(words, binary_count, unary_count, nonterminal_count) n = len(words) # Check if at least one valid tree is returned using the vertical markovization model if len(backpointers[0][n]) > 0: # Get parse tree of max probability starting with 'S' if 'S' in backpointers[0][n]: tree = trace('S', 0, n)
def __init__(self, rows): self.counts = get_counts(rows, LABEL_LOCATION) self.predictions = get_predictions(self.counts) self.data_class = None
def trace(root, i, j): if len(backpointers[i][j][root]) == 1: return [root, backpointers[i][j][root]] else: s = backpointers[i][j][root][0] Y = backpointers[i][j][root][1] Z = backpointers[i][j][root][2] return [root, trace(Y, i, s), trace(Z, s, j)] # To get script runtime (optional) # start = timeit.default_timer() # Obtain the count(X->YZ), count(X->w), count(X), A.K.A, the binary, unary and non-terminal counts # (see questions5_utils.py) nonterminal_count, unary_count, binary_count = get_counts(sys.argv[1]) nonterminal_simple, unary_simple, binary_simple = get_counts(sys.argv[3]) # Read dev file line by line dev_data = file(sys.argv[2], "r") line = dev_data.readline().strip() while line: words = line.split(" ") chart, backpointers = cyk(words, binary_count, unary_count, nonterminal_count) n = len(words) # Check if at least one valid tree is returned using the vertical markovization model if len(backpointers[0][n]) > 0: # Get parse tree of max probability starting with 'S' if "S" in backpointers[0][n]: tree = trace("S", 0, n) # If there are no valid parse trees starting with 'S', get arg max starting with any nonterminal