Ejemplo n.º 1
0
 def test_get_counts(self):
     """get_counts: should work with all parameters"""
     seq = RnaSequence('UCAG-NAUGU')
     p = BasePairs([(1, 8), (2, 7)])
     p2 = BasePairs([
         (1, 8),
         (2, 6),
         (3, 6),
         (4, 9),
     ])
     exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\
         'FP_INCONS':0, 'FP_CONTRA':0, 'FP_COMP':0}
     self.assertEqual(get_counts(p, p2, False), exp)
     exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\
         'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1}
     self.assertEqual(get_counts(p, p2, split_fp=True), exp)
     seq = RnaSequence('UCAG-NACGU')
     exp = {'TP':1,'TN':7, 'FN':1,'FP':3,\
         'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1}
     self.assertEqual(get_counts(p, p2, split_fp=True,\
         sequences=[seq], min_dist=2), exp)
     # check against compare_ct.pm
     exp = {'TP':4,'TN':266, 'FN':6,'FP':6,\
         'FP_INCONS':2, 'FP_CONTRA':2, 'FP_COMP':2}
     seq = 'agguugaaggggauccgauccacuccccggcuggucaaccu'.upper()
     self.assertEqual(get_counts(self.true, self.predicted, split_fp=True,\
         sequences=[seq], min_dist=4), exp)
Ejemplo n.º 2
0
Archivo: c45.py Proyecto: tangert/CS378
def calc_gain_ratio(parent_data, attribute):
    """
     Pseudocode:

     INFO GAIN:
     Entropy of parent data set - weight average of child entropies
                                -> sum ( child proportion of parent set * child entropy )

     Calculates reduction in disorder / increase in organization

     SPLIT INFO:
     sum of child proportions * log(child proportions)

    """

    current_split = Split(attribute)
    parent_entropy = calc_entropy(parent_data)
    parent_data_count = sum(get_counts(parent_data, LABEL_LOCATION).values())
    children_data_sets = partition_data(parent_data, current_split)

    # Initialize metrics
    weighted_child_avg = 0
    split_info = 0

    for child in children_data_sets:

        child_data = children_data_sets[child]
        child_entropy = calc_entropy(child_data)
        child_data_count = sum(get_counts(child_data, LABEL_LOCATION).values())
        child_proportion = child_data_count / parent_data_count

        split_info -= child_proportion * math.log(child_proportion, 2)
        weighted_child_avg += child_proportion * child_entropy

    info_gain = parent_entropy - weighted_child_avg

    # Store info gain, split info, and gain ratio all in one object
    # initialize with info gain and split info
    data = {'info_gain': info_gain, 'split_info': split_info}

    # calculate gain ratio simply from the other two metrics
    if data['split_info'] == 0:
        data['gain_ratio'] = 0
    else:
        data['gain_ratio'] = data['info_gain'] / \
                             data['split_info']

    print "Calculating gain ratio for {}: {}".format(attribute,
                                                     data['gain_ratio'])

    return data
Ejemplo n.º 3
0
def sample_cv(num_clusters,
              random_seed=None,
              out_dir='experiments/trained_models'):
    data = get_counts()

    random_seed = int(time.time()) if random_seed is None else random_seed
    np.random.seed(random_seed)

    model_name = 'Mix_' + str(num_clusters).zfill(3)
    dataset = 'ICGC-BRCA'

    out_dir_for_file = os.path.join(out_dir, dataset, model_name)

    try:
        os.makedirs(out_dir_for_file)
    except OSError:
        pass

    out_file = out_dir_for_file + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} already exist'.format(
            model_name, dataset, random_seed))
        # return

    scores_dict, parameters = train(num_clusters, data)
    dict_to_save = {'scores': scores_dict, 'parameters': parameters}
    save_json(out_file, dict_to_save)
Ejemplo n.º 4
0
def get_new_label_set(new_image_metadata, curr_labels):

    label_counts = utils.get_counts(new_image_metadata)

    for label_name, _ in sorted(label_counts.items(), key=itemgetter(1), reverse=True):

        if label_name in curr_labels:
            continue

        new_labels = curr_labels.union([label_name])

        if len('+'.join(new_labels)) > 100:
            # pixabay api rule
            continue
        dup_set_found = False
        for used_label_group, image_ids in iter(USED_LABELS.items()):
            if len(used_label_group.symmetric_difference(new_labels)) == 0:
                dup_set_found = True
                break

        if not dup_set_found:
            break
    else:
        new_labels = set()

    return new_labels
Ejemplo n.º 5
0
def update_files(metadata):
    write_pixabay_metadata_file(metadata)
    remove_orphaned_metadata()
    metadata = read_pixabay_metadata_file()
    print(f'metadata file saved. {len(metadata)} total records')
    label_counts = utils.get_counts(metadata)
    write_pixabay_tally_file(label_counts)
    print(f'tally file saved. {len(label_counts)} unique labels.')
Ejemplo n.º 6
0
Archivo: c45.py Proyecto: tangert/CS378
def calc_entropy(data):
    entropy = 0
    predictions = get_predictions(get_counts(data, LABEL_LOCATION))

    for label in predictions:
        entropy -= predictions[label] * math.log(predictions[label], 2)

    return entropy
Ejemplo n.º 7
0
def index():
    cols, rows = insert_stats(hours=hours, window=12)
    total_counts = sorted(list(map(list, get_counts(hours=24).items())))
    total_counts.insert(0, ['Website', 'Counts'])
    return render_template('charts.html',
                           cols=cols,
                           rows=rows,
                           counts=total_counts,
                           h=str(hours))
Ejemplo n.º 8
0
 def test_get_counts_pseudo(self):
     """get_counts: should work when pseudo in ref -> classification off"""
     # pairs that would normally be compatible, are now contradicting
     ref = BasePairs([(0, 8), (1, 7), (4, 10)])
     pred = BasePairs([(0, 8), (3, 6), (4, 10)])
     seq = 'GACUGUGUCAU'
     exp = {'TP':2,'TN':13-2-1, 'FN':1,'FP':1,\
         'FP_INCONS':0, 'FP_CONTRA':1, 'FP_COMP':0}
     self.assertEqual(get_counts(ref, pred, split_fp=True,\
         sequences=[seq], min_dist=4), exp)
Ejemplo n.º 9
0
def update_files(metadata, totalsdata, top3=False):
    write_pixabay_metadata_file(metadata)
    remove_orphaned_metadata()
    metadata = read_pixabay_metadata_file()
    print(f'metadata file saved. {len(metadata)} total records.', end=' ')
    label_counts = utils.get_counts(metadata)
    write_pixabay_tally_file(label_counts, top3=top3)
    print(f'tally file saved. {len(label_counts)} unique labels.', end=' ')
    write_pixabay_totals_file(totalsdata)
    print(f'totals file saved.')
Ejemplo n.º 10
0
def sample_cv(num_clusters, num_folds, fold, out_dir='experiments/sampleCV'):
    data = get_counts()

    if not 0 <= fold < num_folds:
        raise ValueError('num_folds is {} but fold is {}'.format(
            num_folds, fold))

    model_name = 'Mix_' + str(num_clusters).zfill(3)
    dataset = 'ICGC-BRCA'

    out_dir_for_file = os.path.join(out_dir, dataset, model_name)

    try:
        os.makedirs(out_dir_for_file)
    except OSError:
        pass

    out_file = out_dir_for_file + "/" + str(fold + 1) + '_' + str(num_folds)
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} {} already exist'.format(
            model_name, dataset, num_folds, fold))
        return

    # splitting the data
    sample_names = np.arange(len(data))
    splits = np.array_split(sample_names, num_folds)
    train_data = []
    test_data = []
    for chunk in range(num_folds):
        if chunk == fold:
            test_data.extend(splits[chunk])
        else:
            train_data.extend(splits[chunk])
    train_data = data[train_data]
    test_data = data[test_data]
    scores_dict, parameters = train_and_test(num_clusters, train_data,
                                             test_data)
    dict_to_save = {'scores': scores_dict, 'parameters': parameters}
    save_json(out_file, dict_to_save)
Ejemplo n.º 11
0
from elasticsearch import Elasticsearch
import pandas as pd
import sys

from utils import get_counts

print('Connecting to elasticsearch...')
es = Elasticsearch(hosts='localhost:9200')

print('pulling state-measure counts...')
counts = get_counts(es)

print('Building plot...')
counts_df = pd.DataFrame([{'state': item['key']['state'],
                           'measure_id': item['key']['measure_id'],
                           'counts': item['doc_count']}
                          for item in counts])
plot_df = counts_df.pivot(index='state', columns='measure_id', values='counts')

plot = plot_df.plot.bar(stacked=True, figsize=(20, 20))

print('Saving plot to {}'.format(sys.argv[1]))
plot.get_figure().savefig(sys.argv[1])
Ejemplo n.º 12
0
def trace(root, i, j):
    if len(backpointers[i][j][root]) == 1:
        return [root, backpointers[i][j][root]]
    else:
        s = backpointers[i][j][root][0]
        Y = backpointers[i][j][root][1]
        Z = backpointers[i][j][root][2]
        return [root, trace(Y, i, s), trace(Z, s, j)]


# To get script runtime (optional)
# start = timeit.default_timer()

# Obtain the count(X->YZ), count(X->w), count(X), A.K.A,  the binary, unary and non-terminal counts
# (see utils.py)
nonterminal_count, unary_count, binary_count = get_counts(sys.argv[1])
nonterminal_simple, unary_simple, binary_simple = get_counts(sys.argv[3])

# Read dev file line by line
dev_data = file(sys.argv[2], "r")
line = dev_data.readline().strip()
while line:
    words = line.split(" ")
    chart, backpointers = cyk(words, binary_count, unary_count,
                              nonterminal_count)
    n = len(words)
    # Check if at least one valid tree is returned using the vertical markovization model
    if len(backpointers[0][n]) > 0:
        # Get parse tree of max probability starting with 'S'
        if 'S' in backpointers[0][n]:
            tree = trace('S', 0, n)
Ejemplo n.º 13
0
Archivo: c45.py Proyecto: tangert/CS378
 def __init__(self, rows):
     self.counts = get_counts(rows, LABEL_LOCATION)
     self.predictions = get_predictions(self.counts)
     self.data_class = None
Ejemplo n.º 14
0
def trace(root, i, j):
    if len(backpointers[i][j][root]) == 1:
        return [root, backpointers[i][j][root]]
    else:
        s = backpointers[i][j][root][0]
        Y = backpointers[i][j][root][1]
        Z = backpointers[i][j][root][2]
        return [root, trace(Y, i, s), trace(Z, s, j)]


# To get script runtime (optional)
# start = timeit.default_timer()

# Obtain the count(X->YZ), count(X->w), count(X), A.K.A,  the binary, unary and non-terminal counts
# (see questions5_utils.py)
nonterminal_count, unary_count, binary_count = get_counts(sys.argv[1])
nonterminal_simple, unary_simple, binary_simple = get_counts(sys.argv[3])

# Read dev file line by line
dev_data = file(sys.argv[2], "r")
line = dev_data.readline().strip()
while line:
    words = line.split(" ")
    chart, backpointers = cyk(words, binary_count, unary_count, nonterminal_count)
    n = len(words)
    # Check if at least one valid tree is returned using the vertical markovization model
    if len(backpointers[0][n]) > 0:
        # Get parse tree of max probability starting with 'S'
        if "S" in backpointers[0][n]:
            tree = trace("S", 0, n)
        # If there are no valid parse trees starting with 'S', get arg max starting with any nonterminal