def test_co_occurrences(): doc1 = data.read_file('../data/tasa/TASATest/Science/Agatha09.07.03.txt') doc2 = data.read_file('../data/tasa/TASATest_preprocessed/Science/Agatha09.07.03.txt') g0 = construct_cooccurrence_network(doc1, context='window', already_preprocessed=False) g1 = construct_cooccurrence_network(doc2, context='window', already_preprocessed=True) g2 = construct_cooccurrence_network(doc1, context='sentence', already_preprocessed=False) graphs = data.pickle_from_file('output/testdata/co-occurrence-graphs.pkl') assert(graph.equal(g0,graphs[0])) assert(graph.equal(g1,graphs[1])) assert(graph.equal(g2,graphs[2])) doc = data.read_file('output/testdata/higher.order.testdoc.preprocessed.txt') g1 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1]) g12 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2]) g123 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2,3]) g13 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,3]) assert(('foo','bar') in g1.edges()) assert(('foo','baz') not in g1.edges()) assert(('foo','cake') not in g1.edges()) assert(('foo','bar') in g12.edges()) assert(('foo','baz') in g12.edges()) assert(('foo','cake') not in g12.edges()) assert(('foo','bar') in g123.edges()) assert(('foo','baz') in g123.edges()) assert(('foo','cake') in g123.edges()) assert(('foo','baz') not in g13.edges()) print 'ok'
def test(): model_path = "./models/voc2012-64-0.55.h5" model = create_resnet50(NCLASS) model.load_weights(model_path) test_data_root = os.path.join(voc2012_root_path, "JPEGImages") test_data_label = "test.txt" all_labels = read_file(test_data_label) count = len(all_labels) correct = 0 top_k = 3 for index, image_and_label in enumerate(all_labels): image_name, ground_truth = image_and_label.split(" ") image_path = os.path.join(test_data_root, image_name) image = cv2.imread(image_path) image = cv2.resize(image, (WIDTH, HEIGHT)) image = image.astype(np.float) / 255.0 - 0.5 image = np.expand_dims(image, axis=0) out = model.predict(image) out = out[0].tolist() labels = list(map(out.index, heapq.nlargest(top_k, out))) if int(ground_truth) in labels: correct += 1 predict = " ".join([classes[x] for x in labels]) logger.info("{}/{} {} ground_truth = {:<12} predict = {:<12}".format( index + 1, count, image_name, classes[int(ground_truth)], predict)) logger.info("accuracy = {}/{} = {}".format(correct, count, correct * 1.0 / count))
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20): def _print_terms(cents, rep, num): ts = _top_cents(cents, num) terms = [] for t in ts: terms.append(t[0]) print rep + ' & ' + ', '.join(terms) + ' \\\\' def _top_cents(cents,num): return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num] def _calc_cents(g, metric, gcents=None): if gcents: icc = graph_representation.calculate_icc_dict(gcents) else: icc = None return graph_representation.graph_to_dict(g, metric, icc) import operator import dependency_experiments import co_occurrence_experiments dataset = 'air/reports' path = '../data/'+doc doc = data.read_file(path) metric = graph.GraphMetrics.DEGREE context = 'window' g = graph_representation.construct_cooccurrence_network(doc, context=context) cents = _calc_cents(g, metric) _print_terms(cents, 'Co-occurrence TC', num) gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Co-occurrence TC-ICC', num) metric = graph.GraphMetrics.EIGENVECTOR deps = data._text_to_dependencies(doc) g = graph_representation.construct_dependency_network(deps) cents = _calc_cents(g, metric) _print_terms(cents, 'Dependency TC', num) gcents = dependency_experiments.retrieve_centralities(dataset, metric) cents = _calc_cents(g, metric, gcents) _print_terms(cents, 'Dependency TC-ICC', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0] _print_terms(fdict, 'TF-IDF', num) fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0] _print_terms(fdict, 'TF', num)
def load_test_data(self, filename): self.test_set = data.read_file(filename) self.embdding = np.array(data.load('word_emb.pkl')) return
import rule import data dataSet = data.read_file() def evaluate_fitness(population): for individual in population: individual.fitness = 0 fitness_function(individual, dataSet) def fitness_function(individual, data_set): gene_count = 0 rule_base = [] for new_rule in range(0, rule.numberOfRules): rule_base.append(rule.Rule()) for condition in range(0, rule_base[new_rule].numberOfConditions): rule_base[new_rule].conditions.append(individual.genes[gene_count]) gene_count += 1 rule_base[new_rule].output = individual.genes[gene_count] gene_count += 1 for current_data in data_set: for current_rule in rule_base: if rule.check_rules_match(current_data.bytes, current_rule.conditions): if current_data.output == current_rule.output: individual.fitness += 1
import sys import logging as log from conf import experiments, conf from data import preprocess, read_file import spacy import os log.basicConfig(format='%(asctime)s %(message)s', level=log.INFO) try: experiment = experiments[sys.argv[1]] except: log.error("experiment \"{0}\" does not exist".format(sys.argv[1])) sys.exit(1) nlp = spacy.load(experiment["language"], disable=["ner", "pos", "parser"]) data_dir = os.path.join(conf["data_dir"], experiment["data"]) data_file = os.path.join(data_dir, "{0}.tsv".format(sys.argv[2])) sentences, labels = read_file(data_file) sentences_preprocessed = [ preprocess(sentence, experiment, nlp) for sentence in sentences ] log.info("writing preprocessed file") for sentence_preprocessed, label in zip(sentences_preprocessed, labels): if label == sys.argv[3]: for token in sentence_preprocessed: sys.stdout.write("{0}\n".format(token))