def test_co_occurrences():
    doc1 = data.read_file('../data/tasa/TASATest/Science/Agatha09.07.03.txt')
    doc2 = data.read_file('../data/tasa/TASATest_preprocessed/Science/Agatha09.07.03.txt')
    g0 = construct_cooccurrence_network(doc1, context='window', already_preprocessed=False)
    g1 = construct_cooccurrence_network(doc2, context='window', already_preprocessed=True)
    g2 = construct_cooccurrence_network(doc1, context='sentence', already_preprocessed=False)
    graphs = data.pickle_from_file('output/testdata/co-occurrence-graphs.pkl')
    assert(graph.equal(g0,graphs[0]))
    assert(graph.equal(g1,graphs[1]))
    assert(graph.equal(g2,graphs[2]))

    doc = data.read_file('output/testdata/higher.order.testdoc.preprocessed.txt')
    g1 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1])
    g12 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2])
    g123 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2,3])
    g13 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,3])
    assert(('foo','bar') in g1.edges())
    assert(('foo','baz') not in g1.edges())
    assert(('foo','cake') not in g1.edges())
    assert(('foo','bar') in g12.edges())
    assert(('foo','baz') in g12.edges())
    assert(('foo','cake') not in g12.edges())
    assert(('foo','bar') in g123.edges())
    assert(('foo','baz') in g123.edges())
    assert(('foo','cake') in g123.edges())
    assert(('foo','baz') not in g13.edges())
    print 'ok'
Beispiel #2
0
def test():
    model_path = "./models/voc2012-64-0.55.h5"
    model = create_resnet50(NCLASS)
    model.load_weights(model_path)

    test_data_root = os.path.join(voc2012_root_path, "JPEGImages")
    test_data_label = "test.txt"
    all_labels = read_file(test_data_label)
    count = len(all_labels)
    correct = 0
    top_k = 3
    for index, image_and_label in enumerate(all_labels):
        image_name, ground_truth = image_and_label.split(" ")
        image_path = os.path.join(test_data_root, image_name)
        image = cv2.imread(image_path)
        image = cv2.resize(image, (WIDTH, HEIGHT))
        image = image.astype(np.float) / 255.0 - 0.5
        image = np.expand_dims(image, axis=0)
        out = model.predict(image)
        out = out[0].tolist()
        labels = list(map(out.index, heapq.nlargest(top_k, out)))
        if int(ground_truth) in labels:
            correct += 1
        predict = " ".join([classes[x] for x in labels])
        logger.info("{}/{} {} ground_truth = {:<12} predict = {:<12}".format(
            index + 1, count, image_name, classes[int(ground_truth)], predict))

    logger.info("accuracy = {}/{} = {}".format(correct, count,
                                               correct * 1.0 / count))
Beispiel #3
0
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20):
    def _print_terms(cents, rep, num):
        ts = _top_cents(cents, num)
        terms = []
        for t in ts:
            terms.append(t[0])
        print rep + ' & ' + ', '.join(terms) + ' \\\\'
    def _top_cents(cents,num):
        return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num]
    def _calc_cents(g, metric, gcents=None):
        if gcents: icc = graph_representation.calculate_icc_dict(gcents)
        else: icc = None
        return graph_representation.graph_to_dict(g, metric, icc)

    import operator
    import dependency_experiments
    import co_occurrence_experiments

    dataset = 'air/reports'
    path = '../data/'+doc
    doc = data.read_file(path)

    metric = graph.GraphMetrics.DEGREE
    context = 'window'
    g = graph_representation.construct_cooccurrence_network(doc, context=context)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Co-occurrence TC', num)
    gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Co-occurrence TC-ICC', num)

    metric = graph.GraphMetrics.EIGENVECTOR
    deps = data._text_to_dependencies(doc)
    g = graph_representation.construct_dependency_network(deps)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Dependency TC', num)
    gcents = dependency_experiments.retrieve_centralities(dataset, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Dependency TC-ICC', num)

    fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0]
    _print_terms(fdict, 'TF-IDF', num)

    fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0]
    _print_terms(fdict, 'TF', num)
Beispiel #4
0
 def load_test_data(self, filename):
     self.test_set = data.read_file(filename)
     self.embdding = np.array(data.load('word_emb.pkl'))
     return
import rule
import data

dataSet = data.read_file()


def evaluate_fitness(population):
    for individual in population:
        individual.fitness = 0
        fitness_function(individual, dataSet)


def fitness_function(individual, data_set):
    gene_count = 0
    rule_base = []

    for new_rule in range(0, rule.numberOfRules):
        rule_base.append(rule.Rule())

        for condition in range(0, rule_base[new_rule].numberOfConditions):
            rule_base[new_rule].conditions.append(individual.genes[gene_count])
            gene_count += 1

        rule_base[new_rule].output = individual.genes[gene_count]
        gene_count += 1

    for current_data in data_set:
        for current_rule in rule_base:
            if rule.check_rules_match(current_data.bytes, current_rule.conditions):
                if current_data.output == current_rule.output:
                    individual.fitness += 1
import sys
import logging as log
from conf import experiments, conf
from data import preprocess, read_file
import spacy
import os

log.basicConfig(format='%(asctime)s %(message)s', level=log.INFO)

try:
    experiment = experiments[sys.argv[1]]
except:
    log.error("experiment \"{0}\" does not exist".format(sys.argv[1]))
    sys.exit(1)

nlp = spacy.load(experiment["language"], disable=["ner", "pos", "parser"])

data_dir = os.path.join(conf["data_dir"], experiment["data"])
data_file = os.path.join(data_dir, "{0}.tsv".format(sys.argv[2]))
sentences, labels = read_file(data_file)
sentences_preprocessed = [
    preprocess(sentence, experiment, nlp) for sentence in sentences
]

log.info("writing preprocessed file")
for sentence_preprocessed, label in zip(sentences_preprocessed, labels):
    if label == sys.argv[3]:
        for token in sentence_preprocessed:
            sys.stdout.write("{0}\n".format(token))