コード例 #1
0
import networkx as nx


if __name__ == "__main__":
    # Method variables
    #predict_threshold = 0.9 # prediction threshold
    
    input_file = "test-full.txt"
    output_file = "test-expanded-all-neighb.txt"
    
    # Generate graph from edges file
    G = ex.read_graph(c.output_dir + "edges-directed-6.txt")
    
    # Get feature space and target words
    features_index = u.read_features_file(c.output_dir + "feat-space.txt")
    target_words = u.get_target_words()
    
    # Get sentences/vectors of data to expand
    sentences, data = ex.get_expansion_data(input_file, features_index)
    
    # Get matrix of weight vectors
    print "generating weight matrix..."
    W, b_arr = u.get_weight_matrix(target_words)
    
    
    print "expanding feature vectors..."
    i = 0
    for vect in data:
        #if i == 5:
        #    break
        
コード例 #2
0
# Get training data (sentences) to learn agreement with.
# We select different datasets for every pair of words to optimise the agreement measure.
# dDtaset for pair x and y: N sents containing x, N sents containing y, and N sents containing neither.

import lib.utility as u
import config as c
import random
import itertools
import os
from time import time

if __name__ == "__main__":
    # Load target words into memory
    target_words = u.get_target_words()

    # Get highly related pairs of words to get agreement data for
    #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt")

    # Store all sentences in memory
    sentences_with = set(u.get_lines(c.output_dir + "sentences-with.txt"))
    sentences_without = set(u.get_lines(c.output_dir +
                                        "sentences-without.txt"))

    # Get all sentences used already - using a dict (so we only care about sentences used for specific word)
    used_sentences = {}
    for word in target_words:
        with open(c.train_data_dir + word + ".txt") as file:
            sents = set()
            for line in file:
                sents.add(line[3:].strip())
            used_sentences[word] = sents
コード例 #3
0
    Predicts presence of word in each instance in dataset, and returns results array.
    """
    file_name = c.rbf_data + word + '.txt'

    # Get predicted classes for word
    results = u.get_lines(file_name)
    results = results[1:]

    return results


if __name__ == "__main__":
    # Read features and target words
    features_index = u.read_features_file(c.output_dir + "feat-space.txt")
    feat_size = len(features_index)  #5000
    target_words = u.get_target_words()  # len = 700

    # Set up two matrices - one for directed (asymmetric matrix) and one for undirected (symmetric matrix) agreements
    #dimensions = len(target_words)
    #SM = [[0.0000 for j in xrange(dimensions)] for i in xrange(dimensions)]
    #AM = [[0.0000 for j in xrange(dimensions)] for i in xrange(dimensions)]

    # Get pairs
    #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt")

    # Arrays to hold agreements
    pos_agr_a = []
    cond_prob_a = []

    # Iterate through each pair, and compute two types of agreement
    # counter