コード例 #1
0
def main():
    """
    Compute k nearest neighbors for targets.
    """

    # Get the arguments
    args = docopt("""Compute  k nearest neighbors for targets.

    Usage:
        knn.py <spacePrefix1> <k> <outPath> [<testset> <co>]

        <spacePrefix1> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <co> = column index for targets
        <k> = parameter k (k nearest neighbors)
        <outPath> = output path for result file

    Note:
        ...
        
    """)

    spacePrefix1 = args['<spacePrefix1>']
    testset = args['<testset>']
    co = int(args['<co>'])
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.config.dictConfig({
        'version': 1,
        'disable_existing_loggers': True,
    })
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)

    if testset != None:
        with codecs.open(testset, 'r', 'utf8') as f_in:
            targets = [line.strip().split('\t')[co] for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        targets = [target.decode('utf8') for target in space1.get_row2id()]

    target2neighbors = {}
    for i, t1 in enumerate(targets):

        try:
            neighbors1 = space1.get_neighbours(t1.encode('utf8'), k,
                                               CosSimilarity())
            del neighbors1[0]
        except KeyError:
            neighbors1 = [('nan', float('nan'))]

        target2neighbors[t1] = neighbors1

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for t1 in targets:
            # Convert cosine similarity to cosine distance, export nearest neighbors
            print >> f_out, t1 + '\t' + ' '.join(
                [str((n, 1 - v)) for (n, v) in target2neighbors[t1]])

    logging.info("--- %s seconds ---" % (time.time() - start_time))
コード例 #2
0
            verbs_for_mean_bytes.append(verb_b)

    print str(len(verbs_bytes)) + ' verbs in main list\n'
    print str(len(verbs_for_mean_bytes)) + ' verbs in mean list\n'

    f_list = io.open(FLAGS.filebase + '/neighbours_list.csv',
                     'w',
                     encoding='utf8')
    f_mean = io.open(FLAGS.filebase + '/distance_to_mean.csv',
                     'w',
                     encoding='utf8')

    for verb in verbs_bytes:
        neighbour_list = this_space.get_neighbours(verb,
                                                   FLAGS.number_neighbours,
                                                   CosSimilarity())
        neighbour_words = [item[0] for item in neighbour_list]
        neighbour_similarities = [item[1] for item in neighbour_list]
        f_list.write(verb.decode('utf8') + ',')
        f_list.write(','.join(neighbour_words).decode('utf8') + '\n')
        floats_line = 'Sim.,'
        for sim in neighbour_similarities:
            floats_line += '%.7f,' % sim

        verbs_line = 'Verb,'
        for word in neighbour_words:
            if word in verbs_bytes:
                verbs_line += '1,'
            else:
                verbs_line += ','
コード例 #3
0
ファイル: dissect.py プロジェクト: DariaRyzhova/phd
        els_for_comp.append(element)
    return els_for_comp


typ_space = create_space(TypDmFile, TypRowsFile)
distr_space = create_space(DistrDmFile, DistrRowsFile)

#load a space from a pickle file
#my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl")

#distributional vectors processing
distr_space = distr_space.apply(PpmiWeighting())
distr_space = distr_space.apply(Svd(300))
#io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl")

items = items_from_file(itemsFile)
els_for_comp = elements_for_composition(items)

my_comp = WeightedAdditive(alpha=1, beta=1)
distr_space = my_comp.compose(els_for_comp, distr_space)

pairs = pairs(items)

predicted = distr_space.get_sims(pairs, CosSimilarity())
gold = typ_space.get_sims(pairs, CosSimilarity())

#compute correlations
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")
コード例 #4
0
#ex08.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load a space
my_space = io_utils.load("./data/out/ex01.pkl")

#get the top 2 neighbours of "car"
print my_space.get_neighbours("car", 2, CosSimilarity())
コード例 #5
0
print("Training Lexical Function composition model...")
comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)

print("Composing phrases...")
test_phrases_file = data_path + "ML08nvs_test.txt"
test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2])
composed_space = comp_model.compose(test_phrases, space)

print("Reading similarity test data...")
test_similarity_file = data_path + "ML08data_new.txt"
test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1])
gold = io_utils.read_list(test_similarity_file, field=2)

print("Computing similarity with lexical function...")
pred = composed_space.get_sims(test_pairs, CosSimilarity())

#use this composed space to assign similarities
print("Scoring lexical function...")
print(scoring_utils.score(gold, pred, "spearman"))

print("Training Full Additive composition model...")
comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)
composed_space = comp_model.compose(test_phrases, space)
pred = composed_space.get_sims(test_pairs, CosSimilarity())
print(scoring_utils.score(gold, pred, "spearman"))

print("Training Weighted Additive composition model...")
comp_model = WeightedAdditive()
comp_model.train(train_data, space, per_space)
コード例 #6
0
ファイル: ex07.py プロジェクト: tttthomasssss/dissect-py3
#ex07.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load two spaces
my_space = io_utils.load("./data/out/ex01.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

print(my_space.id2row)
print(my_per_space.id2row)

#compute similarity between a word and a phrase in the two spaces
print(
    my_space.get_sim("car", "sports_car", CosSimilarity(),
                     space2=my_per_space))
コード例 #7
0
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])

lengths = []
found = True
for wp in word_pairs:
    try:
        v1 = my_space.get_row(wp[0])
        v2 = my_space.get_row(wp[1])
    except KeyError:
        #print wp[0],"or",wp[1],"not found"
        found = False
    if found:
        composed_space = add.compose([(wp[0], wp[1], "_composed_")], my_space)
        neighbours = composed_space.get_neighbours("_composed_",
                                                   10,
                                                   CosSimilarity(),
                                                   space2=my_space)
        print wp[0], wp[1]
        print neighbours
        density = 0
        for n in neighbours:
            density += n[1]
        density = density / 10
        print "Density", density
        c = composed_space.get_row("_composed_")
        print "Norm ", c.norm()
        cos = composed_space.get_sim("_composed_",
                                     wp[1],
                                     CosSimilarity(),
                                     space2=my_space)
        print "Cos ", cos
コード例 #8
0
ファイル: ex20.py プロジェクト: tttthomasssss/dissect-py3
#ex20.py
#-------
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity

#read in a space
my_space = io_utils.load("data/out/ex01.pkl")

#compute similarities of a list of word pairs
fname = "data/in/word_sims.txt"
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])
predicted = my_space.get_sims(word_pairs, CosSimilarity())

#compute correlations
gold = io_utils.read_list(fname, field=2)
print("Spearman")
print(scoring_utils.score(gold, predicted, "spearman"))
print("Pearson")
print(scoring_utils.score(gold, predicted, "pearson"))
コード例 #9
0
ファイル: ex06.py プロジェクト: totonac/dissect
#ex06.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load a space
my_space = io_utils.load("./data/out/ex01.pkl")

print my_space.cooccurrence_matrix
print my_space.id2row

#compute similarity between two words in the space
print my_space.get_sim("car", "car", CosSimilarity())
print my_space.get_sim("car", "book", CosSimilarity())
コード例 #10
0
def computeAnalogy(w1,w2,w3):
	composed_space = sub.compose([(w1,w2, "step1")], space)
	composed_space2 = add.compose([("step1", w3, "step2")], (composed_space,space))
	guess=composed_space2.get_neighbours("step2", 1, CosSimilarity(),space)
	return guess
コード例 #11
0
#kneighbours.py
#USAGE: python kneighbours [space file] [word] [k]
#EXAMPLE: python2.7 kneighbours.py ~/UkWac/dissect-data/ANs/out/CORE_SS.ans.ppmi.row.pkl car-n 30
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity
import sys

#load a space
my_space = io_utils.load(sys.argv[1])

#get the top 2 neighbours of "car"
print my_space.get_neighbours(sys.argv[2], int(sys.argv[3]), CosSimilarity())
コード例 #12
0
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity
import sys

#read in a space
my_space = io_utils.load(sys.argv[1])

#compute similarities of a list of word pairs
fname = sys.argv[2]
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1, 2])

predicted = []
gold = []
cos = 0
for wp in word_pairs:
    try:
        cos = my_space.get_sim(wp[0], wp[1], CosSimilarity())
        if cos > 0:
            #print wp[0],wp[1],cos
            predicted.append(cos)
            gold.append(wp[2])
    except:
        print "Couldn't measure cosine..."

#compute correlations
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")