Beispiel #1
0
def eval_and_log_w2v(t_train, w2v_space_file):
    """
    Evaluates the trained Word2Vec model. The evaluation metric is the Spearman correlation.
    Evaluation is designed to use the MEN dataset described in Bruni et al. (2014).
    :param t_train: float -- time taken to train the Word2Vec model
    :param w2v_space_file: str -- file path of the w2v model that was trained
    """
    global performance_summary
    if verbose: print("evaluating and logging the Word2Vec model ...")
    # evaluate the w2v model
    try:
        w2v_space = utils.readDM(w2v_space_file)
        spcorr, pairs = MEN.compute_men_spearman(w2v_space, testset_file)
        with open(w2v_results_file, "a+") as f:
            f.write("RUN " + str(run) + ":" + \
                    "\tSP_CORR: " + str(spcorr) + \
                    "\tTEST_PAIRS: " + str(pairs) + \
                    "\tTRAIN_TIME: " + str(t_train) + "\n")
        # keep internal log
        performance_summary[run].extend([spcorr, pairs, t_train])
    except Exception as e:
        with open(errorlog, "a") as f:
            f.write(str(e)[:500] + "\n")
        print("An error occured while evaluating Word2Vec. Check", errorlog,
              "for further information.")
Beispiel #2
0
    def read_incremental_parts(self,
                               outspace,
                               outcols,
                               flyfile,
                               verbose=False):
        """
        Returns a co-occurrence matrix, a corresponding vocabulary and its index, and a Fruitfly object.
        The matrix and the vocabulary can be newly instantiated or taken from existing files.
        The Fruitfly object can be optionally created alongside, also either new or from an
        existing file. All these options are handled by attributes of the Incrementor object
        from which this method is called.
        :param outspace: str -- file path to a co-occurrence count
        :param outcols: str -- file path to the corresponding vocabulary
        :param flyfile: str -- file path to a Fruitfly config (parameters and connections)
        :param verbose: bool -- comment on the workings via print statements
        :return: ndarray [[]] -- co-occurrence matrix (two axes, each of length n)
        :return: {str:int} -- mapping of vocabulary to matrix positions (length: n)
        :return: {int:str} -- mapping of matrix indices to vocabulary (length: n)
        :return: Fruitfly -- Fruitfly object (or None if not wanted)
        """
        if self.is_incremental:
            if verbose:
                print("\nLoading existing co-occurrence count from", outspace,
                      "...")
            # returns dict of word : vector
            unhashed_space = utils.readDM(outspace)
            i_to_words, words_to_i = utils.readCols(outcols)
            dimensions = sorted(words_to_i, key=words_to_i.get)
            cooc = np.stack(tuple([unhashed_space[w] for w in dimensions]))
        else:
            cooc = np.array([[]])
            words_to_i = {}
            i_to_words = {}

        if self.is_grow_fly:
            if self.is_new_fly:
                if verbose: print("creating new fruitfly...")
                # default config: (50,40000,6,5,log)
                fruitfly = Fruitfly.from_scratch(max_pn_size=self.fly_max_pn)
            else:
                if verbose: print("loading fruitfly from", flyfile, "...")
                fruitfly = Fruitfly.from_config(flyfile)
                self.fly_max_pn = fruitfly.max_pn_size
        else:
            fruitfly = None

        return cooc, words_to_i, i_to_words, fruitfly
Beispiel #3
0
def prepare_flight():
    """
    read in the count vectors etc. and choose which ones to fly
    based on a list of required words.
    :return fly_these: {str:[float]} -- possibly a subset of unhashed_space
    :return unhashed_space: {str:[float]} -- words and their corresponding co-occurrence counts
    :return_words_to_i: {str:int} -- mapping of context words to their position in the count
    """
    if verbose: print("Preparing hashing ...")
    unhashed_space = utils.readDM(breeder.outspace)
    i_to_words, words_to_i = utils.readCols(breeder.outcols)
    # only select words that will be needed for evaluation:
    if overlap_file is None:
        # in this case, fly() is applied to the whole of unhashed_space
        fly_these = unhashed_space
    else:
        words_for_flight = breeder.read_checklist(overlap_file)
        fly_these = {
            w: unhashed_space[w]
            for w in words_for_flight if w in unhashed_space
        }
    return fly_these, unhashed_space, words_to_i
Beispiel #4
0
                print("improvement:", round(internal_log[run[0]][4], 5),
                      "with configuration:", all_ff_specs[run[0]])


""" Parameter Input """
data, column_labels = get_text_resources_from_argv()
goldstandard = get_testset_from_argv()
log_dest = get_logging_from_argv()

flattening = get_flattening_from_argv()
kc_factor_min, kc_factor_max, kc_steps = get_ranges_from_argv("-kc")
projections_min, projections_max, proj_steps = get_ranges_from_argv("-proj")
hash_perc_min, hash_perc_max, hash_steps = get_ranges_from_argv("-hash")

# returns {str:[float]}
in_space = utils.readDM(data)
# returns {int:str} and {str:i}
i_to_cols, cols_to_i = utils.readCols(column_labels)
# length of word vector (= input dimension)
pn_size = len(i_to_cols)

# for reporting purposes
verbose = "-v" in sys.argv
no_overall_summary_wanted = "-no-summary" in sys.argv
# {run:ff_specs}
all_ff_specs = {}
# {run:results}
internal_log = {}
sp_vals = {}
""" Grid Search"""
run = 0
num_dims = 4000  #Num dims in BNC dm file
target = sys.argv[1]  #A kind (e.g. toad_N)
chars_file = sys.argv[2]
num_chars = int(sys.argv[3])
context_weight = int(sys.argv[4])
'''Get character name'''
character = ""
m = re.search(".*/(.*).chars", chars_file)
if m:
    character = m.group(1)
else:
    character = chars_file[:-6]
character = character.lower() + "_char_N"
'''Load files'''
background_space = utils.readDM("BNC.w10.4000c.5000r.ppmi.rownorm.dm")
background_cols = utils.readDims("BNC.w10.4000c.5000r.ppmi.rownorm.cols")
chars = utils.readChars(chars_file)
'''Compute contextualisation'''
c = 1
reweighted_vectors = []
for context in sorted(chars, key=chars.get, reverse=True):
    ppmi = chars[context]

    i = 0
    context_vector = np.zeros(num_dims)
    #print("Reweighting vector with context",context)
    for col in background_cols:
        if context in background_space and col in background_space:  #in case core space does not include context (e.g. bnc.2000 does not include 'rat')
            context_vector[i] = pow(
                utils.cosine_similarity(background_space[context],
Beispiel #6
0
import numpy as np
import utils
import sys

def mk_training_matrices(pairs, en_dimension, cat_dimension, semanticspace, catalan_space):
    en_mat = np.zeros((len(pairs),en_dimension)) 
    cat_mat = np.zeros((len(pairs),cat_dimension))
    c = 0
    for p in pairs:
        en_word,cat_word = p.split()
        en_mat[c] = semanticspace[en_word]   
        cat_mat[c] = catalan_space[cat_word]   
        c+=1
    return en_mat,cat_mat



if len(sys.argv) == 4:
    space=sys.argv[1]
    if space=='reducedcolors':
        semanticspace=utils.readDM("data/reducedcolors.dm")s
    if space =='fullcolors':
        semanticspace=utils.readDM("data/full.dm")
    word = sys.argv[2]
    num_neighbours = int(sys.argv[3])
    print(utils.neighbours(semanticspace, semanticspace[word],num_neighbours))
    english_neighbours = utils.neighbours(semanticspace,semanticspace[word],num_neighbours)
    utils.run_PCAneighbours(semanticspace,[word]+english_neighbours,"english_neighbours"+word+".png")
Beispiel #7
0
    print("\nUSAGE: python3 projection.py bnc|wiki [num-kc] [size-proj] [percent-hash]\n\
    - num-kc: the number of Kenyon cells\n\
    - size-proj: how many projection neurons are used for each projection\n\
    - percent-hash: how much of the Kenyon layer to keep in the final hash.\n")
    sys.exit() 

if sys.argv[1] == "bnc":
    data = "data/BNC-MEN.dm"
    column_labels = "data/BNC-MEN.cols"
    MEN_annot = "data/MEN_dataset_lemma_form_full"
else:
    data = "data/wiki_all.dm"
    column_labels = "data/wiki_all.cols"
    MEN_annot = "data/MEN_dataset_natural_form_full"

english_space = utils.readDM(data)
i_to_cols, cols_to_i = utils.readCols(column_labels)

PN_size = len(english_space.popitem()[1])
KC_size = int(sys.argv[2])
proj_size = int(sys.argv[3])
percent_hash = int(sys.argv[4])
print("SIZES PN LAYER:",PN_size,"KC LAYER:",KC_size)
print("SIZE OF PROJECTIONS:",proj_size)
print("SIZE OF FINAL HASH:",percent_hash,"%")

projection_layer = np.zeros(PN_size)
kenyon_layer = np.zeros(KC_size)
projection_functions = []

Beispiel #8
0
from flask import render_template, request, jsonify
from sklearn.decomposition import PCA
import logging

from utils import sim_to_matrix, sim_to_matrix_url, readDM, make_figure, readUrls
from htmlparser import extract_from_url
import mk_page_vector
from openviz import app

pca = PCA(n_components=2)
target_word = "meaning"
dm_dict_en = readDM("./openviz/spaces/english.dm")
dm_dict_ca = readDM("./openviz/spaces/catalan.dm")
url_dict_en = readUrls("./openviz/spaces/url_english.csv")
url_dict_ca = readUrls("./openviz/spaces/url_english.csv")

language_codes = {}
language_codes["English"] = [dm_dict_en, url_dict_en, "en"]
language_codes["Catalan"] = [dm_dict_ca, url_dict_ca, "ca"]


def compute(target_word, language):
    error = ""
    if language != "":
        dm_dict = language_codes[language][0]
        dictionary = language_codes[language][2]
    else:
        dm_dict = language_codes["English"][0]
        dictionary = language_codes["English"][2]
    logging.exception(language)
    if target_word not in dm_dict:
    import sys
    import MEN
    import utils

    # parameter input
    while True:
        spacefiles = utils.loop_input(
            rtype=str,
            default=None,
            msg="Space to be used (without file extension): ")
        try:
            data = spacefiles + ".dm"
            column_labels = spacefiles + ".cols"
            # returns {word:word_vector}
            unhashed_space = utils.readDM(data)
            # returns both-ways dicts of the vocabulary (word:index_in_vector)
            i_to_cols, cols_to_i = utils.readCols(column_labels)
        except FileNotFoundError as e:
            print("Unable to find files for input space and/or vocabulary.\n\
                   - correct file path?\n\
                   - are the file extensions '.dm' and '.cols'?\n\
                   - don't specify the file extension.")
            continue
        else:
            break
    MEN_annot = utils.loop_input(rtype=str,
                                 default=None,
                                 msg="Testset to be used: ")
    evaluate_mode = True if input(
        "Only evaluate the space (without flying)? [y/n] ").upper(
Beispiel #10
0
        en_word, cat_word = p.split()
        en_mat[c] = english_space[en_word]
        cat_mat[c] = catalan_space[cat_word]
        c += 1
    return en_mat, cat_mat


def linalg(mat_english, mat_catalan):
    w = np.linalg.lstsq(mat_english,
                        mat_catalan)[0]  # obtaining the parameters
    print(mat_english.shape, mat_catalan.shape, w.shape)
    return w


'''Read semantic spaces'''
english_space = utils.readDM("data/english.subset.dm")
catalan_space = utils.readDM("data/catalan.subset.dm")
utils.run_PCA(english_space, english_space.keys(), "english_space.png")
utils.run_PCA(catalan_space, catalan_space.keys(), "catalan_space.png")
'''Read all word pairs'''
all_pairs = []
f = open("data/pairs.txt")
for l in f:
    l = l.rstrip('\n')
    all_pairs.append(l)
f.close()
'''Make training/test fold'''
training_pairs = all_pairs[:120]
test_pairs = all_pairs[121:]
'''Make training/test matrices'''
en_mat, cat_mat = mk_training_matrices(training_pairs, 400, 300, english_space,
Beispiel #11
0
import numpy as np
from utils import sim_to_matrix, readDM, run_PCA
import sys

def visualise(words, space):
    run_PCA(space,words,words[0]+"_space.png")


'''Read semantic space'''
space = readDM(sys.argv[1])
neighbours = sim_to_matrix(space, space[sys.argv[2]], int(sys.argv[3]))
#print(neighbours)
visualise(neighbours,space)
def main():
    if len(sys.argv) == 3:
       # listtop500 = open('data/listtop500.txt','r')
        #listlast500=open('data/listlast500.txt','r')


        #['reykjanes','danielli','underdrawing','halichoerus','hepler','change-']


        #['widgeon','colourpoint','water-lilies','kingbirds','gallinules','pebbledash']

        #['flowers.','nard','hearing-aid','filsham','trumpet-shaped','crecca' ]

       #['kerchief', 'kingbirds','cerise','biretta','pale-blue','v-necked','pebbledash']
         
        #['crecca','flowers.','corollas','shovelers','supercilium','crocuses']
        listtop500=['village','ponk','catspaw','lycaenid','orangey-pink','saponaria']
        if sys.argv[2]=='full':
            semanticspace1=utils.readDM("data/colorswithoutremovedtargets.dm")
        if sys.argv[2]=='nonzero':
            semanticspace1=utils.readDM("data/reducedcolors.dm")
        dicttop=defaultdict(list)
        dictlast=defaultdict(list)
        for line in listtop500:
            #word = line.strip()
            word=line
            num_neighbours = int(sys.argv[1])
            neighbours1=[]
            for i in utils.neighbours(semanticspace1,semanticspace1[word],num_neighbours):
                neighbours1.append(i.strip("."))
            neighbours2=[]
            cosinefull=[]
            for i in functionneighbours(word,num_neighbours):
                neighbours2.append(i[0])
                cosinefull.append(i[1])
            densityfull=(sum(cosinefull))/(len(cosinefull))


            ##compare neigbours of 2 different spaces 
            intersection = set(neighbours1) & set(neighbours2)

       
            print(word, intersection,len(intersection))
            
            #dicttop[word]=len(intersection)
           

        # for line in listlast500:
        #     word = line.strip()
        #     num_neighbours = int(sys.argv[1])
        #     neighbours1=[]
        #     for i in utils.neighbours(semanticspace1,semanticspace1[word],num_neighbours):
        #         neighbours1.append(i.strip("."))
        #     neighbours2=[]
        #     cosinefull=[]
        #     for i in functionneighbours(word,num_neighbours):
        #         neighbours2.append(i[0])
        #         cosinefull.append(i[1])
        #     densityfull=(sum(cosinefull))/(len(cosinefull))


        #     ##compare neigbours of 2 different spaces 
        #     intersection = set(neighbours1) & set(neighbours2)
        #     print(intersection,len(intersection))
            
        #     dictlast[word]=len(intersection)
           


        #density color space 
            listdensity=[]
            listcoherence=[]
            neighbours=utils.neighbours(semanticspace1,semanticspace1[word],num_neighbours)
            for i in neighbours:
                cosine=utils.cosine_similarity(semanticspace1[word],semanticspace1[i])
               
                # if np.isnan(cosine):
                #     pass
                # else: 
                listdensity.append(cosine)
            density=sum(listdensity)/(len(listdensity))
            print('density color space: ',density)
            print('density full space: ', densityfull)
            dicttop[word]=[len(intersection),density]
Beispiel #13
0
import sys
sys.path.append('..')
import utils
from scipy.stats import spearmanr

dm_dict = utils.readDM(sys.argv[1])
eval_dataset = sys.argv[2]
system = []
gold = []
if eval_dataset == 'men':
    lines = open("MEN_dataset_lemma_form_full", 'r')
    sep = ' '
elif eval_dataset == 'simlex':
    with open("SimLex-999.txt", 'r') as f:
        lines = f.read().splitlines()[1:]
    sep = '\t'

for l in lines:
    fields = l.rstrip('\n').split(sep)
    w1 = fields[0][:-2]
    w2 = fields[1][:-2]
    score = float(fields[2])
    if w1 in dm_dict and w2 in dm_dict:
        try:
            cos = utils.cosine_similarity(dm_dict[w1], dm_dict[w2])
            system.append(cos)
            gold.append(score)
            print(w1, w2, cos, score)
        except:
            continue
f.close()
Beispiel #14
0
#This test the definitional nonces on sum.
#python3 test_def_nonces.py spaces/ukwac_reduced.txt /definitions/nonce.definitions.300.test

import sys
import re
import utils

background = sys.argv[1]
dataset = sys.argv[2]
mrr = 0.0

human_responses = []
system_responses = []

dm_dict = utils.readDM(background)

c = 0
f=open(dataset)
for l in f:
  if c < 1:
    c+=1
    continue
  else:
    fields=l.rstrip('\n').split('\t')
    nonce = fields[0]
    sentence = fields[1].replace("___","").split()
    print("--")
    print(nonce)
    print("SENTENCE:",sentence)

  if nonce in dm_dict: