pathToSVMFile = sys.argv[3] clusterFile = sys.argv[4] relFile = sys.argv[5] pathToExpansionCache = sys.argv[6] normalVectorsFile = sys.argv[7] expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split( '/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion) print "Loading rel, task, vector, words that have been disambiguated" rel = shelve.open(relFile) task, tralala = load_task(taskFilename) vectors = load_vectors(vectorsFilename) normalVectors = load_vectors(normalVectorsFile) disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)] print "Reading agglomerative cluster centers" clusterCenters = [ getAverageWordRep(x, vectors) for x in read_sets(clusterFile) ] print "Starting..." # initiate empty ratings methodsRating = [] humanRating = [] questions = task.values() jointVocCache = dict()
# read all files textfile = sys.argv[1] relFile = sys.argv[2] clusterFile = sys.argv[3] vecFile = sys.argv[4] pathToSVMFile = sys.argv[5] pathToExpansionCache = sys.argv[6] pathToOutput = sys.argv[7] # open the rel rel = shelve.open(relFile) # open the vectors print "Loading vectors" vecs = load_vectors(vecFile) # read clusters and get their cluster centers by taking the average... print "Reading agglomerative cluster centers" clusterCenters = [ getAverageWordRep(x, vecs) for x in read_sets(clusterFile) ] # IT MIGHT HAPPEN THAT SOME CLUSTER CENTERS ARE ()? HOW IS THIS POSSIBLE? # set some remaining parameters expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split( '/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion)
vectorsFilename = sys.argv[2] pathToSVMFile = sys.argv[3] clusterFile = sys.argv[4] relFile = sys.argv[5] pathToExpansionCache = sys.argv[6] normalVectorsFile = sys.argv[7] expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion) print "Loading rel, task, vector, words that have been disambiguated" rel = shelve.open(relFile) task, tralala = load_task(taskFilename) vectors = load_vectors(vectorsFilename) normalVectors = load_vectors(normalVectorsFile) disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)] print "Reading agglomerative cluster centers" clusterCenters = [getAverageWordRep(x, vectors) for x in read_sets(clusterFile)] print "Starting..." # initiate empty ratings methodsRating = [] humanRating = [] questions = task.values() jointVocCache = dict() partVoc = set(vectors.keys())
bestSim = None bestWord = None for candidate in expandedContext: if candidate in vectors: sim = cosine_similarity(vectors[candidate], wordVector) if sim > bestSim: bestSim = sim bestWord = candidate return bestWord #labels = ['concept', 'discord', 'physiol', 'newton', 'steve', 'keyboard', 'disk', 'compani', 'itun', 'cut', 'grape', 'desktop', 'late', 'window', 'busi', 'board', 'introduc', 'digit', 'firewir', 'ceo', 'big', 'powerbook', 'mous', 'game', 'cyanid', 'bit', 'name', 'reveal', 'motorola', 'intel', 'gui', 'cider', 'clown', 'popular', 'x', 'pausania', 'candi', 'respond', 'nut', 'timelin', 'continu', 'see', 'video', 'logo', 'gross', 'alcohol', 'profit', 'new', 'sold', 'red', 'machin', 'ipod', 'job', 'corp', 'found', 'releas', 'news', 'advertis', 'lawsuit', 'card', 'g', 'sacr', 'technolog', 's', 'plus', 'retail', 'block', 'macintosh', 'softwar', 'sweet', 'modul', 'system', 'linux', 'macworld', 'market', 'use', 'type', 'pictur', 'wozniak', 'licens', 'hesperid', 'connector', 'interfac', 'basic', 'basin', 'was', 'threw', 'parc', 'brand', 'gravit', 'beatl', 'appel', 'iic', 'line', 'iii', 'ibm', 'syrup', 'properti', 'tree', 'matter', 'nine', 'danc', 'display', 'employe', 'hypercard', 'comput', 'share', 'pie', 'ii', 'replica', 'tabl', 'infring', 'phrase', 'quicktim', 'sell', 'develop', 'datura', 'rather', 'media', 'deni', 'inscrib', 'descend', 'eventu', 'smell', 'bundl', 'juic', 'pineappl', 'product', 'clone', 'pear', 'evil', 'mac', 'fruit', 'stereo', 'potato', 'home', 'su', 'patent', 'switch', 'adam', 'dos', 'audio', 'microsoft', 'wine'] #labels = ['inning', 'mexican', 'sequest', 'just', 'fox', 'guano', 'rodent', 'handcuff', 'sleep', 'endem', 'ear', 'ty', 'harmless', 'aaron', 'score', 'rabi', 'vulner', 'plate', 'anim', 'hit', 'megabat', 'ya', 'safe', 'roost', 'dark', 'game', 'leagu', 'batter', 'shadow', 'cage', 'earn', 'costum', 'masterson', 'entangl', 'sacrific', 'lament', 'superfamili', 'sanctuari', 'nose', 'night', 'mammal', 'batsmen', 'respond', 'vampir', 'noctilionida', 'speci', 'matur', 'percentag', 'burger', 'championship', 'greenhal', 'home', 'fli', 'ha', 'abl', 'cinemat', 'estim', 'fossil', 'get', 'thriae', 'hopikin', 'goth', 'team', 'pontoon', 'mississippi', 'usual', 'pup', 'molossida', 'trickster', 'comoro', 'career', 'joke', 'askariyya', 'livingston', 'boe', 'vespertilionida', 'logo', 'myoti', 'walk', 'sox', 'pp', 'caught', 'frolic', 'wicket', 'swarm', 'pollin', 'strike', 'prey', 'funnel', 'tenerif', 'season', 'spider', 'casey', 'conserv', 'nobodi', 'microchiroptera', 'eat', 'greenberg', 'sound', 'exposur', 'ben', 'hitter', 'hous', 'tree', 'rope', 'record', 'retir', 'shark', 'teammat', 'scanner', 'toss', 'cricket', 'player', 'basebal', 'alphabet', 'cave', 'bowl', 'batman', 'hibern', 'ruth', 'wildlif', 'ball', 'ye', 'autocod', 'insect', 'fruit', 'blood', 'averag', 'ghost', 'noun', 'batwoman', 'lineup', 'wing', 'subgroup'] labels = ['foul', 'rodent', 'abil', 'sky', 'solitari', 'aaron', 'hockey', 'rabi', 'annoy', 'anim', 'hit', 'genera', 'bear', 'batter', 'yellow', 'cage', 'gray', 'hunt', 'glove', 'sang', 'nose', 'team', 'claw', 'vision', 'batsmen', 'sneaker', 'vampir', 'speci', 'fish', 'matur', 'home', 'girl', 'bee', 'blue', 'fli', 'flower', 'appear', 'pet', 'hoof', 'fox', 'lara', 'score', 'finger', 'bird', 'disney', 'statist', 'night', 'mammal', 'errat', 'pup', 'slug', 'box', 'wolv', 'wolf', 'smoki', 'genus', 'nicknam', 'walk', 'duck', 'limb', 'scent', 'breed', 'tail', 'gehrig', 'prey', 'cub', 'babe', 'season', 'tiger', 'rabbit', 'catch', 'ear', 'eat', 'bud', 'shoot', 'cacti', 'ben', 'hitter', 'tree', 'cat', 'rope', 'rbi', 'wild', 'boomerang', 'cetacean', 'robin', 'heart', 'shark', 'cane', 'sad', 'nippl', 'pit', 'comfort', 'microbat', 'chest', 'batman', 'hibern', 'wildlif', 'ball', 'terri', 'bumblebe', 'nest', 'insect', 'whale', 'averag', 'kid', 'rbis', 'gather', 'dog', 'tooth', 'walker', 'lineup', 'snake'] vectors = load_vectors('../data/wordvectors/enwiki8.relevant.vectors') rel = shelve.open('../../corponut/enwiki8_rel') wordRel = rel['appl'] rel.close() toBeShifted = [] for label in labels: if not label in vectors: toBeShifted.append(label) labels = list(set(labels) - set(toBeShifted)) while len(labels) > 5: best1 = None best2 = None bestSim = None
# read all files textfile = sys.argv[1] relFile = sys.argv[2] clusterFile = sys.argv[3] vecFile = sys.argv[4] pathToSVMFile = sys.argv[5] pathToExpansionCache = sys.argv[6] pathToOutput = sys.argv[7] # open the rel rel = shelve.open(relFile) # open the vectors print "Loading vectors" vecs = load_vectors(vecFile) # read clusters and get their cluster centers by taking the average... print "Reading agglomerative cluster centers" clusterCenters = [getAverageWordRep(x, vecs) for x in read_sets(clusterFile)] # IT MIGHT HAPPEN THAT SOME CLUSTER CENTERS ARE ()? HOW IS THIS POSSIBLE? # set some remaining parameters expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion) wordsOfInterest = [x.split("_")[0] for x in os.listdir(pathToSVMFile)] #print wordsOfInterest f = open(pathToOutput, 'r')
vectorsFilename = sys.argv[2] pathToSVMFile = sys.argv[3] clusterFile = sys.argv[4] relFile = sys.argv[5] pathToExpansionCache = sys.argv[6] pathToNormalVectors = sys.argv[7] expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion) print "Loading rel, task, vector, words that have been disambiguated" rel = shelve.open(relFile) task, tralala = load_task(taskFilename) vectors = load_vectors(vectorsFilename) normalVectors = load_vectors(pathToNormalVectors) disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)] print "Reading agglomerative cluster centers" clusterCenters = [getAverageWordRep(x, vectors) for x in read_sets(clusterFile)] print "Starting..." # initiate empty ratings methodsRating = [] humanRating = [] questions = task.values() jointVocCache = dict() partVoc = set(vectors.keys())
import sys from utils import load_task, Word from fast_utils import cosine_similarity, load_vectors, spearman if __name__ == "__main__": print "Baseline with wordvectors" if len(sys.argv) < 3: print "USAGE: python baselline_word2vec.py <PATH TO TASK> <PATH TO WORDVECTORS>" sys.exit() taskFilename = sys.argv[1] vectorsFilename = sys.argv[2] task, _ = load_task(taskFilename) vectors = load_vectors(vectorsFilename) methodsRating = [] humanRating = [] questions = task.values() coverage = 0 for i in xrange(len(questions)): question = questions[i] word1 = Word(question['word1']).lemma() word2 = Word(question['word2']).lemma() if word1 in vectors and word2 in vectors: vec1 = vectors[word1] vec2 = vectors[word2] methodsRating.append(cosine_similarity(vec1, vec2))
print "\tClustering took", stop - start, "seconds" return nodes def read_args(): assert len(sys.argv) == 5 vecs = sys.argv[1] limit = int(sys.argv[2]) clusternumber = int(sys.argv[3]) minimum = int(sys.argv[4]) return (vecs, limit, clusternumber, minimum) if __name__ == '__main__': # pypy agglomerative.py ../data/wordvectors/enwiki8.relevant.vectors 2000 500 10 (vecs, limit, clusternumber, minimum) = read_args() print "Loading vectors" data = load_vectors(vecs, limit).items() print "Clustering" nodes = fag_clustering(data, 0.03, clusternumber, minimum) print "Saving clusters" clusterName = "../data/agg_wordclusters/" clusterName += vecs.split("/")[-1] clusterName += "_" + str(limit) + "x" + str(clusternumber) + "x" + str( minimum) clusterName += ".clusters" save_clusters_to_file(nodes, clusterName) print "Done!"
def vector_similarity(cs, w1, w2, vectors): if w1 in vectors and w2 in vectors: return cosine_similarity(vectors[w1], vectors[w2]) else: return cs if __name__ == '__main__': if not len(sys.argv) == 4: print "USAGE: python agg_answering.py <PATH TO TASK> <PATH TO cluster_descriptors> <PATH TO wordvectors" sys.exit() print "Loading stuf..." taskFilename = sys.argv[1] filename = sys.argv[2] # "../../../cluster_descriptors/enwiki8.clust-desc.shelve" vectorsFilename = sys.argv[3] vectors = load_vectors(vectorsFilename) d = shelve.open(filename) key_sets = [] newD = dict() vec_size = len(d.keys()) for i in xrange(vec_size): key_sets.append(set(d[str(i)].keys())) newD[i] = d[str(i)] task, _ = load_task(taskFilename) questions = task.values() methodsRating = [] humanRating = []