Example #1
0
    pathToSVMFile = sys.argv[3]
    clusterFile = sys.argv[4]
    relFile = sys.argv[5]
    pathToExpansionCache = sys.argv[6]
    normalVectorsFile = sys.argv[7]

    expansion = 5
    window = 5
    svmFileInfo = '_SVM_' + clusterFile.split(
        '/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
    expansionCacheInfo = "_expansionParam_" + str(expansion)

    print "Loading rel, task, vector, words that have been disambiguated"
    rel = shelve.open(relFile)
    task, tralala = load_task(taskFilename)
    vectors = load_vectors(vectorsFilename)
    normalVectors = load_vectors(normalVectorsFile)
    disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)]

    print "Reading agglomerative cluster centers"
    clusterCenters = [
        getAverageWordRep(x, vectors) for x in read_sets(clusterFile)
    ]

    print "Starting..."
    # initiate empty ratings
    methodsRating = []
    humanRating = []
    questions = task.values()

    jointVocCache = dict()
Example #2
0
    # read all files
    textfile = sys.argv[1]
    relFile = sys.argv[2]
    clusterFile = sys.argv[3]
    vecFile = sys.argv[4]
    pathToSVMFile = sys.argv[5]
    pathToExpansionCache = sys.argv[6]
    pathToOutput = sys.argv[7]

    # open the rel
    rel = shelve.open(relFile)

    # open the vectors
    print "Loading vectors"
    vecs = load_vectors(vecFile)

    # read clusters and get their cluster centers by taking the average...
    print "Reading agglomerative cluster centers"
    clusterCenters = [
        getAverageWordRep(x, vecs) for x in read_sets(clusterFile)
    ]
    # IT MIGHT HAPPEN THAT SOME CLUSTER CENTERS ARE ()? HOW IS THIS POSSIBLE?

    # set some remaining parameters
    expansion = 5
    window = 5
    svmFileInfo = '_SVM_' + clusterFile.split(
        '/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
    expansionCacheInfo = "_expansionParam_" + str(expansion)
Example #3
0
	vectorsFilename = sys.argv[2]
	pathToSVMFile = sys.argv[3]
	clusterFile = sys.argv[4]
	relFile = sys.argv[5]
	pathToExpansionCache = sys.argv[6]
	normalVectorsFile = sys.argv[7]

	expansion = 5
	window = 5
	svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
	expansionCacheInfo = "_expansionParam_"  + str(expansion)
	
	print "Loading rel, task, vector, words that have been disambiguated"
	rel = shelve.open(relFile)
	task, tralala = load_task(taskFilename)
	vectors = load_vectors(vectorsFilename)
	normalVectors = load_vectors(normalVectorsFile)
	disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)]

	print "Reading agglomerative cluster centers"
	clusterCenters = [getAverageWordRep(x, vectors) for x in read_sets(clusterFile)]

	print "Starting..."
	# initiate empty ratings
	methodsRating = []
	humanRating = []
	questions = task.values()

	jointVocCache = dict()
	partVoc = set(vectors.keys())
Example #4
0
	bestSim = None
	bestWord = None

	for candidate in expandedContext:
		if candidate in vectors:
			sim = cosine_similarity(vectors[candidate], wordVector)
			if sim > bestSim:
				bestSim = sim
				bestWord = candidate
	return bestWord

#labels = ['concept', 'discord', 'physiol', 'newton', 'steve', 'keyboard', 'disk', 'compani', 'itun', 'cut', 'grape', 'desktop', 'late', 'window', 'busi', 'board', 'introduc', 'digit', 'firewir', 'ceo', 'big', 'powerbook', 'mous', 'game', 'cyanid', 'bit', 'name', 'reveal', 'motorola', 'intel', 'gui', 'cider', 'clown', 'popular', 'x', 'pausania', 'candi', 'respond', 'nut', 'timelin', 'continu', 'see', 'video', 'logo', 'gross', 'alcohol', 'profit', 'new', 'sold', 'red', 'machin', 'ipod', 'job', 'corp', 'found', 'releas', 'news', 'advertis', 'lawsuit', 'card', 'g', 'sacr', 'technolog', 's', 'plus', 'retail', 'block', 'macintosh', 'softwar', 'sweet', 'modul', 'system', 'linux', 'macworld', 'market', 'use', 'type', 'pictur', 'wozniak', 'licens', 'hesperid', 'connector', 'interfac', 'basic', 'basin', 'was', 'threw', 'parc', 'brand', 'gravit', 'beatl', 'appel', 'iic', 'line', 'iii', 'ibm', 'syrup', 'properti', 'tree', 'matter', 'nine', 'danc', 'display', 'employe', 'hypercard', 'comput', 'share', 'pie', 'ii', 'replica', 'tabl', 'infring', 'phrase', 'quicktim', 'sell', 'develop', 'datura', 'rather', 'media', 'deni', 'inscrib', 'descend', 'eventu', 'smell', 'bundl', 'juic', 'pineappl', 'product', 'clone', 'pear', 'evil', 'mac', 'fruit', 'stereo', 'potato', 'home', 'su', 'patent', 'switch', 'adam', 'dos', 'audio', 'microsoft', 'wine']
#labels = ['inning', 'mexican', 'sequest', 'just', 'fox', 'guano', 'rodent', 'handcuff', 'sleep', 'endem', 'ear', 'ty', 'harmless', 'aaron', 'score', 'rabi', 'vulner', 'plate', 'anim', 'hit', 'megabat', 'ya', 'safe', 'roost', 'dark', 'game', 'leagu', 'batter', 'shadow', 'cage', 'earn', 'costum', 'masterson', 'entangl', 'sacrific', 'lament', 'superfamili', 'sanctuari', 'nose', 'night', 'mammal', 'batsmen', 'respond', 'vampir', 'noctilionida', 'speci', 'matur', 'percentag', 'burger', 'championship', 'greenhal', 'home', 'fli', 'ha', 'abl', 'cinemat', 'estim', 'fossil', 'get', 'thriae', 'hopikin', 'goth', 'team', 'pontoon', 'mississippi', 'usual', 'pup', 'molossida', 'trickster', 'comoro', 'career', 'joke', 'askariyya', 'livingston', 'boe', 'vespertilionida', 'logo', 'myoti', 'walk', 'sox', 'pp', 'caught', 'frolic', 'wicket', 'swarm', 'pollin', 'strike', 'prey', 'funnel', 'tenerif', 'season', 'spider', 'casey', 'conserv', 'nobodi', 'microchiroptera', 'eat', 'greenberg', 'sound', 'exposur', 'ben', 'hitter', 'hous', 'tree', 'rope', 'record', 'retir', 'shark', 'teammat', 'scanner', 'toss', 'cricket', 'player', 'basebal', 'alphabet', 'cave', 'bowl', 'batman', 'hibern', 'ruth', 'wildlif', 'ball', 'ye', 'autocod', 'insect', 'fruit', 'blood', 'averag', 'ghost', 'noun', 'batwoman', 'lineup', 'wing', 'subgroup']
labels = ['foul', 'rodent', 'abil', 'sky', 'solitari', 'aaron', 'hockey', 'rabi', 'annoy', 'anim', 'hit', 'genera', 'bear', 'batter', 'yellow', 'cage', 'gray', 'hunt', 'glove', 'sang', 'nose', 'team', 'claw', 'vision', 'batsmen', 'sneaker', 'vampir', 'speci', 'fish', 'matur', 'home', 'girl', 'bee', 'blue', 'fli', 'flower', 'appear', 'pet', 'hoof', 'fox', 'lara', 'score', 'finger', 'bird', 'disney', 'statist', 'night', 'mammal', 'errat', 'pup', 'slug', 'box', 'wolv', 'wolf', 'smoki', 'genus', 'nicknam', 'walk', 'duck', 'limb', 'scent', 'breed', 'tail', 'gehrig', 'prey', 'cub', 'babe', 'season', 'tiger', 'rabbit', 'catch', 'ear', 'eat', 'bud', 'shoot', 'cacti', 'ben', 'hitter', 'tree', 'cat', 'rope', 'rbi', 'wild', 'boomerang', 'cetacean', 'robin', 'heart', 'shark', 'cane', 'sad', 'nippl', 'pit', 'comfort', 'microbat', 'chest', 'batman', 'hibern', 'wildlif', 'ball', 'terri', 'bumblebe', 'nest', 'insect', 'whale', 'averag', 'kid', 'rbis', 'gather', 'dog', 'tooth', 'walker', 'lineup', 'snake']

vectors = load_vectors('../data/wordvectors/enwiki8.relevant.vectors')
rel = shelve.open('../../corponut/enwiki8_rel')
wordRel = rel['appl']
rel.close()

toBeShifted = []
for label in labels:
	if not label in vectors:
		toBeShifted.append(label)

labels = list(set(labels) - set(toBeShifted))

while len(labels) > 5:
	best1 = None
	best2 = None
	bestSim = None
Example #5
0
	# read all files
	textfile = sys.argv[1]
	relFile = sys.argv[2]
	clusterFile = sys.argv[3]
	vecFile = sys.argv[4]
	pathToSVMFile = sys.argv[5]
	pathToExpansionCache = sys.argv[6]
	pathToOutput = sys.argv[7]
	
	# open the rel
	rel = shelve.open(relFile)
	
	# open the vectors
	print "Loading vectors"
	vecs = load_vectors(vecFile)
	
	# read clusters and get their cluster centers by taking the average...
	print "Reading agglomerative cluster centers"
	clusterCenters = [getAverageWordRep(x, vecs) for x in read_sets(clusterFile)]
	# IT MIGHT HAPPEN THAT SOME CLUSTER CENTERS ARE ()? HOW IS THIS POSSIBLE?
	
	# set some remaining parameters
	expansion = 5
	window = 5
	svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
	expansionCacheInfo = "_expansionParam_"  + str(expansion)

	wordsOfInterest = [x.split("_")[0] for x in os.listdir(pathToSVMFile)]
	#print wordsOfInterest
	f = open(pathToOutput, 'r')
Example #6
0
	vectorsFilename = sys.argv[2]
	pathToSVMFile = sys.argv[3]
	clusterFile = sys.argv[4]
	relFile = sys.argv[5]
	pathToExpansionCache = sys.argv[6]
	pathToNormalVectors = sys.argv[7]

	expansion = 5
	window = 5
	svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
	expansionCacheInfo = "_expansionParam_"  + str(expansion)
	
	print "Loading rel, task, vector, words that have been disambiguated"
	rel = shelve.open(relFile)
	task, tralala = load_task(taskFilename)
	vectors = load_vectors(vectorsFilename)
	normalVectors = load_vectors(pathToNormalVectors)
	disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)]

	print "Reading agglomerative cluster centers"
	clusterCenters = [getAverageWordRep(x, vectors) for x in read_sets(clusterFile)]

	print "Starting..."
	# initiate empty ratings
	methodsRating = []
	humanRating = []
	questions = task.values()

	jointVocCache = dict()
	partVoc = set(vectors.keys())
import sys
from utils import load_task, Word
from fast_utils import cosine_similarity, load_vectors, spearman

if __name__ == "__main__":
	print "Baseline with wordvectors"
	if len(sys.argv) < 3:
		print "USAGE: python baselline_word2vec.py <PATH TO TASK> <PATH TO WORDVECTORS>"
		sys.exit()
	taskFilename = sys.argv[1]
	vectorsFilename = sys.argv[2]

	task, _ = load_task(taskFilename)
	vectors = load_vectors(vectorsFilename)

	methodsRating = []
	humanRating = []

	questions = task.values()
	coverage = 0

	for i in xrange(len(questions)):
		question = questions[i]

		word1 = Word(question['word1']).lemma()
		word2 = Word(question['word2']).lemma()

		if word1 in vectors and word2 in vectors:
			vec1 = vectors[word1]
			vec2 = vectors[word2]
			methodsRating.append(cosine_similarity(vec1, vec2))
Example #8
0
    print "\tClustering took", stop - start, "seconds"
    return nodes


def read_args():
    assert len(sys.argv) == 5
    vecs = sys.argv[1]
    limit = int(sys.argv[2])
    clusternumber = int(sys.argv[3])
    minimum = int(sys.argv[4])
    return (vecs, limit, clusternumber, minimum)


if __name__ == '__main__':
    # pypy agglomerative.py ../data/wordvectors/enwiki8.relevant.vectors 2000 500 10
    (vecs, limit, clusternumber, minimum) = read_args()
    print "Loading vectors"
    data = load_vectors(vecs, limit).items()
    print "Clustering"

    nodes = fag_clustering(data, 0.03, clusternumber, minimum)

    print "Saving clusters"
    clusterName = "../data/agg_wordclusters/"
    clusterName += vecs.split("/")[-1]
    clusterName += "_" + str(limit) + "x" + str(clusternumber) + "x" + str(
        minimum)
    clusterName += ".clusters"
    save_clusters_to_file(nodes, clusterName)
    print "Done!"
Example #9
0
def vector_similarity(cs, w1, w2, vectors):
	if w1 in vectors and w2 in vectors:
		return cosine_similarity(vectors[w1], vectors[w2])
	else:
		return cs

if __name__ == '__main__':
	if not len(sys.argv) == 4:
		print "USAGE: python agg_answering.py <PATH TO TASK> <PATH TO cluster_descriptors> <PATH TO wordvectors"
		sys.exit()

	print "Loading stuf..."
	taskFilename = sys.argv[1]
	filename = sys.argv[2] # "../../../cluster_descriptors/enwiki8.clust-desc.shelve"
	vectorsFilename = sys.argv[3]
	vectors = load_vectors(vectorsFilename)

	d = shelve.open(filename)
	key_sets = []
	newD = dict()
	vec_size = len(d.keys())

	for i in xrange(vec_size):
		key_sets.append(set(d[str(i)].keys()))
		newD[i] = d[str(i)]

	task, _ = load_task(taskFilename)
	questions = task.values()

	methodsRating = []
	humanRating = []