Python WordVec Examples

Programming Language: Python

Namespace/Package Name: matteautils.parser

Class/Type: WordVec

Examples at hotexamples.com: 4

Python WordVec - 4 examples found. These are the top rated real world Python examples of matteautils.parser.WordVec extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

WordVec(2)

normalize(2)

logdf(1)

vocab(1)

Example #1

Show file

File: featurize.py Project: mattea/mattea-utils

def processData(args):
	data = dataset.Dataset.load(args.input_data, args.dataset)
	wvout = args.wvfile
	if os.path.exists(wvout):
		wordvecf = wvout
	else:
		wordvecf = args.wvsource

	features = {x for x in args.basefeatures.split(',') if x != ''}
	matchers = {x for x in args.matchers.split(',') if x != ''}

	printd("Loading Word Vectors")
	wordvec = WordVec(wordvecf)
	printd("Vectorizing")
	data.vectorize(wordvec)
	maxwords = data.maxShortSentence()

	if wvout != wordvecf:
		printd("Rereading word vectors to optimize...")
		wv_toks = data.wv_sentences()
		wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize)
		data.vectorize(wordvec)

	conf.wvsize = wordvec.size

	# Train data
	printd("Computing basic WV Features")
	fs = FeatureSet(data, features)

	if "Pair" in matchers:
		printd("Computing Pair Features")
		matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	if "Shingle" in matchers:
		printd("Computing Shingle Features")
		matcher = Shingler(slop=12, lmbda=0.95)
		fs.addMatcher(matcher)

	vocab = None
	if "MinDistSim" in matchers:
		printd("Computing MinDist")
		vocab = fs.data.wv_vocab()
		data.weight()
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos')
		printd("Computing MinDist-Euclidean")
		comparator = 'euclidean'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'euc')

	if "NGram" in matchers:
		printd("Computing MinDist-Ngram")
		vocab = fs.data.wv_vocab()
		if vocab is None:
			vocab = fs.data.wv_vocab()
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=2, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos-bigram')
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=3, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos-trigram')

	if "WWSim" in matchers:
		printd("Computing WWSim")
		matcher = vectorsim.WWSim(wordvec=wordvec, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	if "InfRankSim" in matchers:
		printd("Computing InfRankSim")
		matcher = vectorsim.InfRankSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures)
		printd("InfRankSim Matching")
		fs.addMatcher(matcher)

	if "InfSim" in matchers:
		# We normalize after so primary features are raw word vectors
		# InfSim
		printd("Computing InfSim")
		wordvec.normalize()
		data.vectorize(wordvec)
		matcher = vectorsim.InfSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	return fs

Example #2

Show file

File: shingle.py Project: mattea/mattea-utils

def main(args):
    global wordvec, wordvecf
    conf.debug = args.debug or args.verbose
    conf.verbose = args.verbose
    conf.args = args
    #nf = args.nuggets
    #uf = args.updates
    #mf = args.matches
    sf = args.shingles
    vf = args.wordvec
    #ef = args.evalfile
    wvout = args.wvfile
    sim_thr = args.sim_thr
    dset = args.dataset
    limit = args.limit

    #if args.dataset == "auto":
    #	if ef is not None:
    #		dset = "semeval"
    #	else:
    #		with open(glob.glob(nf)[0]) as nh:
    #			nfhead = nh.readline()
    #			if nfhead.startswith("query_id\tnugget_id"):
    #				dset = "ts"
    #			elif nfhead.startswith("query_id\tvs_id"):
    #				dset = "mclick"
    #			else:
    #				dset = "1click"

    if os.path.exists(wvout) and not args.force:
        wordvecf = wvout

    if vf:
        printd("Reading word vector...")
        #wordvec = load_wordvec()
        wordvec = WordVec(wordvecf)

    if args.sim == "minsim":
        matcher = MinDistSim
    elif args.sim == "infsim":
        matcher = InfSim
    else:
        matcher = VecSim

    if args.sim == "infsim" or args.comparator == "infsim":
        wordvec.normalize()

    #if dset == "ts":
    #	nuggfn = Nuggets
    #	updfn = Updates
    #	outfn = MatchWriter
    #elif dset == "1click":
    #	nuggfn = CLNuggets
    #	updfn = CLUpdates
    #	outfn = CLMatchWriter
    #elif dset == "mclick":
    #	nuggfn = MCNuggets
    #	updfn = Updates
    #	outfn = MCMatchWriter
    #elif dset == "semeval":
    #	data = SemEvalDataset(args.input_data, args.evalfile)
    #	outfn = data.writer
    #	if vf is not None:
    #		data.vectorize(wordvec)
    #else:
    #	nuggfn = MCNuggets
    #	updfn = Updates
    #	outfn = MCMatchWriter

    data = Dataset.load(args.input_data, dset)
    if vf is not None:
        data.vectorize(wordvec)
    #if dset == "semeval":
    #	data = SemEvalDataset(args.input_data, args.evalfile)
    #	#outfn = data.writer
    #	if vf is not None:
    #		data.vectorize(wordvec)
    #else:
    #	printd("Processing Nuggets...")
    #	#nuggets = nuggfn(nf, vectorize=vf is not None)

    #	printd("Processing Updates...")
    #	#updates = updfn(uf, vectorize=vf is not None)
    #	#data = NuggetDataset(nuggets, updates, mf)
    #	data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None)

    if vf and wvout is not None and wvout != wordvecf:
        printd("Rereading word vectors to optimize...")
        wv_toks = data.wv_sentences()
        #if dset == "semeval":
        #	wv_toks = data.wv_sentences()
        #else:
        #	wv_toks = nuggets.wv_text() + updates.wv_text()
        wordvec = WordVec(wordvecf,
                          sentences=wv_toks,
                          wvout=wvout,
                          size=wordvec.originalsize)
        if args.sim == "infsim" or args.comparator == "infsim":
            wordvec.normalize()
        data.vectorize(wordvec)
        with open(wvout + ".vocab", 'w') as wh:
            wh.write("\n".join(wordvec.vocab().keys()))
        with open(wvout + ".toks", 'w') as wh:
            wh.write("\n".join([" ".join(x) for x in wv_toks]))
        #vocab = nuggets.wv_vocab().union(updates.wv_vocab())
        #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD)
        #wordvec.save(wvout)

    vocab = None
    if args.frequencies:
        try:
            with open(args.frequencies) as fh:
                vocab = json.load(fh)
            # For Term Frequencies instead of Document Frequencies
            # Could also do len(vocab[word]) if wanted to mimic DF
            if type(vocab.itervalues().next()) == dict:
                for word in vocab:
                    vocab[word] = sum(vocab[word].itervalues())
        except Exception:
            pass
    if vocab is None:
        vocab = data.wv_vocab()
    logdf = wordvec.logdf(vocab)
    logdffile = wordvecf + ".logdf"
    #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)):
    #	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")
    np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")

    if args.comparator == "infsim" and args.sim != "infsim":
        comparator = InfSim(logdf).pairwisedist
    else:
        comparator = args.comparator

    matcher = matcher(df=logdf, metric=comparator)
    data.normalize(matcher, logdf)

    printd("Finding matches...")
    matches = []
    with data.writer(sf) as sw, data.writer(vf) as vw:
        mcnt = 0
        timer = Timer()
        for pair in data.test():
            if sf:
                match = shingle(pair.s1["tokens"], pair.s2["tokens"])
                if match.score >= min_score:
                    sw.write(pair, match)

            if vf:
                printd("Matching pair %s" % (pair.pid), level=1)
                try:
                    sim = matcher.match(pair)
                    matches.append((matcher.tsim, unicode(matcher)))
                except ValueError, err:
                    printd(err)
                    sim = sim_thr
                printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2))
                if sim < sim_thr:
                    sim = sim_thr
                    start = matcher.start
                    end = matcher.end - matcher.start
                else:
                    start = -1
                    end = len(pair.s2["tokens"]) - 1
                match = Match(sim, start, end)
                vw.write(pair, match)

            mcnt += 1
            if (mcnt % 100000) == 0:
                print >> sys.stderr, "%g tmps" % (100 / timer.mark())
            if limit and mcnt >= limit:
                return

        if conf.verbose:
            for tsim, match in sorted(matches):
                print match

Example #3

Show file

def processData(args):
    data = dataset.Dataset.load(args.input_data, args.dataset)
    wvout = args.wvfile
    if os.path.exists(wvout):
        wordvecf = wvout
    else:
        wordvecf = args.wvsource

    features = {x for x in args.basefeatures.split(',') if x != ''}
    matchers = {x for x in args.matchers.split(',') if x != ''}

    printd("Loading Word Vectors")
    wordvec = WordVec(wordvecf)
    printd("Vectorizing")
    data.vectorize(wordvec)
    maxwords = data.maxShortSentence()

    if wvout != wordvecf:
        printd("Rereading word vectors to optimize...")
        wv_toks = data.wv_sentences()
        wordvec = WordVec(wordvecf,
                          sentences=wv_toks,
                          wvout=wvout,
                          size=wordvec.originalsize)
        data.vectorize(wordvec)

    conf.wvsize = wordvec.size

    # Train data
    printd("Computing basic WV Features")
    fs = FeatureSet(data, features)

    if "Pair" in matchers:
        printd("Computing Pair Features")
        matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    if "Shingle" in matchers:
        printd("Computing Shingle Features")
        matcher = Shingler(slop=12, lmbda=0.95)
        fs.addMatcher(matcher)

    vocab = None
    if "MinDistSim" in matchers:
        printd("Computing MinDist")
        vocab = fs.data.wv_vocab()
        data.weight()
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos')
        printd("Computing MinDist-Euclidean")
        comparator = 'euclidean'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'euc')

    if "NGram" in matchers:
        printd("Computing MinDist-Ngram")
        vocab = fs.data.wv_vocab()
        if vocab is None:
            vocab = fs.data.wv_vocab()
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       ngram=2,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos-bigram')
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       ngram=3,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos-trigram')

    if "WWSim" in matchers:
        printd("Computing WWSim")
        matcher = vectorsim.WWSim(wordvec=wordvec,
                                  dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    if "InfRankSim" in matchers:
        printd("Computing InfRankSim")
        matcher = vectorsim.InfRankSim(data=data,
                                       wordvec=wordvec,
                                       dimfeatures=args.dimfeatures)
        printd("InfRankSim Matching")
        fs.addMatcher(matcher)

    if "InfSim" in matchers:
        # We normalize after so primary features are raw word vectors
        # InfSim
        printd("Computing InfSim")
        wordvec.normalize()
        data.vectorize(wordvec)
        matcher = vectorsim.InfSim(data=data,
                                   wordvec=wordvec,
                                   dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    return fs

Example #4

Show file

File: shingle.py Project: mattea/mattea-utils

def main(args):
	global wordvec, wordvecf
	conf.debug = args.debug or args.verbose
	conf.verbose = args.verbose
	conf.args = args
	#nf = args.nuggets
	#uf = args.updates
	#mf = args.matches
	sf = args.shingles
	vf = args.wordvec
	#ef = args.evalfile
	wvout = args.wvfile
	sim_thr = args.sim_thr
	dset = args.dataset
	limit = args.limit

	#if args.dataset == "auto":
	#	if ef is not None:
	#		dset = "semeval"
	#	else:
	#		with open(glob.glob(nf)[0]) as nh:
	#			nfhead = nh.readline()
	#			if nfhead.startswith("query_id\tnugget_id"):
	#				dset = "ts"
	#			elif nfhead.startswith("query_id\tvs_id"):
	#				dset = "mclick"
	#			else:
	#				dset = "1click"

	if os.path.exists(wvout) and not args.force:
		wordvecf = wvout

	if vf:
		printd("Reading word vector...")
		#wordvec = load_wordvec()
		wordvec = WordVec(wordvecf)

	if args.sim == "minsim":
		matcher = MinDistSim
	elif args.sim == "infsim":
		matcher = InfSim
	else:
		matcher = VecSim

	if args.sim == "infsim" or args.comparator == "infsim":
		wordvec.normalize()

	#if dset == "ts":
	#	nuggfn = Nuggets
	#	updfn = Updates
	#	outfn = MatchWriter
	#elif dset == "1click":
	#	nuggfn = CLNuggets
	#	updfn = CLUpdates
	#	outfn = CLMatchWriter
	#elif dset == "mclick":
	#	nuggfn = MCNuggets
	#	updfn = Updates
	#	outfn = MCMatchWriter
	#elif dset == "semeval":
	#	data = SemEvalDataset(args.input_data, args.evalfile)
	#	outfn = data.writer
	#	if vf is not None:
	#		data.vectorize(wordvec)
	#else:
	#	nuggfn = MCNuggets
	#	updfn = Updates
	#	outfn = MCMatchWriter

	data = Dataset.load(args.input_data, dset)
	if vf is not None:
		data.vectorize(wordvec)
	#if dset == "semeval":
	#	data = SemEvalDataset(args.input_data, args.evalfile)
	#	#outfn = data.writer
	#	if vf is not None:
	#		data.vectorize(wordvec)
	#else:
	#	printd("Processing Nuggets...")
	#	#nuggets = nuggfn(nf, vectorize=vf is not None)

	#	printd("Processing Updates...")
	#	#updates = updfn(uf, vectorize=vf is not None)
	#	#data = NuggetDataset(nuggets, updates, mf)
	#	data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None)

	if vf and wvout is not None and wvout != wordvecf:
		printd("Rereading word vectors to optimize...")
		wv_toks = data.wv_sentences()
		#if dset == "semeval":
		#	wv_toks = data.wv_sentences()
		#else:
		#	wv_toks = nuggets.wv_text() + updates.wv_text()
		wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize)
		if args.sim == "infsim" or args.comparator == "infsim":
			wordvec.normalize()
		data.vectorize(wordvec)
		with open(wvout + ".vocab", 'w') as wh:
			wh.write("\n".join(wordvec.vocab().keys()))
		with open(wvout + ".toks", 'w') as wh:
			wh.write("\n".join([" ".join(x) for x in wv_toks]))
		#vocab = nuggets.wv_vocab().union(updates.wv_vocab())
		#wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD)
		#wordvec.save(wvout)

	vocab = None
	if args.frequencies:
		try:
			with open(args.frequencies) as fh:
				vocab = json.load(fh)
			# For Term Frequencies instead of Document Frequencies
			# Could also do len(vocab[word]) if wanted to mimic DF
			if type(vocab.itervalues().next()) == dict:
				for word in vocab:
					vocab[word] = sum(vocab[word].itervalues())
		except Exception:
			pass
	if vocab is None:
		vocab = data.wv_vocab()
	logdf = wordvec.logdf(vocab)
	logdffile = wordvecf + ".logdf"
	#if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)):
	#	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")
	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")

	if args.comparator == "infsim" and args.sim != "infsim":
		comparator = InfSim(logdf).pairwisedist
	else:
		comparator = args.comparator

	matcher = matcher(df=logdf, metric=comparator)
	data.normalize(matcher, logdf)

	printd("Finding matches...")
	matches = []
	with data.writer(sf) as sw, data.writer(vf) as vw:
		mcnt = 0
		timer = Timer()
		for pair in data.test():
			if sf:
				match = shingle(pair.s1["tokens"], pair.s2["tokens"])
				if match.score >= min_score:
					sw.write(pair, match)

			if vf:
				printd("Matching pair %s" % (pair.pid), level=1)
				try:
					sim = matcher.match(pair)
					matches.append((matcher.tsim, unicode(matcher)))
				except ValueError, err:
					printd(err)
					sim = sim_thr
				printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2))
				if sim < sim_thr:
					sim = sim_thr
					start = matcher.start
					end = matcher.end - matcher.start
				else:
					start = -1
					end = len(pair.s2["tokens"]) - 1
				match = Match(sim, start, end)
				vw.write(pair, match)

			mcnt += 1
			if (mcnt % 100000) == 0:
				print >>sys.stderr, "%g tmps" % (100 / timer.mark())
			if limit and mcnt >= limit:
				return

		if conf.verbose:
			for tsim, match in sorted(matches):
				print match