Exemple #1
0
def featurize(p, featurefns):
    a = []
    for accumulator in accumulators:
        a.append(accumulator(p))

    x = []
    for featurefn in featurefns:
        try:
            fs = featurefn(a)
            for f in fs:
                try:
                    len(f)
                    printd("Bad feature from %s" % featurefn.__name__, -1)
                    printd(f, -1)
                    sys.exit(-1)
                except TypeError:
                    pass
            x.extend(fs)
        except Exception:
            print >> sys.stderr, "Pair:", p.pid
            print >> sys.stderr, "%d" % len(p.s1["vector"]), " ".join(
                p.s1["wv_tokens"])
            print >> sys.stderr, "%d" % len(p.s2["vector"]), " ".join(
                p.s2["wv_tokens"])
            traceback.print_exc()
            sys.exit(-1)

    if len(x) == 0 and len(featurefns) != 0:
        print >> sys.stderr, "Pair:", p.pid
        print >> sys.stderr, "%d" % len(p.s1["vector"]), " ".join(
            p.s1["wv_tokens"])
        print >> sys.stderr, "%d" % len(p.s2["vector"]), " ".join(
            p.s2["wv_tokens"])
        sys.exit(-1)
    return x
Exemple #2
0
	def __init__(self, path, neg_samples=5):
		random.seed(42)
		pfiles = os.listdir(path)
		printd("Loading dataset")
		alldata = []
		size = 0
		if "train" in pfiles:
			for d in ["train", "test", "valid"]:
				if d not in pfiles:
					continue
				dpath = os.path.join(path, d)
				nset = NuggetSet(dpath, neg_samples)
				size += nset.size
				alldata.extend([nset.nuggets, nset.updates])
				setattr(self, d, nset.pairs)
		else:
			nset = NuggetSet(path, neg_samples)
			alldata.extend([nset.nuggets, nset.updates])
			self.test = nset.pairs
			size += nset.size

		self.size = size
		self.data = Superset(*alldata)
		self.writer = nset.writer
		if len(self.valid()) != 0:
			self._train = SuperList(self.train(), self.valid())
			self.train = lambda: self._train
Exemple #3
0
    def __init__(self, path, neg_samples=5):
        random.seed(42)
        pfiles = os.listdir(path)
        printd("Loading dataset")
        alldata = []
        size = 0
        if "train" in pfiles:
            for d in ["train", "test", "valid"]:
                if d not in pfiles:
                    continue
                dpath = os.path.join(path, d)
                nset = NuggetSet(dpath, neg_samples)
                size += nset.size
                alldata.extend([nset.nuggets, nset.updates])
                setattr(self, d, nset.pairs)
        else:
            nset = NuggetSet(path, neg_samples)
            alldata.extend([nset.nuggets, nset.updates])
            self.test = nset.pairs
            size += nset.size

        self.size = size
        self.data = Superset(*alldata)
        self.writer = nset.writer
        if len(self.valid()) != 0:
            self._train = SuperList(self.train(), self.valid())
            self.train = lambda: self._train
Exemple #4
0
def featurize(p, featurefns):
	a = []
	for accumulator in accumulators:
		a.append(accumulator(p))

	x = []
	for featurefn in featurefns:
		try:
			fs = featurefn(a)
			for f in fs:
				try:
					len(f)
					printd("Bad feature from %s" % featurefn.__name__, -1)
					printd(f, -1)
					sys.exit(-1)
				except TypeError:
					pass
			x.extend(fs)
		except Exception:
			print >>sys.stderr, "Pair:", p.pid
			print >>sys.stderr, "%d" % len(p.s1["vector"]), " ".join(p.s1["wv_tokens"])
			print >>sys.stderr, "%d" % len(p.s2["vector"]), " ".join(p.s2["wv_tokens"])
			traceback.print_exc()
			sys.exit(-1)

	if len(x) == 0 and len(featurefns) != 0:
		print >>sys.stderr, "Pair:", p.pid
		print >>sys.stderr, "%d" % len(p.s1["vector"]), " ".join(p.s1["wv_tokens"])
		print >>sys.stderr, "%d" % len(p.s2["vector"]), " ".join(p.s2["wv_tokens"])
		sys.exit(-1)
	return x
Exemple #5
0
	def normalize(self):
		wv = self.wordvec
		printd("Normalizing: wv is %s:" % (str(wv.syn0.shape)))
		# Shfit to Mean = 0
		#means = np.mean(wv.syn0, axis=0)
		#scales = 1 / np.maximum(abs(means), abs(1 - means))
		#wv.syn0 = ((wv.syn0 - means) / scales).astype(np.float32)
		wv.syn0 -= np.mean(wv.syn0, axis=0)
		# Cut off negative and add new vector as positive
		wv.syn0 = np.concatenate((np.maximum(0, wv.syn0), np.maximum(0, -wv.syn0)), axis=1)
		# Unit norm
		wv.syn0 = sklearn.preprocessing.normalize(wv.syn0, axis=0)
		self.size *= 2
		wv.vector_size = self.size
		printd("Done normalizing: wv is %s:" % (str(wv.syn0.shape)))
Exemple #6
0
    def maxShortSentence(self):
        ls = Cycle([0, 0])

        try:
            for dset in self.data:
                l = ls.nextitem()
                for s in dset:
                    cl = len(s["wv_tokens"])
                    if cl > l:
                        l = cl
                ls.setitem(l)
        except KeyError, e:
            printd(e, -1)
            printd(s, -1)
            traceback.print_stack()
            sys.exit(-1)
Exemple #7
0
 def normalize(self):
     wv = self.wordvec
     printd("Normalizing: wv is %s:" % (str(wv.syn0.shape)))
     # Shfit to Mean = 0
     #means = np.mean(wv.syn0, axis=0)
     #scales = 1 / np.maximum(abs(means), abs(1 - means))
     #wv.syn0 = ((wv.syn0 - means) / scales).astype(np.float32)
     wv.syn0 -= np.mean(wv.syn0, axis=0)
     # Cut off negative and add new vector as positive
     wv.syn0 = np.concatenate(
         (np.maximum(0, wv.syn0), np.maximum(0, -wv.syn0)), axis=1)
     # Unit norm
     wv.syn0 = sklearn.preprocessing.normalize(wv.syn0, axis=0)
     self.size *= 2
     wv.vector_size = self.size
     printd("Done normalizing: wv is %s:" % (str(wv.syn0.shape)))
Exemple #8
0
	def maxShortSentence(self):
		ls = Cycle([0, 0])

		try:
			for dset in self.data:
				l = ls.nextitem()
				for s in dset:
					cl = len(s["wv_tokens"])
					if cl > l:
						l = cl
				ls.setitem(l)
		except KeyError, e:
			printd(e, -1)
			printd(s, -1)
			traceback.print_stack()
			sys.exit(-1)
Exemple #9
0
	def logdf(self, tf=None):
		if tf is None:
			if self.__logdf:
				return self.__logdf
			return np.zeros(self.size)

		logdf = np.zeros(self.size)
		nterms = 0

		for t, tc in tf.items():
			if t in self.wordvec:
				logdf += self[t] * tc
				nterms += 1

		self.__logdf = np.nan_to_num(np.log2(logdf))
		printd("Found %d (%0.2f%% toks, %0.2f%% wv) terms from vocab in wordvec" % (nterms, 100 * nterms / len(tf), 100 * nterms / len(self.wordvec.vocab)))
		return self.__logdf
Exemple #10
0
    def logdf(self, tf=None):
        if tf is None:
            if self.__logdf:
                return self.__logdf
            return np.zeros(self.size)

        logdf = np.zeros(self.size)
        nterms = 0

        for t, tc in tf.items():
            if t in self.wordvec:
                logdf += self[t] * tc
                nterms += 1

        self.__logdf = np.nan_to_num(np.log2(logdf))
        printd(
            "Found %d (%0.2f%% toks, %0.2f%% wv) terms from vocab in wordvec" %
            (nterms, 100 * nterms / len(tf),
             100 * nterms / len(self.wordvec.vocab)))
        return self.__logdf
Exemple #11
0
def munkres_handler(signum, frame):
	printd("Can't keep waiting...")
	print frame
	raise Exception("ran out of time...")
Exemple #12
0
def main(args):
    global wordvec, wordvecf
    conf.debug = args.debug or args.verbose
    conf.verbose = args.verbose
    conf.args = args
    #nf = args.nuggets
    #uf = args.updates
    #mf = args.matches
    sf = args.shingles
    vf = args.wordvec
    #ef = args.evalfile
    wvout = args.wvfile
    sim_thr = args.sim_thr
    dset = args.dataset
    limit = args.limit

    #if args.dataset == "auto":
    #	if ef is not None:
    #		dset = "semeval"
    #	else:
    #		with open(glob.glob(nf)[0]) as nh:
    #			nfhead = nh.readline()
    #			if nfhead.startswith("query_id\tnugget_id"):
    #				dset = "ts"
    #			elif nfhead.startswith("query_id\tvs_id"):
    #				dset = "mclick"
    #			else:
    #				dset = "1click"

    if os.path.exists(wvout) and not args.force:
        wordvecf = wvout

    if vf:
        printd("Reading word vector...")
        #wordvec = load_wordvec()
        wordvec = WordVec(wordvecf)

    if args.sim == "minsim":
        matcher = MinDistSim
    elif args.sim == "infsim":
        matcher = InfSim
    else:
        matcher = VecSim

    if args.sim == "infsim" or args.comparator == "infsim":
        wordvec.normalize()

    #if dset == "ts":
    #	nuggfn = Nuggets
    #	updfn = Updates
    #	outfn = MatchWriter
    #elif dset == "1click":
    #	nuggfn = CLNuggets
    #	updfn = CLUpdates
    #	outfn = CLMatchWriter
    #elif dset == "mclick":
    #	nuggfn = MCNuggets
    #	updfn = Updates
    #	outfn = MCMatchWriter
    #elif dset == "semeval":
    #	data = SemEvalDataset(args.input_data, args.evalfile)
    #	outfn = data.writer
    #	if vf is not None:
    #		data.vectorize(wordvec)
    #else:
    #	nuggfn = MCNuggets
    #	updfn = Updates
    #	outfn = MCMatchWriter

    data = Dataset.load(args.input_data, dset)
    if vf is not None:
        data.vectorize(wordvec)
    #if dset == "semeval":
    #	data = SemEvalDataset(args.input_data, args.evalfile)
    #	#outfn = data.writer
    #	if vf is not None:
    #		data.vectorize(wordvec)
    #else:
    #	printd("Processing Nuggets...")
    #	#nuggets = nuggfn(nf, vectorize=vf is not None)

    #	printd("Processing Updates...")
    #	#updates = updfn(uf, vectorize=vf is not None)
    #	#data = NuggetDataset(nuggets, updates, mf)
    #	data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None)

    if vf and wvout is not None and wvout != wordvecf:
        printd("Rereading word vectors to optimize...")
        wv_toks = data.wv_sentences()
        #if dset == "semeval":
        #	wv_toks = data.wv_sentences()
        #else:
        #	wv_toks = nuggets.wv_text() + updates.wv_text()
        wordvec = WordVec(wordvecf,
                          sentences=wv_toks,
                          wvout=wvout,
                          size=wordvec.originalsize)
        if args.sim == "infsim" or args.comparator == "infsim":
            wordvec.normalize()
        data.vectorize(wordvec)
        with open(wvout + ".vocab", 'w') as wh:
            wh.write("\n".join(wordvec.vocab().keys()))
        with open(wvout + ".toks", 'w') as wh:
            wh.write("\n".join([" ".join(x) for x in wv_toks]))
        #vocab = nuggets.wv_vocab().union(updates.wv_vocab())
        #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD)
        #wordvec.save(wvout)

    vocab = None
    if args.frequencies:
        try:
            with open(args.frequencies) as fh:
                vocab = json.load(fh)
            # For Term Frequencies instead of Document Frequencies
            # Could also do len(vocab[word]) if wanted to mimic DF
            if type(vocab.itervalues().next()) == dict:
                for word in vocab:
                    vocab[word] = sum(vocab[word].itervalues())
        except Exception:
            pass
    if vocab is None:
        vocab = data.wv_vocab()
    logdf = wordvec.logdf(vocab)
    logdffile = wordvecf + ".logdf"
    #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)):
    #	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")
    np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")

    if args.comparator == "infsim" and args.sim != "infsim":
        comparator = InfSim(logdf).pairwisedist
    else:
        comparator = args.comparator

    matcher = matcher(df=logdf, metric=comparator)
    data.normalize(matcher, logdf)

    printd("Finding matches...")
    matches = []
    with data.writer(sf) as sw, data.writer(vf) as vw:
        mcnt = 0
        timer = Timer()
        for pair in data.test():
            if sf:
                match = shingle(pair.s1["tokens"], pair.s2["tokens"])
                if match.score >= min_score:
                    sw.write(pair, match)

            if vf:
                printd("Matching pair %s" % (pair.pid), level=1)
                try:
                    sim = matcher.match(pair)
                    matches.append((matcher.tsim, unicode(matcher)))
                except ValueError, err:
                    printd(err)
                    sim = sim_thr
                printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2))
                if sim < sim_thr:
                    sim = sim_thr
                    start = matcher.start
                    end = matcher.end - matcher.start
                else:
                    start = -1
                    end = len(pair.s2["tokens"]) - 1
                match = Match(sim, start, end)
                vw.write(pair, match)

            mcnt += 1
            if (mcnt % 100000) == 0:
                print >> sys.stderr, "%g tmps" % (100 / timer.mark())
            if limit and mcnt >= limit:
                return

        if conf.verbose:
            for tsim, match in sorted(matches):
                print match
Exemple #13
0
	def normalize(self, matcher, df):
		printd("Normalizing dset")
		for rid, rec in self.data.iteritems():
			rec["vector"], rec["vector_sum"] = matcher.normalize(rec["vector"], df)
Exemple #14
0
 def normalize(self, matcher, df):
     printd("Normalizing dset")
     for rid, rec in self.data.iteritems():
         rec["vector"], rec["vector_sum"] = matcher.normalize(
             rec["vector"], df)
Exemple #15
0
def processData(args):
	data = dataset.Dataset.load(args.input_data, args.dataset)
	wvout = args.wvfile
	if os.path.exists(wvout):
		wordvecf = wvout
	else:
		wordvecf = args.wvsource

	features = {x for x in args.basefeatures.split(',') if x != ''}
	matchers = {x for x in args.matchers.split(',') if x != ''}

	printd("Loading Word Vectors")
	wordvec = WordVec(wordvecf)
	printd("Vectorizing")
	data.vectorize(wordvec)
	maxwords = data.maxShortSentence()

	if wvout != wordvecf:
		printd("Rereading word vectors to optimize...")
		wv_toks = data.wv_sentences()
		wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize)
		data.vectorize(wordvec)

	conf.wvsize = wordvec.size

	# Train data
	printd("Computing basic WV Features")
	fs = FeatureSet(data, features)

	if "Pair" in matchers:
		printd("Computing Pair Features")
		matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	if "Shingle" in matchers:
		printd("Computing Shingle Features")
		matcher = Shingler(slop=12, lmbda=0.95)
		fs.addMatcher(matcher)

	vocab = None
	if "MinDistSim" in matchers:
		printd("Computing MinDist")
		vocab = fs.data.wv_vocab()
		data.weight()
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos')
		printd("Computing MinDist-Euclidean")
		comparator = 'euclidean'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'euc')

	if "NGram" in matchers:
		printd("Computing MinDist-Ngram")
		vocab = fs.data.wv_vocab()
		if vocab is None:
			vocab = fs.data.wv_vocab()
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=2, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos-bigram')
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=3, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos-trigram')

	if "WWSim" in matchers:
		printd("Computing WWSim")
		matcher = vectorsim.WWSim(wordvec=wordvec, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	if "InfRankSim" in matchers:
		printd("Computing InfRankSim")
		matcher = vectorsim.InfRankSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures)
		printd("InfRankSim Matching")
		fs.addMatcher(matcher)

	if "InfSim" in matchers:
		# We normalize after so primary features are raw word vectors
		# InfSim
		printd("Computing InfSim")
		wordvec.normalize()
		data.vectorize(wordvec)
		matcher = vectorsim.InfSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	return fs
Exemple #16
0
    def addMatcher(self, matcher, namebase=""):
        names = [namebase + x for x in matcher.names()]
        self.names.extend(names)
        for datum, pair in chain(izip(self.train, self.data.train()),
                                 izip(self.test, self.data.test())):
            try:
                matcher.match(pair)
            except ValueError, err:
                printd(err)
            fs = matcher.features()
            if len(names) != len(fs):
                printd(
                    "Incorrect names for features for %s: %d vs %d" %
                    (matcher.__class__.__name__, len(names), len(fs)), -1)
                printd(names, -1)
                sys.exit(-1)

            for f in fs:
                # Checking
                if np.isnan(f) or np.isinf(f):
                    printd("Bad feature from %s" % matcher.__class__.__name__,
                           -1)
                    printd(f, -1)
                    printd(pair, -1)
                    sys.exit(-1)
            datum.extend(fs)
Exemple #17
0
def main(args):
    conf.debug = args.debug
    conf.verbose = args.verbose
    conf.args = args
    if args.alphas == "auto":
        pass  # TODO
    else:
        conf.alphas = [float(x) for x in args.alphas.split(",")]

    try:
        args.max_features = int(args.max_features)
    except (ValueError, TypeError):
        try:
            args.max_features = float(args.max_features)
        except (ValueError, TypeError):
            pass

    fitargs = {}
    predictargs = {}

    if args.featurefile and not args.force and os.path.exists(
            args.featurefile + ".train"):
        printd("Loading Saved Features")
        fs = FeatureSet.read(args.featurefile)
    else:
        fs = processData(args)
        if args.featurefile:
            printd("Writing Features")
            fs.write(args.featurefile)

    #kpca = KernelPCA(kernel="rbf", gamma=10)
    if args.model == "randomforests":
        if args.classify:
            model = ensemble.RandomForestClassifier
        else:
            model = ensemble.RandomForestRegressor
    elif args.model == "extratrees":
        if args.classify:
            model = ensemble.ExtraTreesClassifier
        else:
            model = ensemble.ExtraTreesRegressor
    elif args.model == "gradientboosting":
        if args.classify:
            model = ensemble.GradientBoostingClassifier
        else:
            model = ensemble.GradientBoostingRegressor
    elif args.model == "decisiontree":
        model = DecisionTreeRegressor
    elif args.model == "adaboost":
        model = ensemble.AdaBoostRegressor
    elif args.model == "linreg":
        model = LinearRegression
    elif args.model == "autolearn":
        printd("AutoLearn disabled as it does not work properly")
        sys.exit(-1)
        #model = AutoSklearnClassifier
        fitargs["dataset_name"] = "semeval"
    elif args.model == "nn":
        model = NNModel
        fitargs["nb_epoch"] = 10
        fitargs["batch_size"] = 32
        fitargs["verbose"] = 2
        predictargs["verbose"] = 0
    elif args.model == "None":
        printd("No Model specified, exiting")
        sys.exit(-1)
    else:
        printd("Invalid model %s" % args.model)
        sys.exit(-1)

    if args.classify:
        # Forest Classifiers do not allow non-binary labels, so we do it by sample weight instead
        byweight = issubclass(model, ensemble.forest.ForestClassifier)
        lintrainlabels = np.copy(fs.trainlabels)
        fs.discretizeLabels(byweight=byweight)
        if byweight:
            fitargs["sample_weight"] = fs.trainweights
    else:
        lintrainlabels = np.array(fs.trainlabels)
    fs.freeze()
    printd("Train labels:" + str(fs.trainlabels.shape))

    if (not args.force) and args.modelfile and os.path.exists(args.modelfile):
        if args.model == "nn":
            import keras
            model = keras.models.load_model(args.modelfile)
        else:
            model = joblib.load(args.modelfile)
    else:
        params = default_params[args.model]
        for param_name, param_value in params.items():
            try:
                pval = getattr(args, param_name)
                if pval is not None:
                    params[param_name] = pval
            except AttributeError:
                pass

        if "input_dim" in params:
            # -1 for label
            params["input_dim"] = len(fs.names) - 1
        model = model(**params)

        if args.gridsearch:
            model = GridSearchCV(model,
                                 scoring=evalModel,
                                 cv=5,
                                 error_score=0,
                                 param_grid=param_grids[args.model],
                                 n_jobs=16,
                                 pre_dispatch="2*n_jobs",
                                 verbose=10)
        #model = Pipeline(steps=[('pca', kpca), ('dtree', dtree)])
        printd("Training")
        model.fit(fs.train, fs.trainlabels, **fitargs)
        #X_kpca = kpca.fit_transform(X)
        #dtree.fit(traindata, trainlabels)
        if args.modelfile:
            try:
                if args.model == "nn":
                    model.save(args.modelfile)
                else:
                    joblib.dump(model, args.modelfile)
            except Exception:
                printd(
                    "Could not save model, autolearn does not support saving")

    printd("Evaluating")
    print "Using Features: %s" % args.basefeatures
    print "Using Matchers: %s" % args.matchers
    print "Train Accuracy"
    evalData(model=model,
             data=fs.train,
             labels=lintrainlabels,
             classify=args.classify,
             obs=model.oob_prediction_,
             **predictargs)
    # trainobs = _

    print "Test Accuracy"
    testobs = evalData(model=model,
                       data=fs.test,
                       labels=fs.testlabels,
                       classify=args.classify,
                       **predictargs)

    if args.writematches:
        try:
            fs.data.writer
        except AttributeError:
            fs.data = dataset.Dataset.load(args.input_data, args.dataset)
        trainwriter = fs.data.writer(args.writematches + ".train")
        testwriter = fs.data.writer(args.writematches + ".test")
        for pair in fs.data.train():
            trainwriter.write(pair, Match(score=pair.label, autop=0))
        for pair, obs in izip(fs.data.test(), testobs):
            testwriter.write(pair, Match(score=obs))
Exemple #18
0
def processData(args):
    data = dataset.Dataset.load(args.input_data, args.dataset)
    wvout = args.wvfile
    if os.path.exists(wvout):
        wordvecf = wvout
    else:
        wordvecf = args.wvsource

    features = {x for x in args.basefeatures.split(',') if x != ''}
    matchers = {x for x in args.matchers.split(',') if x != ''}

    printd("Loading Word Vectors")
    wordvec = WordVec(wordvecf)
    printd("Vectorizing")
    data.vectorize(wordvec)
    maxwords = data.maxShortSentence()

    if wvout != wordvecf:
        printd("Rereading word vectors to optimize...")
        wv_toks = data.wv_sentences()
        wordvec = WordVec(wordvecf,
                          sentences=wv_toks,
                          wvout=wvout,
                          size=wordvec.originalsize)
        data.vectorize(wordvec)

    conf.wvsize = wordvec.size

    # Train data
    printd("Computing basic WV Features")
    fs = FeatureSet(data, features)

    if "Pair" in matchers:
        printd("Computing Pair Features")
        matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    if "Shingle" in matchers:
        printd("Computing Shingle Features")
        matcher = Shingler(slop=12, lmbda=0.95)
        fs.addMatcher(matcher)

    vocab = None
    if "MinDistSim" in matchers:
        printd("Computing MinDist")
        vocab = fs.data.wv_vocab()
        data.weight()
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos')
        printd("Computing MinDist-Euclidean")
        comparator = 'euclidean'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'euc')

    if "NGram" in matchers:
        printd("Computing MinDist-Ngram")
        vocab = fs.data.wv_vocab()
        if vocab is None:
            vocab = fs.data.wv_vocab()
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       ngram=2,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos-bigram')
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       ngram=3,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos-trigram')

    if "WWSim" in matchers:
        printd("Computing WWSim")
        matcher = vectorsim.WWSim(wordvec=wordvec,
                                  dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    if "InfRankSim" in matchers:
        printd("Computing InfRankSim")
        matcher = vectorsim.InfRankSim(data=data,
                                       wordvec=wordvec,
                                       dimfeatures=args.dimfeatures)
        printd("InfRankSim Matching")
        fs.addMatcher(matcher)

    if "InfSim" in matchers:
        # We normalize after so primary features are raw word vectors
        # InfSim
        printd("Computing InfSim")
        wordvec.normalize()
        data.vectorize(wordvec)
        matcher = vectorsim.InfSim(data=data,
                                   wordvec=wordvec,
                                   dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    return fs
Exemple #19
0
	def addMatcher(self, matcher, namebase=""):
		names = [namebase + x for x in matcher.names()]
		self.names.extend(names)
		for datum, pair in chain(izip(self.train, self.data.train()), izip(self.test, self.data.test())):
			try:
				matcher.match(pair)
			except ValueError, err:
				printd(err)
			fs = matcher.features()
			if len(names) != len(fs):
				printd("Incorrect names for features for %s: %d vs %d" % (matcher.__class__.__name__, len(names), len(fs)), -1)
				printd(names, -1)
				sys.exit(-1)

			for f in fs:
				# Checking
				if np.isnan(f) or np.isinf(f):
					printd("Bad feature from %s" % matcher.__class__.__name__, -1)
					printd(f, -1)
					printd(pair, -1)
					sys.exit(-1)
			datum.extend(fs)
Exemple #20
0
	def normalize(cls, s, df):
		if len(s) == 0:
			return s, 0
		if np.any(np.isnan(df)):
			printd("Hmm, nan for df %0.4f")
			printd("df:\n" + str(df))
			sys.exit(1)
		# TODO: This should be a weighted sum with IDF
		# As a result of this sum, different length sentences naturally receive a
		# penalty, as the sum is naturally larger than the min.
		# Also, we aren't looking at euclidean distance, so we may be losing out on scale information
		# But if we did, longer sentences would be harder to match together (as distances would compound).
		# Maybe should divide by min sentence legnth or something of the sort...
		#This is avg, not sum...................................................
		# probably causes all sorts of weirdness
		ps = np.sum(s, axis=0) / np.sum(s)
		if np.any(np.isnan(ps)):
			printd("Hmm, nan for ps %0.4f" % np.sum(s))
			printd("ps:\n" + str(ps))
			printd("s:\n" + str(s))
			printd("df:\n" + str(df))
			sys.exit(1)
		ts = np.sum(np.multiply(ps, df))
		if ts == 0:
			printd("Hmm, 0 for ts")
			printd("ps:\n" + str(ps))
			printd("df:\n" + str(df))
			sys.exit(1)
		return ps, ts
Exemple #21
0
def main(args):
	conf.debug = args.debug
	conf.verbose = args.verbose
	conf.args = args
	if args.alphas == "auto":
		pass  # TODO
	else:
		conf.alphas = [float(x) for x in args.alphas.split(",")]

	try:
		args.max_features = int(args.max_features)
	except (ValueError, TypeError):
		try:
			args.max_features = float(args.max_features)
		except (ValueError, TypeError):
			pass

	fitargs = {}
	predictargs = {}

	if args.featurefile and not args.force and os.path.exists(args.featurefile + ".train"):
		printd("Loading Saved Features")
		fs = FeatureSet.read(args.featurefile)
	else:
		fs = processData(args)
		if args.featurefile:
			printd("Writing Features")
			fs.write(args.featurefile)

	#kpca = KernelPCA(kernel="rbf", gamma=10)
	if args.model == "randomforests":
		if args.classify:
			model = ensemble.RandomForestClassifier
		else:
			model = ensemble.RandomForestRegressor
	elif args.model == "extratrees":
		if args.classify:
			model = ensemble.ExtraTreesClassifier
		else:
			model = ensemble.ExtraTreesRegressor
	elif args.model == "gradientboosting":
		if args.classify:
			model = ensemble.GradientBoostingClassifier
		else:
			model = ensemble.GradientBoostingRegressor
	elif args.model == "decisiontree":
		model = DecisionTreeRegressor
	elif args.model == "adaboost":
		model = ensemble.AdaBoostRegressor
	elif args.model == "linreg":
		model = LinearRegression
	elif args.model == "autolearn":
		printd("AutoLearn disabled as it does not work properly")
		sys.exit(-1)
		#model = AutoSklearnClassifier
		fitargs["dataset_name"] = "semeval"
	elif args.model == "nn":
		model = NNModel
		fitargs["nb_epoch"] = 10
		fitargs["batch_size"] = 32
		fitargs["verbose"] = 2
		predictargs["verbose"] = 0
	elif args.model == "None":
		printd("No Model specified, exiting")
		sys.exit(-1)
	else:
		printd("Invalid model %s" % args.model)
		sys.exit(-1)

	if args.classify:
		# Forest Classifiers do not allow non-binary labels, so we do it by sample weight instead
		byweight = issubclass(model, ensemble.forest.ForestClassifier)
		lintrainlabels = np.copy(fs.trainlabels)
		fs.discretizeLabels(byweight=byweight)
		if byweight:
			fitargs["sample_weight"] = fs.trainweights
	else:
		lintrainlabels = np.array(fs.trainlabels)
	fs.freeze()
	printd("Train labels:" + str(fs.trainlabels.shape))

	if (not args.force) and args.modelfile and os.path.exists(args.modelfile):
		if args.model == "nn":
			import keras
			model = keras.models.load_model(args.modelfile)
		else:
			model = joblib.load(args.modelfile)
	else:
		params = default_params[args.model]
		for param_name, param_value in params.items():
			try:
				pval = getattr(args, param_name)
				if pval is not None:
					params[param_name] = pval
			except AttributeError:
				pass

		if "input_dim" in params:
			# -1 for label
			params["input_dim"] = len(fs.names) - 1
		model = model(**params)

		if args.gridsearch:
			model = GridSearchCV(model, scoring=evalModel, cv=5, error_score=0,
					param_grid=param_grids[args.model], n_jobs=16, pre_dispatch="2*n_jobs", verbose=10)
		#model = Pipeline(steps=[('pca', kpca), ('dtree', dtree)])
		printd("Training")
		model.fit(fs.train, fs.trainlabels, **fitargs)
		#X_kpca = kpca.fit_transform(X)
		#dtree.fit(traindata, trainlabels)
		if args.modelfile:
			try:
				if args.model == "nn":
					model.save(args.modelfile)
				else:
					joblib.dump(model, args.modelfile)
			except Exception:
				printd("Could not save model, autolearn does not support saving")

	printd("Evaluating")
	print "Using Features: %s" % args.basefeatures
	print "Using Matchers: %s" % args.matchers
	print "Train Accuracy"
	evalData(model=model, data=fs.train, labels=lintrainlabels, classify=args.classify, obs=model.oob_prediction_, **predictargs)
	# trainobs = _

	print "Test Accuracy"
	testobs = evalData(model=model, data=fs.test, labels=fs.testlabels, classify=args.classify, **predictargs)

	if args.writematches:
		try:
			fs.data.writer
		except AttributeError:
			fs.data = dataset.Dataset.load(args.input_data, args.dataset)
		trainwriter = fs.data.writer(args.writematches + ".train")
		testwriter = fs.data.writer(args.writematches + ".test")
		for pair in fs.data.train():
			trainwriter.write(pair, Match(score=pair.label, autop=0))
		for pair, obs in izip(fs.data.test(), testobs):
			testwriter.write(pair, Match(score=obs))
Exemple #22
0
class MinDistSim(Matcher):

	def __init__(self, df=None, metric='cosine', maxsent=20, ngram=1, recurse=False, dimfeatures=True):
		#self.dist = ndist(s1, s2)
		#s1 = s1["vector"]
		#s2 = s2["vector"]
		self.metric = getMetric(metric)
		self._names = ["MDS_" + x for x in ["tsim", "lsim", "kdist", "kldist", "ldist", "kt", "tmax", "tmin", "tsum", "tstd", "tmaxidf", "tsumidf"]]
		maxsent = maxsent - ngram + 1
		if dimfeatures:
			self._names.extend(["MDS_w%03d" % x for x in range(maxsent)])
		self.maxsent = maxsent
		self.ngram = ngram
		self.recurse = recurse
		self.vocab = df
		self.wordcount = df.total
		self.dimfeatures = dimfeatures

	def match(self, pair):
		s1l = len(pair.s1["vector"])
		s2l = len(pair.s2["vector"])
		self.tsim = float('-9999')
		self.lsim = float('-9999')
		self.minlen = min(s1l, s2l)
		self.maxlen = max(s1l, s2l)
		self.nmatches = 0
		self.start = -1
		self.end = -1
		if (self.minlen == 0 or
				self.maxlen >= 100):
			return self.tsim

		# For simplicity in later code, make the shorter one first
		if s1l < s2l:
			self.s1 = pair.s1
			self.s2 = pair.s2
			s1l = len(pair.s1["vector"])
			s2l = len(pair.s2["vector"])
		else:
			self.s1 = pair.s2
			self.s2 = pair.s1

		wc = self.wordcount
		if "wv_idfs" not in self.s1:
			self.s1["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s1["wv_tokens"]]
		if "wv_idfs" not in self.s2:
			self.s2["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s2["wv_tokens"]]

		if self.ngram > 1:
			ng = self.ngram
			v1 = self.s1["vector"]
			v2 = self.s2["vector"]
			t1 = self.s1["wv_tokens"]
			t2 = self.s2["wv_tokens"]
			#idf1 = self.s1["wv_idfs"]
			#idf2 = self.s2["wv_idfs"]
			weights1 = self.s1["weights"]
			weights2 = self.s2["weights"]
			nv1 = [sum(v1[i:i + ng]) for i in range(max(1, len(v1) - ng + 1))]
			nv2 = [sum(v2[i:i + ng]) for i in range(max(1, len(v2) - ng + 1))]
			nt1 = ["_".join(t1[i:i + ng]) for i in range(max(1, len(t1) - ng + 1))]
			nt2 = ["_".join(t2[i:i + ng]) for i in range(max(1, len(t2) - ng + 1))]
			#nidf1 = [max(idf1[i:i + ng]) for i in range(max(1, len(idf1) - ng + 1))]
			#nidf2 = [max(idf2[i:i + ng]) for i in range(max(1, len(idf2) - ng + 1))]
			nweights1 = [max(weights1[i:i + ng]) for i in range(max(1, len(weights1) - ng + 1))]
			nweights2 = [max(weights2[i:i + ng]) for i in range(max(1, len(weights2) - ng + 1))]
			#self.s1 = {"vector": nv1, "wv_tokens": nt1, "wv_idfs": nidf1}
			#self.s2 = {"vector": nv2, "wv_tokens": nt2, "wv_idfs": nidf2}
			self.s1 = {"vector": nv1, "wv_tokens": nt1, "weights": nweights1}
			self.s2 = {"vector": nv2, "wv_tokens": nt2, "weights": nweights2}

			self.minlen = max(self.minlen - ng + 1, 1)
			self.maxlen = max(self.maxlen - ng + 1, 1)

		self.dists = [1] * self.minlen

		self.pair = pair
		#self.dist = pairdist(self.s1["vector"], self.s2["vector"], fn=self.metric)
		#self.dist = pairdist(self.s1, self.s2, fn=self.metric)
		dist = self.metric(self.s1, self.s2)

		# scale by max of idf
		#for i in range(dist.shape[0]):
		#	for j in range(dist.shape[1]):
		#		dist[i][j] *= max(self.s1["wv_idfs"][i], self.s2["wv_idfs"][j])

		self.matchv = np.zeros(dist.shape, int)
		np.fill_diagonal(self.matchv, 1)
		if np.sum(dist) == 0:
			self.tsim = 1
			self.nmatches = min(dist.shape)
			self.start = 0
			self.end = dist.shape[1] - 1
			return self.tsim
		if (dist == dist[0]).all():
			self.tsim = 1 - sum(dist[0])
			self.nmatches = min(dist.shape)
			self.start = 0
			self.end = dist.shape[1] - 1
			return self.tsim
		if (dist.T == dist[:, 0]).all():
			self.tsim = 1 - sum(dist[:, 0])
			self.nmatches = min(dist.shape)
			self.start = 0
			self.end = dist.shape[1] - 1
			return self.tsim

		signal.signal(signal.SIGALRM, munkres_handler)
		signal.alarm(10)
		try:
			matches = munkres(dist)
		except Exception, e:
			printd(e)
			printd("dist: " + dist.shape)
			printd(dist)
			return self.tsim
		signal.alarm(0)
		self.matchv = matches

		tdist = 0
		tmaxidf = 0
		tsumidf = 0
		nmatches = 0
		mstart = dist.shape[1]
		mend = 0
		#print self.s1["text"]
		#print self.s2["text"]
		#print " ".join(self.s1["wv_tokens"])
		#print " ".join(self.s2["wv_tokens"])
		s1tok = self.s1["wv_tokens"]
		s2tok = self.s2["wv_tokens"]
		matcharr = [0] * matches.shape[0]
		dists = [0] * matches.shape[0]
		matchedy = [0] * matches.shape[1]
		for i in range(matches.shape[0]):
			for j in range(matches.shape[1]):
				if matches[i, j]:
					matchedy[j] = 1
					tdist += dist[i, j]
					#tmaxidf += dist[i, j] * max(self.s1["wv_idfs"][i], self.s2["wv_idfs"][j])
					#tsumidf += dist[i, j] * sum((self.s1["wv_idfs"][i], self.s2["wv_idfs"][j]))
					wi = self.s1["weights"][i]
					wj = self.s2["weights"][j]
					tmaxidf += dist[i, j] * max(wi, wj)
					tsumidf += dist[i, j] * sum((wi, wj))
					printd("%s\t%s\t%0.4f\t%0.4f\t%0.4f" % (s1tok[i], s2tok[j], dist[i, j], wi, wj), level=1, sock=sys.stdout)
					nmatches += 1
					matcharr[i] = j
					dists[i] = dist[i, j]
					if j < mstart:
						mstart = j
					if j > mend:
						mend = j
		ldist = tdist
		tdist = tdist * max(dist.shape) / pow(min(dist.shape), 2)
		tmaxidf = tmaxidf * max(dist.shape) / pow(min(dist.shape), 2)
		tsumidf = tsumidf * max(dist.shape) / pow(min(dist.shape), 2)
		kt, ktp = kendalltau(range(len(matcharr)), matcharr)
		printd("Score: %0.4f\t%0.4f\t%0.4f\tLabel: %g\n" % (tdist, tmaxidf, tsumidf, pair.label), level=1, sock=sys.stdout)
		if self.recurse:
			# Remove matches from dist array, and rerun munkres
			# Repeat until dist array is empty
			pass
		else:
			for i in range(matches.shape[1]):
				if not matchedy[i]:
					ldist += min(matches[:, i])
		ldist /= max(dist.shape)
		# TODO:
		# Dist penalty is at most beta
		# The problem with this is that there may be a better pairing between the two sentences
		# if you optimize for mindist with dist penalty.
		# Also could add a weight to each pairing like IDF, most important for the
		# summing, but a different sum would again potentially affect the optimal
		# match.
		beta = 1
		self.kdist = tdist * (1 + beta * (kt + 1) / 2)
		self.kldist = ldist * (1 + beta * (kt + 1) / 2)
		self.ldist = ldist
		#print "Score: %g" % tsim
		#print "Label: %g" % self.pair.label
		self.tsim = 1 - tdist
		self.tmaxidf = tmaxidf
		self.tsumidf = tsumidf
		self.nmatches = nmatches
		self.start = mstart
		self.end = mend
		self.kt = kt
		self.dists = sorted(dists, reverse=True)
		self.lsim = tdist + (max(dists) * (self.maxlen - self.minlen))
		self.tmax = max(dists)
		self.tmin = max(dists)
		self.tsum = sum(dists)
		self.tstd = np.std(dists)
		return self.tsim
Exemple #23
0
	def match(self, pair):
		s1l = len(pair.s1["vector"])
		s2l = len(pair.s2["vector"])
		self.tsim = float('-9999')
		self.lsim = float('-9999')
		self.minlen = min(s1l, s2l)
		self.maxlen = max(s1l, s2l)
		self.nmatches = 0
		self.start = -1
		self.end = -1
		if (self.minlen == 0 or
				self.maxlen >= 100):
			return self.tsim

		# For simplicity in later code, make the shorter one first
		if s1l < s2l:
			self.s1 = pair.s1
			self.s2 = pair.s2
			s1l = len(pair.s1["vector"])
			s2l = len(pair.s2["vector"])
		else:
			self.s1 = pair.s2
			self.s2 = pair.s1

		wc = self.wordcount
		if "wv_idfs" not in self.s1:
			self.s1["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s1["wv_tokens"]]
		if "wv_idfs" not in self.s2:
			self.s2["wv_idfs"] = [math.log(wc / self.vocab[x], 2) for x in self.s2["wv_tokens"]]

		if self.ngram > 1:
			ng = self.ngram
			v1 = self.s1["vector"]
			v2 = self.s2["vector"]
			t1 = self.s1["wv_tokens"]
			t2 = self.s2["wv_tokens"]
			#idf1 = self.s1["wv_idfs"]
			#idf2 = self.s2["wv_idfs"]
			weights1 = self.s1["weights"]
			weights2 = self.s2["weights"]
			nv1 = [sum(v1[i:i + ng]) for i in range(max(1, len(v1) - ng + 1))]
			nv2 = [sum(v2[i:i + ng]) for i in range(max(1, len(v2) - ng + 1))]
			nt1 = ["_".join(t1[i:i + ng]) for i in range(max(1, len(t1) - ng + 1))]
			nt2 = ["_".join(t2[i:i + ng]) for i in range(max(1, len(t2) - ng + 1))]
			#nidf1 = [max(idf1[i:i + ng]) for i in range(max(1, len(idf1) - ng + 1))]
			#nidf2 = [max(idf2[i:i + ng]) for i in range(max(1, len(idf2) - ng + 1))]
			nweights1 = [max(weights1[i:i + ng]) for i in range(max(1, len(weights1) - ng + 1))]
			nweights2 = [max(weights2[i:i + ng]) for i in range(max(1, len(weights2) - ng + 1))]
			#self.s1 = {"vector": nv1, "wv_tokens": nt1, "wv_idfs": nidf1}
			#self.s2 = {"vector": nv2, "wv_tokens": nt2, "wv_idfs": nidf2}
			self.s1 = {"vector": nv1, "wv_tokens": nt1, "weights": nweights1}
			self.s2 = {"vector": nv2, "wv_tokens": nt2, "weights": nweights2}

			self.minlen = max(self.minlen - ng + 1, 1)
			self.maxlen = max(self.maxlen - ng + 1, 1)

		self.dists = [1] * self.minlen

		self.pair = pair
		#self.dist = pairdist(self.s1["vector"], self.s2["vector"], fn=self.metric)
		#self.dist = pairdist(self.s1, self.s2, fn=self.metric)
		dist = self.metric(self.s1, self.s2)

		# scale by max of idf
		#for i in range(dist.shape[0]):
		#	for j in range(dist.shape[1]):
		#		dist[i][j] *= max(self.s1["wv_idfs"][i], self.s2["wv_idfs"][j])

		self.matchv = np.zeros(dist.shape, int)
		np.fill_diagonal(self.matchv, 1)
		if np.sum(dist) == 0:
			self.tsim = 1
			self.nmatches = min(dist.shape)
			self.start = 0
			self.end = dist.shape[1] - 1
			return self.tsim
		if (dist == dist[0]).all():
			self.tsim = 1 - sum(dist[0])
			self.nmatches = min(dist.shape)
			self.start = 0
			self.end = dist.shape[1] - 1
			return self.tsim
		if (dist.T == dist[:, 0]).all():
			self.tsim = 1 - sum(dist[:, 0])
			self.nmatches = min(dist.shape)
			self.start = 0
			self.end = dist.shape[1] - 1
			return self.tsim

		signal.signal(signal.SIGALRM, munkres_handler)
		signal.alarm(10)
		try:
			matches = munkres(dist)
		except Exception, e:
			printd(e)
			printd("dist: " + dist.shape)
			printd(dist)
			return self.tsim
Exemple #24
0
def main(args):
	global wordvec, wordvecf
	conf.debug = args.debug or args.verbose
	conf.verbose = args.verbose
	conf.args = args
	#nf = args.nuggets
	#uf = args.updates
	#mf = args.matches
	sf = args.shingles
	vf = args.wordvec
	#ef = args.evalfile
	wvout = args.wvfile
	sim_thr = args.sim_thr
	dset = args.dataset
	limit = args.limit

	#if args.dataset == "auto":
	#	if ef is not None:
	#		dset = "semeval"
	#	else:
	#		with open(glob.glob(nf)[0]) as nh:
	#			nfhead = nh.readline()
	#			if nfhead.startswith("query_id\tnugget_id"):
	#				dset = "ts"
	#			elif nfhead.startswith("query_id\tvs_id"):
	#				dset = "mclick"
	#			else:
	#				dset = "1click"

	if os.path.exists(wvout) and not args.force:
		wordvecf = wvout

	if vf:
		printd("Reading word vector...")
		#wordvec = load_wordvec()
		wordvec = WordVec(wordvecf)

	if args.sim == "minsim":
		matcher = MinDistSim
	elif args.sim == "infsim":
		matcher = InfSim
	else:
		matcher = VecSim

	if args.sim == "infsim" or args.comparator == "infsim":
		wordvec.normalize()

	#if dset == "ts":
	#	nuggfn = Nuggets
	#	updfn = Updates
	#	outfn = MatchWriter
	#elif dset == "1click":
	#	nuggfn = CLNuggets
	#	updfn = CLUpdates
	#	outfn = CLMatchWriter
	#elif dset == "mclick":
	#	nuggfn = MCNuggets
	#	updfn = Updates
	#	outfn = MCMatchWriter
	#elif dset == "semeval":
	#	data = SemEvalDataset(args.input_data, args.evalfile)
	#	outfn = data.writer
	#	if vf is not None:
	#		data.vectorize(wordvec)
	#else:
	#	nuggfn = MCNuggets
	#	updfn = Updates
	#	outfn = MCMatchWriter

	data = Dataset.load(args.input_data, dset)
	if vf is not None:
		data.vectorize(wordvec)
	#if dset == "semeval":
	#	data = SemEvalDataset(args.input_data, args.evalfile)
	#	#outfn = data.writer
	#	if vf is not None:
	#		data.vectorize(wordvec)
	#else:
	#	printd("Processing Nuggets...")
	#	#nuggets = nuggfn(nf, vectorize=vf is not None)

	#	printd("Processing Updates...")
	#	#updates = updfn(uf, vectorize=vf is not None)
	#	#data = NuggetDataset(nuggets, updates, mf)
	#	data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None)

	if vf and wvout is not None and wvout != wordvecf:
		printd("Rereading word vectors to optimize...")
		wv_toks = data.wv_sentences()
		#if dset == "semeval":
		#	wv_toks = data.wv_sentences()
		#else:
		#	wv_toks = nuggets.wv_text() + updates.wv_text()
		wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize)
		if args.sim == "infsim" or args.comparator == "infsim":
			wordvec.normalize()
		data.vectorize(wordvec)
		with open(wvout + ".vocab", 'w') as wh:
			wh.write("\n".join(wordvec.vocab().keys()))
		with open(wvout + ".toks", 'w') as wh:
			wh.write("\n".join([" ".join(x) for x in wv_toks]))
		#vocab = nuggets.wv_vocab().union(updates.wv_vocab())
		#wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD)
		#wordvec.save(wvout)

	vocab = None
	if args.frequencies:
		try:
			with open(args.frequencies) as fh:
				vocab = json.load(fh)
			# For Term Frequencies instead of Document Frequencies
			# Could also do len(vocab[word]) if wanted to mimic DF
			if type(vocab.itervalues().next()) == dict:
				for word in vocab:
					vocab[word] = sum(vocab[word].itervalues())
		except Exception:
			pass
	if vocab is None:
		vocab = data.wv_vocab()
	logdf = wordvec.logdf(vocab)
	logdffile = wordvecf + ".logdf"
	#if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)):
	#	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")
	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")

	if args.comparator == "infsim" and args.sim != "infsim":
		comparator = InfSim(logdf).pairwisedist
	else:
		comparator = args.comparator

	matcher = matcher(df=logdf, metric=comparator)
	data.normalize(matcher, logdf)

	printd("Finding matches...")
	matches = []
	with data.writer(sf) as sw, data.writer(vf) as vw:
		mcnt = 0
		timer = Timer()
		for pair in data.test():
			if sf:
				match = shingle(pair.s1["tokens"], pair.s2["tokens"])
				if match.score >= min_score:
					sw.write(pair, match)

			if vf:
				printd("Matching pair %s" % (pair.pid), level=1)
				try:
					sim = matcher.match(pair)
					matches.append((matcher.tsim, unicode(matcher)))
				except ValueError, err:
					printd(err)
					sim = sim_thr
				printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2))
				if sim < sim_thr:
					sim = sim_thr
					start = matcher.start
					end = matcher.end - matcher.start
				else:
					start = -1
					end = len(pair.s2["tokens"]) - 1
				match = Match(sim, start, end)
				vw.write(pair, match)

			mcnt += 1
			if (mcnt % 100000) == 0:
				print >>sys.stderr, "%g tmps" % (100 / timer.mark())
			if limit and mcnt >= limit:
				return

		if conf.verbose:
			for tsim, match in sorted(matches):
				print match