Beispiel #1
0
 def extendAS(self,ext_vocab=[]):
         
     ext_vocab=[x.lower() for x in ext_vocab]
         
     #this is just a restart so labels are still valid
     labels={key: value for key, value in enumerate(self.activeSearch.labels.tolist()) if value > -1}
     self.extendedVocabulary.update(set(ext_vocab))
     
     #attach only 
     ngram_range=(500,0)
     if len(ext_vocab)==0:
         return
     for x in ext_vocab:
         l=len(x.split())
         ngram_range=(min((l,ngram_range[0])),max((l,ngram_range[1])))
     tempvectorizer=CountVectorizer(analyzer='word',vocabulary=ext_vocab,binary=True,ngram_range=ngram_range,decode_error=u'ignore')
     addX=tempvectorizer.fit_transform(self.text)
     #scale by mean distance and some factor
     #some_factor=2
     #addX.multiply(self.scalefactor*float(some_factor))
     
     #add column 
     self.Xsparse = sparse.hstack((self.Xsparse,addX))
     
     if self.dimred:
         print self.Xsparse.shape
         svd=TruncatedSVD(n_components=self.n_components)
         X=svd.fit_transform(self.Xsparse)
         print("dimensionalty reduction leads to explained variance ratio sum of "+str(svd.explained_variance_ratio_.sum()))
         self.sparse=False
     else:
         X=self.Xsparse
     params=asI.Parameters(pi=self.prevalence,verbose=False,sparse=self.sparse,eta=self.eta)
     self.activeSearch = asI.kernelAS(params=params) ##fast
     self.activeSearch.initialize(X.transpose(),init_labels = labels)
Beispiel #2
0
def test_nan ():
	import cPickle as pickle
	import os, os.path as osp

	with open(osp.join(os.getenv('HOME'), 'Research/Data/ActiveSearch/ben/forumthreadsSparseMatrix.pkl'),'r') as fl:
		X = pickle.load(fl)
		X = X.T

	# import IPython 
	# IPython.embed()
	X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]

	X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]
	print X.shape
	r,n = X.shape

	nt = int(0.05*n)
	num_eval = 50
	Y = np.array([1]*nt + [0]*(n-nt), dtype=int)
	nr.shuffle(Y)

	pi = sum(Y)/len(Y)
	init_pt = 537

	# import IPython 
	# IPython.embed()

	# A = np.array((X.T.dot(X)).todense())
	t1 = time.time()

	verbose = True
	prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose)	
	kAS = ASI.kernelAS(prms)
	kAS.initialize(X)


	init_lbls = {init_pt:1}

	kAS.firstMessage(init_pt)
	# fs2 = [kAS.f]

	import IPython
	IPython.embed()

	for i in range(num_eval):
		idx1 = kAS.getNextMessage()
		kAS.setLabelCurrent(Y[idx1])
		init_lbls[idx1] = Y[idx1]
		import IPython
		IPython.embed()
Beispiel #3
0
def test_CC ():

	nac = np.allclose

	n = 1000
	r = 100
	nt = 200
	rcross = 0
	X,Y = createFakeData3(n, r, nt, rcross)

	num_eval = 50
	pi = sum(Y)/len(Y)
	init_pt = 5

	# import IPython 
	# IPython.embed()

	A = X.T.dot(X)
	t1 = time.time()

	verbose = True
	prms = ASI.Parameters(pi=pi,sparse=False, verbose=verbose)	
	kAS = ASI.kernelAS(prms)
	kAS.initialize(X)
	
	sAS = ASI.shariAS(prms)
	sAS.initialize(A)
	#sAS2 = ASI.naiveShariAS(prms)

	kAS.firstMessage(init_pt)
	sAS.firstMessage(init_pt)
	# fs2 = [kAS.f]

	for i in range(num_eval):
		idx1 = kAS.getNextMessage()
		kAS.setLabelCurrent(Y[idx1])
		# init_lbls[idx1] = Y[idx1]
		idx2 = sAS.getNextMessage()
		sAS.setLabelCurrent(Y[idx2])

		print('NEXT')
		print idx1==idx2
		print nac(kAS.f, sAS.f)
		# fs2.append(kAS.f)
		# fs3.append(sAS.f)

	import IPython 
	IPython.embed()
Beispiel #4
0
def test_interface ():
	verbose = False
	#ts_data = ef.load_timestamps (tsfile)
	Xfull = load_sparse_csr('Xfull1.npz')

	r,n = Xfull.shape

	nt = int(0.05*n)
	num_eval = 1000
	# num_eval = nt*2
	Y = np.array([1]*nt + [0]*(n-nt), dtype=int)

	pi = sum(Y)/len(Y)
	init_pt = 100

	t1 = time.time()

	prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose)	
	kAS = ASI.kernelAS(prms)
	kAS.initialize(Xfull)
	kAS.firstMessage(init_pt)
	fs2 = [kAS.f]	

	for i in range(num_eval):
		idx = kAS.getNextMessage()
		kAS.setLabelCurrent(Y[idx])
		fs2.append(kAS.f)

	t2 = time.time()

	f1,h1,s1,fs1,dtinv1 = AS.kernel_AS (Xfull, Y, pi=pi, num_eval=num_eval, init_pt=init_pt, verbose=verbose,all_fs=True,tinv=True,sparse=True)

	t3 = time.time()

	checks = [np.allclose(fs1[i],fs2[i]) for i in range(len(fs1))]

	import IPython
	IPython.embed()
    dataConn = mysql_conn.flatfileDataConnect()
    message_count = dataConn.connect(args.JSON_path)
else:
    dataConn = mysql_conn.mysqlDataConnect()
    message_count = dataConn.connect(args.database, args.database_hostname, args.database_user, args.database_pass)

activeSearch = None

# when firstMessage is called we reinitialize the kernel algorithm. However calling
# initialize again requires us to invert C so we could be smarter and save that 
# For now the invert time is a couple of seconds so we can do that as future work
restart_save = None
first_run = True
if (args.method == "kernel"):
    print "Using kernelAS"
    activeSearch = asI.kernelAS()
    wMat = dataConn.getFinalFeatureMatrix(args.wordlimit,args.skip_stemmer, args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0,0)
    restart_save = wMat.copy()
    activeSearch.initialize(wMat)
elif (args.method == "shari"):
    print "Using shariAS"
    activeSearch = asI.shariAS()   
    A = dataConn.getAffinityMatrix(args.wordlimit,args.skip_stemmer,args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0,0)
    # Feeding in the dense version to shari's code because the sparse version is not implemented 
    activeSearch.initialize(np.array(A.todense())) 
elif (args.method == "naiveshari"):
    print "Using naieveShariAS"
    activeSearch = asI.naiveShariAS()   
    A = dataConn.getAffinityMatrix(args.wordlimit,args.skip_stemmer,args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0,0)
    # Feeding in the dense version to shari's code because the sparse version is not implemented 
    activeSearch.initialize(np.array(A.todense())) 
Beispiel #6
0
def test_warm_start ():

	verbose = True
	nac = np.allclose
	#ts_data = ef.load_timestamps (tsfile)
	Xfull = load_sparse_csr('Xfull1.npz')
	# print Xfull.shape
	# Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:]
	# Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))]
	# # r,n = Xfull.shape

	# print Xfull.shape
	# Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:]
	# Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))]

	# getting rid of features which are zero for all these elements
	n = 300
	r = 600
	X = Xfull[:,:n]

	X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]

	X = X[:r,:]
	X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]
	print X.shape
	#X = np.load('X11.npy')
	r,n = X.shape
	
	nt = int(0.05*n)
	num_eval = 50
	Y = np.array([1]*nt + [0]*(n-nt), dtype=int)
	nr.shuffle(Y)

	pi = sum(Y)/len(Y)
	init_pt = 5

	# import IPython 
	# IPython.embed()

	A = np.array((X.T.dot(X)).todense())
	t1 = time.time()

	prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose)	
	kAS = ASI.kernelAS(prms)
	kAS.initialize(X)
	
	kAS2 = ASI.kernelAS(prms)
	sAS = ASI.shariAS(prms)
	sAS2 = ASI.naiveShariAS(prms)

	# import IPython
	# IPython.embed()

	init_lbls = {init_pt:1}

	kAS.firstMessage(init_pt)
	fs2 = [kAS.f]

	for i in range(num_eval):
		idx1 = kAS.getNextMessage()
		kAS.setLabelCurrent(Y[idx1])
		init_lbls[idx1] = Y[idx1]
		# sAS.setLabelCurrent(Y[idx2])
		# fs2.append(kAS.f)
		# fs3.append(sAS.f)

	print("Batch initializing:")
	print("Kernel AS:")
	kAS2.initialize(X, init_lbls)
	print("Shari AS:")
	sAS.initialize(A, init_lbls)
	print("Naive Shari AS:")
	sAS2.initialize(A, init_lbls)


	import IPython
	IPython.embed()
Beispiel #7
0
def test_interface3 ():

	verbose = True
	nac = np.allclose
	#ts_data = ef.load_timestamps (tsfile)
	Xfull = load_sparse_csr('Xfull1.npz')
	print Xfull.shape
	Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:]
	Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))]
	# r,n = Xfull.shape

	print Xfull.shape
	Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:]
	Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))]

	# getting rid of features which are zero for all these elements
	# n = 300
	# r = 600
	X = Xfull#[:,:n]

	# X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	# X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]

	# X = X[:r,:]
	# X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	# X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]
	# print X.shape
	# #X = np.load('X11.npy')
	r,n = X.shape
	
	nt = int(0.05*n)
	num_eval = 50
	Y = np.array([1]*nt + [0]*(n-nt), dtype=int)

	pi = sum(Y)/len(Y)
	init_pt = 5

	# import IPython 
	# IPython.embed()

	A = np.array((X.T.dot(X)).todense())
	t1 = time.time()

	prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose)	
	kAS = ASI.kernelAS(prms)
	kAS.initialize(X)
	sAS = ASI.shariAS(prms)
	sAS.initialize(A)

	# ofk = kAS.f
	# ofs = sAS.f

	# import IPython
	# IPython.embed()


	kAS.firstMessage(init_pt)
	fs2 = [kAS.f]
	sAS.firstMessage(init_pt)
	fs3 = [sAS.f]

	# #
	# lbl = 1
	# idx = 5
	
	# B = np.ones(n)/(1+prms.w0)
	# D = A.sum(axis=1)
	# BDinv = np.diag(np.squeeze(B*1./D))
	# IA = np.eye(n) - BDinv.dot(A)
	# IAi = np.matrix(nlg.inv(IA))
	# IAk = nlg.inv(np.eye(n) + kAS.BDinv.dot(X.T.dot(nlg.inv(np.eye(r) - X.dot(kAS.BDinv.dot(X.T))))).dot(X.todense()))
	# IAki = nlg.inv(IAk)
	
	# t = (1+prms.w0)*(1-prms.eta)
	# e = np.zeros((n,1))
	# e[idx] = 1
	# IA2 = IA + (1-t)*e.dot(e.T).dot(BDinv.dot(A))
	# ai = (1./D)[idx]/(1+ prms.w0)*A[idx,:]
	# Ad = (1-t)*IAi[:,idx].dot(ai.dot(IAi))/(1 + (1-t)*ai.dot(IAi[:,idx]))
	# IA2i = IAi - Ad
	#


	# import IPython
	# IPython.embed()

	for i in range(num_eval):
		idx1 = kAS.getNextMessage()
		idx2 = sAS.getNextMessage()
		print('NEXT')
		print idx1==idx2
		print nac(kAS.f, sAS.f)
		# import IPython
		# IPython.embed()

		kAS.setLabelCurrent(Y[idx1])
		sAS.setLabelCurrent(Y[idx2])
		fs2.append(kAS.f)
		fs3.append(sAS.f)

	t2 = time.time()

	# f1,h1,s1,fs1,dtinv1 = AS.kernel_AS (Xfull, Y, pi=pi, num_eval=num_eval, init_pt=init_pt, verbose=verbose,all_fs=True,tinv=True,sparse=True)

	t3 = time.time()

	# checks = [np.allclose(fs1[i],fs2[i]) for i in range(len(fs1))]

	import IPython
	IPython.embed()
Beispiel #8
0
def test_interface2 ():
	verbose = True
	nac = np.allclose
	#ts_data = ef.load_timestamps (tsfile)
	Xfull = load_sparse_csr('Xfull1.npz')
	print Xfull.shape
	# Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:]
	# Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))]
	# r,n = Xfull.shape

	# print Xfull.shape
	# Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:]
	# Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))]

	# getting rid of features which are zero for all these elements
	n = 300
	r = 600
	X = Xfull[:,:n]

	X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]

	X = X[:r,:]
	X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:]
	X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))]
	print X.shape
	#X = np.load('X11.npy')
	r,n = X.shape
	
	nt = int(0.05*n)
	num_eval = 50
	Y = np.array([1]*nt + [0]*(n-nt), dtype=int)

	pi = sum(Y)/len(Y)
	init_pt = 5

	# import IPython 
	# IPython.embed()

	A = np.array((X.T.dot(X)).todense())
	t1 = time.time()

	prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose)	
	kAS = ASI.kernelAS(prms)
	kAS.initialize(X)
	sAS = ASI.naiveShariAS(prms)
	sAS.initialize(A)
	
	import IPython
	IPython.embed()

	kAS.firstMessage(init_pt)
	fs2 = [kAS.f]
	sAS.firstMessage(init_pt)
	fs3 = [sAS.f]

	import IPython
	IPython.embed()

	for i in range(num_eval):
		idx1 = kAS.getNextMessage()
		idx2 = sAS.getNextMessage()
		print('NEXT')
		print idx1==idx2
		print nac(kAS.f, sAS.f)
		# import IPython
		# IPython.embed()

		kAS.setLabelCurrent(Y[idx1])
		sAS.setLabelCurrent(Y[idx2])
		fs2.append(kAS.f)
		fs3.append(sAS.f)

	t2 = time.time()

	# f1,h1,s1,fs1,dtinv1 = AS.kernel_AS (Xfull, Y, pi=pi, num_eval=num_eval, init_pt=init_pt, verbose=verbose,all_fs=True,tinv=True,sparse=True)

	t3 = time.time()

	# checks = [np.allclose(fs1[i],fs2[i]) for i in range(len(fs1))]

	import IPython
	IPython.embed()
Beispiel #9
0
    message_count = dataConn.connect(args.JSON_path)
else:
    dataConn = mysql_conn.mysqlDataConnect()
    message_count = dataConn.connect(args.database, args.database_hostname,
                                     args.database_user, args.database_pass)

activeSearch = None

# when firstMessage is called we reinitialize the kernel algorithm. However calling
# initialize again requires us to invert C so we could be smarter and save that
# For now the invert time is a couple of seconds so we can do that as future work
restart_save = None
first_run = True
if (args.method == "kernel"):
    print "Using kernelAS"
    activeSearch = asI.kernelAS()
    wMat = dataConn.getFinalFeatureMatrix(args.wordlimit, args.skip_stemmer,
                                          args.num_cpus, message_count,
                                          args.out_to_database,
                                          args.in_from_database, 0, 0)
    restart_save = wMat.copy()
    activeSearch.initialize(wMat)
elif (args.method == "shari"):
    print "Using shariAS"
    activeSearch = asI.shariAS()
    A = dataConn.getAffinityMatrix(args.wordlimit, args.skip_stemmer,
                                   args.num_cpus, message_count,
                                   args.out_to_database, args.in_from_database,
                                   0, 0)
    # Feeding in the dense version to shari's code because the sparse version is not implemented
    activeSearch.initialize(np.array(A.todense()))
Beispiel #10
0
    def startAS(self,corpus,labeled_corpus=[],labels=[],starting_points=[]):
        """
        corpus --> list of touples (id,text) where id is external id
        """
        num_labels = len(labels)
        if num_labels != len(labeled_corpus):
            raise Exception ("Number of lables and number of previously labeled objects does not match")
        if num_labels > 0:
            self.prev_corpus.extend(labeled_corpus)
            self.prev_labels.extend(labels)
            
        #initialise with previous information
        self.start_idx=len(self.prev_labels)    

        #get map from external id to internal index for the new corpus 
        self.id_to_idx={}#maps external id (e.g. AdId) to internal index
        for i,el in enumerate(corpus):
            self.id_to_idx[el[0]]=i+self.start_idx #do not include indices pointing to already labeled objects from previous AS
        self.curr_corpus=corpus
        self.num_messages=len(corpus)
        self.unlabeled_idxs=set(xrange(self.start_idx,self.num_messages))
        self.hashlookup={}
        if self.dedupe:
            #calculate all minhash values
            self.hashed=[self.hashing(tup[1].lower()) for i,tup in enumerate(corpus)]#minhash
            #for now, save collisions in a dictionary. Replace with locality sensitive hashing later
            for i,h in enumerate(self.hashed):
                if h in self.hashlookup:
                    self.hashlookup[h].append(i)
                else:
                    self.hashlookup[h]=[i]
        text = [x[1] for x in self.prev_corpus] + [y[1] for y in corpus]
        
        #save text so that restart is possible
        self.text=text
        #featurize
        ngram_range=(500,0)
        if len(self.extendedVocabulary)==0:
            ngram_range=(1,1)
        for x in self.extendedVocabulary:
            l=len(x.split())
            ngram_range=(min((l,ngram_range[0])),max((l,ngram_range[1])))
        if self.vocab == None:
            vocabulary = self.getVocabulary(text,extendedVoc=list(self.extendedVocabulary))
        else:
            vocabulary = self.vocab+list(self.extendedVocabulary)
        if self.tfidf:
            self.setTfidf(vocab=vocabulary,ngram_range=ngram_range)
        else:
            self.setCountVectorizer(vocab=vocabulary,ngram_range=ngram_range)
        self.Xsparse=self.vectorizer.fit_transform(text)
        
        
        #add column with ones for empty rows 
        a = self.Xsparse.dot(np.ones(self.Xsparse.shape[1]))#this works because features are non-negative
        anonz=a.nonzero()[0]
        if anonz.shape[0] != self.Xsparse.shape[0]:#matrix contains empty rows
            b=np.ones(self.Xsparse.shape[0])
            b[anonz]=0
            self.Xsparse=sparse.hstack((self.Xsparse,sparse.csr_matrix(b).T))
        
        if self.dimred:
            print self.Xsparse.shape
            svd=TruncatedSVD(n_components=self.n_components)
            X=svd.fit_transform(self.Xsparse)
            print("dimensionalty reduction leads to explained variance ratio sum of "+str(svd.explained_variance_ratio_.sum()))
            self.sparse=False
            #b=np.array([len(x) for x in text,ndmin=2).transpose()
            #X=np.hstack((X,b))
        else:
            #b=np.array([len(x) for x in text,ndmin=2).transpose()
            #self.Xsparse=sparse.hstack((X,b))
            X=self.Xsparse

        #get scale
        #extimate pairwise distances through random sampling
        #pairwise_dists = squareform(pdist(X[np.random.choice(X.shape[0], 1000, replace=False),:], 'euclidean'))
        #self.scalefactor = np.mean(pairwise_dists)
        
        params=asI.Parameters(pi=self.prevalence,verbose=False,sparse=self.sparse,eta=self.eta)
        self.activeSearch = asI.kernelAS(params=params) ##fast
        
        if len(starting_points)==0:
            if len(self.prev_labels)==0:
                raise Exception ("No start point and no labels provided")
        init_labels = {key: value for key, value in enumerate(self.prev_labels)}
        for x in starting_points:
            idx=self.id_to_idx[x]
            self.unlabeled_idxs.remove(idx)
            init_labels[idx]=1
        self.activeSearch.initialize(X.transpose(),init_labels = init_labels)