-
Notifications
You must be signed in to change notification settings - Fork 0
/
spectralFeatureAlignmentSparse.py
149 lines (134 loc) · 8.67 KB
/
spectralFeatureAlignmentSparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding:utf-8 -*-
from sqlite3 import dbapi2 as sqlite
from os import path
from sys import argv
import numpy as np
from scipy.sparse.linalg import eigsh
from scipy import sparse
from sklearn.svm.sparse import LinearSVC
from bisect import bisect_left
class SpectralFeatureAlignment():
def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain):
self._dbDir = dbDir
self._sourceDomain = sourceDomain
self._rawDataFolder = rawDataFolder
self._targetDomain = targetDomain
self._tableName = sourceDomain + "to" + targetDomain
self._connection = sqlite.connect(path.join(dbDir,sourceDomain))
self._cursor = self._connection.cursor()
self._lsvc = LinearSVC(C=10000)
def _getFeatures(self, maxDIFeatures=500, minFrequency=5):
features = []
self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency])
features = [a[0] for a in self._cursor.fetchall()]
self._cursor.execute("SELECT term FROM mostinformatives")
mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000])
features = [feature for feature in features if feature not in mostInformatives]
return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:])
def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures):
domainIndependentFeaturesSet = set(domainIndependentFeatures)
domainDependentFeaturesSet = set(domainDependentFeatures)
def __parseFile(filePath):
with open(filePath, "r") as f:
for review in f:
reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()])
independentFeatures = reviewFeatures & domainIndependentFeaturesSet
dependentFeatures = reviewFeatures & domainDependentFeaturesSet
for dependentFeature in dependentFeatures:
rowIndex = bisect_left(domainDependentFeatures,dependentFeature)
for independentFeature in independentFeatures:
matrix[rowIndex, bisect_left(domainIndependentFeatures,independentFeature)] += 1
matrix = np.zeros((len(domainDependentFeatures), len(domainIndependentFeatures)))
__parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review"))
__parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review"))
__parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review"))
__parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review"))
return sparse.coo_matrix(matrix)
def _createSquareAffinityMatrix(self, cooccurrenceMatrix):
height = np.size(cooccurrenceMatrix, 0)
width = np.size(cooccurrenceMatrix, 1)
topMatrix = sparse.coo_matrix((height,height))
topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix))
bottomMatrix = sparse.coo_matrix((width,width))
bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix))
matrix = sparse.vstack((topMatrix, bottomMatrix))
return matrix
def _createDiagonalMatrix(self, squareAffinityMatrix):
rows = range(squareAffinityMatrix.get_shape()[0])
data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)]
return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1]))
def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain):
numDomainDep = len(domainDependentFeatures)
numDomainIndep = len(domainIndependentFeatures)
domainDepSet = set(domainDependentFeatures)
domainIndepSet = set(domainIndependentFeatures)
documentVectors = []
classifications = []
def __parseFile(filePath):
with open(filePath,"r") as f:
for review in f:
classification = 1 if "#label#:positive" in review else -1
reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel]
reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList}
reviewFeatures = set(reviewDict.keys())
domainDepReviewFeatures = domainDepSet & reviewFeatures
domainIndepReviewFeatures = domainIndepSet & reviewFeatures
domainDepValues,domainDepIndizes = [],[]
domainIndepValues, domainIndepIndizes = [],[]
for feature in domainIndepReviewFeatures:
#domainIndepValues.append(reviewDict[feature])
domainIndepValues.append(1)
domainIndepIndizes.append(bisect_left(domainIndependentFeatures,feature))
for feature in domainDepReviewFeatures:
#domainDepValues.append(reviewDict[feature])
domainDepValues.append(1)
domainDepIndizes.append(bisect_left(domainDependentFeatures,feature))
domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),shape=(1,numDomainIndep))
domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep))
documentVectors.append((domainIndepVector,domainDepVector))
classifications.append(classification)
__parseFile(path.join(self._rawDataFolder, domain, "positive.review"))
__parseFile(path.join(self._rawDataFolder, domain, "negative.review"))
return documentVectors,classifications
def _trainClassifier(self, trainingVectors, classifications):
self._lsvc.fit(sparse.vstack(trainingVectors),classifications)
def _testClassifier(self,testVectors,classifications):
return self._lsvc.score(sparse.vstack(testVectors),classifications)
def go(self,K=100, Y=6, DI=500, minFreq=5):
print self._sourceDomain + " -> " + self._targetDomain
domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq)
numDomainIndep = len(domainIndependentFeatures)
numDomainDep = len(domainDependentFeatures)
#print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep)
#print "creating cooccurrenceMatrix..."
a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures)
#print "creating SquareAffinityMatrix..."
a = self._createSquareAffinityMatrix(a)
#print "creating DiagonalMatrix..."
b = self._createDiagonalMatrix(a)
#print "multiplying..."
c = b.dot(a)
del a
c = c.dot(b)
del b
#print "calculating eigenvalues and eigenvectors"
eigenValues,eigenVectors = eigsh(c, k=K, which="LA")
del c
#print "building document vectors..."
documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain)
documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain)
#print "training and testing..."
U = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]]
U = np.concatenate(U,axis=1)[:numDomainDep]
U = sparse.csr_matrix(U)
clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining]
trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))]
clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting]
testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))]
self._trainClassifier(trainingVectors, classificationsTraining)
print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
if __name__ == "__main__":
source = argv[1]
target = argv[2]
sfa = SpectralFeatureAlignment("/home/raphael/BachelorThesis/DataBig","/home/raphael/BachelorThesis/Data/processed_acl", source, target)
sfa.go()