Ejemplo n.º 1
0
	def __init__(self):
		cf = ConfigFile()
		vocabPath = cf.GetConfig("VOCAB")
		senDataPath = cf.GetConfig("SENSDATA")
		wrtDataPath = cf.GetConfig("WRITINGDATA")

		self.__wd = WritingData()
		self.__sd = None #SensData()
		self.__vocab = None #Vocab()

		# load ..
		self.__wd.Read(wrtDataPath)
		#self.__sd.Read(senDataPath)
		#self.__vocab.Read(vocabPath)

		self.__ptStm = PorterStemmer()

		# Bog words settings from config file
		if cf.GetConfig("BOG_STEMMER") == 'TRUE':
			self.__isUseStemmer = True
		else:
			self.__isUseStemmer = False

		if cf.GetConfig("BOG_LOWER") == 'TRUE':
			self.__isToLower = True
		else:
			self.__isToLower = False

		if cf.GetConfig("BOG_ALPHANUMONLY") == 'TRUE':
			self.__isAlphaNumOnly = True
		else:
			self.__isAlphaNumOnly = False

		self.__stopWordsVocab = None
		if cf.GetConfig("BOG_RMSTOPWORDS") == 'TRUE':
			self.__rmStopWords = True
			self.__stopWordsVocab = Vocab()
			stopList = cf.GetConfig("STOPLIST")
			self.__stopWordsVocab.Read(stopList)
		else:
			self.__rmStopWords = False

		self.__minFreq = int(cf.GetConfig("BOG_MINFREQ"))
Ejemplo n.º 2
0
class BogBuilder:
	
	def __init__(self):
		cf = ConfigFile()
		vocabPath = cf.GetConfig("VOCAB")
		senDataPath = cf.GetConfig("SENSDATA")
		wrtDataPath = cf.GetConfig("WRITINGDATA")

		self.__wd = WritingData()
		self.__sd = None #SensData()
		self.__vocab = None #Vocab()

		# load ..
		self.__wd.Read(wrtDataPath)
		#self.__sd.Read(senDataPath)
		#self.__vocab.Read(vocabPath)

		self.__ptStm = PorterStemmer()

		# Bog words settings from config file
		if cf.GetConfig("BOG_STEMMER") == 'TRUE':
			self.__isUseStemmer = True
		else:
			self.__isUseStemmer = False

		if cf.GetConfig("BOG_LOWER") == 'TRUE':
			self.__isToLower = True
		else:
			self.__isToLower = False

		if cf.GetConfig("BOG_ALPHANUMONLY") == 'TRUE':
			self.__isAlphaNumOnly = True
		else:
			self.__isAlphaNumOnly = False

		self.__stopWordsVocab = None
		if cf.GetConfig("BOG_RMSTOPWORDS") == 'TRUE':
			self.__rmStopWords = True
			self.__stopWordsVocab = Vocab()
			stopList = cf.GetConfig("STOPLIST")
			self.__stopWordsVocab.Read(stopList)
		else:
			self.__rmStopWords = False

		self.__minFreq = int(cf.GetConfig("BOG_MINFREQ"))

	def __getConfigStr(self):
		"""
		Get the config string such as M10_L_STM
		"""

		cfgStr = "M%d" % self.__minFreq

		if self.__isToLower:
			cfgStr += "_L"

		if self.__isUseStemmer:
			cfgStr += "_STM"

		if self.__isAlphaNumOnly:
			cfgStr += "_ALPHANUM"

		if self.__rmStopWords:
			cfgStr += "_RMSTP"

		return cfgStr

	def GetConfigStr(self):
		"""
		Public method to get the config string according to current setting
		"""
		return self.__getConfigStr()

	def SetConfigByStr(self, cfgStr):
		"""
		Set the configuration by string, such as 7nat_5K_M10_L_STM.arff etc
		"""
		import re
		
		# build a set of rex
		pat = r'(_|^)M(?P<MINFREQ>\d{1,2})[_\.]'
		rexMF = re.compile(pat)

		pat = r'_L([_\.]|$)'
		rexLower = re.compile(pat)

		pat = r'_STM([_\.]|$)'
		rexStm = re.compile(pat)
		
		pat = r'_ALPHANUM([_\.]|$)'
		rexAlphaNum = re.compile(pat)

		pat = r'_RMSTP([_\.]|$)'
		rexRmstp = re.compile(pat)

		
		# Start set config values
		m = rexMF.search(cfgStr)
		if m is None:
			print cfgStr	
		self.__minFreq = int(m.group('MINFREQ'))

		m = rexStm.search(cfgStr)
		if not m is None:
			self.__isUseStemmer = True
		else:
			self.__isUseStemmer = False

		m = rexLower.search(cfgStr)
		if not m is None:
			self.__isToLower = True
		else:
			self.__isToLower = False

		m = rexAlphaNum.search(cfgStr)
		if not m is None:
			self.__isAlphaNumOnly = True
		else:
			self.__isAlphaNumOnly = False

		m = rexRmstp.search(cfgStr)
		self.__stopWordsVocab = None
		if not m is None:
			self.__rmStopWords = True
			self.__stopWordsVocab = Vocab()
			cf = ConfigFile()
			stopList = cf.GetConfig("STOPLIST")
			self.__stopWordsVocab.Read(stopList)
		else:
			self.__rmStopWords = False


	def _getCvSplitRange(self, nFold, nData):
		# TODO
		assert(nFold <= nData)
		
		idxArray = []

		step = int(nData / nFold)
		startIdx = 0
		endIdx = 1

		for iFold in range(nFold - 1):
			endIdx = startIdx + step
			idxArray.append((startIdx, endIdx))
			startIdx = endIdx

		# for last fold
		idxArray.append((startIdx, nData))

		assert(len(idxArray) == nData)

		return idxArray
			
	
	def SplitCrossValidationData(self, oriArffFile):
		"""
		Split the original file to n-fold validation
		Simple version 
		"""

		cf = ConfigFile()
		nFold = int(cf.GetConfig("CVFOLD"))
		ratio = 1.0 / float(nFold)
		
		# analyse Arff file
		
		ap = ArffParser()	
		ap.Parse(oriArffFile)
		headerStr = ap.GetHeader()
		dataLines = ap.GetDataLines()
		nData = len(dataLines)
		
		cvRange = self.__getCvSplitRange(nFold, nData)

		for iFold in range(nFold):
			print("Generate fold %d" % iFold)
			fnTrn = oriArffFile + "_fold_%d_trn.arff" % iFold
			fnTst = oriArffFile + "_fold_%d_tst.arff" % iFold
			
			print("Start writing traning file: " + fnTrn)
			fwTrn = open(fnTrn, 'w')
			fwTrn.write(headerStr)

			for iPart in range(nFold):
				if iPart == iFold:
					continue
				# else write as training data
				sIdx = cvRange[iPart][0]
				eIdx = cvRange[iPart][1]
				idx = sIdx
				while idx < eIdx:
					fwTrn.write(nData[idx])
			
			print("Start writing testing file: " + fnTst)

			fwTrn = open(fnTrn, 'w')
			fwTrn.write(headerStr)
	
			sIdx = cvRange[iFold][0]
			eIdx = cvRange[iFold][1]
			idx = sIdx
			while idx < eIdx:
				fwTrn.write(nData[idx])

	def isAllNonASCII(self, string):
		return all(ord(c) >= 128 for c in string)

	def ProcessToken(self, tok):
		"""
		Process token, according to the configuration setting
		i.e. lower? stemmer? etc.
		"""

		tok = tok.strip()
		if tok == '':
			return None

		if self.isAllNonASCII(tok):
			#print tok
			return None

		# substitution some char
		
		if tok.find("'") != -1:
			tok = "'%s'" % tok.replace("'", r"\'")

		if tok.find('%') != -1:
			tok = "'%s'" % tok.replace("%", r"\%")

		if self.__rmStopWords:
			if self.__stopWordsVocab.IsVocabWord(tok):
				#print("REMOVED: " + tok)
				return None

		if self.__isToLower:
			tok = tok.lower()

		if self.__isUseStemmer:
			tok = self.__ptStm.stem(tok, 0, len(tok) - 1)

		if self.__isAlphaNumOnly:
			if not tok.isalnum():
				return None

		return tok


	def GetBog(self, sensDataPath, outBogPath):
		"""
		Generate weka format arff
		Will scan the sentence on the run!
		"""

		#Some bog setting
		lg = Log()
		msg = "BogBuilder.GetBog(\n%s ->\n %s" % (sensDataPath, outBogPath)
		print(msg)
		lg.WriteLog(msg)
		
		wd = self.__wd

		if wd is None:
			wd = WritingData()
			wd.Read()

		sd = self.__sd
		if sd is None:
			sd = SensData()
			sd.Read(sensDataPath)

		sensDict = sd.GetSensDict()

		# 1st pass
		# Building vocabulary
		# Get all the sentence first, establish the vocab

		vocab = {}
		classSet = set()
		classColStr = "Nationality"
		for wrtId in sensDict.keys():
			# wrtId (int)
			classVal = wd.GetValueByWid(wrtId, classColStr) # classId, i.e. nationality
			classVal = classVal.lower()
			classSet.add(classVal)

			sensList = sensDict[wrtId]
			for sen in sensList:
				toks = sen.split(' ')
				
				for tok in toks:
					# start adding to vocabulary
					
					tok = self.ProcessToken(tok)

					if tok is None or tok == '':
						continue
				
					if vocab.has_key(tok):
						vocab[tok] += 1
					else:
						vocab[tok] = 1


		relString = "BogBuilder -stmmer:%r -lower:%r -minFreq:%d -alphanumonly:%r" % \
							  (self.__isUseStemmer, self.__isToLower, \
								self.__minFreq, self.__isAlphaNumOnly)
								
		msg = "[BogBuilder] " + "First pass vocab scan, #vocab = %d" % len(vocab)
		print(msg)
		print(relString)
		lg.WriteLog("BOG setting: " + relString)
		lg.WriteLog(msg)

		# Further process vocabulary, i.e. min frequency cut-off

		# impose min word frequency cut-off
		if self.__minFreq > 1:
			for word in vocab.keys():
				freq = vocab[word]
				if freq < self.__minFreq:
					del vocab[word]

		msg = "[BogBuilder] " + "Applied minimum frequency cut-off (#attr) #vocab = %d" % len(vocab)
		print(msg)
		lg.WriteLog(msg)

		import operator
		sortedVocab = sorted(vocab.iteritems(), key = operator.itemgetter(1), reverse = True)
		
		bogAttrIdx = {}
		idx = 0
		for (word, freq) in sortedVocab:
			bogAttrIdx[word] = idx
			idx += 1

		# 2nd Pass
		# Start buiilding BOG!

		# Start writing ARFF file
		ab = ArffBuilder()
		ab.StartWriting(outBogPath)

		
		relString = "BogBuilder -stmmer:%r -lower:%r -minFreq:%d -alphanumonly:%r" % \
							  (self.__isUseStemmer, self.__isToLower, \
								self.__minFreq, self.__isAlphaNumOnly)

		ab.WriteRelation("text_file") # TODO!


		strNatClasses = "{"
		for n in list(classSet):
			if n == "":
				continue
			strNatClasses += "%s," % n
		strNatClasses = strNatClasses[0:-1] + "}"
		
		#Add Class lable attribute
		ab.AddAttr("CLS::" + classColStr, strNatClasses)

		# add other bog attributes
		feType = '{0,1}' # should be norminal to save time
		for (word, freq) in sortedVocab:
			ab.AddAttr(word, feType)

		ab.WriteAttr()
		

		# OK Finally, start writing data!

		# classColStr = "Nationality"
		bogAttrIdxOffset = 1
		nSen = 0
		for wrtId in sensDict.keys():
			# wrtId (int)
			classVal = wd.GetValueByWid(wrtId, classColStr) # classId, i.e. nationality

			if classVal is None:
				continue


			for sen in sensDict[wrtId]:
				# process each sentence

				attrList = []
				attrList.append('%d %s' % (0, classVal))

				toks = sen.split(' ')
				attrUnsort = set()
				for tok in toks:
					tok = self.ProcessToken(tok)
					
					if not bogAttrIdx.has_key(tok):
						continue
					
					attrIdx = bogAttrIdx[tok] + bogAttrIdxOffset
					attrUnsort.add(attrIdx)
				
				if len(attrUnsort) <= 0:
					continue
				
				attrSort = list(attrUnsort)
				attrSort.sort()
				
				for attrIdx in attrSort:
					attrList.append('%d %d' % (attrIdx, 1)) 

				ab.AddDataSparse(attrList)
				nSen += 1

		#Done! write summary
		msg = "Done! Number of instance wrote: %d" % nSen
		print(msg)
		lg.WriteLog(msg)
			
	
	def GenerateBog_Bef(self,arffOut):
		"""
		Generate bog arff file, before filter
		like ...

		"some text ...", classid
		"some text ...", classid
		"some text ...", classid
		"some text ...", classid

		"""
		
		wd = self.__wd
		sd = self.__sd
		sd.ResetWritingIter()
		vocab = self.__vocab
		
		ab = ArffBuilder()
		ab.StartWriting(arffOut)
		ab.AddAttr("text", "String")
		
		# Write arff header part
		strNatClasses = "{"
		for n in wd.GetUniqueData("Nationality"):
			if n == "":
				continue
			strNatClasses += "%s," % n
		strNatClasses = strNatClasses[0:-1] + "}"
		ab.AddAttr("nationality", "{fr,cn,mx,it,ru,de,br}") # TODO this is hack!

		ab.WriteRelation("text_files")
		ab.WriteAttr()


		# Write arff data part
		
		sens = sd.GetAllSentencesWrtId()

		for (wrtId, sen) in sens:
			nat = wd.GetValueByWid(wrtId, "Nationality")
					
			if nat == None:
				continue
			
			if nat.strip() == "":
				continue
			
			nat = nat.lower().strip()

			attrList = ['"%s"' % sen, nat]
			ab.AddData(attrList)