Esempio n. 1
0
def load_instance(filepath):
	ins_list=[]
	filelist = get_ipython().getoutput(u'ls '+filepath)
	for filename in filelist:
		f = open(filepath+filename,'r')
		tokens=[]
		data=[]
		#read each line
		for l in f:
			tokens += nltk.regexp_tokenize(l,pattern="\w+")
		data = util.del_dup(tokens)
		if filepath[-4:-1] == 'neg':
			ins = Instance(filename,'negative',data,tokens)	
		elif filepath[-4:-1] == 'pos':
			ins = Instance(filename,'positive',data,tokens)
		else: 
			raise Exception, "Wrong path!"
		ins_list.append(ins)
	f.close()
    	return ins_list
Esempio n. 2
0
	def _collect_counts(self, instance_list):
		"""Collect feature and label counts from the dataset

		Create appropriate count tables and populate them
		Replace this with the actual docstring
		explaining the overview of the function, data structures of choice, etc.
		"""

		#pass
		
		#for test
		#self.count_table = numpy.zeros((4,2))

		#populate the count_table, for smoothing,initialize the array with value one
		self.count_table = numpy.ones((self.feature_codebook.size(),self.label_codebook.size()))		
		self.count_y_table = numpy.zeros(self.label_codebook.size())
		#counts
		for i in instance_list:
			#delete the duplicated words and make it work for the Bernoulli distribution
			i.raw_data = util.del_dup(i.raw_data)
			self.count_y_table[self.label_codebook.get_index(i.label)] +=1
			for token in i.raw_data:
				if self.feature_codebook.has_label(token):							
					self.count_table[self.feature_codebook.get_index(token),self.label_codebook.get_index(i.label)]+=1