def index(self):
		starttime = time.time()
		print ">>INDEX: Word indexing started."
		index = {}
		articlecounts = {}
		inputpath = os.getcwd() + "/data/lemmatiser_output"

		doccount = 0
		totalwordcount = 0
		inputdata = {}

		pickleTime = time.time()
		for inputfilename in self.inputfiles:
			print ">>INDEX: Unpickling: \t '%s'." %inputfilename
			path = os.path.join(inputpath, inputfilename)
			pickledData = open(path, "r")
			tmp = pickle.load(pickledData)
			inputdata.update(tmp)
			pickledData.close()
		pickleTime = round((time.time() - pickleTime), 3)

		indexTime = time.time()
		print ">>INDEX: Indexing %s articles." % len(inputdata)
		for doc in inputdata:
			doccount += 1
			wordcount = 0
			articleid = doc

			for line in inputdata[doc]:
				#print "line: %s" % line
				if len(line) < 1:
					continue

				for word in line:
					if len(word) < 1:
						continue
					wordcount += 1
					if word in index:
						if articleid in index[word]:
							index[word][articleid] += 1 
						else:
							index[word][articleid] = 1
					else:
						index[word] = {articleid:1}
					# print "articleid: %s" % index[word]
			totalwordcount += wordcount
			articlecounts[articleid] = wordcount
		indexTime = round((time.time() - indexTime), 3)

		print ">>INDEX: %s words in total." % totalwordcount
		print ">>INDEX: %s unique words indexed." % len(index)
		print ">>INDEX: Document average length is %s." % (totalwordcount / doccount)

		self.wordIndex = index
		self.articleIndex = articlecounts

		totalTime = round((time.time() - starttime), 3)

		print ">>INDEX: Word indexing completed in %s seconds." % totalTime
		indexLog(self.inputfiles, len(inputdata), len(index), (totalwordcount / doccount), pickleTime, indexTime, totalTime)
Beispiel #2
0
	def index(self):
		starttime = time.time() # logging purposes
		print ">>INDEX: Word indexing started."
		doccount = 0 # logging purposes
		totalwordcount = 0 # logging purposes

		indexTime = time.time() # logging purposes
		print ">>INDEX: Indexing %s articles." % len(self.articleDict)
		for articleid in self.articleDict:
			doccount += 1 # logging purposes
			wordcount = 0
			sentimentcount = 0

			for line in self.articleDict[articleid]:
				if len(line) < 1: continue
				sentence = line.split(" ")

				for i in range(len(sentence)):
					word = sentence[i]
					word = word[:word.find("/")]
					if "|" in word:
						word = word[:word.find("|")]
					if len(word) < 1: continue
					if word in self.sentimentDict: sentimentcount += int(self.sentimentDict[word])
					if word in self.ngramterms:
						comingwords = self._checkNgram(word, self.ngramterms[word], sentence, i, False)[0]

						if len(comingwords) > 0:
							word += "_" + "_".join(comingwords)
							i += len(comingwords)
							wordcount += len(comingwords)

					wordcount += 1
					if word in self.wordIndex:
						if articleid in self.wordIndex[word]: self.wordIndex[word][articleid] += 1 
						else: self.wordIndex[word][articleid] = 1
					else:
						self.wordIndex[word] = {articleid:1}

			totalwordcount += wordcount # logging purposes
			self.articleIndex[articleid] = (wordcount, sentimentcount)

		# Function done. Now printing and logging!
		indexTime = round((time.time() - indexTime), 3) # logging purposes

		print ">>INDEX: %s words in total." % totalwordcount
		print ">>INDEX: %s unique words indexed." % len(self.wordIndex)
		print ">>INDEX: Document average length is %s." % (totalwordcount / doccount)

		totalTime = round((time.time() - starttime), 3) # logging purposes

		print ">>INDEX: Word indexing completed in %s seconds. \n" % totalTime
		# print "pickletime: ", pickleTime
		print "indextime: ", indexTime
		print "doccount: ", doccount
		indexLog(self.inputfiles, len(self.articleDict), len(self.wordIndex), (totalwordcount / doccount), 0, indexTime, totalTime)
Beispiel #3
0
	def build_indices(self):
		starttime = time.time()
		print ">>INDEX: Word indexing started."
		inputpath = os.getcwd() + "/data/monster_output"

		doccount = 0
		totalwordcount = 0
		inputdata = {}

		pickleTime = time.time()
		for inputfilename in self.inputfiles:
			print ">>INDEX: Unpickling: \t '%s'." %inputfilename
			path = os.path.join(inputpath, inputfilename)
			with open(path, "r") as pickledData:
				tmp = pickle.load(pickledData)
				inputdata.update(tmp)
		pickleTime = round((time.time() - pickleTime), 3)

		indexTime = time.time()
		print ">>INDEX: Indexing %s articles." % len(inputdata)
		for articleid in inputdata:
			doccount += 1
			# if doccount > 3695:
			# 	break
			wordcount = 0
			sentimentcount = 0

			for line in inputdata[articleid]:
				if len(line) < 1: continue
				words = line.split(" ")

				for i in range(len(words)):
					word = words[i]
					word = word[:word.find("/")]
					if len(word) < 1: continue
					if word in self.sentimentdict: sentimentcount += int(self.sentimentdict[word])
					if word in self.ngramterms:
						comingwords = self._checkNgram(word, words, i)
						if len(comingwords) > 0:
							for x in comingwords:
								word += "_" + x
						
					wordcount += 1
					if word in self.wordIndex:
						if articleid in self.wordIndex[word]:
							self.wordIndex[word][articleid] += 1 
						else:
							self.wordIndex[word][articleid] = 1
					else:
						self.wordIndex[word] = {articleid:1}

				# for word in line.split(" "):
				# 	word = word[:word.find("/")]
				# 	if len(word) < 1:
				# 		continue
				# 	if word in self.sentimentdict:
				# 		sentimentcount += int(self.sentimentdict[word])
				# 	wordcount += 1
				# 	if word in index:
				# 		if articleid in index[word]:
				# 			index[word][articleid] += 1 
				# 		else:
				# 			index[word][articleid] = 1
				# 	else:
				# 		index[word] = {articleid:1}
					# print "articleid: %s" % index[word]

			totalwordcount += wordcount
			self.articleIndex[articleid] = (wordcount, sentimentcount)

		indexTime = round((time.time() - indexTime), 3)

		print ">>INDEX: %s words in total." % totalwordcount
		print ">>INDEX: %s unique words indexed." % len(self.wordIndex)
		print ">>INDEX: Document average length is %s." % (totalwordcount / doccount)

		totalTime = round((time.time() - starttime), 3)

		print ">>INDEX: Word indexing completed in %s seconds. \n" % totalTime
		print "pickletime: ", pickleTime
		print "indextime: ", indexTime
		print "doccount: ", doccount
		indexLog(self.inputfiles, len(inputdata), len(self.wordIndex), (totalwordcount / doccount), pickleTime, indexTime, totalTime)