def index(self): starttime = time.time() print ">>INDEX: Word indexing started." index = {} articlecounts = {} inputpath = os.getcwd() + "/data/lemmatiser_output" doccount = 0 totalwordcount = 0 inputdata = {} pickleTime = time.time() for inputfilename in self.inputfiles: print ">>INDEX: Unpickling: \t '%s'." %inputfilename path = os.path.join(inputpath, inputfilename) pickledData = open(path, "r") tmp = pickle.load(pickledData) inputdata.update(tmp) pickledData.close() pickleTime = round((time.time() - pickleTime), 3) indexTime = time.time() print ">>INDEX: Indexing %s articles." % len(inputdata) for doc in inputdata: doccount += 1 wordcount = 0 articleid = doc for line in inputdata[doc]: #print "line: %s" % line if len(line) < 1: continue for word in line: if len(word) < 1: continue wordcount += 1 if word in index: if articleid in index[word]: index[word][articleid] += 1 else: index[word][articleid] = 1 else: index[word] = {articleid:1} # print "articleid: %s" % index[word] totalwordcount += wordcount articlecounts[articleid] = wordcount indexTime = round((time.time() - indexTime), 3) print ">>INDEX: %s words in total." % totalwordcount print ">>INDEX: %s unique words indexed." % len(index) print ">>INDEX: Document average length is %s." % (totalwordcount / doccount) self.wordIndex = index self.articleIndex = articlecounts totalTime = round((time.time() - starttime), 3) print ">>INDEX: Word indexing completed in %s seconds." % totalTime indexLog(self.inputfiles, len(inputdata), len(index), (totalwordcount / doccount), pickleTime, indexTime, totalTime)
def index(self): starttime = time.time() # logging purposes print ">>INDEX: Word indexing started." doccount = 0 # logging purposes totalwordcount = 0 # logging purposes indexTime = time.time() # logging purposes print ">>INDEX: Indexing %s articles." % len(self.articleDict) for articleid in self.articleDict: doccount += 1 # logging purposes wordcount = 0 sentimentcount = 0 for line in self.articleDict[articleid]: if len(line) < 1: continue sentence = line.split(" ") for i in range(len(sentence)): word = sentence[i] word = word[:word.find("/")] if "|" in word: word = word[:word.find("|")] if len(word) < 1: continue if word in self.sentimentDict: sentimentcount += int(self.sentimentDict[word]) if word in self.ngramterms: comingwords = self._checkNgram(word, self.ngramterms[word], sentence, i, False)[0] if len(comingwords) > 0: word += "_" + "_".join(comingwords) i += len(comingwords) wordcount += len(comingwords) wordcount += 1 if word in self.wordIndex: if articleid in self.wordIndex[word]: self.wordIndex[word][articleid] += 1 else: self.wordIndex[word][articleid] = 1 else: self.wordIndex[word] = {articleid:1} totalwordcount += wordcount # logging purposes self.articleIndex[articleid] = (wordcount, sentimentcount) # Function done. Now printing and logging! indexTime = round((time.time() - indexTime), 3) # logging purposes print ">>INDEX: %s words in total." % totalwordcount print ">>INDEX: %s unique words indexed." % len(self.wordIndex) print ">>INDEX: Document average length is %s." % (totalwordcount / doccount) totalTime = round((time.time() - starttime), 3) # logging purposes print ">>INDEX: Word indexing completed in %s seconds. \n" % totalTime # print "pickletime: ", pickleTime print "indextime: ", indexTime print "doccount: ", doccount indexLog(self.inputfiles, len(self.articleDict), len(self.wordIndex), (totalwordcount / doccount), 0, indexTime, totalTime)
def build_indices(self): starttime = time.time() print ">>INDEX: Word indexing started." inputpath = os.getcwd() + "/data/monster_output" doccount = 0 totalwordcount = 0 inputdata = {} pickleTime = time.time() for inputfilename in self.inputfiles: print ">>INDEX: Unpickling: \t '%s'." %inputfilename path = os.path.join(inputpath, inputfilename) with open(path, "r") as pickledData: tmp = pickle.load(pickledData) inputdata.update(tmp) pickleTime = round((time.time() - pickleTime), 3) indexTime = time.time() print ">>INDEX: Indexing %s articles." % len(inputdata) for articleid in inputdata: doccount += 1 # if doccount > 3695: # break wordcount = 0 sentimentcount = 0 for line in inputdata[articleid]: if len(line) < 1: continue words = line.split(" ") for i in range(len(words)): word = words[i] word = word[:word.find("/")] if len(word) < 1: continue if word in self.sentimentdict: sentimentcount += int(self.sentimentdict[word]) if word in self.ngramterms: comingwords = self._checkNgram(word, words, i) if len(comingwords) > 0: for x in comingwords: word += "_" + x wordcount += 1 if word in self.wordIndex: if articleid in self.wordIndex[word]: self.wordIndex[word][articleid] += 1 else: self.wordIndex[word][articleid] = 1 else: self.wordIndex[word] = {articleid:1} # for word in line.split(" "): # word = word[:word.find("/")] # if len(word) < 1: # continue # if word in self.sentimentdict: # sentimentcount += int(self.sentimentdict[word]) # wordcount += 1 # if word in index: # if articleid in index[word]: # index[word][articleid] += 1 # else: # index[word][articleid] = 1 # else: # index[word] = {articleid:1} # print "articleid: %s" % index[word] totalwordcount += wordcount self.articleIndex[articleid] = (wordcount, sentimentcount) indexTime = round((time.time() - indexTime), 3) print ">>INDEX: %s words in total." % totalwordcount print ">>INDEX: %s unique words indexed." % len(self.wordIndex) print ">>INDEX: Document average length is %s." % (totalwordcount / doccount) totalTime = round((time.time() - starttime), 3) print ">>INDEX: Word indexing completed in %s seconds. \n" % totalTime print "pickletime: ", pickleTime print "indextime: ", indexTime print "doccount: ", doccount indexLog(self.inputfiles, len(inputdata), len(self.wordIndex), (totalwordcount / doccount), pickleTime, indexTime, totalTime)