def save_stems(self): if self.draft or not HAS_LIBSTEMMER: return try: stemmer = Stemmer(self.language.iso639_1) except RuntimeError: return self.stems.all().delete() r = re.compile('\w{4,32}', re.UNICODE) words = r.findall(self.body) num_words = float(len(words)) stemmed_words = {} for word in words: stem_value = stemmer.stem(word) stemmed_words.setdefault(stem_value, 0) stemmed_words[stem_value] += 1 for stem, value in stemmed_words.iteritems(): stem, created = Stem.objects.get_or_create(value=stem) self.stems.create(stem=stem, value=value/num_words) self.find_related()
class TP: """Text Processing, TP class ***NOTE: This is a simple, beta tool and low-performance now*** HOW-TO: 1. instantiate 2. use .addDoc(filename) first 3. analyze() 4. showStat() This class has several main functions: - removing stop words - stemming/lemmatizing - calculating TF-IDF """ def __init__(self): # establish stopword set self.stopword_set = set() # for sotring added file data self.doc_wordlist = {} # doc_topic_dict = {docname: topic_ist, ...} self.doc_topic_dict = {} # for TF-IDF (term frequency - inverse document frequency) # tfidf_dict/tf_dict = {filename : {feature : TF, ...}, ...} # idf_dict = {word : IDF, ...} self.tf_dict = {} self.idf_dict = {} self.tfidf_dict = {} self.doc_with_word_counter = {} self.docs_total = 0 # for cosine similarity # it looks like: # { # 'docA': {'docA': xxx, 'docB': xxx, ...}, # 'docB': {'docA': xxx, 'docB': xxx, ...}, # ... # } self.cos_dict = {} # for normalize word (regex) self.enableRegex = True # ask if user want to enable regex filter self.restr = r'[a-zA-Z\-][a-zA-Z\-]*' req = raw_input("Enable regex filter ? (default: r'[a-zA-Z\-][a-zA-Z\-]*'. enter 'no' to disable it): ") if req == 'no': print('Disable regex filter') self.enableRegex = False else: self.enableRegex = True # set stopword list self.enableStopword = True # ask if enable stopword filter spq = raw_input("Enable stopword filter ? ( 'no' to disable ): ") if spq == 'no': print('Disable stopword filter') self.enableStopword = False else: self.enableStopword = True if self.enableStopword: stopwdf = raw_input("Please input stopword list path (keep empty for default 'stopwords.txt'): ") if stopwdf: self.setStopwdList(stopwdf) else: self.setStopwdList('stopwords.txt') # ask if user want to use stemmer self.enableStemmer = True smq = raw_input('Enable stemmer ? ( "no" to disable ): ') if smq == 'no': print('Disable stemmer') self.enableStemmer = False else: self.enableStemmer = True # setup stemmer if self.enableStemmer: self.stemmer = Stemmer() def setStopwdList(self, filename): try: with open(filename, 'r') as f_swlist: self.stopword_set = set(f_swlist.read().splitlines()) print "[INFO] reading stopwords list done." except IOError: sys.stderr.write("[Error] failed to open stopwords list\n") sys.stderr.write("[Warning] Assign stopword list failed. Please try setStopwdList(filename) again.\n") ### output methods ### def showStat(self, docname=None): """ Show TF-IDF info about docs. If docname is now given, show all docs. """ if len(self.tfidf_dict) <= 0: print "[Error] There is no data in corpus. Please use addDoc() and autoTfidf() to conduct an analysis." return 1 if docname: self.printTFIDF(docname) else: for dname in self.tfidf_dict: self.printTFIDF(dname) def printTFIDF(self, docname): """ print formatted TF and TFIDF info """ d_tfidf = self.tfidf_dict[docname] d_tf = self.tf_dict[docname] print 'TFIDF info. of %s\n' % docname print 'Word --> TF\tTF-IDF' print '-------------------\n' for word in d_tfidf: if word in d_tf: print '{wd:<16s} --> {tf:e}\t{idf:e}'.format(wd=word, tf=d_tf[word], idf=d_tfidf[word]) else: print '{wd:<16s} --> {tf:e}\t{idf:e}'.format(wd=word, tf='KEY ERROR', idf=d_tfidf[word]) print '-------------------\n' def printIDF(self): """ print IDF information """ print 'Word --> IDF' for word in self.idf_dict: print '{wd:<16s} --> {idf}'.format(wd=word, idf=self.idf_dict[word]) def exportTfidf(self, limit=0, filename=None): """ Now this function can only output tfidf_dict as JSON. limit is the number of data you want to output. """ # if filename is not specified, use date-time instead. if not filename: filename = time.strftime('%h-%d-%Y-%I-%M-%p')+'_TFIDF_dict.json' # export whole tfidf_dict with open(filename, 'w') as fout: if (limit == 0): out_dict = self.tfidf_dict else: out_dict = {} # counter cnter = 0 for (key, val) in self.tfidf_dict.items(): out_dict[key] = val cnter+=1 if cnter >= limit: break # translate python dict into json outjson = json.dumps(out_dict, sort_keys=True, indent=4) fout.write(outjson) print('[Info] TFIDF_dict has been exported as file {fn:s}'.format(fn=filename)) ### word processors ### def testStopword(self, word): """ Evaluate one word and figure out whether it is a stopword from self.stopwordlist. If so, return true. Otherwise return false. """ if word in self.stopword_set: return True return False def stemming(self, word, stway=None): """ With given word and specified stemming algorithm, return it's stemming result. If no algorithm is specified, use XXX as default. """ return self.stemmer.stemming(word) ### TF-IDF calculator ### def analyze(self): """ Automatically run tf(), idf() and tfidf() """ print "[INFO] starting auto TF-IDF calculator" # record start time start_time = time.time() self.tf() self.idf() self.tfidf() # record end time end_time = time.time() print "[INFO] Finished" print "[INFO] Use showStat() to show the result" print "[Info] Total time of execution: {timestr:s}".format(timestr=str(end_time-start_time)) def tfidf(self): """ calculate TF-IDF """ print "[INFO] calculating TF-IDF..." for doc_name in self.tf_dict: tmp_dict = {} doc_wl = self.tf_dict[doc_name] for word in doc_wl: # TF-IDF = TF * IDF tmp_dict[word] = doc_wl[word] * self.idf_dict[word] # store result to tfidf_dict # tfidf_dict looks like {filename : {word1 : IDF, word2 : IDF, ...}, ...} self.tfidf_dict[doc_name] = tmp_dict def tf(self): """ calculate TF of added docs and store them to self.tf_dict """ print "[INFO] calculating TF..." # iterate whole doc_wordlist if len(self.doc_wordlist) > 0: for doc_name in self.doc_wordlist: print "[Info] tf: " + str(doc_name) # make a container word_tf_dict = {} # retrieve word list of from doc_wordlist and remove the key in dict wl = self.doc_wordlist[doc_name] word_num = len(wl) # calculate term frequency (within a file) for word in wl: # count the number of word within a file and store it to word_tf_dict temporarily # get previous record and pile up be = word_tf_dict.get(word, 0.0) word_tf_dict[word] = be + 1.0 # reserve words for IDF if not word in self.idf_dict: self.idf_dict[word] = 0.0 #print "[Debug] tf: TF result: " + doc_name + " " + str(word_tf_dict) # normalize word_tf_dict = dict(map(lambda (k,v) : (k,v/word_num), word_tf_dict.iteritems())) # save result self.tf_dict[doc_name] = word_tf_dict #print "[Debug] result: " + doc_name + " " + str(self.tf_dict[doc_name]) else: sys.stderr.write("[Error] tf: there is no file in queue\n") sys.stderr.write("[Error] tf: please use addDoc first\n") def idf(self): """ calculate current idf with all docs have been added """ print "[INFO] calculating IDF..." # log(total number of documents) log_doc_total = math.log(self.docs_total, 10) # use self.idf_dict # retrieve word from idf_dict for word in self.idf_dict: # initialize self.idf_dict[word] = 0.0 ### Attention !!! The code below make this program too inefficient. ### I have used self.word_counter instead to record the number of word when addDoc() was called ### ============================================================ # count the number of word through all docs #for doc_name in self.tf_dict: # if self.tf_dict[doc_name].has_key(word): # self.idf_dict[word] += 1.0 ### ============================================================ # calculate IDF # log(word number) ### the line below is depreciated. ### log_tf = math.log(self.idf_dict[word], 10) # log( the number of doc contains the word ) log_ndcw = math.log(self.doc_with_word_counter[word], 10) # log(docs number) - log(word number) self.idf_dict[word] = log_doc_total - log_ndcw ### file operating method ### def addDoc(self, doc_name): """ Read data from a doc, store it and return a word list of the content. This function will also remove stopwords and do stemming. """ print('[Info] addDoc '+doc_name) # for storing words wd_list = [] # file reading mechanism has been adjusted # specified input file format: # Line 1 = question name # Line 2 = topics # Line 3... = answer content try: # open docs and read data with open(doc_name, 'r') as f_in: for i,line in enumerate(f_in): # We ignore first line (question name) if (i == 0): pass # How to deal with topics at second line ? # Read topics into self.doc_topic_dict if (i == 1): self.doc_topic_dict[doc_name] = line.split(', ') for word in line.split(): # if the length of word is zero, pass if (len(word) == 0): pass # normalize the word. Get rid of punctuation which is not needed if self.enableRegex: matchedGrp = re.match(self.restr, word, re.M|re.I) if matchedGrp: word = matchedGrp.group(0) else: word = '' # END if # lowercast word word = word.lower() # remove stopwords here (if enabled) if self.enableStopword and self.testStopword(word): pass else: # stemming here if self.enableStemmer: # word = self.stemming(word) pass ### These code below is used to calculate IDF later ### ========================================= # count the number of doc contains the word # if current word has been in wd_list then ignore it to prevent counting the same word twice if not word in wd_list: pre_rc = self.doc_with_word_counter.get(word, 0.0) self.doc_with_word_counter[word] = pre_rc + 1.0 ### ========================================= # add word to word list wd_list.append(word) # END if # END for words in line # END for lines in file # END with open file # increase docs number self.docs_total += 1 # store word list self.doc_wordlist[doc_name] = wd_list except IOError, e: sys.stderr.write("[Error] addDoc failed.\n") sys.stderr.write("[Error] error dump: {er:s}".format(er=str(e))) return wd_list
def __init__(self): # establish stopword set self.stopword_set = set() # for sotring added file data self.doc_wordlist = {} # doc_topic_dict = {docname: topic_ist, ...} self.doc_topic_dict = {} # for TF-IDF (term frequency - inverse document frequency) # tfidf_dict/tf_dict = {filename : {feature : TF, ...}, ...} # idf_dict = {word : IDF, ...} self.tf_dict = {} self.idf_dict = {} self.tfidf_dict = {} self.doc_with_word_counter = {} self.docs_total = 0 # for cosine similarity # it looks like: # { # 'docA': {'docA': xxx, 'docB': xxx, ...}, # 'docB': {'docA': xxx, 'docB': xxx, ...}, # ... # } self.cos_dict = {} # for normalize word (regex) self.enableRegex = True # ask if user want to enable regex filter self.restr = r'[a-zA-Z\-][a-zA-Z\-]*' req = raw_input("Enable regex filter ? (default: r'[a-zA-Z\-][a-zA-Z\-]*'. enter 'no' to disable it): ") if req == 'no': print('Disable regex filter') self.enableRegex = False else: self.enableRegex = True # set stopword list self.enableStopword = True # ask if enable stopword filter spq = raw_input("Enable stopword filter ? ( 'no' to disable ): ") if spq == 'no': print('Disable stopword filter') self.enableStopword = False else: self.enableStopword = True if self.enableStopword: stopwdf = raw_input("Please input stopword list path (keep empty for default 'stopwords.txt'): ") if stopwdf: self.setStopwdList(stopwdf) else: self.setStopwdList('stopwords.txt') # ask if user want to use stemmer self.enableStemmer = True smq = raw_input('Enable stemmer ? ( "no" to disable ): ') if smq == 'no': print('Disable stemmer') self.enableStemmer = False else: self.enableStemmer = True # setup stemmer if self.enableStemmer: self.stemmer = Stemmer()