Ejemplo n.º 1
0
    def save_stems(self):
        if self.draft or not HAS_LIBSTEMMER:
            return
        try:
            stemmer = Stemmer(self.language.iso639_1)
        except RuntimeError:
            return

        self.stems.all().delete()
        r = re.compile('\w{4,32}', re.UNICODE)
        words = r.findall(self.body)
        num_words = float(len(words))
        stemmed_words = {}

        for word in words:
            stem_value = stemmer.stem(word)
            stemmed_words.setdefault(stem_value, 0)
            stemmed_words[stem_value] += 1

        for stem, value in stemmed_words.iteritems():
            stem, created = Stem.objects.get_or_create(value=stem)
            self.stems.create(stem=stem, value=value/num_words)

        self.find_related()
Ejemplo n.º 2
0
class TP:

    """Text Processing, TP class

        ***NOTE: This is a simple, beta tool and low-performance now***
        
        HOW-TO:
            1. instantiate
            2. use .addDoc(filename) first
            3. analyze()
            4. showStat()

        This class has several main functions:
            - removing stop words
            - stemming/lemmatizing
            - calculating TF-IDF

    """

    def __init__(self):

        # establish stopword set
        self.stopword_set = set()

        # for sotring added file data
        self.doc_wordlist = {}

        # doc_topic_dict = {docname: topic_ist, ...}
        self.doc_topic_dict = {}

        # for TF-IDF (term frequency - inverse document frequency)
        # tfidf_dict/tf_dict = {filename : {feature : TF, ...}, ...}
        # idf_dict = {word : IDF, ...}
        self.tf_dict = {}
        self.idf_dict = {}
        self.tfidf_dict = {}
        self.doc_with_word_counter = {}
        self.docs_total = 0

        # for cosine similarity
        # it looks like: 
        # { 
        #   'docA': {'docA': xxx, 'docB': xxx, ...}, 
        #   'docB': {'docA': xxx, 'docB': xxx, ...}, 
        #   ... 
        # }
        self.cos_dict = {}

        # for normalize word (regex)
        self.enableRegex = True
        # ask if user want to enable regex filter
        self.restr = r'[a-zA-Z\-][a-zA-Z\-]*'
        req = raw_input("Enable regex filter ? (default: r'[a-zA-Z\-][a-zA-Z\-]*'. enter 'no' to disable it): ")
        if req == 'no':
            print('Disable regex filter')
            self.enableRegex = False
        else:
            self.enableRegex = True

        # set stopword list
        self.enableStopword = True
        # ask if enable stopword filter
        spq = raw_input("Enable stopword filter ? ( 'no' to disable ): ")
        if spq == 'no':
            print('Disable stopword filter')
            self.enableStopword = False
        else:
            self.enableStopword = True

        if self.enableStopword:
            stopwdf = raw_input("Please input stopword list path (keep empty for default 'stopwords.txt'): ")
            if stopwdf:
                self.setStopwdList(stopwdf)
            else:
                self.setStopwdList('stopwords.txt')

        # ask if user want to use stemmer
        self.enableStemmer = True
        smq = raw_input('Enable stemmer ? ( "no" to disable ): ')

        if smq == 'no':
            print('Disable stemmer')
            self.enableStemmer = False
        else:
            self.enableStemmer = True
        # setup stemmer
        if self.enableStemmer:
            self.stemmer = Stemmer()

    def setStopwdList(self, filename):
        try:
            with open(filename, 'r') as f_swlist:
                self.stopword_set = set(f_swlist.read().splitlines())
                print "[INFO] reading stopwords list done."
        except IOError:
           sys.stderr.write("[Error] failed to open stopwords list\n") 
           sys.stderr.write("[Warning] Assign stopword list failed. Please try setStopwdList(filename) again.\n") 

    ### output methods ###
    
    def showStat(self, docname=None):
        """ Show TF-IDF info about docs. If docname is now given, show all docs.
        """
        if len(self.tfidf_dict) <= 0:
            print "[Error] There is no data in corpus. Please use addDoc() and autoTfidf() to conduct an analysis."
            return 1

        if docname:
            self.printTFIDF(docname)
        else:
            for dname in self.tfidf_dict:
                self.printTFIDF(dname)

    def printTFIDF(self, docname):
        """ print formatted TF and TFIDF info
        """
        d_tfidf = self.tfidf_dict[docname]
        d_tf = self.tf_dict[docname]

        print 'TFIDF info. of %s\n' % docname
        print 'Word    -->  TF\tTF-IDF'
        print '-------------------\n'
        for word in d_tfidf:
            if word in d_tf:
                print '{wd:<16s}  -->  {tf:e}\t{idf:e}'.format(wd=word, tf=d_tf[word], idf=d_tfidf[word])
            else:
                print '{wd:<16s}  -->  {tf:e}\t{idf:e}'.format(wd=word, tf='KEY ERROR', idf=d_tfidf[word])
        print '-------------------\n'

    def printIDF(self):
        """ print IDF information
        """
        print 'Word  -->  IDF'
        for word in self.idf_dict:
            print '{wd:<16s}  -->  {idf}'.format(wd=word, idf=self.idf_dict[word])

    def exportTfidf(self, limit=0, filename=None):
        """
            Now this function can only output tfidf_dict as JSON.
            limit is the number of data you want to output.
        """
        # if filename is not specified, use date-time instead.
        if not filename:
            filename = time.strftime('%h-%d-%Y-%I-%M-%p')+'_TFIDF_dict.json'

        # export whole tfidf_dict
        with open(filename, 'w') as fout:

            if (limit == 0):
                out_dict = self.tfidf_dict
            else:
                out_dict = {}
                # counter
                cnter = 0
                for (key, val) in self.tfidf_dict.items():
                    out_dict[key] = val
                    cnter+=1
                    if cnter >= limit:
                        break
            # translate python dict into json
            outjson = json.dumps(out_dict, sort_keys=True, indent=4)
            fout.write(outjson)

        print('[Info] TFIDF_dict has been exported as file {fn:s}'.format(fn=filename))

    ### word processors ###

    def testStopword(self, word):
        """ Evaluate one word and figure out whether it is a stopword
            from self.stopwordlist. 
            If so, return true. 
            Otherwise return false.
        """
        if word in self.stopword_set:
            return True
        return False

    def stemming(self, word, stway=None):
        """ With given word and specified stemming algorithm, 
            return it's stemming result.
            If no algorithm is specified, use XXX as default.
        """
        return self.stemmer.stemming(word)

    
    ### TF-IDF calculator ###

    def analyze(self):
        """ Automatically run tf(), idf() and tfidf()
        """
        print "[INFO] starting auto TF-IDF calculator"

        # record start time
        start_time = time.time()

        self.tf()
        self.idf()
        self.tfidf()

        # record end time
        end_time = time.time()

        print "[INFO] Finished"
        print "[INFO] Use showStat() to show the result"
        print "[Info] Total time of execution: {timestr:s}".format(timestr=str(end_time-start_time))

    def tfidf(self):
        """ calculate TF-IDF
        """
        print "[INFO] calculating TF-IDF..."
        for doc_name in self.tf_dict:

            tmp_dict = {}
            doc_wl = self.tf_dict[doc_name]

            for word in doc_wl:
                # TF-IDF = TF * IDF
                tmp_dict[word] = doc_wl[word] * self.idf_dict[word]

            # store result to tfidf_dict
            # tfidf_dict looks like {filename : {word1 : IDF, word2 : IDF, ...}, ...}
            self.tfidf_dict[doc_name] = tmp_dict

    def tf(self):
        """ calculate TF of added docs and store them to self.tf_dict
        """
        print "[INFO] calculating TF..."
        # iterate whole doc_wordlist
        if len(self.doc_wordlist) > 0:
            for doc_name in self.doc_wordlist:
               
                print "[Info] tf: " + str(doc_name)

                # make a container
                word_tf_dict = {}

                # retrieve word list of from doc_wordlist and remove the key in dict
                wl = self.doc_wordlist[doc_name]
                word_num = len(wl)

                # calculate term frequency (within a file)
                for word in wl:
                    # count the number of word within a file and store it to word_tf_dict temporarily
                    # get previous record and pile up
                    be = word_tf_dict.get(word, 0.0)
                    word_tf_dict[word] = be + 1.0
                    # reserve words for IDF
                    if not word in self.idf_dict:
                        self.idf_dict[word] = 0.0

                #print "[Debug] tf: TF result: " + doc_name + " " + str(word_tf_dict)

                # normalize
                word_tf_dict = dict(map(lambda (k,v) : (k,v/word_num), word_tf_dict.iteritems()))

                # save result
                self.tf_dict[doc_name] = word_tf_dict
                #print "[Debug] result: " + doc_name + " " + str(self.tf_dict[doc_name])
            
        else:
            sys.stderr.write("[Error] tf: there is no file in queue\n")
            sys.stderr.write("[Error] tf: please use addDoc first\n")

    def idf(self):
        """ calculate current idf with all docs have been added
        """
        print "[INFO] calculating IDF..."
        # log(total number of documents)
        log_doc_total = math.log(self.docs_total, 10)

        # use self.idf_dict
        # retrieve word from idf_dict
        for word in self.idf_dict:
            # initialize
            self.idf_dict[word] = 0.0

            ### Attention !!! The code below make this program too inefficient.
            ### I have used self.word_counter instead to record the number of word when addDoc() was called
            ### ============================================================
            # count the number of word through all docs
            #for doc_name in self.tf_dict:
            #    if self.tf_dict[doc_name].has_key(word):
            #        self.idf_dict[word] += 1.0
            ### ============================================================

            # calculate IDF
            # log(word number)
            ### the line below is depreciated.
            ### log_tf = math.log(self.idf_dict[word], 10)

            # log( the number of doc contains the word )
            log_ndcw = math.log(self.doc_with_word_counter[word], 10)
            # log(docs number) - log(word number)
            self.idf_dict[word] = log_doc_total - log_ndcw

    ### file operating method ###
        
    def addDoc(self, doc_name):
        """ Read data from a doc, store it and return a word list of the content.
            This function will also remove stopwords and do stemming.
        """

        print('[Info] addDoc '+doc_name)

        # for storing words
        wd_list = []

        # file reading mechanism has been adjusted
        # specified input file format:
        # Line 1 = question name
        # Line 2 = topics
        # Line 3... = answer content

        try:
            # open docs and read data
            with open(doc_name, 'r') as f_in:
                for i,line in enumerate(f_in):

                    # We ignore first line (question name)
                    if (i == 0):
                        pass

                    # How to deal with topics at second line ?
                    # Read topics into self.doc_topic_dict
                    if (i == 1):
                        self.doc_topic_dict[doc_name] = line.split(', ')

                    for word in line.split():
                        
                        # if the length of word is zero, pass
                        if (len(word) == 0):
                            pass

                        # normalize the word. Get rid of punctuation which is not needed
                        if self.enableRegex:
                            matchedGrp = re.match(self.restr, word, re.M|re.I)
                            if matchedGrp:
                                word = matchedGrp.group(0)
                            else:
                                word = ''
                        # END if

                        # lowercast word
                        word = word.lower()

                        # remove stopwords here (if enabled)
                        if self.enableStopword and self.testStopword(word):
                            pass
                        else:
                            # stemming here
                            if self.enableStemmer:
                                # word = self.stemming(word)
                                pass

                            ### These code below is used to calculate IDF later
                            ### =========================================
                            # count the number of doc contains the word
                            # if current word has been in wd_list then ignore it to prevent counting the same word twice
                            if not word in wd_list:
                                pre_rc = self.doc_with_word_counter.get(word, 0.0)
                                self.doc_with_word_counter[word] = pre_rc + 1.0
                            ### =========================================

                            # add word to word list
                            wd_list.append(word)
                        # END if


                    # END for words in line
                # END for lines in file
            # END with open file

            # increase docs number
            self.docs_total += 1
            # store word list
            self.doc_wordlist[doc_name] = wd_list

        except IOError, e:
            sys.stderr.write("[Error] addDoc failed.\n")
            sys.stderr.write("[Error] error dump: {er:s}".format(er=str(e)))

        return wd_list
Ejemplo n.º 3
0
    def __init__(self):

        # establish stopword set
        self.stopword_set = set()

        # for sotring added file data
        self.doc_wordlist = {}

        # doc_topic_dict = {docname: topic_ist, ...}
        self.doc_topic_dict = {}

        # for TF-IDF (term frequency - inverse document frequency)
        # tfidf_dict/tf_dict = {filename : {feature : TF, ...}, ...}
        # idf_dict = {word : IDF, ...}
        self.tf_dict = {}
        self.idf_dict = {}
        self.tfidf_dict = {}
        self.doc_with_word_counter = {}
        self.docs_total = 0

        # for cosine similarity
        # it looks like: 
        # { 
        #   'docA': {'docA': xxx, 'docB': xxx, ...}, 
        #   'docB': {'docA': xxx, 'docB': xxx, ...}, 
        #   ... 
        # }
        self.cos_dict = {}

        # for normalize word (regex)
        self.enableRegex = True
        # ask if user want to enable regex filter
        self.restr = r'[a-zA-Z\-][a-zA-Z\-]*'
        req = raw_input("Enable regex filter ? (default: r'[a-zA-Z\-][a-zA-Z\-]*'. enter 'no' to disable it): ")
        if req == 'no':
            print('Disable regex filter')
            self.enableRegex = False
        else:
            self.enableRegex = True

        # set stopword list
        self.enableStopword = True
        # ask if enable stopword filter
        spq = raw_input("Enable stopword filter ? ( 'no' to disable ): ")
        if spq == 'no':
            print('Disable stopword filter')
            self.enableStopword = False
        else:
            self.enableStopword = True

        if self.enableStopword:
            stopwdf = raw_input("Please input stopword list path (keep empty for default 'stopwords.txt'): ")
            if stopwdf:
                self.setStopwdList(stopwdf)
            else:
                self.setStopwdList('stopwords.txt')

        # ask if user want to use stemmer
        self.enableStemmer = True
        smq = raw_input('Enable stemmer ? ( "no" to disable ): ')

        if smq == 'no':
            print('Disable stemmer')
            self.enableStemmer = False
        else:
            self.enableStemmer = True
        # setup stemmer
        if self.enableStemmer:
            self.stemmer = Stemmer()