Python WordFilter Examples

Programming Language: Python

Namespace/Package Name: dataPreprocess.filterWords

Class/Type: WordFilter

Examples at hotexamples.com: 4

Python WordFilter - 4 examples found. These are the top rated real world Python examples of dataPreprocess.filterWords.WordFilter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

filterwords(2)

Example #1

Show file

File: nodefeatureconstruct-V1.py Project: anthonylife/KeywordExtract

    def __init__(
        self,
        posset,
        feature_bittag,
        dir_text,
        text_suffix,
        dir_feature,
        feature_suffix,
        dir_manualkp,
        manualkp_suffix,
        wordsmap_file,
        featurenum,
        nonposfeature,
    ):
        # receving input
        self.posset = posset
        self.feature_bittag = feature_bittag
        self.dir_text = dir_text
        self.text_suffix = text_suffix
        self.dir_feature = dir_feature
        self.feature_suffix = feature_suffix
        self.dir_manualkp = dir_manualkp
        self.manualkp_suffix = manualkp_suffix
        self.wordsmap_file = wordsmap_file
        self.featurenum = featurenum
        self.nonposfeature = nonposfeature

        # inner variable setting
        self.poslist = list(posset)
        self.featuretype = sum(feature_bittag)
        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.wordfilter = WordFilter()
        self.doctext, self.doctags, self.worddf = self.loaddoctext()
        self.manualkeywords = self.getmanualkeywords()

Example #2

Show file

File: genFeatures.py Project: anthonylife/KeywordExtract

    def __init__(self, dir_text_file, text_suffix,\
            dir_feature_file, feature_suffix,\
            dir_manualkp_file, manualkp_suffix,\
            words_map_file):
        self.dir_text_file = dir_text_file
        self.text_suffix = text_suffix
        self.dir_feature_file = dir_feature_file
        self.feature_suffix = feature_suffix
        self.dir_manualkp_file = dir_manualkp_file
        self.manualkp_suffix = manualkp_suffix
        self.words_map_file = words_map_file

        self.featurenum = 3

        self.stemmer = PorterStemmer()
        self.wordmap = self.loadmap()
        self.doclist = self.getdoclist(self.substr_text,\
                dir_text_file)
        self.wordFilter = WordFilter()
        self.doctext,self.doctags,self.worddf=self.loaddoctext()
        self.manualkeywords = self.getmanuallabels()
        corpfeature = self.mkdocfeatures()
        self.corpfeature = self.normalization(corpfeature,\
                method='minmax')

Example #3

Show file

File: genFeatures.py Project: anthonylife/KeywordExtract

class FeatureGenerator():
    '''This class serves for the supervised keyphrase
       extraction method, i.e., logistic regression
       for keyword extraction and then merge keywords
       to keyphrases.
       Features: as we use "POS tags" to merge keywords
       when generating keyphrases, we don't use "POS" tags as features,
            1.TF;
            2.DF;
            3.POSITION.
       Note: all features should be normalized.
    '''
    def __init__(self, dir_text_file, text_suffix,\
            dir_feature_file, feature_suffix,\
            dir_manualkp_file, manualkp_suffix,\
            words_map_file):
        self.dir_text_file = dir_text_file
        self.text_suffix = text_suffix
        self.dir_feature_file = dir_feature_file
        self.feature_suffix = feature_suffix
        self.dir_manualkp_file = dir_manualkp_file
        self.manualkp_suffix = manualkp_suffix
        self.words_map_file = words_map_file

        self.featurenum = 3

        self.stemmer = PorterStemmer()
        self.wordmap = self.loadmap()
        self.doclist = self.getdoclist(self.substr_text,\
                dir_text_file)
        self.wordFilter = WordFilter()
        self.doctext,self.doctags,self.worddf=self.loaddoctext()
        self.manualkeywords = self.getmanuallabels()
        corpfeature = self.mkdocfeatures()
        self.corpfeature = self.normalization(corpfeature,\
                method='minmax')

    def loaddoctext(self):
        doctext = {}
        doctags = {}
        worddf = {}
        for doc in self.doclist:
            docwordlist = []
            doctaglist = []
            docwordset = set([])
            for line in open(doc):
                clean_words,tags=self.wordFilter.filterwords(\
                        line.strip('\r\n '))
                docwordlist += clean_words
                doctaglist += tags
            #docname = doc.split('/')[-1].split('.')[0]
            doctext[doc] = docwordlist
            doctags[doc] = doctaglist
            for word in set(docwordlist):
                if word in self.wordmap:
                    wordid = self.wordmap[word]
                    if wordid not in docwordset:
                        if wordid in worddf:
                            worddf[wordid] += 1
                        else:
                            worddf[wordid] = 1
                        docwordset.add(wordid)
        return doctext, doctags, worddf

    def getmanuallabels(self):
        '''segment each keyphrase into keywords
        '''
        manualkeywords = {}
        doclist = self.getdoclist(self.substr_manualkp,\
                self.dir_manualkp_file)
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            keywordset = set([])
            for line in open(doc):
                for word in line.strip('\r\n ').split(' '):
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, \
                        len(word)-1)
                    if word not in keywordset:
                        keywordset.add(word)
            keywordset = map(lambda x:self.wordmap[x],\
                    keywordset)
            manualkeywords[docname] = set(keywordset)
        return manualkeywords

    def getdoclist(self, substr_func, dir_file):
        doclist = []
        for subdir in dir_file:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            candi_files = filter(substr_func, candi_files)
            candi_files = map(lambda x: subdir+x, candi_files)
            doclist = doclist + candi_files
        return doclist

    def substr_text(self, candi_file):
        if candi_file.find(self.text_suffix) != -1:
            return True
        return False

    def substr_manualkp(self, candi_file):
        if candi_file.find(self.manualkp_suffix) != -1:
            return True
        return False

    def loadmap(self):
        wordsmap = {}
        for line in open(self.words_map_file):
            biparts = line.strip('\n').split(' ')
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def mkdocfeatures(self):
        '''Feature format:
            1.tf; 2.df; 3.position
        '''
        corpfeature = {}
        for key in self.doctext.keys():
            docfeature = {}
            doctext = self.doctext[key]
            doctags = self.doctags[key]
            for i, word in enumerate(doctext):
                if word in self.wordmap and doctags[i] in POS:
                    if self.wordmap[word] not in docfeature:
                        docfeature[self.wordmap[word]] = \
                            [1, self.worddf[self.wordmap[word]], i]
                    else:
                        docfeature[self.wordmap[word]][0] += 1
            corpfeature[key] = docfeature
        return corpfeature

    def outputdocfeatures(self):
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            manuallabelkey = dockey.split('/')[-1].split('.')[0]
            if dockey.find('Train') != -1:
                dir_feature_file = self.dir_feature_file[0]
            elif dockey.find('Validation') != -1:
                dir_feature_file = self.dir_feature_file[1]
            elif dockey.find('Test') != -1:
                dir_feature_file = self.dir_feature_file[2]
            else:
                dir_feature_file = self.dir_feature_file[0]
            output_feature_file = dir_feature_file\
                    + manuallabelkey + '.' + self.feature_suffix
            wfd = open(output_feature_file, 'w')
            for word in docfeature.keys():
                #print word
                #print self.manualkeywords[manuallabelkey]
                #raw_input()
                if word in self.manualkeywords[manuallabelkey]:
                    wfd.write('%s 1 %f %f %f\n' % (word,\
                        docfeature[word][0], docfeature[word][1],\
                        docfeature[word][2]))
                else:
                    wfd.write('%s 0 %f %f %f\n' % (word,\
                        docfeature[word][0], docfeature[word][1],\
                        docfeature[word][2]))
            wfd.close()

    def normalization(self, features, method):
        ''' feature normalization:
            1.document frequency features are normalized
            in the whole corpus;
            2.words frequency and position are normalized
            in their corresponding document.
        '''
        if method == 'minmax':
            features = self.minmax(features)
        elif method == 'norm':
            features = self.norm(features)
        elif method == 'original':
            pass
        else:
            print 'Invalid method choice'
            sys.exit(0)
        return features

    def minmax(self, features):
        std_feature = {}
        mindf = min(map(lambda x:x[1], self.worddf.items()))
        maxdf = max(map(lambda x:x[1], self.worddf.items()))
        #maxdf = 3
        #mindf = 1
        for dockey in features.keys():
            docfeature = features[dockey]
            mintf = min(map(lambda x:x[1][0],\
                    docfeature.items()))
            maxtf = max(map(lambda x:x[1][0],\
                    docfeature.items()))
            minpos = min(map(lambda x:x[1][2],\
                    docfeature.items()))
            maxpos = max(map(lambda x:x[1][2],\
                    docfeature.items()))
            for word in docfeature.keys():
                docfeature[word][0] = 1.0*(docfeature[word][0]-mintf)\
                        /max(1, (maxtf-mintf))
                docfeature[word][1] = 1.0*(docfeature[word][1]-mindf)\
                        /(maxdf-mindf)
                docfeature[word][2] = 1.0*(docfeature[word][2]-minpos)\
                        /(maxpos-minpos)
            std_feature[dockey] = docfeature
        return std_feature

    def norm(self, features):
        meandf = np.mean(np.array(map(lambda x:x[1],\
                self.worddf.items())))
        stddf = np.std(np.array(map(lambda x:x[1],\
                self.worddf.items())))
        for dockey in features.keys():
            docfeature = features[dockey]
            meantf = np.mean(np.array(map(lambda x:x[1][0],\
                    docfeature.items())))
            stdtf = np.std(np.array(map(lambda x:x[1][0],\
                    docfeature.items())))
            meanpos = np.mean(np.array(map(lambda x:x[1][2],\
                    docfeature.items())))
            stdpos = np.std(np.array(map(lambda x:x[1][2],\
                    docfeature.items())))
            for word in docfeature.keys():
                docfeature[word][0] = (docfeature[word][0]-meantf)\
                        / stdtf
                docfeature[word][1] = (docfeature[word][1]-meandf)\
                        / stddf
                docfeature[word][2] = (docfeature[word][2]-meanpos)\
                        / stdpos
            features[dockey] = docfeature
        return features

Example #4

Show file

File: nodefeatureconstruct-V1.py Project: anthonylife/KeywordExtract

class NodeFeatureGenerator:
    """This class provides a framework which is easy for feature addition
       and deletion. The following features are common features that have
       been used in keywords/keyphrase extraction task.

       Features: the first three features listed below must be generated.
       1.TF;
       2.DF;
       3.POSITION;
       4.TF-IDF;
       5.lenText;
       6.POS-Tagging;

       Note: all features should be normalized to 0-1.
    """

    def __init__(
        self,
        posset,
        feature_bittag,
        dir_text,
        text_suffix,
        dir_feature,
        feature_suffix,
        dir_manualkp,
        manualkp_suffix,
        wordsmap_file,
        featurenum,
        nonposfeature,
    ):
        # receving input
        self.posset = posset
        self.feature_bittag = feature_bittag
        self.dir_text = dir_text
        self.text_suffix = text_suffix
        self.dir_feature = dir_feature
        self.feature_suffix = feature_suffix
        self.dir_manualkp = dir_manualkp
        self.manualkp_suffix = manualkp_suffix
        self.wordsmap_file = wordsmap_file
        self.featurenum = featurenum
        self.nonposfeature = nonposfeature

        # inner variable setting
        self.poslist = list(posset)
        self.featuretype = sum(feature_bittag)
        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.wordfilter = WordFilter()
        self.doctext, self.doctags, self.worddf = self.loaddoctext()
        self.manualkeywords = self.getmanualkeywords()

    def generatefeature(self, norm_method):
        # running
        self.corpfeature = self.mkfeatures()
        self.normalization(norm_method)
        self.outputdocfeatures()

    def loadmap(self):
        wordsmap = {}
        for line in open(self.wordsmap_file):
            biparts = line.strip("\n").split(" ")
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def loaddoctext(self):
        doctext = {}
        doctags = {}
        worddf = {}
        doclist = self.getdoclist(self.substr_text, self.dir_text)
        for doc in doclist:
            docwordlist = []
            doctaglist = []
            for line in open(doc):
                clean_words, tags = self.wordfilter.filterwords(line.strip("\r\n "))
                docwordlist += clean_words
                doctaglist += tags
            doctext[doc] = docwordlist
            doctags[doc] = doctaglist

            # compute document frequency for words
            for word in set(docwordlist):
                if word in self.wordsmap:
                    wordid = self.wordsmap[word]
                    if wordid in worddf:
                        worddf[wordid] += 1
                    else:
                        worddf[wordid] = 1
        return doctext, doctags, worddf

    def getdoclist(self, substr_func, dir_file):
        doclist = []
        for subdir in dir_file:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            candi_files = filter(substr_func, candi_files)
            candi_files = map(lambda x: subdir + x, candi_files)
            doclist = doclist + candi_files
        return doclist

    def substr_text(self, candi_file):
        if candi_file.find(self.text_suffix) != -1:
            return True
        return False

    def substr_manualkp(self, candi_file):
        if candi_file.find(self.manualkp_suffix) != -1:
            return True
        return False

    def getmanualkeywords(self):
        """segment each keyphrase into keywords
        """
        manualkeywords = {}
        doclist = self.getdoclist(self.substr_manualkp, self.dir_manualkp)
        for doc in doclist:
            docname = doc.split("/")[-1].split(".")[0]
            keywordset = set([])
            for line in open(doc):
                for word in line.strip("\r\n ").split(" "):
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, len(word) - 1)
                    if word not in keywordset:
                        keywordset.add(word)
            keywordset = map(lambda x: self.wordsmap[x], keywordset)
            manualkeywords[docname] = set(keywordset)
        return manualkeywords

    def mkfeatures(self):
        corpfeature = {}
        for dockey in self.doctext.keys():
            docfeature = {}
            doctext = self.doctext[dockey]
            doctags = self.doctags[dockey]
            for i, word in enumerate(doctext):
                if word in self.wordsmap and doctags[i] in self.posset:
                    if self.wordsmap[word] not in docfeature:
                        # class
                        # wordfeature = Feature()
                        # wordfeature.tf  = 1
                        # wordfeature.df = self.worddf[word]
                        # wordfeature.position = i
                        # list
                        wordfeature = [0 for j in range(self.featurenum)]
                        wordfeature[0] = 1
                        wordfeature[1] = self.worddf[self.wordsmap[word]]
                        wordfeature[2] = i
                        # word's length feature
                        if self.feature_bittag[4] == 1:
                            # wordfeature.lentext = len(word)
                            wordfeature[4] = len(word)
                        # word's pos feature
                        if self.feature_bittag[5] == 1:
                            posidx = self.poslist.index(doctags[i])
                            if posidx < 0:
                                print "Invalid pos tags"
                                sys.exit(1)
                            wordfeature[self.nonposfeature + posidx] = 1
                        docfeature[self.wordsmap[word]] = wordfeature
                    else:
                        docfeature[self.wordsmap[word]][0] += 1
            # word's tfidf feature
            if self.feature_bittag[3] == 1:
                for wordkey in docfeature.keys():
                    docfeature[wordkey][3] = self.comptfidf(
                        docfeature[wordkey][0], docfeature[wordkey][1], len(self.doctext.keys())
                    )
            corpfeature[dockey] = docfeature
        return corpfeature

    def comptfidf(self, tf, df, docnum):
        return tf * math.log((docnum * 1.0) / df)

    def normalization(self, method):
        """ feature normalization:
            1.document frequency features are normalized
            in the whole corpus;
            2.words frequency and position are normalized
            in their corresponding document.
        """
        if method == "minmax":
            self.minmax()
        elif method == "norm":
            self.norm()
        elif method == "original":
            pass
        else:
            print "Invalid method choice"
            sys.exit(0)

    def minmax(self):
        std_feature = {}
        # words' df feature
        mindf = min(map(lambda x: x[1], self.worddf.items()))
        maxdf = max(map(lambda x: x[1], self.worddf.items()))
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            mintf = min(map(lambda x: x[1][0], docfeature.items()))
            maxtf = max(map(lambda x: x[1][0], docfeature.items()))
            minpos = min(map(lambda x: x[1][2], docfeature.items()))
            maxpos = max(map(lambda x: x[1][2], docfeature.items()))
            if self.feature_bittag[3] == 1:
                mintfidf = min(map(lambda x: x[1][3], docfeature.items()))
                maxtfidf = max(map(lambda x: x[1][3], docfeature.items()))
            if self.feature_bittag[4] == 1:
                minlength = min(map(lambda x: x[1][4], docfeature.items()))
                maxlength = max(map(lambda x: x[1][4], docfeature.items()))
            for word in docfeature.keys():
                docfeature[word][0] = 1.0 * (docfeature[word][0] - mintf) / max(1, (maxtf - mintf))
                docfeature[word][1] = 1.0 * (docfeature[word][1] - mindf) / (maxdf - mindf)
                docfeature[word][2] = 1.0 * (docfeature[word][2] - minpos) / (maxpos - minpos)
                if self.feature_bittag[3] == 1:
                    docfeature[word][3] = 1.0 * (docfeature[word][3] - mintfidf) / (maxtfidf - mintfidf)
                if self.feature_bittag[4] == 1:
                    docfeature[word][4] = 1.0 * (docfeature[word][4] - minlength) / (maxlength - minlength)

            std_feature[dockey] = docfeature
        self.corpfeature = std_feature

    def norm(self):
        pass

    def outputdocfeatures(self):
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            manuallabelkey = dockey.split("/")[-1].split(".")[0]
            if dockey.find("Train") != -1:
                dir_feature = self.dir_feature[0]
            elif dockey.find("Validation") != -1:
                dir_feature = self.dir_feature[1]
            elif dockey.find("Test") != -1:
                dir_feature = self.dir_feature[2]
            else:
                dir_feature = self.dir_feature[0]
            output_feature_file = dir_feature + manuallabelkey + "." + self.feature_suffix
            wfd = open(output_feature_file, "w")
            for word in docfeature.keys():
                # print word
                # print self.manualkeywords[manuallabelkey]
                # raw_input()
                if word in self.manualkeywords[manuallabelkey]:
                    wfd.write("%s 1 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2]))
                else:
                    wfd.write("%s 0 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2]))
                if self.feature_bittag[3] == 1:
                    wfd.write(" %f" % docfeature[word][3])
                if self.feature_bittag[4] == 1:
                    wfd.write(" %f" % docfeature[word][4])
                if self.feature_bittag[5] == 1:
                    for i in range(5, self.featurenum):
                        wfd.write(" %d" % docfeature[word][i])
                wfd.write("\n")
            wfd.close()