def __init__(
        self,
        posset,
        feature_bittag,
        dir_text,
        text_suffix,
        dir_feature,
        feature_suffix,
        dir_manualkp,
        manualkp_suffix,
        wordsmap_file,
        featurenum,
        nonposfeature,
    ):
        # receving input
        self.posset = posset
        self.feature_bittag = feature_bittag
        self.dir_text = dir_text
        self.text_suffix = text_suffix
        self.dir_feature = dir_feature
        self.feature_suffix = feature_suffix
        self.dir_manualkp = dir_manualkp
        self.manualkp_suffix = manualkp_suffix
        self.wordsmap_file = wordsmap_file
        self.featurenum = featurenum
        self.nonposfeature = nonposfeature

        # inner variable setting
        self.poslist = list(posset)
        self.featuretype = sum(feature_bittag)
        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.wordfilter = WordFilter()
        self.doctext, self.doctags, self.worddf = self.loaddoctext()
        self.manualkeywords = self.getmanualkeywords()
    def __init__(self, dir_text_file, text_suffix,\
            dir_feature_file, feature_suffix,\
            dir_manualkp_file, manualkp_suffix,\
            words_map_file):
        self.dir_text_file = dir_text_file
        self.text_suffix = text_suffix
        self.dir_feature_file = dir_feature_file
        self.feature_suffix = feature_suffix
        self.dir_manualkp_file = dir_manualkp_file
        self.manualkp_suffix = manualkp_suffix
        self.words_map_file = words_map_file

        self.featurenum = 3

        self.stemmer = PorterStemmer()
        self.wordmap = self.loadmap()
        self.doclist = self.getdoclist(self.substr_text,\
                dir_text_file)
        self.wordFilter = WordFilter()
        self.doctext,self.doctags,self.worddf=self.loaddoctext()
        self.manualkeywords = self.getmanuallabels()
        corpfeature = self.mkdocfeatures()
        self.corpfeature = self.normalization(corpfeature,\
                method='minmax')
class FeatureGenerator():
    '''This class serves for the supervised keyphrase
       extraction method, i.e., logistic regression
       for keyword extraction and then merge keywords
       to keyphrases.
       Features: as we use "POS tags" to merge keywords
       when generating keyphrases, we don't use "POS" tags as features,
            1.TF;
            2.DF;
            3.POSITION.
       Note: all features should be normalized.
    '''
    def __init__(self, dir_text_file, text_suffix,\
            dir_feature_file, feature_suffix,\
            dir_manualkp_file, manualkp_suffix,\
            words_map_file):
        self.dir_text_file = dir_text_file
        self.text_suffix = text_suffix
        self.dir_feature_file = dir_feature_file
        self.feature_suffix = feature_suffix
        self.dir_manualkp_file = dir_manualkp_file
        self.manualkp_suffix = manualkp_suffix
        self.words_map_file = words_map_file

        self.featurenum = 3

        self.stemmer = PorterStemmer()
        self.wordmap = self.loadmap()
        self.doclist = self.getdoclist(self.substr_text,\
                dir_text_file)
        self.wordFilter = WordFilter()
        self.doctext,self.doctags,self.worddf=self.loaddoctext()
        self.manualkeywords = self.getmanuallabels()
        corpfeature = self.mkdocfeatures()
        self.corpfeature = self.normalization(corpfeature,\
                method='minmax')

    def loaddoctext(self):
        doctext = {}
        doctags = {}
        worddf = {}
        for doc in self.doclist:
            docwordlist = []
            doctaglist = []
            docwordset = set([])
            for line in open(doc):
                clean_words,tags=self.wordFilter.filterwords(\
                        line.strip('\r\n '))
                docwordlist += clean_words
                doctaglist += tags
            #docname = doc.split('/')[-1].split('.')[0]
            doctext[doc] = docwordlist
            doctags[doc] = doctaglist
            for word in set(docwordlist):
                if word in self.wordmap:
                    wordid = self.wordmap[word]
                    if wordid not in docwordset:
                        if wordid in worddf:
                            worddf[wordid] += 1
                        else:
                            worddf[wordid] = 1
                        docwordset.add(wordid)
        return doctext, doctags, worddf

    def getmanuallabels(self):
        '''segment each keyphrase into keywords
        '''
        manualkeywords = {}
        doclist = self.getdoclist(self.substr_manualkp,\
                self.dir_manualkp_file)
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            keywordset = set([])
            for line in open(doc):
                for word in line.strip('\r\n ').split(' '):
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, \
                        len(word)-1)
                    if word not in keywordset:
                        keywordset.add(word)
            keywordset = map(lambda x:self.wordmap[x],\
                    keywordset)
            manualkeywords[docname] = set(keywordset)
        return manualkeywords

    def getdoclist(self, substr_func, dir_file):
        doclist = []
        for subdir in dir_file:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            candi_files = filter(substr_func, candi_files)
            candi_files = map(lambda x: subdir+x, candi_files)
            doclist = doclist + candi_files
        return doclist

    def substr_text(self, candi_file):
        if candi_file.find(self.text_suffix) != -1:
            return True
        return False

    def substr_manualkp(self, candi_file):
        if candi_file.find(self.manualkp_suffix) != -1:
            return True
        return False

    def loadmap(self):
        wordsmap = {}
        for line in open(self.words_map_file):
            biparts = line.strip('\n').split(' ')
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def mkdocfeatures(self):
        '''Feature format:
            1.tf; 2.df; 3.position
        '''
        corpfeature = {}
        for key in self.doctext.keys():
            docfeature = {}
            doctext = self.doctext[key]
            doctags = self.doctags[key]
            for i, word in enumerate(doctext):
                if word in self.wordmap and doctags[i] in POS:
                    if self.wordmap[word] not in docfeature:
                        docfeature[self.wordmap[word]] = \
                            [1, self.worddf[self.wordmap[word]], i]
                    else:
                        docfeature[self.wordmap[word]][0] += 1
            corpfeature[key] = docfeature
        return corpfeature

    def outputdocfeatures(self):
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            manuallabelkey = dockey.split('/')[-1].split('.')[0]
            if dockey.find('Train') != -1:
                dir_feature_file = self.dir_feature_file[0]
            elif dockey.find('Validation') != -1:
                dir_feature_file = self.dir_feature_file[1]
            elif dockey.find('Test') != -1:
                dir_feature_file = self.dir_feature_file[2]
            else:
                dir_feature_file = self.dir_feature_file[0]
            output_feature_file = dir_feature_file\
                    + manuallabelkey + '.' + self.feature_suffix
            wfd = open(output_feature_file, 'w')
            for word in docfeature.keys():
                #print word
                #print self.manualkeywords[manuallabelkey]
                #raw_input()
                if word in self.manualkeywords[manuallabelkey]:
                    wfd.write('%s 1 %f %f %f\n' % (word,\
                        docfeature[word][0], docfeature[word][1],\
                        docfeature[word][2]))
                else:
                    wfd.write('%s 0 %f %f %f\n' % (word,\
                        docfeature[word][0], docfeature[word][1],\
                        docfeature[word][2]))
            wfd.close()

    def normalization(self, features, method):
        ''' feature normalization:
            1.document frequency features are normalized
            in the whole corpus;
            2.words frequency and position are normalized
            in their corresponding document.
        '''
        if method == 'minmax':
            features = self.minmax(features)
        elif method == 'norm':
            features = self.norm(features)
        elif method == 'original':
            pass
        else:
            print 'Invalid method choice'
            sys.exit(0)
        return features

    def minmax(self, features):
        std_feature = {}
        mindf = min(map(lambda x:x[1], self.worddf.items()))
        maxdf = max(map(lambda x:x[1], self.worddf.items()))
        #maxdf = 3
        #mindf = 1
        for dockey in features.keys():
            docfeature = features[dockey]
            mintf = min(map(lambda x:x[1][0],\
                    docfeature.items()))
            maxtf = max(map(lambda x:x[1][0],\
                    docfeature.items()))
            minpos = min(map(lambda x:x[1][2],\
                    docfeature.items()))
            maxpos = max(map(lambda x:x[1][2],\
                    docfeature.items()))
            for word in docfeature.keys():
                docfeature[word][0] = 1.0*(docfeature[word][0]-mintf)\
                        /max(1, (maxtf-mintf))
                docfeature[word][1] = 1.0*(docfeature[word][1]-mindf)\
                        /(maxdf-mindf)
                docfeature[word][2] = 1.0*(docfeature[word][2]-minpos)\
                        /(maxpos-minpos)
            std_feature[dockey] = docfeature
        return std_feature

    def norm(self, features):
        meandf = np.mean(np.array(map(lambda x:x[1],\
                self.worddf.items())))
        stddf = np.std(np.array(map(lambda x:x[1],\
                self.worddf.items())))
        for dockey in features.keys():
            docfeature = features[dockey]
            meantf = np.mean(np.array(map(lambda x:x[1][0],\
                    docfeature.items())))
            stdtf = np.std(np.array(map(lambda x:x[1][0],\
                    docfeature.items())))
            meanpos = np.mean(np.array(map(lambda x:x[1][2],\
                    docfeature.items())))
            stdpos = np.std(np.array(map(lambda x:x[1][2],\
                    docfeature.items())))
            for word in docfeature.keys():
                docfeature[word][0] = (docfeature[word][0]-meantf)\
                        / stdtf
                docfeature[word][1] = (docfeature[word][1]-meandf)\
                        / stddf
                docfeature[word][2] = (docfeature[word][2]-meanpos)\
                        / stdpos
            features[dockey] = docfeature
        return features
class NodeFeatureGenerator:
    """This class provides a framework which is easy for feature addition
       and deletion. The following features are common features that have
       been used in keywords/keyphrase extraction task.

       Features: the first three features listed below must be generated.
       1.TF;
       2.DF;
       3.POSITION;
       4.TF-IDF;
       5.lenText;
       6.POS-Tagging;

       Note: all features should be normalized to 0-1.
    """

    def __init__(
        self,
        posset,
        feature_bittag,
        dir_text,
        text_suffix,
        dir_feature,
        feature_suffix,
        dir_manualkp,
        manualkp_suffix,
        wordsmap_file,
        featurenum,
        nonposfeature,
    ):
        # receving input
        self.posset = posset
        self.feature_bittag = feature_bittag
        self.dir_text = dir_text
        self.text_suffix = text_suffix
        self.dir_feature = dir_feature
        self.feature_suffix = feature_suffix
        self.dir_manualkp = dir_manualkp
        self.manualkp_suffix = manualkp_suffix
        self.wordsmap_file = wordsmap_file
        self.featurenum = featurenum
        self.nonposfeature = nonposfeature

        # inner variable setting
        self.poslist = list(posset)
        self.featuretype = sum(feature_bittag)
        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.wordfilter = WordFilter()
        self.doctext, self.doctags, self.worddf = self.loaddoctext()
        self.manualkeywords = self.getmanualkeywords()

    def generatefeature(self, norm_method):
        # running
        self.corpfeature = self.mkfeatures()
        self.normalization(norm_method)
        self.outputdocfeatures()

    def loadmap(self):
        wordsmap = {}
        for line in open(self.wordsmap_file):
            biparts = line.strip("\n").split(" ")
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def loaddoctext(self):
        doctext = {}
        doctags = {}
        worddf = {}
        doclist = self.getdoclist(self.substr_text, self.dir_text)
        for doc in doclist:
            docwordlist = []
            doctaglist = []
            for line in open(doc):
                clean_words, tags = self.wordfilter.filterwords(line.strip("\r\n "))
                docwordlist += clean_words
                doctaglist += tags
            doctext[doc] = docwordlist
            doctags[doc] = doctaglist

            # compute document frequency for words
            for word in set(docwordlist):
                if word in self.wordsmap:
                    wordid = self.wordsmap[word]
                    if wordid in worddf:
                        worddf[wordid] += 1
                    else:
                        worddf[wordid] = 1
        return doctext, doctags, worddf

    def getdoclist(self, substr_func, dir_file):
        doclist = []
        for subdir in dir_file:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            candi_files = filter(substr_func, candi_files)
            candi_files = map(lambda x: subdir + x, candi_files)
            doclist = doclist + candi_files
        return doclist

    def substr_text(self, candi_file):
        if candi_file.find(self.text_suffix) != -1:
            return True
        return False

    def substr_manualkp(self, candi_file):
        if candi_file.find(self.manualkp_suffix) != -1:
            return True
        return False

    def getmanualkeywords(self):
        """segment each keyphrase into keywords
        """
        manualkeywords = {}
        doclist = self.getdoclist(self.substr_manualkp, self.dir_manualkp)
        for doc in doclist:
            docname = doc.split("/")[-1].split(".")[0]
            keywordset = set([])
            for line in open(doc):
                for word in line.strip("\r\n ").split(" "):
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, len(word) - 1)
                    if word not in keywordset:
                        keywordset.add(word)
            keywordset = map(lambda x: self.wordsmap[x], keywordset)
            manualkeywords[docname] = set(keywordset)
        return manualkeywords

    def mkfeatures(self):
        corpfeature = {}
        for dockey in self.doctext.keys():
            docfeature = {}
            doctext = self.doctext[dockey]
            doctags = self.doctags[dockey]
            for i, word in enumerate(doctext):
                if word in self.wordsmap and doctags[i] in self.posset:
                    if self.wordsmap[word] not in docfeature:
                        # class
                        # wordfeature = Feature()
                        # wordfeature.tf  = 1
                        # wordfeature.df = self.worddf[word]
                        # wordfeature.position = i
                        # list
                        wordfeature = [0 for j in range(self.featurenum)]
                        wordfeature[0] = 1
                        wordfeature[1] = self.worddf[self.wordsmap[word]]
                        wordfeature[2] = i
                        # word's length feature
                        if self.feature_bittag[4] == 1:
                            # wordfeature.lentext = len(word)
                            wordfeature[4] = len(word)
                        # word's pos feature
                        if self.feature_bittag[5] == 1:
                            posidx = self.poslist.index(doctags[i])
                            if posidx < 0:
                                print "Invalid pos tags"
                                sys.exit(1)
                            wordfeature[self.nonposfeature + posidx] = 1
                        docfeature[self.wordsmap[word]] = wordfeature
                    else:
                        docfeature[self.wordsmap[word]][0] += 1
            # word's tfidf feature
            if self.feature_bittag[3] == 1:
                for wordkey in docfeature.keys():
                    docfeature[wordkey][3] = self.comptfidf(
                        docfeature[wordkey][0], docfeature[wordkey][1], len(self.doctext.keys())
                    )
            corpfeature[dockey] = docfeature
        return corpfeature

    def comptfidf(self, tf, df, docnum):
        return tf * math.log((docnum * 1.0) / df)

    def normalization(self, method):
        """ feature normalization:
            1.document frequency features are normalized
            in the whole corpus;
            2.words frequency and position are normalized
            in their corresponding document.
        """
        if method == "minmax":
            self.minmax()
        elif method == "norm":
            self.norm()
        elif method == "original":
            pass
        else:
            print "Invalid method choice"
            sys.exit(0)

    def minmax(self):
        std_feature = {}
        # words' df feature
        mindf = min(map(lambda x: x[1], self.worddf.items()))
        maxdf = max(map(lambda x: x[1], self.worddf.items()))
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            mintf = min(map(lambda x: x[1][0], docfeature.items()))
            maxtf = max(map(lambda x: x[1][0], docfeature.items()))
            minpos = min(map(lambda x: x[1][2], docfeature.items()))
            maxpos = max(map(lambda x: x[1][2], docfeature.items()))
            if self.feature_bittag[3] == 1:
                mintfidf = min(map(lambda x: x[1][3], docfeature.items()))
                maxtfidf = max(map(lambda x: x[1][3], docfeature.items()))
            if self.feature_bittag[4] == 1:
                minlength = min(map(lambda x: x[1][4], docfeature.items()))
                maxlength = max(map(lambda x: x[1][4], docfeature.items()))
            for word in docfeature.keys():
                docfeature[word][0] = 1.0 * (docfeature[word][0] - mintf) / max(1, (maxtf - mintf))
                docfeature[word][1] = 1.0 * (docfeature[word][1] - mindf) / (maxdf - mindf)
                docfeature[word][2] = 1.0 * (docfeature[word][2] - minpos) / (maxpos - minpos)
                if self.feature_bittag[3] == 1:
                    docfeature[word][3] = 1.0 * (docfeature[word][3] - mintfidf) / (maxtfidf - mintfidf)
                if self.feature_bittag[4] == 1:
                    docfeature[word][4] = 1.0 * (docfeature[word][4] - minlength) / (maxlength - minlength)

            std_feature[dockey] = docfeature
        self.corpfeature = std_feature

    def norm(self):
        pass

    def outputdocfeatures(self):
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            manuallabelkey = dockey.split("/")[-1].split(".")[0]
            if dockey.find("Train") != -1:
                dir_feature = self.dir_feature[0]
            elif dockey.find("Validation") != -1:
                dir_feature = self.dir_feature[1]
            elif dockey.find("Test") != -1:
                dir_feature = self.dir_feature[2]
            else:
                dir_feature = self.dir_feature[0]
            output_feature_file = dir_feature + manuallabelkey + "." + self.feature_suffix
            wfd = open(output_feature_file, "w")
            for word in docfeature.keys():
                # print word
                # print self.manualkeywords[manuallabelkey]
                # raw_input()
                if word in self.manualkeywords[manuallabelkey]:
                    wfd.write("%s 1 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2]))
                else:
                    wfd.write("%s 0 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2]))
                if self.feature_bittag[3] == 1:
                    wfd.write(" %f" % docfeature[word][3])
                if self.feature_bittag[4] == 1:
                    wfd.write(" %f" % docfeature[word][4])
                if self.feature_bittag[5] == 1:
                    for i in range(5, self.featurenum):
                        wfd.write(" %d" % docfeature[word][i])
                wfd.write("\n")
            wfd.close()