def __init__(
        self,
        posset,
        feature_bittag,
        dir_text,
        text_suffix,
        dir_feature,
        feature_suffix,
        dir_manualkp,
        manualkp_suffix,
        wordsmap_file,
        featurenum,
        nonposfeature,
    ):
        # receving input
        self.posset = posset
        self.feature_bittag = feature_bittag
        self.dir_text = dir_text
        self.text_suffix = text_suffix
        self.dir_feature = dir_feature
        self.feature_suffix = feature_suffix
        self.dir_manualkp = dir_manualkp
        self.manualkp_suffix = manualkp_suffix
        self.wordsmap_file = wordsmap_file
        self.featurenum = featurenum
        self.nonposfeature = nonposfeature

        # inner variable setting
        self.poslist = list(posset)
        self.featuretype = sum(feature_bittag)
        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.wordfilter = WordFilter()
        self.doctext, self.doctags, self.worddf = self.loaddoctext()
        self.manualkeywords = self.getmanualkeywords()
    def __init__(self, src_corp_dir, lda_doc, docsuffix):
        self.src_corp_dir = src_corp_dir
        self.lda_doc = lda_doc
        self.docsuffix = docsuffix
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.doclist = []
        self.getdoclist()
        self.genmodel_inputfile()
Example #3
0
    def __init__(self, dir_text_file, output_wordmap_file, \
            pos_sets, reg_exp, docsuffix, kpdocsuffix):
        self.dir_text_file = dir_text_file
        self.output_wordmap_file = output_wordmap_file
        self.pos_sets = pos_sets
        self.docsuffix = docsuffix
        self.kpdocsuffix = kpdocsuffix

        self.stemmer = PorterStemmer()
        self.dict_words = []
        self.pattern = re.compile(reg_exp)
        self.doclist = []
        self.doclist = self.getdoclist(1)
def main(test=None):
    # files setting
    semwiki_file = "../cleanData/Hulth2003/words_pair.simvalue.dict"
    clean_semwiki_file = "../cleanData/Hulth2003/words_pair.wiki.clean.dict"
    if test:
        semwiki_file = "../unitTest/dataPreprocess/cleanWikiminerResult/input.txt"
        clean_semwiki_file = "../unitTest/dataPreprocess/cleanWikiminerResult/output.txt"
    # stemmer
    stemmer = PorterStemmer()

    raw_numitems = 0
    new_numitems = 0
    tri_dict = {}
    for line in open(semwiki_file):
        raw_numitems += 1
        triparts = line.strip('\n').split(' ')

        # to lower case
        triparts[0] = triparts[0].lower()
        triparts[1] = triparts[1].lower()
        # stemming
        triparts[0] = stemmer.stem(triparts[0], 0, len(triparts[0])-1)
        triparts[1] = stemmer.stem(triparts[1], 0, len(triparts[1])-1)

        synth_key = triparts[0] + '_' + triparts[1]
        if synth_key not in tri_dict:
            tri_dict[synth_key] = triparts[2]
            new_numitems += 1

    wfd = open(clean_semwiki_file, 'w')
    for key in tri_dict.keys():
        triparts = key.split('_')
        triparts.append(tri_dict[key])
        wfd.write("%s %s %s\n" % (triparts[0], triparts[1], triparts[2]))
    wfd.close()

    print "Raw number of items: %d. After cleaning, number of items: %d.\n"\
            % (raw_numitems, new_numitems)
    def __init__(self, wordsmap_file, dir_keywords, dir_result, topk, \
            keywords_suffix, result_suffix, completekwnum_suffix):
        self.wordsmap_file   = wordsmap_file
        self.dir_keywords    = dir_keywords
        self.dir_result      = dir_result
        self.keywords_suffix = keywords_suffix
        self.result_suffix   = result_suffix
        self.topk            = topk
        self.completekwnum_suffix = completekwnum_suffix

        self.stemmer  = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.manualkeywords  = self.getmanualkeywords()
        self.extractkeywords = self.self.getextractkeywords()
    def __init__(self, dir_text_file, dir_output_file,\
            windowsize, docsuffix, wordmap_file):
        self.mapfile_suffix = ".idmap"
        self.graphfile_suffix = ".graph"
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.dir_text_file = dir_text_file
        self.dir_output_file = dir_output_file
        self.windowsize = windowsize
        self.docsuffix = docsuffix
        self.wordmap_file = wordmap_file

        self.doclist = []
        self.getdoclist()
        self.corp_wordmap = {}
        self.readwordmap()
    def __init__(self, dir_text_file, text_suffix,\
            dir_feature_file, feature_suffix,\
            dir_manualkp_file, manualkp_suffix,\
            words_map_file):
        self.dir_text_file = dir_text_file
        self.text_suffix = text_suffix
        self.dir_feature_file = dir_feature_file
        self.feature_suffix = feature_suffix
        self.dir_manualkp_file = dir_manualkp_file
        self.manualkp_suffix = manualkp_suffix
        self.words_map_file = words_map_file

        self.featurenum = 3

        self.stemmer = PorterStemmer()
        self.wordmap = self.loadmap()
        self.doclist = self.getdoclist(self.substr_text,\
                dir_text_file)
        self.wordFilter = WordFilter()
        self.doctext,self.doctags,self.worddf=self.loaddoctext()
        self.manualkeywords = self.getmanuallabels()
        corpfeature = self.mkdocfeatures()
        self.corpfeature = self.normalization(corpfeature,\
                method='minmax')
    def __init__(self, words_map_file, dir_keywords_file,\
            dir_results_file, dir_wholetext_file,\
            keywords_suffix, results_suffix, wholetext_suffix,\
            kwnum_suffix, topk):
        self.words_map_file = words_map_file
        self.dir_keywords_file = dir_keywords_file
        self.dir_results_file = dir_results_file
        self.dir_wholetext_file = dir_wholetext_file
        self.keywords_suffix = keywords_suffix
        self.results_suffix = results_suffix
        self.wholetext_suffix = wholetext_suffix
        self.kwnum_suffix = kwnum_suffix
        self.topk = topk

        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.doclist = self.getdoclist(self.dir_wholetext_file,\
                self.substr_wholetext)

        self.corpkeyphrase = {}
        self.getkeyphrase()
        self.manualkwnum = {}
        self.manuallabels = {}
        self.getmanuallabels()
class GetTopicDis:
    ''' Call ldaGibbs++, the opensource software, to get the topic
        distribution for words and documents. Before calling, we
        should first convert the file to the format which meets
        the requirements.
    '''
    def __init__(self, src_corp_dir, lda_doc, docsuffix):
        self.src_corp_dir = src_corp_dir
        self.lda_doc = lda_doc
        self.docsuffix = docsuffix
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.doclist = []
        self.getdoclist()
        self.genmodel_inputfile()

    def getdoclist(self):
        for subdir in self.src_corp_dir:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            if self.docsuffix:
                candi_files = filter(self.substr, candi_files)
            candi_files = map(lambda x: subdir+x, candi_files)
            self.doclist = self.doclist + candi_files

    # generate the file meeting the requirements of the ldaGibbs++
    def genmodel_inputfile(self):
        wfd = open(self.lda_doc, "w")
        wfd.write("%d\n" % len(self.doclist))
        for doc in self.doclist:
            docwordlist = []
            for line in open(doc):
                line = line.strip("\n\r ")
                docwordlist = docwordlist + self.filterwords(line)
            docwordlist = sorted(docwordlist, reverse=False)
            wfd.write("%s\n" % " ".join(docwordlist))
        wfd.close()

    # call Gibbs LDA
    def call_lda(self, topicnum, maxiter):
        # lda model parameter setting
        alpha = 1.0*topicnum / 50
        cmd = CMD + "-alpha " + str(alpha) + " -ntopics " + \
                str(topicnum) + " -niters " + str(maxiter) + \
                " -dfile " + self.lda_doc
        print "Calling Gibbs LDA"
        #os.popen(cmd)
        os.system(cmd)
        print "Finishing calling"

    # filter words based on stopwords list and character rule
    def filterwords(self, textline):
        save_words = []
        words = textline.split(" ")
        for word in words:
            if word == " ":
                continue
            biparts = word.split("_")
            # words processing (stopword, stemming, lower)
            # ============================================
            biparts[0] = biparts[0].lower()
            biparts[0] = self.stemmer.stem(biparts[0], 0, \
                    len(biparts[0])-1)
            if len(biparts) == 2 and biparts[1] in TOPIC_POS:
                if not self.stopWords.is_stopword(biparts[0])\
                        and self.pattern.match(biparts[0]):
                    save_words.append(biparts[0])
            # ============================================
        return save_words

    def substr(self, candi_file):
        if candi_file.find(self.docsuffix) != -1:
            return True
        return False
Example #10
0
class FeatureGenerator():
    '''This class serves for the supervised keyphrase
       extraction method, i.e., logistic regression
       for keyword extraction and then merge keywords
       to keyphrases.
       Features: as we use "POS tags" to merge keywords
       when generating keyphrases, we don't use "POS" tags as features,
            1.TF;
            2.DF;
            3.POSITION.
       Note: all features should be normalized.
    '''
    def __init__(self, dir_text_file, text_suffix,\
            dir_feature_file, feature_suffix,\
            dir_manualkp_file, manualkp_suffix,\
            words_map_file):
        self.dir_text_file = dir_text_file
        self.text_suffix = text_suffix
        self.dir_feature_file = dir_feature_file
        self.feature_suffix = feature_suffix
        self.dir_manualkp_file = dir_manualkp_file
        self.manualkp_suffix = manualkp_suffix
        self.words_map_file = words_map_file

        self.featurenum = 3

        self.stemmer = PorterStemmer()
        self.wordmap = self.loadmap()
        self.doclist = self.getdoclist(self.substr_text,\
                dir_text_file)
        self.wordFilter = WordFilter()
        self.doctext,self.doctags,self.worddf=self.loaddoctext()
        self.manualkeywords = self.getmanuallabels()
        corpfeature = self.mkdocfeatures()
        self.corpfeature = self.normalization(corpfeature,\
                method='minmax')

    def loaddoctext(self):
        doctext = {}
        doctags = {}
        worddf = {}
        for doc in self.doclist:
            docwordlist = []
            doctaglist = []
            docwordset = set([])
            for line in open(doc):
                clean_words,tags=self.wordFilter.filterwords(\
                        line.strip('\r\n '))
                docwordlist += clean_words
                doctaglist += tags
            #docname = doc.split('/')[-1].split('.')[0]
            doctext[doc] = docwordlist
            doctags[doc] = doctaglist
            for word in set(docwordlist):
                if word in self.wordmap:
                    wordid = self.wordmap[word]
                    if wordid not in docwordset:
                        if wordid in worddf:
                            worddf[wordid] += 1
                        else:
                            worddf[wordid] = 1
                        docwordset.add(wordid)
        return doctext, doctags, worddf

    def getmanuallabels(self):
        '''segment each keyphrase into keywords
        '''
        manualkeywords = {}
        doclist = self.getdoclist(self.substr_manualkp,\
                self.dir_manualkp_file)
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            keywordset = set([])
            for line in open(doc):
                for word in line.strip('\r\n ').split(' '):
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, \
                        len(word)-1)
                    if word not in keywordset:
                        keywordset.add(word)
            keywordset = map(lambda x:self.wordmap[x],\
                    keywordset)
            manualkeywords[docname] = set(keywordset)
        return manualkeywords

    def getdoclist(self, substr_func, dir_file):
        doclist = []
        for subdir in dir_file:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            candi_files = filter(substr_func, candi_files)
            candi_files = map(lambda x: subdir+x, candi_files)
            doclist = doclist + candi_files
        return doclist

    def substr_text(self, candi_file):
        if candi_file.find(self.text_suffix) != -1:
            return True
        return False

    def substr_manualkp(self, candi_file):
        if candi_file.find(self.manualkp_suffix) != -1:
            return True
        return False

    def loadmap(self):
        wordsmap = {}
        for line in open(self.words_map_file):
            biparts = line.strip('\n').split(' ')
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def mkdocfeatures(self):
        '''Feature format:
            1.tf; 2.df; 3.position
        '''
        corpfeature = {}
        for key in self.doctext.keys():
            docfeature = {}
            doctext = self.doctext[key]
            doctags = self.doctags[key]
            for i, word in enumerate(doctext):
                if word in self.wordmap and doctags[i] in POS:
                    if self.wordmap[word] not in docfeature:
                        docfeature[self.wordmap[word]] = \
                            [1, self.worddf[self.wordmap[word]], i]
                    else:
                        docfeature[self.wordmap[word]][0] += 1
            corpfeature[key] = docfeature
        return corpfeature

    def outputdocfeatures(self):
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            manuallabelkey = dockey.split('/')[-1].split('.')[0]
            if dockey.find('Train') != -1:
                dir_feature_file = self.dir_feature_file[0]
            elif dockey.find('Validation') != -1:
                dir_feature_file = self.dir_feature_file[1]
            elif dockey.find('Test') != -1:
                dir_feature_file = self.dir_feature_file[2]
            else:
                dir_feature_file = self.dir_feature_file[0]
            output_feature_file = dir_feature_file\
                    + manuallabelkey + '.' + self.feature_suffix
            wfd = open(output_feature_file, 'w')
            for word in docfeature.keys():
                #print word
                #print self.manualkeywords[manuallabelkey]
                #raw_input()
                if word in self.manualkeywords[manuallabelkey]:
                    wfd.write('%s 1 %f %f %f\n' % (word,\
                        docfeature[word][0], docfeature[word][1],\
                        docfeature[word][2]))
                else:
                    wfd.write('%s 0 %f %f %f\n' % (word,\
                        docfeature[word][0], docfeature[word][1],\
                        docfeature[word][2]))
            wfd.close()

    def normalization(self, features, method):
        ''' feature normalization:
            1.document frequency features are normalized
            in the whole corpus;
            2.words frequency and position are normalized
            in their corresponding document.
        '''
        if method == 'minmax':
            features = self.minmax(features)
        elif method == 'norm':
            features = self.norm(features)
        elif method == 'original':
            pass
        else:
            print 'Invalid method choice'
            sys.exit(0)
        return features

    def minmax(self, features):
        std_feature = {}
        mindf = min(map(lambda x:x[1], self.worddf.items()))
        maxdf = max(map(lambda x:x[1], self.worddf.items()))
        #maxdf = 3
        #mindf = 1
        for dockey in features.keys():
            docfeature = features[dockey]
            mintf = min(map(lambda x:x[1][0],\
                    docfeature.items()))
            maxtf = max(map(lambda x:x[1][0],\
                    docfeature.items()))
            minpos = min(map(lambda x:x[1][2],\
                    docfeature.items()))
            maxpos = max(map(lambda x:x[1][2],\
                    docfeature.items()))
            for word in docfeature.keys():
                docfeature[word][0] = 1.0*(docfeature[word][0]-mintf)\
                        /max(1, (maxtf-mintf))
                docfeature[word][1] = 1.0*(docfeature[word][1]-mindf)\
                        /(maxdf-mindf)
                docfeature[word][2] = 1.0*(docfeature[word][2]-minpos)\
                        /(maxpos-minpos)
            std_feature[dockey] = docfeature
        return std_feature

    def norm(self, features):
        meandf = np.mean(np.array(map(lambda x:x[1],\
                self.worddf.items())))
        stddf = np.std(np.array(map(lambda x:x[1],\
                self.worddf.items())))
        for dockey in features.keys():
            docfeature = features[dockey]
            meantf = np.mean(np.array(map(lambda x:x[1][0],\
                    docfeature.items())))
            stdtf = np.std(np.array(map(lambda x:x[1][0],\
                    docfeature.items())))
            meanpos = np.mean(np.array(map(lambda x:x[1][2],\
                    docfeature.items())))
            stdpos = np.std(np.array(map(lambda x:x[1][2],\
                    docfeature.items())))
            for word in docfeature.keys():
                docfeature[word][0] = (docfeature[word][0]-meantf)\
                        / stdtf
                docfeature[word][1] = (docfeature[word][1]-meandf)\
                        / stddf
                docfeature[word][2] = (docfeature[word][2]-meanpos)\
                        / stdpos
            features[dockey] = docfeature
        return features
Example #11
0
class EvalResult:
    '''This class is used to evalute the results of keywords extraction.
       Currently, I adopt Precision@K and F-score to evaluate.
       In the keywords extraction task, we only consider words occured
       in the text.
    '''
    def __init__(self, wordsmap_file, dir_keywords, dir_result, topk, \
            keywords_suffix, result_suffix, completekwnum_suffix):
        self.wordsmap_file   = wordsmap_file
        self.dir_keywords    = dir_keywords
        self.dir_result      = dir_result
        self.keywords_suffix = keywords_suffix
        self.result_suffix   = result_suffix
        self.topk            = topk
        self.completekwnum_suffix = completekwnum_suffix

        self.stemmer  = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.manualkeywords  = self.getmanualkeywords()
        self.extractkeywords = self.self.getextractkeywords()

    def getdoclist(self, dir_file, filter_func):
        doclist = []
        candi_files = os.listdir(dir_file)
        # filter file by suffix if existed
        candi_files = filter(filter_func, candi_files)
        candi_files = map(lambda x: dir_file+x, candi_files)
        doclist = doclist + candi_files
        return doclist

    def loadmap(self):
        wordsmap = {}
        for line in open(self.wordsmap_file):
            biparts = line.strip('\n').split(' ')
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def substr_keywords(self, candi_file):
        if candi_file.find(self.keywords_suffix) != -1:
            return True
        return False
    def substr_results(self, candi_file):
        if candi_file.find(self.result_suffix) != -1:
            return True
        return False
    def substr_kwnum(self, candi_file):
        if candi_file.find(self.completekwnum_suffix) != -1:
            return True
        return False

    def getmanualkeywords(self):
        '''Get manual keywords occured in the text.
        '''
        doclist = self.getdoclist(self.dir_keywords, self.substr_keywords)
        manuallabels = {}
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            keywords_set = set([])
            for line in open(doc):
                words = line.strip('\r\n ').split(' ')
                for word in words:
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, len(word)-1)
                    if word not in self.wrodsmap:
                        print 'Invalid keyword'
                        sys.exit(0)
                    word_id = self.wordsmap[word]
                    keywords_set.add(word_id)
            manuallabels[docname] = keywords_set
        return manuallabels

    def getextractkeywords(self):
        '''Get the extracted keywords accroding to the ranking value
           of candidate keywords.
        '''
        doclist = self.getdoclist(self.dir_result, self.substr_results)
        extractkeywords = {}
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            tempwords = []
            for line in open(doc):
                biparts = line.strip('\n\r ').split(' ')
                tempwords.append([biparts[0], float(biparts[1])])
            tempwords = sorted(tempwords, key=lambda x: x[1],reverse=True)
            sortedwords = map(lambda x: x[0], tempwords)
            dockeywords = set(sortedwords[0:self.topk])
            extractkeywords[docname] = dockeywords
        return extractkeywords

    def evaluation(self, eval_choice=None):
        if eval_choice == 'F-score':
            precision, recall, f_score = self.eval_fscore()
            print 'Precision: %f, Recall: %f, F-score: %f\n'\
                    % (precision, recall, f_score)
        elif eval_choice == 'Bpref':
            bpref = self.eval_bpref()
            print 'Bpref: %f\n' % bpref
        elif eval_choice == 'MRR':
            mrr = self.eval_mrr()
            print 'Mrr: %f\n' % mrr
        else:
            precision, recall, f_score = self.eval_fscore()
            bpref = self.eval_bpref()
            mrr = self.eval_mrr()
            print 'Precision: %f, Recall: %f, F-score: %f\n'\
                    % (precision, recall, f_score)
            print 'Bpref: %f\n' % bpref
            print 'Mrr: %f\n' % mrr

    # Using F-score to evaluate
    def eval_fscore(self):
        total_accnum = 0
        ext_accnum = 0
        ext_num = 0
        for doc in self.manualkeywords.keys():
            manual_kw  = self.manualkeywords[doc]
            ext_kw = self.extractkeywords[doc]
            total_accnum += len(manual_kw)
            ext_num += len(ext_kw)
            ext_accnum += len(manual_kw&ext_kw)
        print 'Manual annotated keyphrases: %d' % total_accnum
        print 'Extracted total keyphrases: %d' % ext_num
        print 'Extracted accurate keyphrases: %d' % ext_accnum
        precision = ext_accnum*1.0/ext_num
        recall = ext_accnum*1.0/total_accnum
        fscore = 2*precision*recall/(precision+recall)
        return precision, recall, fscore

    # Using MRR to evaluate
    def eval_mrr(self):
        pass

    # Using Bpref to evaluate
    def eval_bpref(self):
        pass
class GraphGenerator:
    '''This class mainly construct graph for basic random
       walk method.
       Method: graph construction based on sliding window
               strategy.
       Ouput : two files-->1.graph file for each document;
                           2.word id in document to word map dictionary.
       Currently, we implements two graph representation, i.e., dense
       graph representation and sprase graph representation.
    '''
    def __init__(self, dir_text_file, dir_output_file,\
            windowsize, docsuffix, wordmap_file):
        self.mapfile_suffix = ".idmap"
        self.graphfile_suffix = ".graph"
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.dir_text_file = dir_text_file
        self.dir_output_file = dir_output_file
        self.windowsize = windowsize
        self.docsuffix = docsuffix
        self.wordmap_file = wordmap_file

        self.doclist = []
        self.getdoclist()
        self.corp_wordmap = {}
        self.readwordmap()

    def getdoclist(self):
        for rootdir in self.dir_text_file:
            candi_files = os.listdir(rootdir)
            # filter file by suffix if existed
            if self.docsuffix:
                candi_files = filter(self.substr, candi_files)
            candi_files = map(lambda x: rootdir+x, candi_files)
            self.doclist = self.doclist + candi_files

    def substr(self, candi_file):
        if candi_file.find(self.docsuffix) != -1:
            return True
        return False

    def readwordmap(self):
        for line in open(self.wordmap_file):
            biparts = line.strip("\r\n ").split(" ")
            self.corp_wordmap[biparts[0]] = int(biparts[1])

    # filter words based on stopwords list and character rule
    def filterwords(self, textline):
        stemmed_words = []
        saved_words = []
        words = textline.split(" ")
        for word in words:
            if word == " ":
                continue
            biparts = word.split("_")
            # words processing (stopword, stemming, lower)
            # ============================================
            biparts[0] = biparts[0].lower()
            biparts[0] = self.stemmer.stem(biparts[0], 0, \
                    len(biparts[0])-1)
            stemmed_words.append(biparts[0])
            if biparts[1] in POS:
                if not self.stopWords.is_stopword(biparts[0])\
                        and self.pattern.match(biparts[0]):
                    saved_words.append(biparts[0])
            # ============================================
        return stemmed_words, saved_words

    # graph construction
    # strategy: 1.filter words accroding to POS tags;
    #           2.construct graph based on sliding window.
    def construct(self):
        for doc in self.doclist:
            doc_prefix = doc.split('/')[-1].split('.')[0]
            output_graphfile = self.dir_output_file + doc_prefix \
                    + self.graphfile_suffix
            output_mapfile = self.dir_output_file + doc_prefix \
                    + self.mapfile_suffix

            cleaned_wordslist = []
            stemmed_wordslist = []
            for line in open(doc):
                line = line.strip('\n\r ')
                stemmed_words, cleaned_words = self.filterwords(line)
                cleaned_wordslist = cleaned_wordslist + cleaned_words
                stemmed_wordslist = stemmed_wordslist + stemmed_words
            wordsmap_indoc = self.numword_indoc(cleaned_wordslist)
            #print wordsmap_indoc
            pairids = self.mapwordspair(wordsmap_indoc)
            pairids = sorted(pairids, key=lambda x: x[0])
            dense_graph = self.slidingwindow(stemmed_wordslist, wordsmap_indoc)
            self.output_graph('dense', dense_graph, output_graphfile)
            self.output_graph('sparse', dense_graph, output_graphfile)
            self.output_map(pairids, output_mapfile)

    def mapwordspair(self, ids_indoc):
        pairids = []
        for key in ids_indoc.keys():
            pairids.append([ids_indoc[key], self.corp_wordmap[key]])
        return pairids

    def slidingwindow(self, stemmed_wordslist, wordsmap_indoc):
        dense_graph = np.array([0.0 for i in range(len(wordsmap_indoc)\
                *len(wordsmap_indoc))])
        dense_graph = dense_graph.reshape(len(wordsmap_indoc), len(wordsmap_indoc))

        for i, word in enumerate(stemmed_wordslist):
            if stemmed_wordslist[i] in wordsmap_indoc:
                sliding_text = stemmed_wordslist[max(0, i-self.windowsize):\
                        min(len(stemmed_wordslist), i+self.windowsize+1)]
                for j in range(len(sliding_text)):
                    if stemmed_wordslist[i] == sliding_text[j]:
                        continue
                    if sliding_text[j] in wordsmap_indoc:
                        dense_graph[wordsmap_indoc[stemmed_wordslist[i]]-1,\
                                wordsmap_indoc[sliding_text[j]]-1] += 1
        return dense_graph

    def numword_indoc(self, wordslist_indoc):
        wordsmap_indoc = {}
        word_id = 1
        for word in wordslist_indoc:
            if word not in wordsmap_indoc:
                wordsmap_indoc[word] = word_id
                word_id += 1
        return wordsmap_indoc

    def output_graph(self, choice, graphdata, graphfile):
        #print graphfile
        if choice == "dense":
            wfd = open(graphfile+'.dense', 'w')
            for i in range(len(graphdata)):
                wfd.write("%s\n"%' '.join(map(lambda x: str(x), graphdata[i])))
        elif choice == "sparse":
            wfd = open(graphfile+'.sparse', 'w')
            for i in range(len(graphdata)):
                for j in range(len(graphdata)):
                    if graphdata[i,j] != 0:
                        wfd.write("%d %d %d\n" % (i, j, graphdata[i, j]))
        wfd.close()

    def output_map(self, mapdata, mapfile):
        #print mapfile
        wfd = open(mapfile, 'w')
        for i in range(len(mapdata)):
            wfd.write("%d\n" % mapdata[i][1])
        wfd.close()
class NodeFeatureGenerator:
    """This class provides a framework which is easy for feature addition
       and deletion. The following features are common features that have
       been used in keywords/keyphrase extraction task.

       Features: the first three features listed below must be generated.
       1.TF;
       2.DF;
       3.POSITION;
       4.TF-IDF;
       5.lenText;
       6.POS-Tagging;

       Note: all features should be normalized to 0-1.
    """

    def __init__(
        self,
        posset,
        feature_bittag,
        dir_text,
        text_suffix,
        dir_feature,
        feature_suffix,
        dir_manualkp,
        manualkp_suffix,
        wordsmap_file,
        featurenum,
        nonposfeature,
    ):
        # receving input
        self.posset = posset
        self.feature_bittag = feature_bittag
        self.dir_text = dir_text
        self.text_suffix = text_suffix
        self.dir_feature = dir_feature
        self.feature_suffix = feature_suffix
        self.dir_manualkp = dir_manualkp
        self.manualkp_suffix = manualkp_suffix
        self.wordsmap_file = wordsmap_file
        self.featurenum = featurenum
        self.nonposfeature = nonposfeature

        # inner variable setting
        self.poslist = list(posset)
        self.featuretype = sum(feature_bittag)
        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.wordfilter = WordFilter()
        self.doctext, self.doctags, self.worddf = self.loaddoctext()
        self.manualkeywords = self.getmanualkeywords()

    def generatefeature(self, norm_method):
        # running
        self.corpfeature = self.mkfeatures()
        self.normalization(norm_method)
        self.outputdocfeatures()

    def loadmap(self):
        wordsmap = {}
        for line in open(self.wordsmap_file):
            biparts = line.strip("\n").split(" ")
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def loaddoctext(self):
        doctext = {}
        doctags = {}
        worddf = {}
        doclist = self.getdoclist(self.substr_text, self.dir_text)
        for doc in doclist:
            docwordlist = []
            doctaglist = []
            for line in open(doc):
                clean_words, tags = self.wordfilter.filterwords(line.strip("\r\n "))
                docwordlist += clean_words
                doctaglist += tags
            doctext[doc] = docwordlist
            doctags[doc] = doctaglist

            # compute document frequency for words
            for word in set(docwordlist):
                if word in self.wordsmap:
                    wordid = self.wordsmap[word]
                    if wordid in worddf:
                        worddf[wordid] += 1
                    else:
                        worddf[wordid] = 1
        return doctext, doctags, worddf

    def getdoclist(self, substr_func, dir_file):
        doclist = []
        for subdir in dir_file:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            candi_files = filter(substr_func, candi_files)
            candi_files = map(lambda x: subdir + x, candi_files)
            doclist = doclist + candi_files
        return doclist

    def substr_text(self, candi_file):
        if candi_file.find(self.text_suffix) != -1:
            return True
        return False

    def substr_manualkp(self, candi_file):
        if candi_file.find(self.manualkp_suffix) != -1:
            return True
        return False

    def getmanualkeywords(self):
        """segment each keyphrase into keywords
        """
        manualkeywords = {}
        doclist = self.getdoclist(self.substr_manualkp, self.dir_manualkp)
        for doc in doclist:
            docname = doc.split("/")[-1].split(".")[0]
            keywordset = set([])
            for line in open(doc):
                for word in line.strip("\r\n ").split(" "):
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, len(word) - 1)
                    if word not in keywordset:
                        keywordset.add(word)
            keywordset = map(lambda x: self.wordsmap[x], keywordset)
            manualkeywords[docname] = set(keywordset)
        return manualkeywords

    def mkfeatures(self):
        corpfeature = {}
        for dockey in self.doctext.keys():
            docfeature = {}
            doctext = self.doctext[dockey]
            doctags = self.doctags[dockey]
            for i, word in enumerate(doctext):
                if word in self.wordsmap and doctags[i] in self.posset:
                    if self.wordsmap[word] not in docfeature:
                        # class
                        # wordfeature = Feature()
                        # wordfeature.tf  = 1
                        # wordfeature.df = self.worddf[word]
                        # wordfeature.position = i
                        # list
                        wordfeature = [0 for j in range(self.featurenum)]
                        wordfeature[0] = 1
                        wordfeature[1] = self.worddf[self.wordsmap[word]]
                        wordfeature[2] = i
                        # word's length feature
                        if self.feature_bittag[4] == 1:
                            # wordfeature.lentext = len(word)
                            wordfeature[4] = len(word)
                        # word's pos feature
                        if self.feature_bittag[5] == 1:
                            posidx = self.poslist.index(doctags[i])
                            if posidx < 0:
                                print "Invalid pos tags"
                                sys.exit(1)
                            wordfeature[self.nonposfeature + posidx] = 1
                        docfeature[self.wordsmap[word]] = wordfeature
                    else:
                        docfeature[self.wordsmap[word]][0] += 1
            # word's tfidf feature
            if self.feature_bittag[3] == 1:
                for wordkey in docfeature.keys():
                    docfeature[wordkey][3] = self.comptfidf(
                        docfeature[wordkey][0], docfeature[wordkey][1], len(self.doctext.keys())
                    )
            corpfeature[dockey] = docfeature
        return corpfeature

    def comptfidf(self, tf, df, docnum):
        return tf * math.log((docnum * 1.0) / df)

    def normalization(self, method):
        """ feature normalization:
            1.document frequency features are normalized
            in the whole corpus;
            2.words frequency and position are normalized
            in their corresponding document.
        """
        if method == "minmax":
            self.minmax()
        elif method == "norm":
            self.norm()
        elif method == "original":
            pass
        else:
            print "Invalid method choice"
            sys.exit(0)

    def minmax(self):
        std_feature = {}
        # words' df feature
        mindf = min(map(lambda x: x[1], self.worddf.items()))
        maxdf = max(map(lambda x: x[1], self.worddf.items()))
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            mintf = min(map(lambda x: x[1][0], docfeature.items()))
            maxtf = max(map(lambda x: x[1][0], docfeature.items()))
            minpos = min(map(lambda x: x[1][2], docfeature.items()))
            maxpos = max(map(lambda x: x[1][2], docfeature.items()))
            if self.feature_bittag[3] == 1:
                mintfidf = min(map(lambda x: x[1][3], docfeature.items()))
                maxtfidf = max(map(lambda x: x[1][3], docfeature.items()))
            if self.feature_bittag[4] == 1:
                minlength = min(map(lambda x: x[1][4], docfeature.items()))
                maxlength = max(map(lambda x: x[1][4], docfeature.items()))
            for word in docfeature.keys():
                docfeature[word][0] = 1.0 * (docfeature[word][0] - mintf) / max(1, (maxtf - mintf))
                docfeature[word][1] = 1.0 * (docfeature[word][1] - mindf) / (maxdf - mindf)
                docfeature[word][2] = 1.0 * (docfeature[word][2] - minpos) / (maxpos - minpos)
                if self.feature_bittag[3] == 1:
                    docfeature[word][3] = 1.0 * (docfeature[word][3] - mintfidf) / (maxtfidf - mintfidf)
                if self.feature_bittag[4] == 1:
                    docfeature[word][4] = 1.0 * (docfeature[word][4] - minlength) / (maxlength - minlength)

            std_feature[dockey] = docfeature
        self.corpfeature = std_feature

    def norm(self):
        pass

    def outputdocfeatures(self):
        for dockey in self.corpfeature.keys():
            docfeature = self.corpfeature[dockey]
            manuallabelkey = dockey.split("/")[-1].split(".")[0]
            if dockey.find("Train") != -1:
                dir_feature = self.dir_feature[0]
            elif dockey.find("Validation") != -1:
                dir_feature = self.dir_feature[1]
            elif dockey.find("Test") != -1:
                dir_feature = self.dir_feature[2]
            else:
                dir_feature = self.dir_feature[0]
            output_feature_file = dir_feature + manuallabelkey + "." + self.feature_suffix
            wfd = open(output_feature_file, "w")
            for word in docfeature.keys():
                # print word
                # print self.manualkeywords[manuallabelkey]
                # raw_input()
                if word in self.manualkeywords[manuallabelkey]:
                    wfd.write("%s 1 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2]))
                else:
                    wfd.write("%s 0 %f %f %f" % (word, docfeature[word][0], docfeature[word][1], docfeature[word][2]))
                if self.feature_bittag[3] == 1:
                    wfd.write(" %f" % docfeature[word][3])
                if self.feature_bittag[4] == 1:
                    wfd.write(" %f" % docfeature[word][4])
                if self.feature_bittag[5] == 1:
                    for i in range(5, self.featurenum):
                        wfd.write(" %d" % docfeature[word][i])
                wfd.write("\n")
            wfd.close()
class EvalResult:
    '''This class is responsible to do evaluation
       on the extraction results.
    '''
    def __init__(self, words_map_file, dir_keywords_file,\
            dir_results_file, dir_wholetext_file,\
            keywords_suffix, results_suffix, wholetext_suffix,\
            kwnum_suffix, topk):
        self.words_map_file = words_map_file
        self.dir_keywords_file = dir_keywords_file
        self.dir_results_file = dir_results_file
        self.dir_wholetext_file = dir_wholetext_file
        self.keywords_suffix = keywords_suffix
        self.results_suffix = results_suffix
        self.wholetext_suffix = wholetext_suffix
        self.kwnum_suffix = kwnum_suffix
        self.topk = topk

        self.stemmer = PorterStemmer()
        self.wordsmap = self.loadmap()
        self.doclist = self.getdoclist(self.dir_wholetext_file,\
                self.substr_wholetext)

        self.corpkeyphrase = {}
        self.getkeyphrase()
        self.manualkwnum = {}
        self.manuallabels = {}
        self.getmanuallabels()

    def getdoclist(self, dir_file, filter_func):
        doclist = []
        candi_files = os.listdir(dir_file)
        # filter file by suffix if existed
        candi_files = filter(filter_func, candi_files)
        candi_files = map(lambda x: dir_file+x, candi_files)
        doclist = doclist + candi_files
        return doclist

    def substr_keywords(self, candi_file):
        if candi_file.find(self.keywords_suffix) != -1:
            return True
        return False
    def substr_results(self, candi_file):
        if candi_file.find(self.results_suffix) != -1:
            return True
        return False
    def substr_wholetext(self, candi_file):
        if candi_file.find(self.wholetext_suffix) != -1:
            return True
        return False
    def substr_kwnum(self, candi_file):
        if candi_file.find(self.kwnum_suffix) != -1:
            return True
        return False

    def loadmap(self):
        wordsmap = {}
        for line in open(self.words_map_file):
            biparts = line.strip('\n').split(' ')
            wordsmap[biparts[0]] = biparts[1]
        return wordsmap

    def getkeyphrase(self):
        for doc in self.doclist:
            results_file = self.dir_results_file \
                    + doc.split('/')[-1].split('.')[0]\
                    + '.' + self.results_suffix
            #print results_file
            #print doc
            docwordlist = []
            docwordtags = []
            for line in open(doc):
                line = line.strip("\n\r ")
                save_words, save_postags = self.filterwords(line)
                docwordlist = docwordlist + save_words
                docwordtags = docwordtags + save_postags

            dockeywords, sortedwords, wordsvalue = \
                    self.getkeywords(results_file)
            #print sortedwords
            kp_num, dockeyphrase = self.mergekeywords(dockeywords,\
                    docwordlist, wordsvalue, docwordtags)
            #print dockeyphrase
            #print dockeyphrase
            #raw_input()
            dockeyphrase = sorted(dockeyphrase.items(), \
                    key=lambda x:x[1], reverse=True)
            dockeyphrase = map(lambda x:x[0], dockeyphrase)
            self.corpkeyphrase[doc.split('/')[-1].split('.')[0]] = \
                    set(dockeyphrase[0:int(10.0*len(dockeyphrase)/10+1)+1])

    def getmanuallabels(self):
        doclist = self.getdoclist(self.dir_keywords_file,\
                self.substr_kwnum)
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            manual_kwnum = int(open(doc).readline().strip('\n'))
            self.manualkwnum[docname] = manual_kwnum

        doclist = self.getdoclist(self.dir_keywords_file,\
                self.substr_keywords)
        for doc in doclist:
            docname = doc.split('/')[-1].split('.')[0]
            keyphrase_set = set([])
            for line in open(doc):
                words_id =[]
                words = line.strip('\r\n ').split(' ')
                for word in words:
                    word = word.lower()
                    word = self.stemmer.stem(word, 0, \
                        len(word)-1)
                    if word not in self.wordsmap:
                        print 'Invalid keyword'
                        sys.exit(0)
                    word_id = self.wordsmap[word]
                    words_id.append(word_id)
                keyphrase = '_'.join(words_id)
                keyphrase_set.add(keyphrase)
            self.manuallabels[docname] = keyphrase_set

    def mergekeywords(self, dockeywords, docwordlist, wordsvalue,\
            docwordtags):
        dockeyphrase = {}
        kp_num = 0
        kp_tag = False
        kp_start = 0
        for i,word in enumerate(docwordlist):
            if not kp_tag:
                if word in dockeywords:
                    kp_start = i
                    kp_tag = True
            else:
                if word not in dockeywords or i == len(docwordlist)-1:
                    kp_end = i if word not in dockeywords else i+1
                    kp_tag = False
                    keywords_segment = docwordlist[kp_start:kp_end]
                    keywords_postags = docwordtags[kp_start:kp_end]
                    keywords_segment = self.postag_verify(keywords_segment,\
                            keywords_postags)
                    if keywords_segment:
                        keyphrase = '_'.join(keywords_segment)
                        keyphrase_val = self.getkpvalue(keywords_segment,\
                                wordsvalue)
                        if keyphrase not in dockeyphrase:
                            dockeyphrase[keyphrase] = keyphrase_val
                            kp_num += 1
        return kp_num, dockeyphrase

    def getkpvalue(self, keywords_segment, wordsvalue):
        kpvalue = 0.0
        for keyword in keywords_segment:
            kpvalue += wordsvalue[keyword]
        return kpvalue

    def getkeywords(self, keywords_file):
        tempwords = []
        for line in open(keywords_file):
            biparts = line.strip('\n\r ').split(' ')
            tempwords.append([biparts[0], float(biparts[1])])
        tempwords = sorted(tempwords, key=lambda x: x[1],\
                reverse=True)
        #print tempwordss)
        sortedwords = map(lambda x: x[0], tempwords)
        #print sortedwords
        #raw_input()
        wordsvalue = dict(tempwords)
        wordsnum = len(sortedwords)
        keywordsnum = int(1.0*wordsnum)
        dockeywords = set(sortedwords[0:keywordsnum])
        return dockeywords, sortedwords, wordsvalue

    def filterwords(self, textline):
        save_words = []
        save_postags = []
        words = textline.split(" ")
        for word in words:
            biparts = word.split("_")
            if len(biparts) != 2:
                print 'Invalid words occurence.'
                sys.exit(0)
            # words processing (stemming, lower)
            # ============================================
            biparts[0] = biparts[0].lower()
            biparts[0] = self.stemmer.stem(biparts[0], 0, \
                    len(biparts[0])-1)
            # ============================================
            if biparts[0] not in self.wordsmap:
                save_words.append('-1')
                save_postags.append(biparts[1])
            else:
                save_words.append(self.wordsmap[biparts[0]])
                save_postags.append(biparts[1])
        return save_words, save_postags

    # filter candidate keyphrase with invalid postag sequence
    def postag_verify(self, save_words, save_postags):
        #if len(save_words) == 1 and save_postags[0] in ADJ_POS:
        #    return None
        return save_words

    '''def postag_verify(self, save_words, save_postags):
        state = 0
        end_idx = -1
        for i, postag in enumerate(save_postags):
            if state == 0:
                if postag in ADJ_POS:
                    continue
                elif postag in NOUN_POS:
                    state = 1
            elif state == 1:
                if postag in ADJ_POS:
                    end_idx = i-1
                elif postag in NOUN_POS:
                    end_idx = i
                    continue
        if end_idx == -1:
            return None
        else:
            return save_words[0:end_idx+1]
'''
    def evaluation(self, eval_choice=None):
        if eval_choice == 'F-score':
            precision, recall, f_score = self.eval_fscore()
            print 'Precision: %f, Recall: %f, F-score: %f\n'\
                    % (precision, recall, f_score)
        elif eval_choice == 'Bpref':
            bpref = self.eval_bpref()
            print 'Bpref: %f\n' % bpref
        elif eval_choice == 'MRR':
            mrr = self.eval_mrr()
            print 'Mrr: %f\n' % mrr
        else:
            precision, recall, f_score = self.eval_fscore()
            bpref = self.eval_bpref()
            mrr = self.eval_mrr()
            print 'Precision: %f, Recall: %f, F-score: %f\n'\
                    % (precision, recall, f_score)
            print 'Bpref: %f\n' % bpref
            print 'Mrr: %f\n' % mrr

    # Using F-score to evaluate
    def eval_fscore(self):
        total_accnum = 0
        ext_accnum = 0
        ext_num = 0
        for doc in self.corpkeyphrase.keys():
            ext_kp = self.corpkeyphrase[doc]
            manual_kp = self.manuallabels[doc]
            #print ext_kp
            #print manual_kp
            #raw_input()
            #total_accnum += len(manual_kp)
            total_accnum += self.manualkwnum[doc]
            ext_num += len(ext_kp)
            ext_accnum += len(manual_kp&ext_kp)
        print 'Manual annotated keyphrases: %d' % total_accnum
        print 'Extracted total keyphrases: %d' % ext_num
        print 'Extracted accurate keyphrases: %d' % ext_accnum
        precision = ext_accnum*1.0/ext_num
        recall = ext_accnum*1.0/total_accnum
        fscore = 2*precision*recall/(precision+recall)
        return precision, recall, fscore

    # Using MRR to evaluate
    def eval_mrr(self):
        pass

    # Using Bpref to evaluate
    def eval_bpref(self):
        pass
Example #15
0
class WordmapGenerator:
    '''This class generates word map for the specified
       corpus. Because of the task for keyphrase extraction,
       we need to specify the POS sets of which the words
       will be saved.
       It also needs to be general so that all keyphrase
       extraction methods can utilize it.
       As we will use pos tags, we construct word map from
       cleaned text.
       Note that some keyphrases are not existed in the abstract,
       so we need to index them in dictionary.
    '''
    def __init__(self, dir_text_file, output_wordmap_file, \
            pos_sets, reg_exp, docsuffix, kpdocsuffix):
        self.dir_text_file = dir_text_file
        self.output_wordmap_file = output_wordmap_file
        self.pos_sets = pos_sets
        self.docsuffix = docsuffix
        self.kpdocsuffix = kpdocsuffix

        self.stemmer = PorterStemmer()
        self.dict_words = []
        self.pattern = re.compile(reg_exp)
        self.doclist = []
        self.doclist = self.getdoclist(1)

    def getdoclist(self, choice):
        temp_doclist = []
        for rootdir in self.dir_text_file:
            candi_files = os.listdir(rootdir)
            # filter file by suffix if existed
            if choice == 1:
                candi_files = filter(self.doc_substr, candi_files)
            elif choice == 2:
                candi_files = filter(self.kpdoc_substr, candi_files)
            candi_files = map(lambda x: rootdir+x, candi_files)
            temp_doclist = temp_doclist + candi_files
        return temp_doclist

    def doc_substr(self, candi_file):
        if candi_file.find(self.docsuffix) != -1:
            return True
        return False

    def kpdoc_substr(self, candi_file):
        if candi_file.find(self.kpdocsuffix) != -1:
            return True
        return False

    def genwordmap(self):
        temp_words = set([])
        for doc in self.doclist:
            for line in open(doc):
                words = line.strip("\n\r ").split(' ')
                for word in words:
                    biparts = word.split('_')
                    # words processing
                    # ================
                    biparts[0] = biparts[0].lower()
                    biparts[0] = self.stemmer.stem(biparts[0], 0,\
                            len(biparts[0])-1)
                    if len(biparts) == 2 and biparts[1] in self.pos_sets\
                            and self.pattern.match(biparts[0]):
                        temp_words.add(biparts[0])
                    # ================
        self.doclist = self.getdoclist(2)
        temp_keywords = set([])
        miss_keywords = 0
        for doc in self.doclist:
            for line in open(doc):
                textunits = line.strip('\n\r ').split(" ")
                for textunit in textunits:
                    # words processing
                    # ================
                    textunit = textunit.lower()
                    textunit = self.stemmer.stem(textunit, 0, len(textunit)-1)
                    # ================
                    temp_keywords.add(textunit)
                    if not textunit in temp_words:
                        temp_words.add(textunit)
                        miss_keywords += 1
        self.dict_words = sorted(temp_words)
        print "Number of unique keywords: %d, " % len(temp_keywords)
        print "number of left missing words: %d.\n" % miss_keywords

    def output_wordmap(self):
        wfd = open(self.output_wordmap_file, 'w')
        for i,word in enumerate(self.dict_words):
            wfd.write("%s %d\n" % (word, i+1))
        wfd.close()