Python lemmatize_tokens Exemples, utils.utils_email.lemmatize_tokens Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : trec2010_eval_ranking_whoosh.py Projet : clintpgeorge/ediscovery

def gen_ranking_rocs(data_dir, output_dir, query_id, num_topics, keywords):
    
    # the directory which contains the training set 
    # (TRUE negatives and TRUE positives)
    truth_dir = "%s\\%d" % (data_dir, query_id)  
    
    # Using unormalized keywords  
    project_name = "Q%d-UNW-%dT" % (query_id, num_topics)
    config_file = "%s\\%s.cfg" % (output_dir, project_name) 
    norm_tokens = keywords 
    try: 
        eval_keywordlda_topiclda_lucene_ranking(project_name, 
                                                config_file, 
                                                truth_dir, 
                                                norm_tokens, 
                                                output_dir=output_dir)
    except:
        print 'Execption in processing', project_name
    
    # Using lemmatized keywords  
    project_name = "Q%d-LW-%dT" % (query_id, num_topics)
    config_file = "%s\\%s.cfg" % (output_dir, project_name)
    norm_tokens = ' '.join( lemmatize_tokens( regex_tokenizer(keywords) ) ) 
    try: 
        eval_keywordlda_topiclda_lucene_ranking(project_name, 
                                                config_file, 
                                                truth_dir, 
                                                norm_tokens, 
                                                output_dir=output_dir)
    except:
        print 'Execption in processing', project_name
 
         
    # Using Stemming and Lemmatization
    project_name = "Q%d-LSW-%dT" % (query_id, num_topics)
    config_file = "%s\\%s.cfg" % (output_dir, project_name)  
    norm_tokens = ' '.join( stem_tokens( lemmatize_tokens( regex_tokenizer(keywords) ) ) ) #  
    try: 
        eval_keywordlda_topiclda_lucene_ranking(project_name, 
                                                config_file, 
                                                truth_dir, 
                                                norm_tokens, 
                                                output_dir=output_dir)
    except:
        print 'Execption in processing', project_name

Exemple #2

0

Afficher le fichier

Fichier : eval_ranking_models.py Projet : clintpgeorge/ediscovery

    query_id = 201
    file_prefix = '%d-UNT-ALL' % query_id
    config_file = "F:\\Research\\datasets\\trec2010\\Q201-UNT-30T.cfg" # configuration file, created using the SMARTeR GUI 
    test_directory = "F:\\Research\\datasets\\trec2010\\201"# the directory where we keep the training set (TRUE negatives and TRUE positives) 
    keywords = 'pre-pay swap'
    norm_tokens = keywords # Unnormalized keywords 
#    eval_keywordlda_topiclda_lucene(file_prefix, config_file, test_directory, norm_tokens)
    eval_ranking_methods(file_prefix, config_file, test_directory, norm_tokens)
    
    
    query_id = 201
    file_prefix = '%d-LW-ALL' % query_id
    config_file = "F:\\Research\\datasets\\trec2010\\Q201-LW-30T.cfg" # configuration file, created using the SMARTeR GUI 
    test_directory = "F:\\Research\\datasets\\trec2010\\201" # the directory where we keep the training set (TRUE negatives and TRUE positives) 
    keywords = 'pre-pay swap'
    norm_tokens = ' '.join( lemmatize_tokens( regex_tokenizer(keywords) ) ) # Lemmatization 
#    eval_keywordlda_topiclda_lucene(file_prefix, config_file, test_directory, norm_tokens)
    eval_ranking_methods(file_prefix, config_file, test_directory, norm_tokens)
    
#    query_id = 201
#    file_prefix = '%d-LST' % query_id
#    config_file = "F:\\Research\\datasets\\trec2010\\Q201-LST-30T.cfg" # configuration file, created using the SMARTeR GUI 
#    test_directory = "F:\\Research\\datasets\\trec2010\\201"# the directory where we keep the training set (TRUE negatives and TRUE positives) 
#    keywords = 'pre-pay swap'
#    norm_tokens = ' '.join( stem_tokens( lemmatize_tokens( regex_tokenizer(keywords) ) ) ) # Stemming and Lemmatization 
#    eval_keywordlda_topiclda_lucene(file_prefix, config_file, test_directory, norm_tokens)
    
#    query_id = 207
#    file_prefix = '%d-LT' % query_id
#    config_file = "F:\\Research\\datasets\\trec2010\\Q207-LT-5T.cfg" # configuration file, created using the SMARTeR GUI 
#    test_directory = "F:\\Research\\datasets\\trec2010\\207" # the directory where we keep the training set (TRUE negatives and TRUE positives)

Exemple #3

0

Afficher le fichier

Fichier : trec2010_eval_ranking_whoosh.py Projet : clintpgeorge/ediscovery

def eval_ranking_varying_topics(query_id, data_dir, 
                                 keywords, 
                                 limit = 1000, 
                                 img_extension  = '.eps'):
    
    tokens = ' '.join( lemmatize_tokens( regex_tokenizer(keywords) ) ) # Lemmatization 
    lucene_query = 'all:(%s)' % tokens # search in all fields 
    print 'Lucene query:', lucene_query
    print 'TM query:', tokens

    truth_dir = "%s%d" % (data_dir, query_id)
    positive_dir = os.path.join(truth_dir, RELEVANT_DIR_NAME) # TRUE positive documents 

    topiclda_rocs_file_name = '%d-LW-Topic-LDA-VaryingTopics-ROCs' % query_id + img_extension
    topiclda_rocs_img_title = 'Q%d (Topic-LDA): Varying # of LDA Topics and Lemmas' % query_id  
    keywordlda_rocs_file_name = '%d-LW-Keyword-LDA-VaryingTopics-ROCs' % query_id + img_extension
    keywordlda_rocs_img_title = 'Q%d (Keyword-LDA): Varying # of LDA Topics and Lemmas' % query_id  
    topics = [5, 10, 15, 20, 30, 40, 50, 60, 70]
    roc_labels = []
    roc_topiclda_list = []
    roc_keywordlda_list = []

    for idx, num_topics in enumerate(topics): 

        print '------------------------------------------------------------------------------------------'
        #---------------------------------------------- Reads the configuration file
        
        config_file = "%sQ%d-LW-%dT.cfg" % (data_dir, query_id, num_topics)  # configuration file, created using the SMARTeR GUI 
        mdl_cfg = read_config(config_file)
        
        # Loads the LDA model 
        (lda_dictionary, lda_mdl, lda_index, 
         lda_file_path_index, lda_theta, 
         lda_beta) = load_lda_parameters(mdl_cfg)
        
        
        #------------ Checks whether the keywords are there in the corpus dictionary
    
        valid_tokens  = 0 
        for token in tokens.split():
            if token.strip() not in lda_dictionary.values():
                print token, "is not in the corpus vocabulary."
            else: 
                valid_tokens  += 1
                
        if valid_tokens  == 0:
            print 'None of the tokens exist in the dictionary. Exiting topic search!'
            exit()
            
        # Gets the query topic distribution from the LDA beta  
        print 'Estimated topic dist. from the LDA beta:'
        query_td2 = get_lda_query_td2(tokens, lda_dictionary, lda_beta)
        dominant_topics_idx2 = get_query_top_topic_idx(query_td2, lda_mdl, TOP_K_TOPICS)
        
        # Gets the query topic distribution from the LDA model 
        print 'Estimated topic dist. from the LDA model:'
        query_td = get_lda_query_td(tokens, lda_dictionary, lda_mdl) 
        dominant_topics_idx = get_query_top_topic_idx(query_td, lda_mdl, TOP_K_TOPICS)
    
        #------------------------------------------------------------- Lucene search
    
        if idx == 0: # the first Lucene ranking is added as a reference 
            print 'Lucene ranking'
            # lu_docs = search_li(lucene_query, limit, mdl_cfg)
            lu_docs = search_whoosh_index(lucene_query, mdl_cfg)
            _, lu_docs_list = lu_append_nonresp(lu_docs, truth_dir)
            lu_res = convert_to_roc_format(lu_docs_list, positive_dir)
            roc_topiclda_list.append(ROCData(lu_res))
            roc_keywordlda_list.append(ROCData(lu_res))
            roc_labels.append('Lucene')
        
        #---------------------------------------------------------------- LDA search
        
#        # Gets the dominant topics from the LDA model 
#        dominant_topics = get_dominant_query_topics(tokens, lda_dictionary, lda_mdl, TOP_K_TOPICS)
#        dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices 
        
        
        print 'LDA (w/ keywords) ranking'
        lda_docs = search_tm2(query_td, lda_index, lda_file_path_index, limit)
        lda_res = convert_to_roc_format(lda_docs, positive_dir)
    
        print 'LDA (w/ keywords) method-2 ranking'
        lda_docs2 = search_tm2(query_td2, lda_index, lda_file_path_index, limit)
        lda_res2 = convert_to_roc_format(lda_docs2, positive_dir)
            
        
        print 'LDA (w/ query topics) ranking'
        lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) 
        lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir)
        
        print 'LDA (w/ query topics) method-2 ranking'
        lda_tts_docs2 = search_tm_topics(dominant_topics_idx2, limit, lda_file_path_index, lda_theta) 
        lda_tts_res2 = convert_to_roc_format(lda_tts_docs2, positive_dir)
    
        
        roc_topiclda_list.append(ROCData(lda_tts_res))
        roc_keywordlda_list.append(ROCData(lda_res))
        roc_labels.append('%d topics' % num_topics)

        roc_topiclda_list.append(ROCData(lda_tts_res2))
        roc_keywordlda_list.append(ROCData(lda_res2))
        roc_labels.append('%d topics (method-2)' % num_topics)    
        
        print '------------------------------------------------------------------------------------------'    
    
    ## Plot ROC curves  
    
    plot_multiple_roc(roc_topiclda_list, title=topiclda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=topiclda_rocs_file_name)
     
    plot_multiple_roc(roc_keywordlda_list, title=keywordlda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=keywordlda_rocs_file_name)

Exemple #4

0

Afficher le fichier

Fichier : eval_tm_lucene.py Projet : clintpgeorge/ediscovery

def eval_ranking_varying_topics(query_id, dir_path, 
                                 keywords, 
                                 limit = 1000, 
                                 img_extension  = '.eps'):
    
    tm_query = ' '.join( lemmatize_tokens( regex_tokenizer(keywords)  ) ) # Lemmatization 
    lucene_query = 'all:(%s)' % tm_query # search in all fields 
    print 'Lucene query:', lucene_query
    print 'TM query:', tm_query

    test_directory = "%s%d" % (dir_path, query_id)
    positive_dir = os.path.join(test_directory, "1") # TRUE positive documents 

    TOP_K_TOPICS = 5 # the number topics used for Topic-LDA 
    topiclda_rocs_file_name = '%d-LT-Topic-LDA-VaryingTopics-ROCs' % query_id + img_extension
    topiclda_rocs_img_title = 'Q%d Topic-LDA with Varying Number of Topics' % query_id  
    keywordlda_rocs_file_name = '%d-LT-Keyword-LDA-VaryingTopics-ROCs' % query_id + img_extension
    keywordlda_rocs_img_title = 'Q%d Keyword-LDA with Varying Number of Topics' % query_id  
    topics = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80]
    roc_labels = []
    roc_topiclda_list = []
    roc_keywordlda_list = []

    for idx, num_topics in enumerate(topics): 

        #---------------------------------------------- Reads the configuration file
        
        config_file = "%sQ%d-LT-%dT.cfg" % (dir_path, query_id, num_topics)  # configuration file, created using the SMARTeR GUI 
        mdl_cfg = read_config(config_file)
        
        # Loads the LDA model 
        lda_dictionary, lda_mdl, _, _ = load_tm(mdl_cfg)
        
        
        #------------ Checks whether the keywords are there in the corpus dictionary
    
        valid_tokens  = 0 
        for token in tm_query.split():
            if token.strip() not in lda_dictionary.values():
                print token, "is not in the corpus vocabulary. Hence, this word will be ignored from the topic search."
            else: 
                valid_tokens  += 1
                
        if valid_tokens  == 0:
            print 'None of the tokens exist in the dictionary. Exiting topic search!'
            exit()
            
            
        #------------------------------------------------------------- Lucene search
    
        if idx == 0: # the first Lucene ranking is added as a reference 
            print 'Lucene ranking'
            lu_docs = search_li(lucene_query, limit, mdl_cfg)
            _, lu_docs_list = lu_append_nonresp(lu_docs, test_directory)
            lu_res = convert_to_roc_format(lu_docs_list, positive_dir)
            roc_topiclda_list.append(ROCData(lu_res))
            roc_keywordlda_list.append(ROCData(lu_res))
            roc_labels.append('Lucene')
        
        #---------------------------------------------------------------- LDA search
        
        # Gets the dominant topics from the LDA model 
        dominant_topics = get_dominant_query_topics(tm_query, lda_dictionary, lda_mdl, TOP_K_TOPICS)
        dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices 
        
        
        print 'LDA (w/ keywords) ranking'
        lda_docs = search_tm(tm_query, limit, mdl_cfg)
        lda_res = convert_to_roc_format(lda_docs, positive_dir)
        
        
        print 'LDA (w/ query topics) ranking'
        lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, mdl_cfg) 
        lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir)
    
        
        roc_topiclda_list.append(ROCData(lda_tts_res))
        roc_keywordlda_list.append(ROCData(lda_res))
        roc_labels.append('%d topics' % num_topics)
        
    
    ## Plot ROC curves  
    
    plot_multiple_roc(roc_topiclda_list, title=topiclda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=topiclda_rocs_file_name)
     
    plot_multiple_roc(roc_keywordlda_list, title=keywordlda_rocs_img_title, 
                      labels=roc_labels, include_baseline=True, 
                      file_name=keywordlda_rocs_file_name)