Python Preprocessor Exemples, text_categorization.prototypes.text_preprocessor.Preprocessor Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : thy_topics.py Projet : burakyldrm/calisma

def train_kmeans_model(instances, labels, n_clusters=3):

    n_max_features = None
    n_gram_range = (1, 1)
    more_stopwords = None
    reduce_dim = False

    preprocessor = prep.Preprocessor(lang="tr",
                                     stopword=True,
                                     more_stopwords=None,
                                     stemming=True,
                                     remove_numbers=True,
                                     deasciify=False,
                                     remove_punkt=True)

    tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=prep.identity,
                                                  preprocessor=None,
                                                  lowercase=False,
                                                  ngram_range=n_gram_range,
                                                  max_features=n_max_features)

    kmeans = skcluster.KMeans(n_clusters=n_clusters,
                              init='random',
                              max_iter=1000,
                              n_init=10,
                              random_state=42)

    pipeline = skpipeline.Pipeline([('preprocessor', preprocessor),
                                    ('vect', tfidf_vectorizer),
                                    ('normalizer', skprep.Normalizer()),
                                    ('clusterer', kmeans)])

    data_distances = pipeline.fit_transform(instances)

    return pipeline, data_distances, preprocessor, tfidf_vectorizer, kmeans, instances, labels

Exemple #2

0

Afficher le fichier

def _ar_txt_clf_features_pipeline2(
    feature_params_config_dict  #  {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example.
):

    lang = feature_params_config_dict[conf.lang_key]
    feature_weights = feature_params_config_dict[conf.weights_key]
    prep_params = feature_params_config_dict[conf.prep_key]

    #print(feature_weights)

    # features found in the processed tokens

    preprocessor = prep.Preprocessor(
        lang=lang,
        stopword=prep_params[conf.stopword_key],
        more_stopwords=prep_params[conf.more_stopwords_key],
        spellcheck=prep_params[conf.spellcheck_key],
        stemming=prep_params[conf.stemming_key],
        remove_numbers=prep_params[conf.remove_numbers_key],
        deasciify=prep_params[conf.deasciify_key],
        remove_punkt=prep_params[conf.remove_punkt_key],
        lowercase=prep_params[conf.lowercase_key])

    tfidfvect = TfidfVectorizer(
        tokenizer=prep.identity,
        preprocessor=None,
        lowercase=False,
        use_idf=prep_params[conf.use_idf_key],
        ngram_range=prep_params[conf.wordngramrange_key],
        max_features=prep_params[conf.nmaxfeature_key])

    token_weights = dict(tfidfvect=feature_weights["word_tfidf"], )
    token_transformers_dict = dict(
        tfidfvect=
        tfidfvect,  # not to lose above integrity if we change variable names
    )
    token_transformers = [(k, v) for k, v in token_transformers_dict.items()]

    tokenbasedpipe = skpipeline.Pipeline([
        ('preprocessor', preprocessor),
        # ('nadropper', tbt.DropNATransformer()),
        ('union1',
         skpipeline.FeatureUnion(transformer_list=token_transformers,
                                 transformer_weights=token_weights)),
    ])

    charngramvect = TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=prep_params[conf.charngramrange_key],
        lowercase=False)

    # stylistic
    '''
    # BUG
    named_entity_pipe = tbt.get_named_entity_weight_pipeline(lang)
    
    text_weights = dict(charngramvect=feature_weights["char_tfidf"],   # @TODO hardcoded
                             polpipe1=feature_weights["polyglot_count"],
                             polpipe2=feature_weights["polyglot_value"],
                             named_entity_pipe=feature_weights["named_entity_rate"])
                             
    text_transformers_dict = dict(charngramvect=charngramvect,
                             polpipe1=polpipe1,
                             polpipe2=polpipe2,
                             named_entity_pipe=named_entity_pipe)
    '''

    text_weights = dict(
        charngramvect=feature_weights["char_tfidf"],  # @TODO hardcoded
    )

    text_transformers_dict = dict(charngramvect=charngramvect, )

    text_transformers = [(k, v) for k, v in text_transformers_dict.items()]
    '''
    textpipes = [('charngramvect', charngramvect),]
    textpweights = {'charngramvect' : 1.5}
    textpweights = dict(charngramvect = 1 if charngramvect else 0)
    '''
    textbasedpipe = skpipeline.Pipeline([(
        'union2',
        skpipeline.FeatureUnion(transformer_list=text_transformers,
                                transformer_weights=text_weights),
    )])

    final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe,
                                   textbasedpipe=textbasedpipe)
    final_transformers = [(k, v) for k, v in final_transformers_dict.items()]

    #print(textbasedpipe.named_steps)
    '''        
    #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()}
    check_zero = lambda x : 1 if sum(x) > 0 else 0
    x = list(tokenbasedpipe.get_params(False).values())
    print(len(x), x[0])
    print(x[0][1])   # convert x[0] tuple to dict, then get transformer weights
    print("**")
    print(x,"\n--")
    print(list(textbasedpipe.get_params(False).values()))
    tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values())
                      for _, k in final_transformers_dict.items()}
    '''

    features = skpipeline.FeatureUnion(
        transformer_list=final_transformers,
        # transformer_weights=tweights   # weight assignment is not necessary as the number of features is small
    )

    #print("0000000000", feature_params_config_dict)

    return features

Exemple #3

0

Afficher le fichier

Fichier : _sentiment_classification.py Projet : burakyldrm/calisma

def _tr_sentiment_features_pipeline(
        lang="tr",
        feature_weights={
            "word_tfidf": 1,
            "polyglot_value": 0,
            "polyglot_count": 0,
            "lexicon_count": 0,
            "char_tfidf": 1
        },
        stopword_choice=True,
        more_stopwords_list=None,
        spellcheck_choice=False,
        stemming_choice=False,
        number_choice=False,
        deasc_choice=True,
        punct_choice=True,
        case_choice=True,
        word_ngramrange=(1, 2),  # tuple
        char_ngramrange=(2, 2),
        nmaxfeature=10000,  # int or None  
        norm="l2",
        use_idf=True):

    preprocessor = prep.Preprocessor(lang=lang,
                                     stopword=stopword_choice,
                                     more_stopwords=more_stopwords_list,
                                     spellcheck=spellcheck_choice,
                                     stemming=stemming_choice,
                                     remove_numbers=number_choice,
                                     deasciify=deasc_choice,
                                     remove_punkt=punct_choice,
                                     lowercase=case_choice)
    tfidfvect = TfidfVectorizer(tokenizer=prep.identity,
                                preprocessor=None,
                                lowercase=False,
                                use_idf=use_idf,
                                ngram_range=word_ngramrange,
                                max_features=nmaxfeature)
    polpipe3 = obt.get_lexicon_count_pipeline(tokenizer=prep.identity)

    token_weights = dict(tfidfvect=feature_weights["word_tfidf"],
                         polpipe3=feature_weights["lexicon_count"])
    token_transformers_dict = dict(
        tfidfvect=
        tfidfvect,  # not to lose above integrity if we change variable names
        polpipe3=polpipe3)
    token_transformers = [(k, v) for k, v in token_transformers_dict.items()]

    tokenbasedpipe = skpipeline.Pipeline([
        ('preprocessor', preprocessor),
        # ('nadropper', tbt.DropNATransformer()),
        ('union1',
         skpipeline.FeatureUnion(transformer_list=token_transformers,
                                 transformer_weights=token_weights)),
    ])

    charngramvect = TfidfVectorizer(analyzer='char_wb',
                                    ngram_range=char_ngramrange,
                                    lowercase=False)

    polpipe1 = tbt.get_polylglot_polarity_count_pipe(lang)
    polpipe2 = tbt.get_polylglot_polarity_value_pipe(lang)

    text_weights = dict(charngramvect=feature_weights["char_tfidf"],
                        polpipe1=feature_weights["polyglot_count"],
                        polpipe2=feature_weights["polyglot_value"])
    text_transformers_dict = dict(charngramvect=charngramvect,
                                  polpipe1=polpipe1,
                                  polpipe2=polpipe2)
    text_transformers = [(k, v) for k, v in text_transformers_dict.items()]
    '''
    textpipes = [('charngramvect', charngramvect),]
    textpweights = {'charngramvect' : 1.5}
    textpweights = dict(charngramvect = 1 if charngramvect else 0)
    '''
    textbasedpipe = skpipeline.Pipeline([(
        'union2',
        skpipeline.FeatureUnion(transformer_list=text_transformers,
                                transformer_weights=text_weights),
    )])

    final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe,
                                   textbasedpipe=textbasedpipe)
    final_transformers = [(k, v) for k, v in final_transformers_dict.items()]
    '''        
    #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()}
    check_zero = lambda x : 1 if sum(x) > 0 else 0
    x = list(tokenbasedpipe.get_params(False).values())
    print(len(x), x[0])
    print(x[0][1])   # convert x[0] tuple to dict, then get transformer weights
    print("**")
    print(x,"\n--")
    print(list(textbasedpipe.get_params(False).values()))
    tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values())
                      for _, k in final_transformers_dict.items()}
    '''

    features = skpipeline.FeatureUnion(
        transformer_list=final_transformers,
        # transformer_weights=tweights   # weight assignment is not necessary as the number of features is small
    )
    '''
    tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor),
                                          #('nadropper', tbt.DropNATransformer()),                                       
                                          ('union1', skpipeline.FeatureUnion(
                                                transformer_list=[                                                   
                                                 ('tfidfvect', tfidfvect),
                                                 #('polarity3', polpipe3),
                                        ])),]
                                        )
    
    
    textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion([                                
                                         #('polarity1', polpipe1),
                                         #('polarity2', polpipe2),
                                         ('charngramvect', charngramvect),
                                         ]),)])
    
    features = skpipeline.FeatureUnion(transformer_list=[
                                        ('tokenbasedfeatures', tokenbasedpipe),
                                        ('textbasedfeatures', textbasedpipe),
                                       ])
    '''
    return features

Exemple #4

0

Afficher le fichier

Fichier : email_classification.py Projet : burakyldrm/calisma

def _email_features_pipeline(lang,
                                stopword_choice=True,
                                more_stopwords_list=None,
                                spellcheck_choice=False,
                                stemming_choice=False,
                                number_choice=False,
                                deasc_choice=True,
                                punct_choice=True,
                                case_choice=True,
                                
                                ngramrange=(1, 2),  # tuple
                                nmaxfeature=10000,  # int or None  
                                norm="l2",
                                use_idf=True,
                                keywords=[],  # ["arıza", "pstn"]
                                final_weights=dict(text_based=1, token_based=1)
                                ):
        

    # use a list of (pipeline, pipeline_name, weight)
               
        # features found in the processed tokens
    token_features = []
    token_weights = {}
    preprocessor = prep.Preprocessor(lang=lang,
                                 stopword=stopword_choice, more_stopwords=more_stopwords_list,
                                 spellcheck=spellcheck_choice,
                                 stemming=stemming_choice,
                                 remove_numbers=number_choice,
                                 deasciify=deasc_choice,
                                 remove_punkt=punct_choice,
                                 lowercase=case_choice
                                )
    
    tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False,
                                use_idf=use_idf, ngram_range=ngramrange, max_features=nmaxfeature)

    tfidfvect_name = 'word_tfidfvect'
    token_features.append((tfidfvect_name, tfidfvect))
    token_weights[tfidfvect_name] = 1
       
    
    
        # features found in the whole raw text
    text_features = []
    text_weights = {}
    # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False)
    # keyword presence features
    if keywords:
        for keyword in keywords:
            keywordpipe = txbt.get_keyword_pipeline(keyword)
            feature_name = "has_" + keyword
            text_features.append((feature_name, keywordpipe))
            text_weights[feature_name] = 1
            
    
    
    
    tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor),
                                          # ('nadropper', tbt.DropNATransformer()),                                       
                                          ('union1', skpipeline.FeatureUnion(
                                                transformer_list=token_features ,
                                                transformer_weights=token_weights                                                
                                                )),
                                        ])
    
    textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion(
                                            transformer_list=text_features,
                                            transformer_weights=text_weights
                                            ),
                                          )
                                        ])
    
    
    #######
    # add the feature pipes to final_features if all the component weights are non-zero.
    ########
    check_zero_list = lambda x : 1 if sum(x) > 0 else 0
    #  l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1.
    final_features_dict = {}     
            
    tkweights = list(token_weights.values())
    if(check_zero_list(tkweights) != 0):
        final_features_dict["token_based"] = tokenbasedpipe
    else:
        final_weights["token_based"] = 0
      
    txweights = list(text_weights.values())
    if(check_zero_list(txweights) != 0):
        final_features_dict["text_based"] = textbasedpipe
    else:
        final_weights["text_based"] = 0  
                                        
    final_features = list(final_features_dict.items())    
    
    fweights = list(final_weights.values())
    if((check_zero_list(fweights) == 0) or (len(final_features) == 0)):
        return None
    
    '''
    features = skpipeline.FeatureUnion(transformer_list=[
                                        ('tokenbasedfeatures', tokenbasedpipe),
                                        ('textbasedfeatures', textbasedpipe),                                          
                                       ],
                                       transformer_weights=final_weights)
    '''
    features = skpipeline.FeatureUnion(transformer_list=final_features,
                                       transformer_weights=final_weights)
    return features

Exemple #5

0

Afficher le fichier

Fichier : email_classification.py Projet : burakyldrm/calisma

def _email_features_pipeline2(feature_params_config_dict  #  {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example.
                                ):
        


    lang = feature_params_config_dict[conf.lang_key]
    final_weights = feature_params_config_dict[conf.weights_key]
    prep_params = feature_params_config_dict[conf.prep_key]
    keywords = feature_params_config_dict[conf.keyword_key]

               
        # features found in the processed tokens
    token_features = []
    token_weights = {}

    preprocessor = prep.Preprocessor(lang=lang,
                                     stopword=prep_params[conf.stopword_key], more_stopwords=prep_params[conf.more_stopwords_key],
                                     spellcheck=prep_params[conf.spellcheck_key],
                                     stemming=prep_params[conf.stemming_key],
                                     remove_numbers=prep_params[conf.remove_numbers_key],
                                     deasciify=prep_params[conf.deasciify_key],
                                     remove_punkt=prep_params[conf.remove_punkt_key],
                                     lowercase=prep_params[conf.lowercase_key]
                                )
    
    tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False,
                                use_idf=prep_params[conf.use_idf_key],
                                ngram_range=prep_params[conf.ngramrange_key],
                                max_features=prep_params[conf.nmaxfeature_key])

    tfidfvect_name = 'word_tfidfvect'
    token_features.append((tfidfvect_name, tfidfvect))
    token_weights[tfidfvect_name] = 1
       
    
    
        # features found in the whole raw text
    text_features = []
    text_weights = {}
    # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False)
    # keyword presence features
    if keywords:
        for keyword in keywords:
            keywordpipe = txbt.get_keyword_pipeline(keyword)
            feature_name = "has_" + keyword
            text_features.append((feature_name, keywordpipe))
            text_weights[feature_name] = 1
            
    
    
    
    tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor),
                                          # ('nadropper', tbt.DropNATransformer()),                                       
                                          ('union1', skpipeline.FeatureUnion(
                                                transformer_list=token_features ,
                                                transformer_weights=token_weights                                                
                                                )),
                                        ])
    
    textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion(
                                            transformer_list=text_features,
                                            transformer_weights=text_weights
                                            ),
                                          )
                                        ])
    
    
    
    
    '''
    features = skpipeline.FeatureUnion(transformer_list=[
                                        ('tokenbasedfeatures', tokenbasedpipe),
                                        ('textbasedfeatures', textbasedpipe),                                          
                                       ],
                                       transformer_weights=final_weights)
    '''
    #######
    # add the feature pipes to final_features if all the component weights are non-zero.
    ########
    check_zero_list = lambda x : 1 if sum(x) > 0 else 0
    #  l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1.
    final_features_dict = {}     
            
    tkweights = list(token_weights.values())
    if(check_zero_list(tkweights) != 0):
        final_features_dict["token_based"] = tokenbasedpipe
    else:
        final_weights["token_based"] = 0
      
    txweights = list(text_weights.values())
    if(check_zero_list(txweights) != 0):
        final_features_dict["text_based"] = textbasedpipe
    else:
        final_weights["text_based"] = 0  
                                        
    final_features = list(final_features_dict.items())    
    
    fweights = list(final_weights.values())
    
    #print(final_weights)
    
    if((check_zero_list(fweights) == 0) or (len(final_features) == 0)):
        return None
    
    
    features = skpipeline.FeatureUnion(transformer_list=final_features,
                                       transformer_weights=final_weights)
    return features

Exemple #6

0

Afficher le fichier

Fichier : thy_topics.py Projet : burakyldrm/calisma

def topics_lsi(instances,
               labels,
               sentence,
               ndim=5,
               n_gram_range=(1, 1),
               n_max_features=None):

    print(instances[:5])
    print(labels[:5])

    highlight_word = ""

    preprocessor = prep.Preprocessor(lang="tr",
                                     stopword=True,
                                     more_stopwords=None,
                                     stemming=True,
                                     remove_numbers=True,
                                     deasciify=False,
                                     remove_punkt=True)

    tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=prep.identity,
                                                  preprocessor=None,
                                                  lowercase=False,
                                                  ngram_range=n_gram_range,
                                                  max_features=n_max_features)

    svd_model = TruncatedSVD(n_components=ndim,
                             algorithm='randomized',
                             n_iter=10,
                             random_state=42)

    svd_transformer = skpipeline.Pipeline([
        ('preprocessor', preprocessor),
        ('vectorizer', tfidf_vectorizer),
        #('normalizer', skprep.Normalizer()),
        ('scaler', skprep.StandardScaler(with_mean=False)),
        ('svd', svd_model)
    ])

    docmatrix = svd_transformer.fit_transform(instances)

    input_ = preprocessor.tokenize(sentence)
    if (len(input_) < 1 or len("".join(input_)) < 1):
        highlight_word = ""
        return highlight_word

    inputmatrix = svd_transformer.transform(input_)

    termmatrix = svd_model.components_.T
    print(termmatrix.shape)
    print(inputmatrix.shape)
    print(docmatrix.shape)

    # closest docs
    # @TODO different similarity metrics
    docsim, docindices = list_utils.matrix_similarity(inputmatrix,
                                                      docmatrix,
                                                      top_N=10)
    for i, w in enumerate(input_):
        print(w)
        sim_docs = [labels[j] for j in docindices[i]]
        #print("most similar docs: ", ", ".join(sim_docs))
        print("most similar docs: ", sim_docs)
        sim_vals = docsim[i]
        print(sim_vals)
        print()

    # closest terms -> the input word which has the largest similarity value
    termsim, termindices = list_utils.matrix_similarity(inputmatrix,
                                                        termmatrix,
                                                        top_N=10)
    allterms = tfidf_vectorizer.get_feature_names()
    print(len(allterms))
    #open("/home/dicle/Documents/experiments/thy_topics/output/all_terms.txt", "w").write("\n".join(allterms))

    for i, w in enumerate(input_):
        print(w)
        sim_terms = [allterms[j] for j in termindices[i]]
        print("most similar terms: ", ", ".join(sim_terms))
        sim_vals = termsim[i]
        print(sim_vals)
        print(sum(sim_vals))

    # the heaviest term
    similarity_threshold = 0.0  # @TODO should be inferred from the data_matrix

    total_termsim_per_instance = np.sum(termsim, axis=1)
    max_sim = total_termsim_per_instance.max()
    max_index = total_termsim_per_instance.argmax()
    #print("max -> ", input_[max_index], " : ",max_sim)

    if max_sim <= similarity_threshold:
        highlight_word = ""
        return highlight_word

    highlight_word = input_[max_index]
    return highlight_word

Exemple #7

0

Afficher le fichier

Fichier : thy_topics.py Projet : burakyldrm/calisma

def topics_kmeans(instances, labels, sentence):

    output = {"word": "", "nearest_docs": [], "nearest_terms": []}

    n_clusters = 3
    n_max_features = None
    top_N_words = 30
    n_gram_range = (1, 1)
    more_stopwords = None
    reduce_dim = False

    preprocessor = prep.Preprocessor(lang="tr",
                                     stopword=True,
                                     more_stopwords=None,
                                     stemming=True,
                                     remove_numbers=True,
                                     deasciify=False,
                                     remove_punkt=True)

    tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=prep.identity,
                                                  preprocessor=None,
                                                  lowercase=False,
                                                  ngram_range=n_gram_range,
                                                  max_features=n_max_features)

    kmeans = skcluster.KMeans(n_clusters=n_clusters,
                              init='random',
                              max_iter=1000,
                              n_init=10,
                              random_state=42)

    pipeline = skpipeline.Pipeline([('preprocessor', preprocessor),
                                    ('vect', tfidf_vectorizer),
                                    ('normalizer', skprep.Normalizer()),
                                    ('clusterer', kmeans)])
    ''' Prediction on input words 
         - assign each word to one of the clusters formed by the database
           - print the closest terms and docs for each input word
         - find the word closest to its own cluster center, assign it to be highlighted
           - if the total distance of the input words exceeds SOME THRESHOLD, then that input is irrelevant and will not be highlighted
           
        + map the preprocessed word to its original
        + handle empty or after-preprocessing empty input
    '''
    data_distances = pipeline.fit_transform(instances)
    data_clusters = kmeans.labels_
    clnames = list(set(data_clusters))

    words = preprocessor.tokenize(sentence)
    words = list(set(words))  # remove repeating words
    if (len(words) < 1 or len("".join(words)) < 1):
        return None

    #words = [sentence]
    input_clusters = pipeline.predict(words)
    input_distances = pipeline.transform(words)
    '''
     store the ids and distances of the members for each cluster
      - {clNO : [(member_id, member_distance)]}
    '''
    cluster_members = dict.fromkeys(clnames, [])
    for memberid, memberclNo in enumerate(data_clusters):
        distance = data_distances[memberid, memberclNo]
        cluster_members[memberclNo] = cluster_members[memberclNo] + [
            (memberid, distance)
        ]

    input_cluster_members = [
    ]  # store [(input_id, input_clNO, input_distance)]
    for inputid, inputclNo in enumerate(input_clusters):
        distance = input_distances[inputid, inputclNo]
        input_cluster_members.append((inputid, inputclNo, distance))

    print(words)
    print(input_distances)
    '''
     the most important word -> the closest to its cluster center
    '''
    l = sorted(input_cluster_members, key=lambda x: x[2])
    hwordid, hclusterNo, hdist = l[0]
    highlight_word = words[hwordid]

    #the validity of the most important word. check its distance with the doc farthest in the cluster
    # the farthest instance
    highlight_cluster_members = cluster_members[hclusterNo]
    l = sorted(highlight_cluster_members, key=lambda x: x[1], reverse=True)
    _, farthestDocDist = l[0]
    print(highlight_word, " dist: ", hdist, " farthest dist: ",
          farthestDocDist)
    # distance to the closest words
    word_distances = kmeans.cluster_centers_  # n_clusters X n_database_words
    cl_word_distances = word_distances[hclusterNo, :]
    print(cl_word_distances.shape)
    word_max_dist = cl_word_distances[0]
    print("word_max_dist: ", word_max_dist)
    print("word_min_dist: ", cl_word_distances[-1])
    cl_word_distances = cl_word_distances[::-1]
    word_avg_dist = cl_word_distances.mean()
    print("word_avg_dist: ", word_avg_dist)

    cluster_terms = clustering.get_top_N_words(kmeans,
                                               tfidf_vectorizer,
                                               nclusters=n_clusters,
                                               top_N_words=top_N_words)
    hclusterterms = cluster_terms[hclusterNo]

    if highlight_word not in hclusterterms:  # @TODO find a real threshold!
        return None

    cluster_membernames = clustering.get_cluster_members(kmeans, labels)
    hclustermembers = cluster_membernames[hclusterNo]

    # find the original word
    preprocessed_word_map = prep.original_to_preprocessed_map(
        preprocessor, sentence)
    original_words = preprocessed_word_map[highlight_word]

    output["word"] = original_words
    output["nearest_terms"] = hclusterterms
    output["nearest_docs"] = hclustermembers
    return output