def train_kmeans_model(instances, labels, n_clusters=3): n_max_features = None n_gram_range = (1, 1) more_stopwords = None reduce_dim = False preprocessor = prep.Preprocessor(lang="tr", stopword=True, more_stopwords=None, stemming=True, remove_numbers=True, deasciify=False, remove_punkt=True) tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, ngram_range=n_gram_range, max_features=n_max_features) kmeans = skcluster.KMeans(n_clusters=n_clusters, init='random', max_iter=1000, n_init=10, random_state=42) pipeline = skpipeline.Pipeline([('preprocessor', preprocessor), ('vect', tfidf_vectorizer), ('normalizer', skprep.Normalizer()), ('clusterer', kmeans)]) data_distances = pipeline.fit_transform(instances) return pipeline, data_distances, preprocessor, tfidf_vectorizer, kmeans, instances, labels
def _ar_txt_clf_features_pipeline2( feature_params_config_dict # {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example. ): lang = feature_params_config_dict[conf.lang_key] feature_weights = feature_params_config_dict[conf.weights_key] prep_params = feature_params_config_dict[conf.prep_key] #print(feature_weights) # features found in the processed tokens preprocessor = prep.Preprocessor( lang=lang, stopword=prep_params[conf.stopword_key], more_stopwords=prep_params[conf.more_stopwords_key], spellcheck=prep_params[conf.spellcheck_key], stemming=prep_params[conf.stemming_key], remove_numbers=prep_params[conf.remove_numbers_key], deasciify=prep_params[conf.deasciify_key], remove_punkt=prep_params[conf.remove_punkt_key], lowercase=prep_params[conf.lowercase_key]) tfidfvect = TfidfVectorizer( tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=prep_params[conf.use_idf_key], ngram_range=prep_params[conf.wordngramrange_key], max_features=prep_params[conf.nmaxfeature_key]) token_weights = dict(tfidfvect=feature_weights["word_tfidf"], ) token_transformers_dict = dict( tfidfvect= tfidfvect, # not to lose above integrity if we change variable names ) token_transformers = [(k, v) for k, v in token_transformers_dict.items()] tokenbasedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion(transformer_list=token_transformers, transformer_weights=token_weights)), ]) charngramvect = TfidfVectorizer( analyzer='char_wb', ngram_range=prep_params[conf.charngramrange_key], lowercase=False) # stylistic ''' # BUG named_entity_pipe = tbt.get_named_entity_weight_pipeline(lang) text_weights = dict(charngramvect=feature_weights["char_tfidf"], # @TODO hardcoded polpipe1=feature_weights["polyglot_count"], polpipe2=feature_weights["polyglot_value"], named_entity_pipe=feature_weights["named_entity_rate"]) text_transformers_dict = dict(charngramvect=charngramvect, polpipe1=polpipe1, polpipe2=polpipe2, named_entity_pipe=named_entity_pipe) ''' text_weights = dict( charngramvect=feature_weights["char_tfidf"], # @TODO hardcoded ) text_transformers_dict = dict(charngramvect=charngramvect, ) text_transformers = [(k, v) for k, v in text_transformers_dict.items()] ''' textpipes = [('charngramvect', charngramvect),] textpweights = {'charngramvect' : 1.5} textpweights = dict(charngramvect = 1 if charngramvect else 0) ''' textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion(transformer_list=text_transformers, transformer_weights=text_weights), )]) final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe, textbasedpipe=textbasedpipe) final_transformers = [(k, v) for k, v in final_transformers_dict.items()] #print(textbasedpipe.named_steps) ''' #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()} check_zero = lambda x : 1 if sum(x) > 0 else 0 x = list(tokenbasedpipe.get_params(False).values()) print(len(x), x[0]) print(x[0][1]) # convert x[0] tuple to dict, then get transformer weights print("**") print(x,"\n--") print(list(textbasedpipe.get_params(False).values())) tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values()) for _, k in final_transformers_dict.items()} ''' features = skpipeline.FeatureUnion( transformer_list=final_transformers, # transformer_weights=tweights # weight assignment is not necessary as the number of features is small ) #print("0000000000", feature_params_config_dict) return features
def _tr_sentiment_features_pipeline( lang="tr", feature_weights={ "word_tfidf": 1, "polyglot_value": 0, "polyglot_count": 0, "lexicon_count": 0, "char_tfidf": 1 }, stopword_choice=True, more_stopwords_list=None, spellcheck_choice=False, stemming_choice=False, number_choice=False, deasc_choice=True, punct_choice=True, case_choice=True, word_ngramrange=(1, 2), # tuple char_ngramrange=(2, 2), nmaxfeature=10000, # int or None norm="l2", use_idf=True): preprocessor = prep.Preprocessor(lang=lang, stopword=stopword_choice, more_stopwords=more_stopwords_list, spellcheck=spellcheck_choice, stemming=stemming_choice, remove_numbers=number_choice, deasciify=deasc_choice, remove_punkt=punct_choice, lowercase=case_choice) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=use_idf, ngram_range=word_ngramrange, max_features=nmaxfeature) polpipe3 = obt.get_lexicon_count_pipeline(tokenizer=prep.identity) token_weights = dict(tfidfvect=feature_weights["word_tfidf"], polpipe3=feature_weights["lexicon_count"]) token_transformers_dict = dict( tfidfvect= tfidfvect, # not to lose above integrity if we change variable names polpipe3=polpipe3) token_transformers = [(k, v) for k, v in token_transformers_dict.items()] tokenbasedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion(transformer_list=token_transformers, transformer_weights=token_weights)), ]) charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=char_ngramrange, lowercase=False) polpipe1 = tbt.get_polylglot_polarity_count_pipe(lang) polpipe2 = tbt.get_polylglot_polarity_value_pipe(lang) text_weights = dict(charngramvect=feature_weights["char_tfidf"], polpipe1=feature_weights["polyglot_count"], polpipe2=feature_weights["polyglot_value"]) text_transformers_dict = dict(charngramvect=charngramvect, polpipe1=polpipe1, polpipe2=polpipe2) text_transformers = [(k, v) for k, v in text_transformers_dict.items()] ''' textpipes = [('charngramvect', charngramvect),] textpweights = {'charngramvect' : 1.5} textpweights = dict(charngramvect = 1 if charngramvect else 0) ''' textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion(transformer_list=text_transformers, transformer_weights=text_weights), )]) final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe, textbasedpipe=textbasedpipe) final_transformers = [(k, v) for k, v in final_transformers_dict.items()] ''' #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()} check_zero = lambda x : 1 if sum(x) > 0 else 0 x = list(tokenbasedpipe.get_params(False).values()) print(len(x), x[0]) print(x[0][1]) # convert x[0] tuple to dict, then get transformer weights print("**") print(x,"\n--") print(list(textbasedpipe.get_params(False).values())) tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values()) for _, k in final_transformers_dict.items()} ''' features = skpipeline.FeatureUnion( transformer_list=final_transformers, # transformer_weights=tweights # weight assignment is not necessary as the number of features is small ) ''' tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor), #('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion( transformer_list=[ ('tfidfvect', tfidfvect), #('polarity3', polpipe3), ])),] ) textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion([ #('polarity1', polpipe1), #('polarity2', polpipe2), ('charngramvect', charngramvect), ]),)]) features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ]) ''' return features
def _email_features_pipeline(lang, stopword_choice=True, more_stopwords_list=None, spellcheck_choice=False, stemming_choice=False, number_choice=False, deasc_choice=True, punct_choice=True, case_choice=True, ngramrange=(1, 2), # tuple nmaxfeature=10000, # int or None norm="l2", use_idf=True, keywords=[], # ["arıza", "pstn"] final_weights=dict(text_based=1, token_based=1) ): # use a list of (pipeline, pipeline_name, weight) # features found in the processed tokens token_features = [] token_weights = {} preprocessor = prep.Preprocessor(lang=lang, stopword=stopword_choice, more_stopwords=more_stopwords_list, spellcheck=spellcheck_choice, stemming=stemming_choice, remove_numbers=number_choice, deasciify=deasc_choice, remove_punkt=punct_choice, lowercase=case_choice ) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=use_idf, ngram_range=ngramrange, max_features=nmaxfeature) tfidfvect_name = 'word_tfidfvect' token_features.append((tfidfvect_name, tfidfvect)) token_weights[tfidfvect_name] = 1 # features found in the whole raw text text_features = [] text_weights = {} # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False) # keyword presence features if keywords: for keyword in keywords: keywordpipe = txbt.get_keyword_pipeline(keyword) feature_name = "has_" + keyword text_features.append((feature_name, keywordpipe)) text_weights[feature_name] = 1 tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion( transformer_list=token_features , transformer_weights=token_weights )), ]) textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion( transformer_list=text_features, transformer_weights=text_weights ), ) ]) ####### # add the feature pipes to final_features if all the component weights are non-zero. ######## check_zero_list = lambda x : 1 if sum(x) > 0 else 0 # l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1. final_features_dict = {} tkweights = list(token_weights.values()) if(check_zero_list(tkweights) != 0): final_features_dict["token_based"] = tokenbasedpipe else: final_weights["token_based"] = 0 txweights = list(text_weights.values()) if(check_zero_list(txweights) != 0): final_features_dict["text_based"] = textbasedpipe else: final_weights["text_based"] = 0 final_features = list(final_features_dict.items()) fweights = list(final_weights.values()) if((check_zero_list(fweights) == 0) or (len(final_features) == 0)): return None ''' features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ], transformer_weights=final_weights) ''' features = skpipeline.FeatureUnion(transformer_list=final_features, transformer_weights=final_weights) return features
def _email_features_pipeline2(feature_params_config_dict # {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example. ): lang = feature_params_config_dict[conf.lang_key] final_weights = feature_params_config_dict[conf.weights_key] prep_params = feature_params_config_dict[conf.prep_key] keywords = feature_params_config_dict[conf.keyword_key] # features found in the processed tokens token_features = [] token_weights = {} preprocessor = prep.Preprocessor(lang=lang, stopword=prep_params[conf.stopword_key], more_stopwords=prep_params[conf.more_stopwords_key], spellcheck=prep_params[conf.spellcheck_key], stemming=prep_params[conf.stemming_key], remove_numbers=prep_params[conf.remove_numbers_key], deasciify=prep_params[conf.deasciify_key], remove_punkt=prep_params[conf.remove_punkt_key], lowercase=prep_params[conf.lowercase_key] ) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=prep_params[conf.use_idf_key], ngram_range=prep_params[conf.ngramrange_key], max_features=prep_params[conf.nmaxfeature_key]) tfidfvect_name = 'word_tfidfvect' token_features.append((tfidfvect_name, tfidfvect)) token_weights[tfidfvect_name] = 1 # features found in the whole raw text text_features = [] text_weights = {} # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False) # keyword presence features if keywords: for keyword in keywords: keywordpipe = txbt.get_keyword_pipeline(keyword) feature_name = "has_" + keyword text_features.append((feature_name, keywordpipe)) text_weights[feature_name] = 1 tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion( transformer_list=token_features , transformer_weights=token_weights )), ]) textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion( transformer_list=text_features, transformer_weights=text_weights ), ) ]) ''' features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ], transformer_weights=final_weights) ''' ####### # add the feature pipes to final_features if all the component weights are non-zero. ######## check_zero_list = lambda x : 1 if sum(x) > 0 else 0 # l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1. final_features_dict = {} tkweights = list(token_weights.values()) if(check_zero_list(tkweights) != 0): final_features_dict["token_based"] = tokenbasedpipe else: final_weights["token_based"] = 0 txweights = list(text_weights.values()) if(check_zero_list(txweights) != 0): final_features_dict["text_based"] = textbasedpipe else: final_weights["text_based"] = 0 final_features = list(final_features_dict.items()) fweights = list(final_weights.values()) #print(final_weights) if((check_zero_list(fweights) == 0) or (len(final_features) == 0)): return None features = skpipeline.FeatureUnion(transformer_list=final_features, transformer_weights=final_weights) return features
def topics_lsi(instances, labels, sentence, ndim=5, n_gram_range=(1, 1), n_max_features=None): print(instances[:5]) print(labels[:5]) highlight_word = "" preprocessor = prep.Preprocessor(lang="tr", stopword=True, more_stopwords=None, stemming=True, remove_numbers=True, deasciify=False, remove_punkt=True) tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, ngram_range=n_gram_range, max_features=n_max_features) svd_model = TruncatedSVD(n_components=ndim, algorithm='randomized', n_iter=10, random_state=42) svd_transformer = skpipeline.Pipeline([ ('preprocessor', preprocessor), ('vectorizer', tfidf_vectorizer), #('normalizer', skprep.Normalizer()), ('scaler', skprep.StandardScaler(with_mean=False)), ('svd', svd_model) ]) docmatrix = svd_transformer.fit_transform(instances) input_ = preprocessor.tokenize(sentence) if (len(input_) < 1 or len("".join(input_)) < 1): highlight_word = "" return highlight_word inputmatrix = svd_transformer.transform(input_) termmatrix = svd_model.components_.T print(termmatrix.shape) print(inputmatrix.shape) print(docmatrix.shape) # closest docs # @TODO different similarity metrics docsim, docindices = list_utils.matrix_similarity(inputmatrix, docmatrix, top_N=10) for i, w in enumerate(input_): print(w) sim_docs = [labels[j] for j in docindices[i]] #print("most similar docs: ", ", ".join(sim_docs)) print("most similar docs: ", sim_docs) sim_vals = docsim[i] print(sim_vals) print() # closest terms -> the input word which has the largest similarity value termsim, termindices = list_utils.matrix_similarity(inputmatrix, termmatrix, top_N=10) allterms = tfidf_vectorizer.get_feature_names() print(len(allterms)) #open("/home/dicle/Documents/experiments/thy_topics/output/all_terms.txt", "w").write("\n".join(allterms)) for i, w in enumerate(input_): print(w) sim_terms = [allterms[j] for j in termindices[i]] print("most similar terms: ", ", ".join(sim_terms)) sim_vals = termsim[i] print(sim_vals) print(sum(sim_vals)) # the heaviest term similarity_threshold = 0.0 # @TODO should be inferred from the data_matrix total_termsim_per_instance = np.sum(termsim, axis=1) max_sim = total_termsim_per_instance.max() max_index = total_termsim_per_instance.argmax() #print("max -> ", input_[max_index], " : ",max_sim) if max_sim <= similarity_threshold: highlight_word = "" return highlight_word highlight_word = input_[max_index] return highlight_word
def topics_kmeans(instances, labels, sentence): output = {"word": "", "nearest_docs": [], "nearest_terms": []} n_clusters = 3 n_max_features = None top_N_words = 30 n_gram_range = (1, 1) more_stopwords = None reduce_dim = False preprocessor = prep.Preprocessor(lang="tr", stopword=True, more_stopwords=None, stemming=True, remove_numbers=True, deasciify=False, remove_punkt=True) tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, ngram_range=n_gram_range, max_features=n_max_features) kmeans = skcluster.KMeans(n_clusters=n_clusters, init='random', max_iter=1000, n_init=10, random_state=42) pipeline = skpipeline.Pipeline([('preprocessor', preprocessor), ('vect', tfidf_vectorizer), ('normalizer', skprep.Normalizer()), ('clusterer', kmeans)]) ''' Prediction on input words - assign each word to one of the clusters formed by the database - print the closest terms and docs for each input word - find the word closest to its own cluster center, assign it to be highlighted - if the total distance of the input words exceeds SOME THRESHOLD, then that input is irrelevant and will not be highlighted + map the preprocessed word to its original + handle empty or after-preprocessing empty input ''' data_distances = pipeline.fit_transform(instances) data_clusters = kmeans.labels_ clnames = list(set(data_clusters)) words = preprocessor.tokenize(sentence) words = list(set(words)) # remove repeating words if (len(words) < 1 or len("".join(words)) < 1): return None #words = [sentence] input_clusters = pipeline.predict(words) input_distances = pipeline.transform(words) ''' store the ids and distances of the members for each cluster - {clNO : [(member_id, member_distance)]} ''' cluster_members = dict.fromkeys(clnames, []) for memberid, memberclNo in enumerate(data_clusters): distance = data_distances[memberid, memberclNo] cluster_members[memberclNo] = cluster_members[memberclNo] + [ (memberid, distance) ] input_cluster_members = [ ] # store [(input_id, input_clNO, input_distance)] for inputid, inputclNo in enumerate(input_clusters): distance = input_distances[inputid, inputclNo] input_cluster_members.append((inputid, inputclNo, distance)) print(words) print(input_distances) ''' the most important word -> the closest to its cluster center ''' l = sorted(input_cluster_members, key=lambda x: x[2]) hwordid, hclusterNo, hdist = l[0] highlight_word = words[hwordid] #the validity of the most important word. check its distance with the doc farthest in the cluster # the farthest instance highlight_cluster_members = cluster_members[hclusterNo] l = sorted(highlight_cluster_members, key=lambda x: x[1], reverse=True) _, farthestDocDist = l[0] print(highlight_word, " dist: ", hdist, " farthest dist: ", farthestDocDist) # distance to the closest words word_distances = kmeans.cluster_centers_ # n_clusters X n_database_words cl_word_distances = word_distances[hclusterNo, :] print(cl_word_distances.shape) word_max_dist = cl_word_distances[0] print("word_max_dist: ", word_max_dist) print("word_min_dist: ", cl_word_distances[-1]) cl_word_distances = cl_word_distances[::-1] word_avg_dist = cl_word_distances.mean() print("word_avg_dist: ", word_avg_dist) cluster_terms = clustering.get_top_N_words(kmeans, tfidf_vectorizer, nclusters=n_clusters, top_N_words=top_N_words) hclusterterms = cluster_terms[hclusterNo] if highlight_word not in hclusterterms: # @TODO find a real threshold! return None cluster_membernames = clustering.get_cluster_members(kmeans, labels) hclustermembers = cluster_membernames[hclusterNo] # find the original word preprocessed_word_map = prep.original_to_preprocessed_map( preprocessor, sentence) original_words = preprocessed_word_map[highlight_word] output["word"] = original_words output["nearest_terms"] = hclusterterms output["nearest_docs"] = hclustermembers return output