def single_diffbotapi_call(request, token, list_of_urls): features = {} list_of_titles=[] url_title = {} i=0 for url in list_of_urls: print i try: ti,txt,sent, num_of_links = TE.diffbot_api(request, token, url) cw_article = count_words_string(txt) sentiment = grab_sentiment_articles(sent) features[url] = [cw_article, sentiment, num_of_links] list_of_titles.append(ti) url_title[url] = ti except KeyError as e: print e i=i+1 update_urls = [url for url in features] wf = website_Freq(update_urls) bp = basicParse(update_urls,list_of_titles) d2v = doc2vec(bp) tfidf_r = tfidf(d2v,bp) for url,data in features.iteritems(): avg_tf = take_avg(tfidf_r[url]) features[url].append(avg_tf) features[url].append(wf[url]) return features
def compare (list1,list2,list3): g = [] for rows in list1: for rows in list2: g.extend(tfidf(list1.rows[8],list2.rows[8],list3[rownumber])) g = sorted(g,key=getKey) g = normalize(g) return g
def compare(list1, list2, list3): g = [] for rows in list1: for rows in list2: g.extend(tfidf(list1.rows[8], list2.rows[8], list3[rownumber])) g = sorted(g, key=getKey) g = normalize(g) return g
def test_Non_UTF_Characters(self): docs = ( """GMU Machine Learning and Inference Laboratory ... 2002 Copyright 2002-2003 Machine Learning and Inference Laboratory Front page created by Guido Cervone and Janejira Kalsmith. ... Description: Research on Theories of Learning, Inference, and Discovery Data Mining and Knowledge Discovery, User... """, """Yahoo! Groups : machine-learning machine-learning Machine Learning, [ Join This Group! ]. Home, Messages, Links, Members Only, Chat, ... Machine Learning mailing list: [email protected]. ... Description: An unmoderated mailing list intended for people in computer sciences, statistics, mathematics, and... """, ) x = [tfidf("", doc, docs) for doc in docs] print x
def get_vector_for(self, text): """ zwraca wekorTF-IDF dla podanego tekstu @param Text text : tekst dla którego trzeba wyliczyć wektor TF-IDF @return tuple : @author Andrzej Skupień """ vector = [] keywords = self.keywords for word in keywords: try: tfidf_value = tfidf(str(word), str(text), self.documents) except ZeroDivisionError: tfidf_value = 0 vector.append(tfidf_value) return Vector(vector)
def main(merged_filename, real_file, boted_file): documents = [] print merged_filename users_info = getUserIMDMessages(merged_filename) print users_info.keys() print "total users:" + str(len(users_info.keys())) considerd_users_index = [] index = 0 for user in users_info.keys(): if users_info[user]['m'] > 1: considerd_users_index.append(index) index += 1 real_users_index, bot_users_index, real_users, bot_users = labeling_data( merged_filename, real_file, boted_file) similar_users = OrderedDict() real_users_cnt = 0 bot_users_cnt = 0 per_user_mentions = OrderedDict() spell = SpellChecker() slang = SlangNormalization() slang.readfile('slang.txt') rr = RepeatReplacer() lemmatiser = WordNetLemmatizer() stop_words = set(stopwords.words('english')) chats_info = [] users_list = [] with open(merged_filename, 'r') as f: lines = f.readlines() for line in lines: chat_info = {} user = str( line.split(',"u":')[1].split(',"e":')[0].replace('"', '')).lower() message = format_line( str( line.split(',"m":')[1].split(',"nm":')[0].replace('"', ''))) chat_info['user'] = user if user not in users_list: users_list.append(user) if user in real_users: real_users_cnt += 1 else: bot_users_cnt += 1 message_tokens = tokenize(message) if user not in per_user_mentions.keys(): per_user_mentions[user] = [ token for token in message_tokens if token in users_list ] message = ' '.join(token.lower() for token in tokenize(message)) normalized_msg = slang.translator(message) normalized_msg = ' '.join([ lemmatiser.lemmatize( spell.replace(rr.replace(token.lower())).decode('utf-8'), pos="v") for token in normalized_msg.split(' ') if token not in stop_words and token != "" ]) chat_info['msgs'] = normalized_msg chats_info.append(chat_info) f.close() #print len(chats_info) user_msgs = OrderedDict() for i in range(len(chats_info)): if chats_info[i]['user'] not in user_msgs.keys(): user_msgs[chats_info[i]['user']] = '' user_msgs[chats_info[i]['user']] += chats_info[i]['msgs'] + ' ' else: user_msgs[chats_info[i]['user']] += chats_info[i]['msgs'] + ' ' labels = [] for user in user_msgs.keys(): documents.append(user_msgs[user]) users = user_msgs.keys() print users users_dict = getUserIMDMessages(merged_filename) #print len(users.keys()) user_chats_ft = get_chats_features(users_dict) #print users_list user_chats_ft = pd.DataFrame(user_chats_ft) user_imd_bins = pd.DataFrame(get_IMD_features(users_dict)) user_features = pd.concat([user_chats_ft, user_imd_bins], axis=1) tfidf_representation = tfidf(documents) tfidf_features = pd.DataFrame(np.array(tfidf_representation)) #user_mentions_features = pd.DataFrame(user_mentions_features) #print get_no_user_msgs(merged_filename).values no_user_msgs_features = pd.DataFrame( get_no_user_msgs(merged_filename).values()) user_entropy = pd.DataFrame(np.array( get_entropy_features(merged_filename))) #CCE_features = pd.DataFrame(get_user_cce_features(merged_filename)) #print "user Entropy features" #print per_user_mentions #print "user mention features" user_mentions_features = pd.DataFrame( get_user_mentions_features(per_user_mentions)) #print user_mentions_features.values.tolist() #metadata_features = pd.DataFrame(users_metadata(users_list)) conversation_features = pd.DataFrame(ConversationalFeatures(chats_info)) #print "conversational features" feature_vectors = pd.concat([ user_entropy, user_mentions_features, no_user_msgs_features, conversation_features ], axis=1) feature_vectors = get_final_features(feature_vectors, considerd_users_index) final_features = pd.concat([user_features, feature_vectors], axis=1) #print "shape of features:" + str(np.array(feature_vectors).shape) channel_followers = get_channel_followers( '../followers_cnt/', real_file.split('#')[1].split('database')[0]) #real_users_index = set(real_users_index) users = set(users) for user in channel_followers: if user in users: real_users_index.append(list(users).index(user)) print real_users_index print "# real users:" + str( real_users_cnt) + ' ' + "labelled real users:" + str( len(real_users_index)) print '# bot users:' + str( bot_users_cnt) + ' ' + "labelled bot users:" + str( len(bot_users_index)) #x_train,x_test,y_train = label_data(feature_vectors,real_users_index,bot_users_index) #print len(x_train[0]) #print type(metadata_features) #print "shape of train features:" + str(np.array(x_train).shape) print "#considerd users:" + str(len(considerd_users_index)) label_X, label_Y = data_labelprop(final_features, real_users_index, bot_users_index) print len(label_X), len(label_Y) # Learn with LabelSpreading label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.8) label_spread.fit(label_X, label_Y) output_labels = label_spread.transduction_ print output_labels print accuracy_score(np.array(label_Y), output_labels) * 100 #print feature_vectors.isnull().any() #print metadata_features.values #print feature_vectors.columns[feature_vectors.isnull().any()].tolist() #print feature_vectors.isnull().sum() '''
#WHERE dataset_id < 20" c.execute(qry) documentList = [] documentNumber = 0 docMap = [] for id,title, description in c.fetchall(): documentList.append(title + description) docMap.append(id) c.close() vectors = [] print "gotDocs" for x in range(len(documentList)): words = {} for word in documentList[documentNumber].split(None): words[word] = tfidf(word,documentList[documentNumber],documentList) #for item in sorted(words.items(), key=itemgetter(1), reverse=True): # print "%f <= %s" % (item[1], item[0]) vectors.append(words) documentNumber = x+1 print "got vectors" sim = [] for i in range(len(vectors[:-1])): for j in range(i+1, len(vectors)): sim = cos_sim(vectors[i], vectors[j]) db_id1 = docMap[i] db_id2 = docMap[j] qry = "INSERT into cosine_similarity(id1, id2, score) VALUES (%s, %s, %s)" c = conn.cursor() c.execute(qry, (db_id1, db_id2, sim))
def textmining(): # return textmining_request(request) # # @copy_current_request_context # def textmining_request(req): #21.08.11 app=Flask(__name__) #21.08.11 app.config['JSONIFY_PRETTYPRINT_REGULAR']=True print("************************Textmining************************") ### Angular post data try: # current_app.preprocess_request() if request.method == 'POST': data = request.json print(data) email = data['userEmail'] keyword = data['keyword'] savedDate = data['savedDate'] optionList = data['optionList'] analysisName = data['analysisName'] else: return 'GET result' except Exception as e : resultDic = { "result": e } return json.dumps(resultDic, default=json_util.default, ensure_ascii=False) # email = '*****@*****.**' # keyword = "통일" # savedDate = "2021-08-06T11:52:05.706Z" if analysisName == 'count': print("빈도수 분석을 시작합니다\n") result = word_count(email, keyword, savedDate, optionList, analysisName) print("\n빈도수 분석 결과\n", result) resultDic = {#'returnDate' : datetime.datetime.now(), 'activity' : analysisName, 'email' : email, 'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result' : result} elif analysisName == 'tfidf': print("tfidf 분석을 시작합니다\n") result = tfidf(email, keyword, savedDate, optionList, analysisName) print("\ntfidf 분석 결과\n", result) resultDic = {#'returnDate' : datetime.datetime.now(), 'activity' : analysisName, 'email' : email, 'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result' : result} # for semanticNetworkAnalysis elif analysisName == 'network': print("의미연결망 분석을 시작합니다\n") result1, result2 = semanticNetworkAnalysis(email, keyword, savedDate, optionList, analysisName) print("\n의미연결망 분석 결과\n") print("\n 연결망 json(dict)", result1, "\n") print("\n 중심성 json(dict)", result2, "\n") resultDic = {#'returnDate' : datetime.datetime.now(), 'activity' : analysisName, 'email' : email, 'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result1' : result1, 'result2': result2 } # for kmeans elif analysisName == 'kmeans': print("kmeans 분석을 시작합니다\n") result1, result2 = kmeans(email, keyword, savedDate, optionList, analysisName) print("\n kmeans 분석 결과\n") print("\n plot json(dict)", result1, "\n") print("\n cluster json(dict)", result2, "\n") resultDic = {#'returnDate' : datetime.datetime.now(), 'activity' : analysisName, 'email' : email, 'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result1' : result1, 'result2': result2 } else: return 'result' return json.dumps(resultDic, default=json_util.default, ensure_ascii=False)
[TF, IDF, dictionary] = dictionary_sort(TF, IDF, dictionary) """ Downsize features Discart the least important """ # discard that many elements from the end of TF IDF and dictionary if discard > 0: TF = TF[:, :-discard] IDF = IDF[:-discard] dictionary = dictionary[:-discard] ''' Create the TF-IDF MATRIX from the training data to be used with svd and gmms ''' TFIDF = tfidf(TF, IDF) ''' Singular Value Decomposition ''' svd = TruncatedSVD(n_components=svd_components) TFIDFsvd = svd.fit_transform(TFIDF) ''' extract data for each class separately GMMs will be trained separately on each classes TFIDF samples ''' TFIDF_class = [] for class_num in range(1, 16): TFIDF_class.append(samples_from_class(TFIDFsvd, class_num, labels)) ''' GMM training We train #classes = 15 GMMS to estimate the distribution of the features