Ejemplo n.º 1
0
def single_diffbotapi_call(request, token, list_of_urls):
    features = {}
    list_of_titles=[]
    url_title = {}

    i=0
    for url in list_of_urls:
        print i
        try:
            ti,txt,sent, num_of_links = TE.diffbot_api(request, token, url)
            cw_article = count_words_string(txt)
            sentiment = grab_sentiment_articles(sent)
            features[url] = [cw_article, sentiment, num_of_links]
            list_of_titles.append(ti)
            url_title[url] = ti

        except KeyError as e:
            print e
        i=i+1

    update_urls = [url for url in features]

    wf = website_Freq(update_urls)
    bp = basicParse(update_urls,list_of_titles)
    d2v = doc2vec(bp)
    tfidf_r = tfidf(d2v,bp)
    for url,data in features.iteritems():
        avg_tf = take_avg(tfidf_r[url])
        features[url].append(avg_tf)   
        features[url].append(wf[url])
    return features
def compare (list1,list2,list3):
    g = []
    for rows in list1:
        for rows in list2:
             g.extend(tfidf(list1.rows[8],list2.rows[8],list3[rownumber]))
    g = sorted(g,key=getKey)
    g = normalize(g)
    return g
def compare(list1, list2, list3):
    g = []
    for rows in list1:
        for rows in list2:
            g.extend(tfidf(list1.rows[8], list2.rows[8], list3[rownumber]))
    g = sorted(g, key=getKey)
    g = normalize(g)
    return g
Ejemplo n.º 4
0
    def test_Non_UTF_Characters(self):
        docs = (
            """GMU Machine Learning and Inference Laboratory
... 2002 Copyright 2002-2003 Machine Learning and Inference Laboratory
Front page created by Guido Cervone and Janejira Kalsmith. ... 
Description: Research on Theories of Learning, Inference, and Discovery Data Mining and Knowledge Discovery, User...
""",
            """Yahoo! Groups : machine-learning
machine-learning Machine Learning, [ Join This Group! ]. Home, Messages, Links,
Members Only, Chat, ... Machine Learning mailing list: [email protected]. ... 
Description: An unmoderated mailing list intended for people in computer sciences, statistics, mathematics, and...
        """,
        )

        x = [tfidf("", doc, docs) for doc in docs]
        print x
Ejemplo n.º 5
0
    def get_vector_for(self, text):
        """
         zwraca wekorTF-IDF dla podanego tekstu

        @param Text text : tekst dla którego trzeba wyliczyć wektor TF-IDF
        @return tuple :
        @author Andrzej Skupień
        """
        vector = []
        keywords = self.keywords
        for word in keywords:
            try:
                tfidf_value = tfidf(str(word), str(text), self.documents)
            except ZeroDivisionError:
                tfidf_value = 0
            vector.append(tfidf_value)
        return Vector(vector)
Ejemplo n.º 6
0
def main(merged_filename, real_file, boted_file):

    documents = []
    print merged_filename
    users_info = getUserIMDMessages(merged_filename)
    print users_info.keys()
    print "total users:" + str(len(users_info.keys()))
    considerd_users_index = []
    index = 0
    for user in users_info.keys():
        if users_info[user]['m'] > 1:
            considerd_users_index.append(index)
        index += 1
    real_users_index, bot_users_index, real_users, bot_users = labeling_data(
        merged_filename, real_file, boted_file)
    similar_users = OrderedDict()
    real_users_cnt = 0
    bot_users_cnt = 0
    per_user_mentions = OrderedDict()
    spell = SpellChecker()
    slang = SlangNormalization()
    slang.readfile('slang.txt')
    rr = RepeatReplacer()
    lemmatiser = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    chats_info = []
    users_list = []
    with open(merged_filename, 'r') as f:
        lines = f.readlines()
        for line in lines:
            chat_info = {}
            user = str(
                line.split(',"u":')[1].split(',"e":')[0].replace('"',
                                                                 '')).lower()
            message = format_line(
                str(
                    line.split(',"m":')[1].split(',"nm":')[0].replace('"',
                                                                      '')))
            chat_info['user'] = user
            if user not in users_list:
                users_list.append(user)
                if user in real_users:
                    real_users_cnt += 1
                else:
                    bot_users_cnt += 1
            message_tokens = tokenize(message)
            if user not in per_user_mentions.keys():
                per_user_mentions[user] = [
                    token for token in message_tokens if token in users_list
                ]
            message = ' '.join(token.lower() for token in tokenize(message))
            normalized_msg = slang.translator(message)
            normalized_msg = ' '.join([
                lemmatiser.lemmatize(
                    spell.replace(rr.replace(token.lower())).decode('utf-8'),
                    pos="v") for token in normalized_msg.split(' ')
                if token not in stop_words and token != ""
            ])
            chat_info['msgs'] = normalized_msg
            chats_info.append(chat_info)
    f.close()
    #print len(chats_info)

    user_msgs = OrderedDict()
    for i in range(len(chats_info)):
        if chats_info[i]['user'] not in user_msgs.keys():
            user_msgs[chats_info[i]['user']] = ''
            user_msgs[chats_info[i]['user']] += chats_info[i]['msgs'] + ' '
        else:
            user_msgs[chats_info[i]['user']] += chats_info[i]['msgs'] + ' '
    labels = []
    for user in user_msgs.keys():
        documents.append(user_msgs[user])

    users = user_msgs.keys()
    print users
    users_dict = getUserIMDMessages(merged_filename)
    #print len(users.keys())
    user_chats_ft = get_chats_features(users_dict)
    #print users_list
    user_chats_ft = pd.DataFrame(user_chats_ft)
    user_imd_bins = pd.DataFrame(get_IMD_features(users_dict))
    user_features = pd.concat([user_chats_ft, user_imd_bins], axis=1)
    tfidf_representation = tfidf(documents)
    tfidf_features = pd.DataFrame(np.array(tfidf_representation))
    #user_mentions_features = pd.DataFrame(user_mentions_features)
    #print get_no_user_msgs(merged_filename).values
    no_user_msgs_features = pd.DataFrame(
        get_no_user_msgs(merged_filename).values())
    user_entropy = pd.DataFrame(np.array(
        get_entropy_features(merged_filename)))
    #CCE_features = pd.DataFrame(get_user_cce_features(merged_filename))
    #print "user Entropy features"
    #print per_user_mentions
    #print "user mention features"
    user_mentions_features = pd.DataFrame(
        get_user_mentions_features(per_user_mentions))
    #print user_mentions_features.values.tolist()

    #metadata_features = pd.DataFrame(users_metadata(users_list))
    conversation_features = pd.DataFrame(ConversationalFeatures(chats_info))
    #print "conversational features"
    feature_vectors = pd.concat([
        user_entropy, user_mentions_features, no_user_msgs_features,
        conversation_features
    ],
                                axis=1)
    feature_vectors = get_final_features(feature_vectors,
                                         considerd_users_index)
    final_features = pd.concat([user_features, feature_vectors], axis=1)
    #print "shape of features:" + str(np.array(feature_vectors).shape)
    channel_followers = get_channel_followers(
        '../followers_cnt/',
        real_file.split('#')[1].split('database')[0])
    #real_users_index = set(real_users_index)
    users = set(users)
    for user in channel_followers:
        if user in users:
            real_users_index.append(list(users).index(user))
    print real_users_index

    print "# real users:" + str(
        real_users_cnt) + ' ' + "labelled real users:" + str(
            len(real_users_index))
    print '# bot users:' + str(
        bot_users_cnt) + ' ' + "labelled bot users:" + str(
            len(bot_users_index))
    #x_train,x_test,y_train = label_data(feature_vectors,real_users_index,bot_users_index)
    #print len(x_train[0])
    #print type(metadata_features)
    #print "shape of train features:" + str(np.array(x_train).shape)
    print "#considerd users:" + str(len(considerd_users_index))
    label_X, label_Y = data_labelprop(final_features, real_users_index,
                                      bot_users_index)
    print len(label_X), len(label_Y)
    # Learn with LabelSpreading
    label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.8)
    label_spread.fit(label_X, label_Y)
    output_labels = label_spread.transduction_
    print output_labels
    print accuracy_score(np.array(label_Y), output_labels) * 100
    #print feature_vectors.isnull().any()
    #print metadata_features.values
    #print feature_vectors.columns[feature_vectors.isnull().any()].tolist()
    #print feature_vectors.isnull().sum()
    '''
Ejemplo n.º 7
0
#WHERE dataset_id < 20"
c.execute(qry)
documentList = []
documentNumber = 0
docMap = []
for id,title, description in c.fetchall():
    documentList.append(title + description)
    docMap.append(id)
c.close()
vectors = []
print "gotDocs"
for x in range(len(documentList)):
    words = {}
    for word in documentList[documentNumber].split(None):
        words[word] = tfidf(word,documentList[documentNumber],documentList)

    #for item in sorted(words.items(), key=itemgetter(1), reverse=True):
    #    print "%f <= %s" % (item[1], item[0])
    vectors.append(words)
    documentNumber = x+1
print "got vectors"
sim = []
for i in range(len(vectors[:-1])):
    for j in range(i+1, len(vectors)):
        sim = cos_sim(vectors[i], vectors[j])
        db_id1 = docMap[i]
        db_id2 = docMap[j]
        qry = "INSERT into cosine_similarity(id1, id2, score) VALUES (%s, %s, %s)"
        c = conn.cursor()
        c.execute(qry, (db_id1, db_id2, sim))
Ejemplo n.º 8
0
def textmining():
    
#     return textmining_request(request)
    
# # @copy_current_request_context 
# def textmining_request(req):

    #21.08.11 app=Flask(__name__)
    #21.08.11 app.config['JSONIFY_PRETTYPRINT_REGULAR']=True
    print("************************Textmining************************")
    ### Angular post data

    try: 
        # current_app.preprocess_request()
        if request.method == 'POST':
            data = request.json 
            print(data)
            email = data['userEmail']
            keyword = data['keyword']
            savedDate = data['savedDate']
            optionList = data['optionList']
            analysisName = data['analysisName']
        else: return 'GET result'
    except Exception as e :
        resultDic = {
            "result": e
        }
        return json.dumps(resultDic, default=json_util.default, ensure_ascii=False)


    # email = '*****@*****.**'
    # keyword = "통일"
    # savedDate = "2021-08-06T11:52:05.706Z"


    if analysisName == 'count':
        print("빈도수 분석을 시작합니다\n")
        result = word_count(email, keyword, savedDate, optionList, analysisName)
        print("\n빈도수 분석 결과\n", result)
        resultDic = {#'returnDate' : datetime.datetime.now(), 
        'activity' : analysisName, 'email' : email, 
        'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result' : result}
    
    elif analysisName == 'tfidf':
        print("tfidf 분석을 시작합니다\n")
        result = tfidf(email, keyword, savedDate, optionList, analysisName)
        print("\ntfidf 분석 결과\n", result)
        resultDic = {#'returnDate' : datetime.datetime.now(), 
        'activity' : analysisName, 'email' : email, 
        'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result' : result}

    # for semanticNetworkAnalysis
    elif analysisName == 'network':
        print("의미연결망 분석을 시작합니다\n")
        result1, result2 = semanticNetworkAnalysis(email, keyword, savedDate, optionList, analysisName)
        print("\n의미연결망 분석 결과\n")
        print("\n 연결망 json(dict)", result1, "\n")
        print("\n 중심성 json(dict)", result2, "\n")
        
        resultDic = {#'returnDate' : datetime.datetime.now(), 
        'activity' : analysisName, 'email' : email,
        'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result1' : result1, 'result2': result2 }
    # for kmeans
    elif analysisName == 'kmeans':
        print("kmeans 분석을 시작합니다\n")
        result1, result2 = kmeans(email, keyword, savedDate, optionList, analysisName)
        print("\n kmeans 분석 결과\n")
        print("\n plot json(dict)", result1, "\n")
        print("\n cluster json(dict)", result2, "\n")
        
        resultDic = {#'returnDate' : datetime.datetime.now(), 
        'activity' : analysisName, 'email' : email,
        'keyword' : keyword, 'savedDate' : savedDate, 'optionList' : optionList, 'result1' : result1, 'result2': result2 }

    else: return 'result'

    return json.dumps(resultDic, default=json_util.default, ensure_ascii=False)
[TF, IDF, dictionary] = dictionary_sort(TF, IDF, dictionary)
"""
Downsize features
Discart the least important 
"""
# discard that many elements from the end of TF IDF and dictionary
if discard > 0:
    TF = TF[:, :-discard]
    IDF = IDF[:-discard]
    dictionary = dictionary[:-discard]
'''
Create the TF-IDF MATRIX from the training data
to be used with svd and gmms
'''
TFIDF = tfidf(TF, IDF)
'''
Singular Value Decomposition
'''
svd = TruncatedSVD(n_components=svd_components)
TFIDFsvd = svd.fit_transform(TFIDF)
'''
extract data for each class separately
GMMs will be trained separately on each classes TFIDF samples
'''
TFIDF_class = []
for class_num in range(1, 16):
    TFIDF_class.append(samples_from_class(TFIDFsvd, class_num, labels))
'''
GMM training
We train #classes = 15 GMMS to estimate the distribution of the features