Beispiel #1
0
 def qbe(self, s, v, o, a):
     vectorizer = TfidfVectorizer(min_df=1)
     res = {}
     qst = remove_keys_of_empty_value(self.documents[self.q].get_subj_syn())
     len_sq = len(qst.keys())
     qvt = remove_keys_of_empty_value(self.documents[self.q].get_verb_syn())
     len_vq = len(qvt.keys())
     qot = remove_keys_of_empty_value(self.documents[self.q].get_obj_syn())
     len_oq = len(qot.keys())
     qat = remove_keys_of_empty_value(self.documents[self.q].get_adv_syn())
     len_aq = len(qat.keys())
     
     for d in self.documents:
         # subj
         dst = remove_keys_of_empty_value(self.documents[d].get_subj_syn())
         Xs = vconcat(qst, dst)
         if len_sq < 1 or len(dst.keys()) < 1:
             subj_sim = 0.0
         else:
             Xs_vec = vectorizer.fit_transform(Xs)
             subj_sim = np.average(csim(Xs_vec[0:len_sq],Xs_vec[len_sq:]))
         # verb
         dvt = remove_keys_of_empty_value(self.documents[d].get_verb_syn())
         Xv = vconcat(qvt, dvt)
         if len_vq < 1 or len(dvt.keys()) < 1:
             verb_sim = 0.0
         else:
             Xv_vec = vectorizer.fit_transform(Xv)
             verb_sim = np.average(csim(Xv_vec[0:len_vq],Xv_vec[len_vq:]))
         # obj
         dot = remove_keys_of_empty_value(self.documents[d].get_obj_syn())
         Xo = vconcat(qot, dot)
         if len_oq < 1 or len(dot.keys()) < 1:
             obj_sim = 0.0
         else:
             Xo_vec = vectorizer.fit_transform(Xo)
             obj_sim = np.average(csim(Xo_vec[0:len_oq],Xo_vec[len_oq:]))
         # adv
         dat = remove_keys_of_empty_value(self.documents[d].get_adv_syn())
         Xa = vconcat(qat, dat)
         if len_aq < 1 or len(dat.keys()) < 1:
             adv_sim = 0.0
         else:
             Xa_vec = vectorizer.fit_transform(Xa)
             adv_sim = np.average(csim(Xa_vec[0:len_aq],Xa_vec[len_aq:]))
         res[d] = s * subj_sim + v * verb_sim + o * obj_sim + a * adv_sim
     if self.norm:
         answer = norm_dic(res)
     else:
         answer = sort_dic_desc(res)
     return answer
Beispiel #2
0
    def tfidf(self, path="./bench/"): # This function is used for comparision with qbe
        vectorizer = TfidfVectorizer(stop_words='english')
        lst_files = []
        doc_dict = {}
        inv_doc_dict = {}
        i = 0
        for f in os.listdir(path):
            if f.endswith(".txt"):
                d = open(path+f)
                cont = d.read()
                cont = unicode(cont, errors='ignore')
                lst_files.append(cont)
                doc_dict[f[:-4]] = i #[:-4] is used for trimming '.txt' from the filename
                inv_doc_dict[i] = f[:-4]
                i+= 1

        tfidf_bow = vectorizer.fit_transform(lst_files)
        search =  csim(tfidf_bow[doc_dict[self.q]], tfidf_bow)

        search = list(search[0])

        i =0
        ans = {}
        for item in search:
            x = inv_doc_dict[i]
            ans[x] = item
            i+= 1
        tfidf_dic = ans
        tfidf = norm_dic(tfidf_dic)
        return tfidf
Beispiel #3
0
def pos_patterns_sim(query_id, documents, vectorizer, pos='subj'):
    if pos=='subj':
        qt = remove_keys_of_empty_value(documents[query_id].get_subj_syn())
    elif pos =='verb':
        qt = remove_keys_of_empty_value(documents[query_id].get_verb_syn())
    elif pos =='obj':
        qt = remove_keys_of_empty_value(documents[query_id].get_obj_syn())
    elif pos =='adv':
        qt = remove_keys_of_empty_value(documents[query_id].get_adv_syn())
    else:
        raise ValueError('The given pos value is not withen (subj, verb, obj, adv)')

    len_q = len(qt.keys())
    for d in documents:
        if pos=='subj':
            dt = remove_keys_of_empty_value(documents[d].get_subj_syn())
        elif pos =='verb':
            dt = remove_keys_of_empty_value(documents[d].get_verb_syn())
        elif pos =='obj':
            dt = remove_keys_of_empty_value(documents[d].get_obj_syn())
        elif pos =='adv':
            dt = remove_keys_of_empty_value(documents[d].get_adv_syn())
        else:
            raise ValueError('The given pos value is not withen (subj, verb, obj, adv)')
        X = vconcat(qt, dt)
        if len(np.unique(X)) == 1:
            if np.unique(X) == ' ':
                pos_sim = 0.0
        else:
            X_vec = vectorizer.fit_transform(X)
            pos_sim = np.average(csim(X_vec[0:len_q],X_vec[len_q:]))
    return pos_sim
Beispiel #4
0
def qbe(q,documents, s, v, o, a, vectorizer):
    res = {}
    len_sq = len(q.get_subj_syn().keys())
    len_vq = len(q.get_verb_syn().keys())
    len_oq = len(q.get_obj_syn().keys())
    len_aq = len(q.get_adv_syn().keys())
    for d in documents:
        Xs = vconcat(q.get_subj_syn(), documents[d].get_subj_syn())
        Xs_vec = vectorizer.fit_transform(Xs)
        subj_sim = np.average(csim(Xs_vec[0:len_sq],Xs_vec[len_sq:]))
        Xv = vconcat(q.get_verb_syn(), documents[d].get_verb_syn())
        Xv_vec = vectorizer.fit_transform(Xv)
        verb_sim = np.average(csim(Xv_vec[0:len_vq],Xv_vec[len_vq:]))
        Xo = vconcat(q.get_obj_syn(), documents[d].get_obj_syn())
        Xo_vec = vectorizer.fit_transform(Xo)
        obj_sim = np.average(csim(Xo_vec[0:len_oq],Xo_vec[len_oq:]))
        Xa = vconcat(q.get_adv_syn(), documents[d].get_adv_syn())
        Xa_vec = vectorizer.fit_transform(Xa)
        adv_sim = np.average(csim(Xa_vec[0:len_aq],Xa_vec[len_aq:]))

        res[d] = np.average([subj_sim, verb_sim, obj_sim, adv_sim])
        res[d] = s * subj_sim + v * verb_sim + o * obj_sim + a * adv_sim
    return norm_dic(res)
Beispiel #5
0
    '''
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    '''
    
print()
print()

# Specifying random_state so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

# Compute distance matrix:
distm = 1 - csim(nmatrix)

pos = mds.fit_transform(distm)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=names1)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling