Esempio n. 1
0
def words_2gram_adj_matrix(df, voc, col='text', window=2,
                           normalize=True, min_abs=0, min_perc=0.0, binary=False):
    full_voc = list(np.unique(sm.get_all_words(
        df[col], filter_fun=lambda w: any('א'<=c<='ת' for c in w)
    )))
    # get list of sentences
    data = sm.get_all_sentences(df[col])
    # fill incidence matrices
    c = np.zeros(len(voc))
    offsets = list(range(-window,0)) + list(range(1,window+1))
    D = {off: np.zeros((len(voc), len(full_voc))) for off in offsets}
    for txt in data:
        sent = sm.get_all_words(
            txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w))
        for k,w in enumerate(sent):
            if w in voc:
                i = voc.index(w)
                c[i] += 1
                for off in offsets:
                    if 0 <= k+off < len(sent):
                        D[off][i, full_voc.index(sent[k+off])] += 1
    # normalize
    for off in offsets:
        D[off][D[off]<min_abs] = 0
        if normalize:
            D[off] = D[off] * np.nan_to_num(1/c)[:, np.newaxis]
            D[off][D[off]<min_perc] = 0
        if binary: D[off][D[off]>0] = 1
    # adj matrix
    A = np.zeros((len(voc),len(voc)))
    for off in offsets:
        d = np.sqrt(D[off])
        A += np.matmul(d, d.transpose())
    np.fill_diagonal(A, 0)
    return (A, D, full_voc)
Esempio n. 2
0
def words_local_incidence_matrix(df, voc, col='text', window=3,
                                 normalize=True, min_abs=0, min_perc=0.1, binary=False):
    full_voc = list(np.unique(sm.get_all_words(df[col])))
    # get list of sentences
    data = sm.get_all_sentences(df[col])
    # fill incidence matrix
    c = np.zeros(len(voc))
    D = np.zeros((len(voc), len(full_voc)))
    for txt in data:
        sent = sm.get_all_words(
            txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w))
        for k,w in enumerate(sent):
            if w in voc:
                i = voc.index(w)
                c[i] += 1
                neihb = sent[k-window:k] + sent[k+1:k+window+1]
                for w2 in neihb:
                    D[i, full_voc.index(w2)] += 1
    # normalize
    D[D<min_abs] = 0
    if normalize:
        D = D * np.nan_to_num(1/c)[:, np.newaxis]
        D[D<min_perc] = 0
    if binary: D[D>0] = 1
    return D
Esempio n. 3
0
def word2vec(df, col='text', size=100, window=3,
             min_count=1, workers=4, save_to=None, **kwargs):
    sents = sm.get_all_sentences(df[col])
    sents = [sm.get_all_words(s,stopwords=()) for s in sents]
    model = Word2Vec(sents, size=size, window=window,
                     min_count=min_count, workers=workers, **kwargs)
    if save_to:
        pickle.dump(model, open(save_to,'wb'))
    return model
Esempio n. 4
0
def common_context(df, words, col='text', window=2):
    if isinstance(words[0],str):
        words = (words,)
    sents = sm.get_all_sentences(df[col])
    for pair in words:
        print("Words:\t", pair)
        A, D, voc = words_2gram_adj_matrix(df, pair, col=col, window=window)
        context = []
        for o in D:
            ii = [i[0] for i in np.argwhere(D[o][0,:] * D[o][1,:])]
            context.extend([voc[i] for i in ii])
        print("context:\t", context)
        for i, s in enumerate(sents):
            if np.any([w in s for w in pair]) and \
                    np.any([w in s for w in context]):
                print(i, s)