def words_2gram_adj_matrix(df, voc, col='text', window=2, normalize=True, min_abs=0, min_perc=0.0, binary=False): full_voc = list(np.unique(sm.get_all_words( df[col], filter_fun=lambda w: any('א'<=c<='ת' for c in w) ))) # get list of sentences data = sm.get_all_sentences(df[col]) # fill incidence matrices c = np.zeros(len(voc)) offsets = list(range(-window,0)) + list(range(1,window+1)) D = {off: np.zeros((len(voc), len(full_voc))) for off in offsets} for txt in data: sent = sm.get_all_words( txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)) for k,w in enumerate(sent): if w in voc: i = voc.index(w) c[i] += 1 for off in offsets: if 0 <= k+off < len(sent): D[off][i, full_voc.index(sent[k+off])] += 1 # normalize for off in offsets: D[off][D[off]<min_abs] = 0 if normalize: D[off] = D[off] * np.nan_to_num(1/c)[:, np.newaxis] D[off][D[off]<min_perc] = 0 if binary: D[off][D[off]>0] = 1 # adj matrix A = np.zeros((len(voc),len(voc))) for off in offsets: d = np.sqrt(D[off]) A += np.matmul(d, d.transpose()) np.fill_diagonal(A, 0) return (A, D, full_voc)
def words_local_incidence_matrix(df, voc, col='text', window=3, normalize=True, min_abs=0, min_perc=0.1, binary=False): full_voc = list(np.unique(sm.get_all_words(df[col]))) # get list of sentences data = sm.get_all_sentences(df[col]) # fill incidence matrix c = np.zeros(len(voc)) D = np.zeros((len(voc), len(full_voc))) for txt in data: sent = sm.get_all_words( txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)) for k,w in enumerate(sent): if w in voc: i = voc.index(w) c[i] += 1 neihb = sent[k-window:k] + sent[k+1:k+window+1] for w2 in neihb: D[i, full_voc.index(w2)] += 1 # normalize D[D<min_abs] = 0 if normalize: D = D * np.nan_to_num(1/c)[:, np.newaxis] D[D<min_perc] = 0 if binary: D[D>0] = 1 return D
def word2vec(df, col='text', size=100, window=3, min_count=1, workers=4, save_to=None, **kwargs): sents = sm.get_all_sentences(df[col]) sents = [sm.get_all_words(s,stopwords=()) for s in sents] model = Word2Vec(sents, size=size, window=window, min_count=min_count, workers=workers, **kwargs) if save_to: pickle.dump(model, open(save_to,'wb')) return model
def common_context(df, words, col='text', window=2): if isinstance(words[0],str): words = (words,) sents = sm.get_all_sentences(df[col]) for pair in words: print("Words:\t", pair) A, D, voc = words_2gram_adj_matrix(df, pair, col=col, window=window) context = [] for o in D: ii = [i[0] for i in np.argwhere(D[o][0,:] * D[o][1,:])] context.extend([voc[i] for i in ii]) print("context:\t", context) for i, s in enumerate(sents): if np.any([w in s for w in pair]) and \ np.any([w in s for w in context]): print(i, s)