def words_2gram_adj_matrix(df, voc, col='text', window=2, normalize=True, min_abs=0, min_perc=0.0, binary=False): full_voc = list(np.unique(sm.get_all_words( df[col], filter_fun=lambda w: any('א'<=c<='ת' for c in w) ))) # get list of sentences data = sm.get_all_sentences(df[col]) # fill incidence matrices c = np.zeros(len(voc)) offsets = list(range(-window,0)) + list(range(1,window+1)) D = {off: np.zeros((len(voc), len(full_voc))) for off in offsets} for txt in data: sent = sm.get_all_words( txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)) for k,w in enumerate(sent): if w in voc: i = voc.index(w) c[i] += 1 for off in offsets: if 0 <= k+off < len(sent): D[off][i, full_voc.index(sent[k+off])] += 1 # normalize for off in offsets: D[off][D[off]<min_abs] = 0 if normalize: D[off] = D[off] * np.nan_to_num(1/c)[:, np.newaxis] D[off][D[off]<min_perc] = 0 if binary: D[off][D[off]>0] = 1 # adj matrix A = np.zeros((len(voc),len(voc))) for off in offsets: d = np.sqrt(D[off]) A += np.matmul(d, d.transpose()) np.fill_diagonal(A, 0) return (A, D, full_voc)
def words_local_incidence_matrix(df, voc, col='text', window=3, normalize=True, min_abs=0, min_perc=0.1, binary=False): full_voc = list(np.unique(sm.get_all_words(df[col]))) # get list of sentences data = sm.get_all_sentences(df[col]) # fill incidence matrix c = np.zeros(len(voc)) D = np.zeros((len(voc), len(full_voc))) for txt in data: sent = sm.get_all_words( txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)) for k,w in enumerate(sent): if w in voc: i = voc.index(w) c[i] += 1 neihb = sent[k-window:k] + sent[k+1:k+window+1] for w2 in neihb: D[i, full_voc.index(w2)] += 1 # normalize D[D<min_abs] = 0 if normalize: D = D * np.nan_to_num(1/c)[:, np.newaxis] D[D<min_perc] = 0 if binary: D[D>0] = 1 return D
def word2vec(df, col='text', size=100, window=3, min_count=1, workers=4, save_to=None, **kwargs): sents = sm.get_all_sentences(df[col]) sents = [sm.get_all_words(s,stopwords=()) for s in sents] model = Word2Vec(sents, size=size, window=window, min_count=min_count, workers=workers, **kwargs) if save_to: pickle.dump(model, open(save_to,'wb')) return model
def prepare_data_and_test(df, classifiers, x='article', y='source', fig=None, t0=time(), add_heuristics=True, force_balance=True, diagnosis=False): # convert to pairs (x,y) and split to train & test data = get_labeled_raw_data(df, verbose=1, force_balance=force_balance, x_resolution=x, y_col=y) X_train_raw, X_test_raw, y_train, y_test = \ train_test_split(data[0], data[1], test_size=0.2, random_state=0) print(f'Train & test groups defined ({time()-t0:.0f} [s]).') # extract features voc = sm.get_vocabulary(texts=X_train_raw, required_freq=20, filter_fun=lambda w: any('א' <= c <= 'ת' for c in w)) print(f'Vocabulary of {len(voc):d} words is set ({time()-t0:.0f} [s]).') X_train = extract_features(X_train_raw, voc, add_heuristics=add_heuristics) X_test = extract_features(X_test_raw, voc, add_heuristics=add_heuristics) print(f'Features extracted ({time()-t0:.0f} [s]).') # train & test res, models = test_models(X_train, X_test, y_train, y_test, classifiers, t0=t0, verbose=3) print(f'Test finished ({time()-t0:.0f} [s]).') # results analysis if diagnosis: models_diagnosis(models.values(), list(X_train.columns), x+' -> '+y, max_features=30) if fig is None: fig = plt.subplots(1, 2) plt.figure(fig[0].number) plot_results(res, fig[1], x + ' -> ' + y, 100 / len(np.unique(y_train)))
def texts_heuristics(texts): return { 'n_words': ba.count_words(texts), 'chars_per_word': np.array([len(s) for s in texts]) / np.array(ba.count_words(texts)), 'heb_chars_rate': np.nan_to_num([np.mean(['א'<=c<='ת' for w in sm.get_all_words(s) for c in w]) for s in texts]), 'eng_chars_rate': np.nan_to_num( [np.mean(['a'<=c<='z' or 'A'<=c<='Z' for w in sm.get_all_words(s) for c in w]) for s in texts]), 'num_chars_rate': np.nan_to_num([np.mean(['0'<=c<='9' for w in sm.get_all_words(s) for c in w]) for s in texts]) }
def common_context(df, words, col='text', window=2): if isinstance(words[0],str): words = (words,) sents = sm.get_all_sentences(df[col]) for pair in words: print("Words:\t", pair) A, D, voc = words_2gram_adj_matrix(df, pair, col=col, window=window) context = [] for o in D: ii = [i[0] for i in np.argwhere(D[o][0,:] * D[o][1,:])] context.extend([voc[i] for i in ii]) print("context:\t", context) for i, s in enumerate(sents): if np.any([w in s for w in pair]) and \ np.any([w in s for w in context]): print(i, s)
def words_vs_texts_incidence_matrix(df, voc, col='text', per='article', normalize=False, min_abs=0, min_perc=0, binary=False): # get list of texts data = list() sep = sm.SEPARATOR[per] for txt in df[col]: data.extend([s.strip().strip(sm.word_chars_filter) for s in list(filter(None, re.split(sep, txt)))]) # fill incidence matrix c = np.zeros(len(voc)) D = np.zeros((len(voc), len(data))) for j,txt in enumerate(data): for w in sm.get_all_words( txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)): if w in voc: c[voc.index(w)] += 1 D[voc.index(w), j] += 1 # normalize D[D<min_abs] = 0 if normalize: D = D * np.nan_to_num(1/c)[:, np.newaxis] D[D<min_perc] = 0 if binary: D[D>0] = 1 return D
if __name__ == "__main__": ## configuration build_graphs = False save_graphs = False voc_samples = 0 detailed_cliques = False build_word2vec = False # load data t0 = time() df = ba.load_data(r'..\Data\articles') print(f'Data loaded ({time()-t0:.0f} [s]).') # get vocabulary print("\n\n____________________") voc = sm.get_vocabulary(df, required_freq=70, filter_fun=lambda w: any('א' <= c <= 'ת' for c in w)) print(f"Vocabulary loaded ({len(voc):d} words) ({time()-t0:.0f} [s]).") if voc_samples > 0: voc = list(np.random.choice(voc, voc_samples, replace=False)) print(f"Vocabulary shrunk ({len(voc):d} words) ({time()-t0:.0f} [s]).") # Graph of shared skip-grams neighbors if build_graphs: A, D, full_voc = words_2gram_adj_matrix( df, voc, window=3, normalize=True, min_abs=3, min_perc=0.0, binary=True) G = graph_of_words(voc, A=A, filter_singletons=True, A_thresh=2) if save_graphs: pickle.dump(G, open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'wb')) else: G = pickle.load(open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'rb')) print(f'Graph of 2-grams generated ({time()-t0:.0f} [s]).')