def plot_views_cits_corr(): input_cits = open('data/series_cits.json', 'r') series_cits = json.load(input_cits) months_cits = series_cits['months'] months_cits = [ int(m.split('-')[0]) + int(m.split('-')[1]) / 12 for m in months_cits ] data = read_file.load_data() data = read_file.filter_outliers(data) corrs = [] for i, _, _, xs, ys, _ in data: try: idx_begin = months_cits.index(xs[0]) idx_end = months_cits.index(xs[-1]) x = series_cits['data'][i]['citations'][idx_begin:idx_end + 1] x = np.diff(x) y = np.diff(ys) if np.count_nonzero(x[0] == x) == len(x) or np.count_nonzero( y[0] == y) == len(y): continue if np.count_nonzero(x) > len(x) / 2 and np.count_nonzero( y) > len(y) / 2: corr = pearsonr(x, y)[0] corrs.append(corr) except: pass plt.hist(corrs, bins=100) plt.title('correlação entre número de visualizações e número de citações') plt.savefig('corr_views_cits.pdf')
def get_dois(n, filename): data = read_file.load_data(filename) data = read_file.filter_outliers(data) dois = [] for i, s, b, xs, ys, p in data: if len(s) == n: dois.append(i) return dois
def get_data_by_number_segm(): data = read_file.load_data() data = read_file.filter_outliers(data) freq_delta_t = defaultdict(lambda: []) freq_views = defaultdict(lambda: []) for sample in data: N = len(sample[1]) delta_t = sample[3][-1] - sample[3][0] views = sample[-2][-1] freq_delta_t[N].append(delta_t) freq_views[N].append(views) return freq_delta_t, freq_views
def get_all_data(): data = read_file.load_data() data = read_file.filter_outliers(data) # freq1 = defaultdict(lambda:[]) freq = [] for sample in data: N = len(sample[1]) delta_t = sample[3][-1] - sample[3][0] # freq1[N].append((delta_t,sample[-2][-1])) views = sample[-2][-1] if views == 0: continue freq.append((delta_t, views)) return freq
def plt_views(dois, labels): data = read_file.load_data() data = read_file.filter_outliers(data) views = [] for i, s, b, xs, ys, p in data: if i in dois: views.append(ys[-1]) views = np.asarray(views) values = [] unique = np.unique(labels) fig, axes = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(9, 3)) for u in unique: if u == -1: continue axes[u].hist(views[labels == u], label=u, bins=30) axes[u].legend() plt.savefig('views_dist_labels.pdf')
import json import numpy as np from read_file import load_data, filter_outliers sources = ['clusters\\clusters\\clusters_ind_single_0.50_2.txt', 'clusters\\clusters\\clusters_ind_single_0.35_3.txt', 'clusters\\clusters\\clusters_ind_single_0.47_4.txt', 'clusters\\clusters\\clusters_ind_single_0.56_5.txt'] data = load_data() data = filter_outliers(data) for N, source in zip([2, 3, 4, 5], sources): labels = np.loadtxt(source, dtype=np.int).tolist() unique, counts = np.unique(labels, return_counts=True) unique = unique[counts >= 10] counts = counts[counts >= 10] unique_idxs = np.argsort(counts)[-3:] unique = unique[unique_idxs].tolist() labels = [unique.index(l) if l in unique else -1 for l in labels] dois = [] for i, s, b, xs, ys, p in data: if len(s) == N: dois.append(i) print(len(dois), len(labels)) doi2cluster = dict(zip(dois, labels)) str_json = json.dumps(doi2cluster) out = open('doi2cluster_%d_3.json' % N, 'w') out.write(str_json) out.close()
sources = [ 'clusters\\clusters\\clusters_ind_single_0.50_2.txt', 'clusters\\clusters\\clusters_ind_single_0.35_3.txt', 'clusters\\clusters\\clusters_ind_single_0.47_4.txt', 'clusters\\clusters\\clusters_ind_single_0.56_5.txt' ] labels3 = np.loadtxt(sources[1], dtype=np.int) unique, count = np.unique(labels3, return_counts=True) unique = unique[count >= 10] count = count[count >= 10] unique_idxs = np.argsort(count)[-3:] unique = unique[unique_idxs].tolist() labels3 = [unique.index(l) if l in unique else -1 for l in labels3] data = read_file.load_data() data = read_file.filter_outliers(data) # dois = {2: [], 3: [], 4: [], 5: []} # slopes = [] # intervals = [] # for i, s, b, xs, ys, p in data: # dois[len(s)].append(i) # # print('1y') # plt_cits(dois[3], labels3, 1) # print('2y') # plt_cits(dois[3], labels3, 2) # print('3y') # plt_cits(dois[3], labels3, 5) # # pegar os labels # # ver a distribuição dos grupos kk views cits tweets