def plot_views_cits_corr(): input_cits = open('data/series_cits.json', 'r') series_cits = json.load(input_cits) months_cits = series_cits['months'] months_cits = [ int(m.split('-')[0]) + int(m.split('-')[1]) / 12 for m in months_cits ] data = read_file.load_data() data = read_file.filter_outliers(data) corrs = [] for i, _, _, xs, ys, _ in data: try: idx_begin = months_cits.index(xs[0]) idx_end = months_cits.index(xs[-1]) x = series_cits['data'][i]['citations'][idx_begin:idx_end + 1] x = np.diff(x) y = np.diff(ys) if np.count_nonzero(x[0] == x) == len(x) or np.count_nonzero( y[0] == y) == len(y): continue if np.count_nonzero(x) > len(x) / 2 and np.count_nonzero( y) > len(y) / 2: corr = pearsonr(x, y)[0] corrs.append(corr) except: pass plt.hist(corrs, bins=100) plt.title('correlação entre número de visualizações e número de citações') plt.savefig('corr_views_cits.pdf')
def read_preprocessed_file(N, source): dois = open('k' + str(N) + '/k' + str(N) + '_dois.txt', 'r').read().split() original = np.loadtxt(source) slopes = original[:, :N] intervals = original[:, N:-1] labels = original[:, -1] labels = labels.astype(int) total_labels = len(set(labels)) data = read_file.load_data() valid_dois = set() for i, s, b, xs, ys, p in data: # print(len(s)) delta_x = xs[-1] - xs[0] if 5 <= delta_x <= 7: valid_dois.add(i) labels_slopes, labels_intervals = [[] for _ in range(total_labels) ], [[] for _ in range(total_labels)] for doi, label, s, l in zip(dois, labels, slopes, intervals): if doi in valid_dois: labels_slopes[label].append(s) labels_intervals[label].append(l) labels_slopes = [np.asarray(values) for values in labels_slopes] labels_intervals = [np.asarray(values) for values in labels_intervals] return labels_slopes, labels_intervals
def plot_views_cits(): input_cits = open('data/series_cits.json', 'r') series_cits = json.load(input_cits) months_cits = series_cits['months'] months_cits = [ int(m.split('-')[0]) + int(m.split('-')[1]) / 12 for m in months_cits ] data = read_file.load_data() X = [] Y = [] for i, s, b, xs, ys, p in data: try: x = ys[-1] idx_begin = months_cits.index(xs[0]) idx_end = months_cits.index(xs[-1]) y = sum(series_cits['data'][i]['citations'][idx_begin:idx_end + 1]) except: continue X.append(x) Y.append(y) c = pearsonr(X, Y) plt.scatter(X, Y, alpha=0.3, s=1) plt.xlabel('views') plt.ylabel('number of citations') plt.title("pearson = %.2f" % c) plt.savefig('views_cits.pdf')
def get_dois(n, filename): data = read_file.load_data(filename) data = read_file.filter_outliers(data) dois = [] for i, s, b, xs, ys, p in data: if len(s) == n: dois.append(i) return dois
def get_data_by_number_segm(): data = read_file.load_data() data = read_file.filter_outliers(data) freq_delta_t = defaultdict(lambda: []) freq_views = defaultdict(lambda: []) for sample in data: N = len(sample[1]) delta_t = sample[3][-1] - sample[3][0] views = sample[-2][-1] freq_delta_t[N].append(delta_t) freq_views[N].append(views) return freq_delta_t, freq_views
def get_all_data(): data = read_file.load_data() data = read_file.filter_outliers(data) # freq1 = defaultdict(lambda:[]) freq = [] for sample in data: N = len(sample[1]) delta_t = sample[3][-1] - sample[3][0] # freq1[N].append((delta_t,sample[-2][-1])) views = sample[-2][-1] if views == 0: continue freq.append((delta_t, views)) return freq
def plt_views(dois, labels): data = read_file.load_data() data = read_file.filter_outliers(data) views = [] for i, s, b, xs, ys, p in data: if i in dois: views.append(ys[-1]) views = np.asarray(views) values = [] unique = np.unique(labels) fig, axes = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(9, 3)) for u in unique: if u == -1: continue axes[u].hist(views[labels == u], label=u, bins=30) axes[u].legend() plt.savefig('views_dist_labels.pdf')
g.vs[idx]['comm'] = i i += 1 xnet.igraph2xnet(g, 'data/subj_areas/nets/all_with_comm_%d_4.xnet' % year) complete_data = open('data/plos_one_2019_subj_areas.json', 'r').read() # complete_data = open('data/wosPlosOne2016_citations.json','r').read() complete_data = json.loads(complete_data) # new_data = open('data/papers_plos_data_time_series2_filtered.json','r').read() # new_data = json.loads(new_data) data_breaks = load_data( 'data/plos_one_2019_breakpoints_k4_original1_data_filtered.txt') def incr_decr(dois, x0, x1): incr = [] decr = [] for sample in data_breaks: if not sample[0] in dois: continue slopes = sample[1] breakpoints = sample[2] n = len(breakpoints) delta_time = sample[3][-1] - sample[3][0] begin = sample[3][0] for i in range(n): moment = begin + delta_time * breakpoints[i]
#!/usr/bin/env python # coding: utf-8 import stasts from read_file import load_data from stasts import filter_outliers if __name__ == '__main__': data = load_data('segm/segmented_curves_filtered.txt') data = filter_outliers(data) stasts.plot_life_time_hist(data, 'lifetime_v2') stasts.plot_no_of_visual(data, 'views_v2') # stasts.plot_no_of_intervals(data,'segments') # -------------------------------------------------------------------- ''' data = load_data('r_code/segmented_curves_filtered.txt') data = filter_outliers(data) plot_hists(data,4,True) plot_hists(data,5,True) plot_hists(data,6,True) '''
import json import numpy as np from read_file import load_data, filter_outliers sources = ['clusters\\clusters\\clusters_ind_single_0.50_2.txt', 'clusters\\clusters\\clusters_ind_single_0.35_3.txt', 'clusters\\clusters\\clusters_ind_single_0.47_4.txt', 'clusters\\clusters\\clusters_ind_single_0.56_5.txt'] data = load_data() data = filter_outliers(data) for N, source in zip([2, 3, 4, 5], sources): labels = np.loadtxt(source, dtype=np.int).tolist() unique, counts = np.unique(labels, return_counts=True) unique = unique[counts >= 10] counts = counts[counts >= 10] unique_idxs = np.argsort(counts)[-3:] unique = unique[unique_idxs].tolist() labels = [unique.index(l) if l in unique else -1 for l in labels] dois = [] for i, s, b, xs, ys, p in data: if len(s) == N: dois.append(i) print(len(dois), len(labels)) doi2cluster = dict(zip(dois, labels)) str_json = json.dumps(doi2cluster) out = open('doi2cluster_%d_3.json' % N, 'w') out.write(str_json) out.close()
sources = [ 'clusters\\clusters\\clusters_ind_single_0.50_2.txt', 'clusters\\clusters\\clusters_ind_single_0.35_3.txt', 'clusters\\clusters\\clusters_ind_single_0.47_4.txt', 'clusters\\clusters\\clusters_ind_single_0.56_5.txt' ] labels3 = np.loadtxt(sources[1], dtype=np.int) unique, count = np.unique(labels3, return_counts=True) unique = unique[count >= 10] count = count[count >= 10] unique_idxs = np.argsort(count)[-3:] unique = unique[unique_idxs].tolist() labels3 = [unique.index(l) if l in unique else -1 for l in labels3] data = read_file.load_data() data = read_file.filter_outliers(data) # dois = {2: [], 3: [], 4: [], 5: []} # slopes = [] # intervals = [] # for i, s, b, xs, ys, p in data: # dois[len(s)].append(i) # # print('1y') # plt_cits(dois[3], labels3, 1) # print('2y') # plt_cits(dois[3], labels3, 2) # print('3y') # plt_cits(dois[3], labels3, 5) # # pegar os labels