def plot_views_cits_corr():
    input_cits = open('data/series_cits.json', 'r')
    series_cits = json.load(input_cits)
    months_cits = series_cits['months']
    months_cits = [
        int(m.split('-')[0]) + int(m.split('-')[1]) / 12 for m in months_cits
    ]

    data = read_file.load_data()
    data = read_file.filter_outliers(data)

    corrs = []
    for i, _, _, xs, ys, _ in data:
        try:
            idx_begin = months_cits.index(xs[0])
            idx_end = months_cits.index(xs[-1])
            x = series_cits['data'][i]['citations'][idx_begin:idx_end + 1]
            x = np.diff(x)
            y = np.diff(ys)
            if np.count_nonzero(x[0] == x) == len(x) or np.count_nonzero(
                    y[0] == y) == len(y):
                continue

            if np.count_nonzero(x) > len(x) / 2 and np.count_nonzero(
                    y) > len(y) / 2:
                corr = pearsonr(x, y)[0]
                corrs.append(corr)
        except:
            pass

    plt.hist(corrs, bins=100)
    plt.title('correlação entre número de visualizações e número de citações')
    plt.savefig('corr_views_cits.pdf')
def read_preprocessed_file(N, source):
    dois = open('k' + str(N) + '/k' + str(N) + '_dois.txt', 'r').read().split()

    original = np.loadtxt(source)
    slopes = original[:, :N]
    intervals = original[:, N:-1]
    labels = original[:, -1]
    labels = labels.astype(int)
    total_labels = len(set(labels))

    data = read_file.load_data()
    valid_dois = set()
    for i, s, b, xs, ys, p in data:
        # print(len(s))
        delta_x = xs[-1] - xs[0]
        if 5 <= delta_x <= 7:
            valid_dois.add(i)

    labels_slopes, labels_intervals = [[] for _ in range(total_labels)
                                       ], [[] for _ in range(total_labels)]
    for doi, label, s, l in zip(dois, labels, slopes, intervals):
        if doi in valid_dois:
            labels_slopes[label].append(s)
            labels_intervals[label].append(l)

    labels_slopes = [np.asarray(values) for values in labels_slopes]
    labels_intervals = [np.asarray(values) for values in labels_intervals]

    return labels_slopes, labels_intervals
def plot_views_cits():
    input_cits = open('data/series_cits.json', 'r')
    series_cits = json.load(input_cits)
    months_cits = series_cits['months']
    months_cits = [
        int(m.split('-')[0]) + int(m.split('-')[1]) / 12 for m in months_cits
    ]
    data = read_file.load_data()

    X = []
    Y = []
    for i, s, b, xs, ys, p in data:
        try:
            x = ys[-1]
            idx_begin = months_cits.index(xs[0])
            idx_end = months_cits.index(xs[-1])
            y = sum(series_cits['data'][i]['citations'][idx_begin:idx_end + 1])
        except:
            continue
        X.append(x)
        Y.append(y)

    c = pearsonr(X, Y)
    plt.scatter(X, Y, alpha=0.3, s=1)
    plt.xlabel('views')
    plt.ylabel('number of citations')
    plt.title("pearson = %.2f" % c)
    plt.savefig('views_cits.pdf')
def get_dois(n, filename):
    data = read_file.load_data(filename)
    data = read_file.filter_outliers(data)

    dois = []
    for i, s, b, xs, ys, p in data:
        if len(s) == n:
            dois.append(i)
    return dois
def get_data_by_number_segm():
    data = read_file.load_data()
    data = read_file.filter_outliers(data)
    freq_delta_t = defaultdict(lambda: [])
    freq_views = defaultdict(lambda: [])
    for sample in data:
        N = len(sample[1])
        delta_t = sample[3][-1] - sample[3][0]
        views = sample[-2][-1]
        freq_delta_t[N].append(delta_t)
        freq_views[N].append(views)

    return freq_delta_t, freq_views
def get_all_data():
    data = read_file.load_data()
    data = read_file.filter_outliers(data)
    # freq1 = defaultdict(lambda:[])
    freq = []
    for sample in data:
        N = len(sample[1])
        delta_t = sample[3][-1] - sample[3][0]
        # freq1[N].append((delta_t,sample[-2][-1]))
        views = sample[-2][-1]
        if views == 0:
            continue
        freq.append((delta_t, views))
    return freq
def plt_views(dois, labels):
    data = read_file.load_data()
    data = read_file.filter_outliers(data)

    views = []
    for i, s, b, xs, ys, p in data:
        if i in dois:
            views.append(ys[-1])
    views = np.asarray(views)

    values = []
    unique = np.unique(labels)
    fig, axes = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(9, 3))

    for u in unique:
        if u == -1:
            continue
        axes[u].hist(views[labels == u], label=u, bins=30)
        axes[u].legend()

    plt.savefig('views_dist_labels.pdf')
                g.vs[idx]['comm'] = i

            i += 1

        xnet.igraph2xnet(g,
                         'data/subj_areas/nets/all_with_comm_%d_4.xnet' % year)


complete_data = open('data/plos_one_2019_subj_areas.json', 'r').read()
# complete_data = open('data/wosPlosOne2016_citations.json','r').read()
complete_data = json.loads(complete_data)

# new_data = open('data/papers_plos_data_time_series2_filtered.json','r').read()
# new_data = json.loads(new_data)

data_breaks = load_data(
    'data/plos_one_2019_breakpoints_k4_original1_data_filtered.txt')


def incr_decr(dois, x0, x1):
    incr = []
    decr = []
    for sample in data_breaks:
        if not sample[0] in dois:
            continue
        slopes = sample[1]
        breakpoints = sample[2]
        n = len(breakpoints)
        delta_time = sample[3][-1] - sample[3][0]
        begin = sample[3][0]
        for i in range(n):
            moment = begin + delta_time * breakpoints[i]
Esempio n. 9
0
#!/usr/bin/env python
# coding: utf-8

import stasts

from read_file import load_data
from stasts import filter_outliers

if __name__ == '__main__':
    data = load_data('segm/segmented_curves_filtered.txt')
    data = filter_outliers(data)

    stasts.plot_life_time_hist(data, 'lifetime_v2')
    stasts.plot_no_of_visual(data, 'views_v2')
    # stasts.plot_no_of_intervals(data,'segments')

# --------------------------------------------------------------------
'''
data = load_data('r_code/segmented_curves_filtered.txt')

data = filter_outliers(data)

plot_hists(data,4,True)
plot_hists(data,5,True)
plot_hists(data,6,True)
'''
import json
import numpy as np
from read_file import load_data, filter_outliers
sources = ['clusters\\clusters\\clusters_ind_single_0.50_2.txt',
                   'clusters\\clusters\\clusters_ind_single_0.35_3.txt',
                   'clusters\\clusters\\clusters_ind_single_0.47_4.txt',
                   'clusters\\clusters\\clusters_ind_single_0.56_5.txt']

data = load_data()
data = filter_outliers(data)
for N, source in zip([2, 3, 4, 5], sources):
    labels = np.loadtxt(source, dtype=np.int).tolist()
    unique, counts = np.unique(labels, return_counts=True)
    unique = unique[counts >= 10]
    counts = counts[counts >= 10]
    unique_idxs = np.argsort(counts)[-3:]
    unique = unique[unique_idxs].tolist()
    labels = [unique.index(l) if l in unique else -1 for l in labels]

    dois = []
    for i, s, b, xs, ys, p in data:
        if len(s) == N:
            dois.append(i)
    print(len(dois), len(labels))
    doi2cluster = dict(zip(dois, labels))
    str_json = json.dumps(doi2cluster)
    out = open('doi2cluster_%d_3.json' % N, 'w')
    out.write(str_json)
    out.close()
    sources = [
        'clusters\\clusters\\clusters_ind_single_0.50_2.txt',
        'clusters\\clusters\\clusters_ind_single_0.35_3.txt',
        'clusters\\clusters\\clusters_ind_single_0.47_4.txt',
        'clusters\\clusters\\clusters_ind_single_0.56_5.txt'
    ]
    labels3 = np.loadtxt(sources[1], dtype=np.int)
    unique, count = np.unique(labels3, return_counts=True)
    unique = unique[count >= 10]
    count = count[count >= 10]
    unique_idxs = np.argsort(count)[-3:]
    unique = unique[unique_idxs].tolist()
    labels3 = [unique.index(l) if l in unique else -1 for l in labels3]

    data = read_file.load_data()
    data = read_file.filter_outliers(data)
    # dois = {2: [], 3: [], 4: [], 5: []}
    # slopes = []
    # intervals = []
    # for i, s, b, xs, ys, p in data:
    #     dois[len(s)].append(i)
    #
    # print('1y')
    # plt_cits(dois[3], labels3, 1)
    # print('2y')
    # plt_cits(dois[3], labels3, 2)
    # print('3y')
    # plt_cits(dois[3], labels3, 5)

    # # pegar os labels