def get_citation_histograms(identifiers, data=None):
    ch = {}
    current_year = datetime.now().year
    # Get necessary data if nothing was provided
    if not data:
        data = get_citations(identifiers)
    if len(data) == 0:
        data = get_citations(identifiers, no_zero=False)
    years = [int(p.bibcode[:4]) for p in data]
    # First gather all necessary data
    # refereed -> refereed
    rr_data = [([int(c[:4]) for c in p.refereed_citations],
                1.0 / float(p.author_num)) for p in data if p.refereed]
    # refereed -> non-refereed
    rn_data = [([int(c[:4]) for c in p.citations if c in p.refereed_citations],
                1.0 / float(p.author_num)) for p in data if not p.refereed]
    # non-refereed -> refereed
    nr_data = [([int(c[:4]) for c in list(set(p.citations).difference(
        set(p.refereed_citations)))], 1.0 / float(p.author_num)) for
        p in data if p.refereed]
    # non-refereed -> non-refereed
    nn_data = [([int(c[:4]) for c in p.citations if
                 c not in p.refereed_citations],
                1.0 / float(p.author_num)) for p in data if not p.refereed]
    # First construct the regular histograms
    rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data])))
    rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data])))
    nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data])))
    nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data])))
    # Get the earliest citation
    try:
        min_year = min(
            rr_hist.keys() + rn_hist.keys() + nr_hist.keys() + nn_hist.keys())
        nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    except:
        nullhist = [(y, 0) for y in range(min(years), current_year + 1)]
    # Now create the histograms with zeroes for year without values
    ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist)
    ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist)
    ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist)
    ch['nonrefereed to nonrefereed'] = merge_dictionaries(
        dict(nullhist), nn_hist)
    min_year = min(ch['refereed to refereed'].keys() +
                   ch['refereed to nonrefereed'].keys() +
                   ch['nonrefereed to refereed'].keys() +
                   ch['nonrefereed to nonrefereed'].keys())
    nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    # Normalized histograms need a different approach
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data]))
    ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data]))
    ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data]))
    ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data]))
    ch['nonrefereed to nonrefereed normalized'] = get_norm_histo(
        nullhist + tmp)
    return ch
 def test_get_citations(self):
     '''Test getting citations'''
     from models import get_citations
     data = get_citations(testset)
     # The most important thing here is to test that it is a list
     # of MetricsModel instances
     self.assertEqual(isinstance(data, list), True)
     self.assertTrue(False not in
                     [x.__class__.__name__ == 'MetricsModel' for x in data])
 def test_get_citations(self):
     '''Test getting citations'''
     from models import get_citations
     data = get_citations(testset)
     # The most important thing here is to test that it is a list
     # of MetricsModel instances
     self.assertEqual(isinstance(data, list), True)
     self.assertTrue(
         False not in [x.__class__.__name__ == 'MetricsModel' for
                       x in data])
Exemple #4
0
def get_selfcitations(identifiers, bibcodes):
    data = get_citations(identifiers)
    # record the actual self-citations so that we can use that
    # information later on in the calculation of the Tori
    try:
        selfcits = [(set(p.citations).intersection(set(bibcodes)), p.refereed)
                    for p in data]
    except:
        selfcits = [([], False)]
    Nself = sum([len(c[0]) for c in selfcits])
    Nself_refereed = sum([len(c[0]) * c[1] for c in selfcits])
    Nciting = len(set(itertools.chain(*[p.citations for p in data])))
    Nciting_ref = len(
        set(itertools.chain(*[p.citations for p in data if p.refereed])))
    return data, selfcits, Nself, Nself_refereed, Nciting, Nciting_ref
def get_selfcitations(identifiers, bibcodes):
    data = get_citations(identifiers)
    # record the actual self-citations so that we can use that
    # information later on in the calculation of the Tori
    try:
        selfcits = [
            (set(p.citations).intersection(set(bibcodes)), p.refereed)
            for p in data]
    except:
        selfcits = [([], False)]
    Nself = sum([len(c[0]) for c in selfcits])
    Nself_refereed = sum([len(c[0]) * c[1] for c in selfcits])
    Nciting = len(set(itertools.chain(*[p.citations for p in data])))
    Nciting_ref = len(
        set(itertools.chain(*[p.citations for p in data if p.refereed])))
    return data, selfcits, Nself, Nself_refereed, Nciting, Nciting_ref
Exemple #6
0
def get_time_series(identifiers,
                    bibcodes,
                    data=None,
                    usagedata=None,
                    tori_data=None,
                    include_tori=True,
                    self_cits=None):
    series = {}
    i10 = {}
    i100 = {}
    h = {}
    g = {}
    r10 = {}
    tori = {}
    # Get data if nothing was supplied
    if not data:
        data = get_citations(identifiers)
    if not usagedata:
        usagedata = get_usage_data(identifiers)
    if not self_cits and include_tori:
        self_cits = get_selfcitations(identifiers, bibcodes)[1]
    self_citations = set((itertools.chain(*[x[0] for x in self_cits])))
    if not tori_data and include_tori:
        tdata = get_tori_data(identifiers)
        tori_data = [
            p for p in list(
                itertools.chain(
                    *[p.rn_citation_data for p in tdata
                      if p.rn_citation_data]))
            if p['bibcode'] not in self_citations and 'pubyear' in p
        ]
    # Determine the year range
    Nentries = datetime.now().year - 1996 + 1
    years = [int(b[:4]) for b in bibcodes]
    yrange = range(min(years), datetime.now().year + 1)
    d0 = date(datetime.now().year, 1, 1)
    d1 = date(datetime.now().year, datetime.now().month, datetime.now().day)
    d2 = date(datetime.now().year, 12, 31)
    delta = (d1 - d0).days + 1
    ndays = (d2 - d0).days + 1
    try:
        r10_corr = float(ndays) / float(delta)
    except:
        r10_corr = 1.0
    for year in yrange:
        biblist = [b for b in bibcodes if int(b[:4]) <= year]
        citations = sorted([
            len([int(c[:4]) for c in p.citations if int(c[:4]) <= year])
            for p in data if p.bibcode in biblist
        ],
                           reverse=True)
        if year < 1996:
            r10[year] = 0.0
        else:
            idx = year - 1996
            r10[year] = sum([
                float(p.reads[idx]) / float(p.author_num) for p in usagedata
                if p.bibcode in biblist and int(p.bibcode[:4]) > year -
                10 and p.reads and len(p.reads) == Nentries
            ])
        try:
            h[year] = max([i for i, n in enumerate(citations) if i <= n])
            g[year] = max([
                i for i, n in enumerate(np.cumsum(citations, axis=0))
                if i**2 <= n
            ])
        except:
            h[year] = 0
            g[year] = 0
        i10[year] = len([c for c in citations if c >= 10])
        i100[year] = len([c for c in citations if c >= 100])
        if include_tori:
            tori[year] = np.sum(
                np.array([
                    r['auth_norm'] * r['ref_norm'] for r in tori_data
                    if r['pubyear'] <= year and r['cityear'] <= year
                ]))

    r10[datetime.now().year] = r10[datetime.now().year] * r10_corr
    series['i10'] = i10
    series['i100'] = i100
    series['h'] = h
    series['g'] = g
    series['read10'] = r10
    if include_tori:
        series['tori'] = tori

    return series
Exemple #7
0
def get_citation_histograms(identifiers, data=None):
    ch = {}
    current_year = datetime.now().year
    # Get necessary data if nothing was provided
    if not data:
        data = get_citations(identifiers)
    if len(data) == 0:
        data = get_citations(identifiers, no_zero=False)
    years = [int(p.bibcode[:4]) for p in data]
    # First gather all necessary data
    # refereed -> refereed
    rr_data = [([int(c[:4])
                 for c in p.refereed_citations], 1.0 / float(p.author_num))
               for p in data if p.refereed]
    # refereed -> non-refereed
    rn_data = [([int(c[:4]) for c in p.citations
                 if c in p.refereed_citations], 1.0 / float(p.author_num))
               for p in data if not p.refereed]
    # non-refereed -> refereed
    nr_data = [([
        int(c[:4])
        for c in list(set(p.citations).difference(set(p.refereed_citations)))
    ], 1.0 / float(p.author_num)) for p in data if p.refereed]
    # non-refereed -> non-refereed
    nn_data = [
        ([int(c[:4]) for c in p.citations
          if c not in p.refereed_citations], 1.0 / float(p.author_num))
        for p in data if not p.refereed
    ]
    # First construct the regular histograms
    rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data])))
    rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data])))
    nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data])))
    nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data])))
    # Get the earliest citation
    try:
        min_year = min(rr_hist.keys() + rn_hist.keys() + nr_hist.keys() +
                       nn_hist.keys())
        nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    except:
        nullhist = [(y, 0) for y in range(min(years), current_year + 1)]
    # Now create the histograms with zeroes for year without values
    ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist)
    ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist)
    ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist)
    ch['nonrefereed to nonrefereed'] = merge_dictionaries(
        dict(nullhist), nn_hist)
    min_year = min(ch['refereed to refereed'].keys() +
                   ch['refereed to nonrefereed'].keys() +
                   ch['nonrefereed to refereed'].keys() +
                   ch['nonrefereed to nonrefereed'].keys())
    nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    # Normalized histograms need a different approach
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data]))
    ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data]))
    ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data]))
    ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data]))
    ch['nonrefereed to nonrefereed normalized'] = get_norm_histo(nullhist +
                                                                 tmp)
    return ch
def get_time_series(identifiers, bibcodes, data=None, usagedata=None,
                    tori_data=None, include_tori=True, self_cits=None):
    series = {}
    i10 = {}
    i100 = {}
    h = {}
    g = {}
    r10 = {}
    tori = {}
    # Get data if nothing was supplied
    if not data:
        data = get_citations(identifiers)
    if not usagedata:
        usagedata = get_usage_data(identifiers)
    if not self_cits and include_tori:
        self_cits = get_selfcitations(identifiers, bibcodes)[1]
    self_citations = set((itertools.chain(*[x[0] for x in self_cits])))
    if not tori_data and include_tori:
        tdata = get_tori_data(identifiers)
        tori_data = [p for p in list(itertools.chain(
            *[p.rn_citation_data for p in tdata if p.rn_citation_data])) if
            p['bibcode'] not in self_citations and 'pubyear' in p]
    # Determine the year range
    Nentries = datetime.now().year - 1996 + 1
    years = [int(b[:4]) for b in bibcodes]
    yrange = range(min(years), datetime.now().year + 1)
    d0 = date(datetime.now().year, 1, 1)
    d1 = date(datetime.now().year, datetime.now().month, datetime.now().day)
    d2 = date(datetime.now().year, 12, 31)
    delta = (d1 - d0).days + 1
    ndays = (d2 - d0).days + 1
    try:
       r10_corr = float(ndays)/float(delta)
    except:
       r10_corr = 1.0
    for year in yrange:
        biblist = [b for b in bibcodes if int(b[:4]) <= year]
        citations = sorted([len([int(c[:4]) for c in p.citations if int(
            c[:4]) <= year]) for p in data if
            p.bibcode in biblist], reverse=True)
        if year < 1996:
            r10[year] = 0.0
        else:
            idx = year - 1996
            r10[year] = sum([float(p.reads[idx]) / float(p.author_num) for
                             p in usagedata if p.bibcode in biblist and int(
                p.bibcode[:4]) > year - 10 and p.reads and
                len(p.reads) == Nentries])
        try:
            h[year] = max([i for i, n in enumerate(citations) if i <= n])
            g[year] = max(
                [i for i, n in enumerate(np.cumsum(citations, axis=0)) if
                 i**2 <= n])
        except:
            h[year] = 0
            g[year] = 0
        i10[year] = len([c for c in citations if c >= 10])
        i100[year] = len([c for c in citations if c >= 100])
        if include_tori:
            tori[year] = np.sum(np.array([r['auth_norm'] * r['ref_norm'] for
                                          r in tori_data if
                                          r['pubyear'] <= year and
                                          r['cityear'] <= year]))

    r10[datetime.now().year] = r10[datetime.now().year] * r10_corr
    series['i10'] = i10
    series['i100'] = i100
    series['h'] = h
    series['g'] = g
    series['read10'] = r10
    if include_tori:
        series['tori'] = tori

    return series