def get_citation_histograms(identifiers, data=None): ch = {} current_year = datetime.now().year # Get necessary data if nothing was provided if not data: data = get_citations(identifiers) if len(data) == 0: data = get_citations(identifiers, no_zero=False) years = [int(p.bibcode[:4]) for p in data] # First gather all necessary data # refereed -> refereed rr_data = [([int(c[:4]) for c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if p.refereed] # refereed -> non-refereed rn_data = [([int(c[:4]) for c in p.citations if c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed] # non-refereed -> refereed nr_data = [([int(c[:4]) for c in list(set(p.citations).difference( set(p.refereed_citations)))], 1.0 / float(p.author_num)) for p in data if p.refereed] # non-refereed -> non-refereed nn_data = [([int(c[:4]) for c in p.citations if c not in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed] # First construct the regular histograms rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data]))) rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data]))) nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data]))) nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data]))) # Get the earliest citation try: min_year = min( rr_hist.keys() + rn_hist.keys() + nr_hist.keys() + nn_hist.keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] except: nullhist = [(y, 0) for y in range(min(years), current_year + 1)] # Now create the histograms with zeroes for year without values ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist) ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist) ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist) ch['nonrefereed to nonrefereed'] = merge_dictionaries( dict(nullhist), nn_hist) min_year = min(ch['refereed to refereed'].keys() + ch['refereed to nonrefereed'].keys() + ch['nonrefereed to refereed'].keys() + ch['nonrefereed to nonrefereed'].keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] # Normalized histograms need a different approach tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data])) ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data])) ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data])) ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data])) ch['nonrefereed to nonrefereed normalized'] = get_norm_histo( nullhist + tmp) return ch
def test_get_citations(self): '''Test getting citations''' from models import get_citations data = get_citations(testset) # The most important thing here is to test that it is a list # of MetricsModel instances self.assertEqual(isinstance(data, list), True) self.assertTrue(False not in [x.__class__.__name__ == 'MetricsModel' for x in data])
def test_get_citations(self): '''Test getting citations''' from models import get_citations data = get_citations(testset) # The most important thing here is to test that it is a list # of MetricsModel instances self.assertEqual(isinstance(data, list), True) self.assertTrue( False not in [x.__class__.__name__ == 'MetricsModel' for x in data])
def get_selfcitations(identifiers, bibcodes): data = get_citations(identifiers) # record the actual self-citations so that we can use that # information later on in the calculation of the Tori try: selfcits = [(set(p.citations).intersection(set(bibcodes)), p.refereed) for p in data] except: selfcits = [([], False)] Nself = sum([len(c[0]) for c in selfcits]) Nself_refereed = sum([len(c[0]) * c[1] for c in selfcits]) Nciting = len(set(itertools.chain(*[p.citations for p in data]))) Nciting_ref = len( set(itertools.chain(*[p.citations for p in data if p.refereed]))) return data, selfcits, Nself, Nself_refereed, Nciting, Nciting_ref
def get_selfcitations(identifiers, bibcodes): data = get_citations(identifiers) # record the actual self-citations so that we can use that # information later on in the calculation of the Tori try: selfcits = [ (set(p.citations).intersection(set(bibcodes)), p.refereed) for p in data] except: selfcits = [([], False)] Nself = sum([len(c[0]) for c in selfcits]) Nself_refereed = sum([len(c[0]) * c[1] for c in selfcits]) Nciting = len(set(itertools.chain(*[p.citations for p in data]))) Nciting_ref = len( set(itertools.chain(*[p.citations for p in data if p.refereed]))) return data, selfcits, Nself, Nself_refereed, Nciting, Nciting_ref
def get_time_series(identifiers, bibcodes, data=None, usagedata=None, tori_data=None, include_tori=True, self_cits=None): series = {} i10 = {} i100 = {} h = {} g = {} r10 = {} tori = {} # Get data if nothing was supplied if not data: data = get_citations(identifiers) if not usagedata: usagedata = get_usage_data(identifiers) if not self_cits and include_tori: self_cits = get_selfcitations(identifiers, bibcodes)[1] self_citations = set((itertools.chain(*[x[0] for x in self_cits]))) if not tori_data and include_tori: tdata = get_tori_data(identifiers) tori_data = [ p for p in list( itertools.chain( *[p.rn_citation_data for p in tdata if p.rn_citation_data])) if p['bibcode'] not in self_citations and 'pubyear' in p ] # Determine the year range Nentries = datetime.now().year - 1996 + 1 years = [int(b[:4]) for b in bibcodes] yrange = range(min(years), datetime.now().year + 1) d0 = date(datetime.now().year, 1, 1) d1 = date(datetime.now().year, datetime.now().month, datetime.now().day) d2 = date(datetime.now().year, 12, 31) delta = (d1 - d0).days + 1 ndays = (d2 - d0).days + 1 try: r10_corr = float(ndays) / float(delta) except: r10_corr = 1.0 for year in yrange: biblist = [b for b in bibcodes if int(b[:4]) <= year] citations = sorted([ len([int(c[:4]) for c in p.citations if int(c[:4]) <= year]) for p in data if p.bibcode in biblist ], reverse=True) if year < 1996: r10[year] = 0.0 else: idx = year - 1996 r10[year] = sum([ float(p.reads[idx]) / float(p.author_num) for p in usagedata if p.bibcode in biblist and int(p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries ]) try: h[year] = max([i for i, n in enumerate(citations) if i <= n]) g[year] = max([ i for i, n in enumerate(np.cumsum(citations, axis=0)) if i**2 <= n ]) except: h[year] = 0 g[year] = 0 i10[year] = len([c for c in citations if c >= 10]) i100[year] = len([c for c in citations if c >= 100]) if include_tori: tori[year] = np.sum( np.array([ r['auth_norm'] * r['ref_norm'] for r in tori_data if r['pubyear'] <= year and r['cityear'] <= year ])) r10[datetime.now().year] = r10[datetime.now().year] * r10_corr series['i10'] = i10 series['i100'] = i100 series['h'] = h series['g'] = g series['read10'] = r10 if include_tori: series['tori'] = tori return series
def get_citation_histograms(identifiers, data=None): ch = {} current_year = datetime.now().year # Get necessary data if nothing was provided if not data: data = get_citations(identifiers) if len(data) == 0: data = get_citations(identifiers, no_zero=False) years = [int(p.bibcode[:4]) for p in data] # First gather all necessary data # refereed -> refereed rr_data = [([int(c[:4]) for c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if p.refereed] # refereed -> non-refereed rn_data = [([int(c[:4]) for c in p.citations if c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed] # non-refereed -> refereed nr_data = [([ int(c[:4]) for c in list(set(p.citations).difference(set(p.refereed_citations))) ], 1.0 / float(p.author_num)) for p in data if p.refereed] # non-refereed -> non-refereed nn_data = [ ([int(c[:4]) for c in p.citations if c not in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed ] # First construct the regular histograms rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data]))) rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data]))) nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data]))) nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data]))) # Get the earliest citation try: min_year = min(rr_hist.keys() + rn_hist.keys() + nr_hist.keys() + nn_hist.keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] except: nullhist = [(y, 0) for y in range(min(years), current_year + 1)] # Now create the histograms with zeroes for year without values ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist) ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist) ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist) ch['nonrefereed to nonrefereed'] = merge_dictionaries( dict(nullhist), nn_hist) min_year = min(ch['refereed to refereed'].keys() + ch['refereed to nonrefereed'].keys() + ch['nonrefereed to refereed'].keys() + ch['nonrefereed to nonrefereed'].keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] # Normalized histograms need a different approach tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data])) ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data])) ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data])) ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data])) ch['nonrefereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp) return ch
def get_time_series(identifiers, bibcodes, data=None, usagedata=None, tori_data=None, include_tori=True, self_cits=None): series = {} i10 = {} i100 = {} h = {} g = {} r10 = {} tori = {} # Get data if nothing was supplied if not data: data = get_citations(identifiers) if not usagedata: usagedata = get_usage_data(identifiers) if not self_cits and include_tori: self_cits = get_selfcitations(identifiers, bibcodes)[1] self_citations = set((itertools.chain(*[x[0] for x in self_cits]))) if not tori_data and include_tori: tdata = get_tori_data(identifiers) tori_data = [p for p in list(itertools.chain( *[p.rn_citation_data for p in tdata if p.rn_citation_data])) if p['bibcode'] not in self_citations and 'pubyear' in p] # Determine the year range Nentries = datetime.now().year - 1996 + 1 years = [int(b[:4]) for b in bibcodes] yrange = range(min(years), datetime.now().year + 1) d0 = date(datetime.now().year, 1, 1) d1 = date(datetime.now().year, datetime.now().month, datetime.now().day) d2 = date(datetime.now().year, 12, 31) delta = (d1 - d0).days + 1 ndays = (d2 - d0).days + 1 try: r10_corr = float(ndays)/float(delta) except: r10_corr = 1.0 for year in yrange: biblist = [b for b in bibcodes if int(b[:4]) <= year] citations = sorted([len([int(c[:4]) for c in p.citations if int( c[:4]) <= year]) for p in data if p.bibcode in biblist], reverse=True) if year < 1996: r10[year] = 0.0 else: idx = year - 1996 r10[year] = sum([float(p.reads[idx]) / float(p.author_num) for p in usagedata if p.bibcode in biblist and int( p.bibcode[:4]) > year - 10 and p.reads and len(p.reads) == Nentries]) try: h[year] = max([i for i, n in enumerate(citations) if i <= n]) g[year] = max( [i for i, n in enumerate(np.cumsum(citations, axis=0)) if i**2 <= n]) except: h[year] = 0 g[year] = 0 i10[year] = len([c for c in citations if c >= 10]) i100[year] = len([c for c in citations if c >= 100]) if include_tori: tori[year] = np.sum(np.array([r['auth_norm'] * r['ref_norm'] for r in tori_data if r['pubyear'] <= year and r['cityear'] <= year])) r10[datetime.now().year] = r10[datetime.now().year] * r10_corr series['i10'] = i10 series['i100'] = i100 series['h'] = h series['g'] = g series['read10'] = r10 if include_tori: series['tori'] = tori return series