def test_strip11s_too_shorts(self): r = HepReader() self.assertEqual(r._strip11s("0011999", date(2000, 11, 1)), "11999") self.assertEqual(r._strip11s("11999", date(2000, 11, 1)), "11999") self.assertEqual(r._strip11s("1199", date(2000, 1, 1)), "1199") self.assertEqual(r._strip11s("0112999", date(2001, 12, 1)), "112999")
def test_strip11s_cross_cits(self): r = HepReader() self.assertEqual(r._strip11s("119807999", date(1998, 07, 1)), "9807999") self.assertEqual(r._strip11s("11107999", date(2001, 07, 1)), "107999") self.assertEqual(r._strip11s("11207999", date(2002, 07, 1)), "207999") self.assertEqual(r._strip11s("1112999", date(2000, 12, 1)), "12999") self.assertEqual(r._strip11s("117999", date(2000, 07, 1)), "7999")
def plot_community_size_distribution(): """Plot overall distribution for HEP-formatted files""" sizes = Counter() all_sizes = [] for year in range(1992, 2004): hp = HepReader.get_for_year(year) rm = RoleMining(hp.get_nodes(), hp.get_edges()) sizes_of_communities = [len(c) for c in rm.communities.values()] all_sizes.extend(sizes_of_communities) counted = Counter(sizes_of_communities) sizes += counted min_size = 10 max_size = max(all_sizes) year = "overall" P.xlabel("Size of community [members]") P.ylabel("Number of communities") P.suptitle("Year {}, communities bigger than 10 members\nNon-overlapping community size distribution".format(year)) n, bins, patches = P.hist(filter(lambda x: x > min_size, all_sizes)) autolabel(patches) P.xticks([min_size] + range(min_size-10, max_size + 1, 100) ) P.xlim(min_size, max_size) P.show()
def plot_community_size_distribution_from_cfinder(year, k, prop, color): """Plot community size distribution from CFinder cliques output files""" filename = "datasets/enron/communities/8-{0}/k={1}/{2}".format(year, k, prop) # filename = "datasets/hepth/communities/cit-HepTh-{0}/k={1}/{2}".format(year, k, prop) lines = HepReader.read_lines(filename) sizes = {int(size): int(count) for size, count in [line.split() for line in lines if len(line) != 0]} x = sizes.keys() y = sizes.values() P.bar(x, y, label="year:{}, k={}, max={}".format(year, k, max(x)), align='center', alpha=0.7, color=color) P.xlabel("Size of community [members]") P.ylabel("Number of communities")
def test_date_from_id_regular(self): self.assertEqual(HepReader._date_from_id("212999"), date(2002, 12, 01))
def test_date_from_idXXcent(self): self.assertEqual(HepReader._date_from_id("9901999"), date(1999, 01, 01))
def test_date_from_id_no_zeros(self): self.assertEqual(HepReader._date_from_id("1999"), date(2000, 01, 01))
def _get_edges_per_slot(): slots = {} # {year : edges} for year in range(1992, 2004): slots[year] = HepReader.read_edges("datasets/hepth/timeslots/cit-HepTh-{0}.edges".format(year)) return slots
P.xlim(min_size, max_size) P.show() def plot_data_distribution(): """Plot number of new papers and new citations per year, take Hep files as input""" minyear = 1992 maxyear = 2003 x = [datetime(year, 01, 01) for year in range(minyear, maxyear + 1)] x.extend([datetime(year, 06, 01) for year in range(minyear, maxyear + 1)]) x.extend([datetime(year, 9, 1) for year in range(minyear, maxyear + 1)]) x.extend([datetime(year, 12, 1) for year in range(minyear, maxyear + 1)]) x = sorted(x) dates = HepReader.read_dates("/home/stpk/dev/role-mining/datasets/hepth/cit-HepTh-dates-cleaned.txt") edges = HepReader.read_edges("/home/stpk/dev/role-mining/datasets/hepth/cit-HepTh.txt") slots = HepReader.split_to_timeslots(dates, edges, x) for slot in slots.values(): print len(slot) citats = [len(slots[t]) for t in x] print citats print x fig, ax = P.subplots() ax.xaxis_date() P.title("Nowe publikacje na kwartal") P.plot(x, citats, color='b', alpha=0.5, label="Nowe cytowania") ax.set_xticks(x) P.xticks(rotation=70)