Ejemplo n.º 1
0
    def test_strip11s_too_shorts(self):
        r = HepReader()
        self.assertEqual(r._strip11s("0011999", date(2000, 11, 1)), "11999")
        self.assertEqual(r._strip11s("11999", date(2000, 11, 1)), "11999")

        self.assertEqual(r._strip11s("1199", date(2000, 1, 1)), "1199")

        self.assertEqual(r._strip11s("0112999", date(2001, 12, 1)), "112999")
Ejemplo n.º 2
0
 def test_strip11s_cross_cits(self):
     r = HepReader()
     self.assertEqual(r._strip11s("119807999", date(1998, 07, 1)), "9807999")
     self.assertEqual(r._strip11s("11107999", date(2001, 07, 1)), "107999")
     self.assertEqual(r._strip11s("11207999", date(2002, 07, 1)), "207999")
     self.assertEqual(r._strip11s("1112999", date(2000, 12, 1)), "12999")
     self.assertEqual(r._strip11s("117999", date(2000, 07, 1)), "7999")
Ejemplo n.º 3
0
def plot_community_size_distribution():
    """Plot overall distribution for HEP-formatted files"""
    sizes = Counter()
    all_sizes = []
    for year in range(1992, 2004):

        hp = HepReader.get_for_year(year)
        rm = RoleMining(hp.get_nodes(), hp.get_edges())
        sizes_of_communities = [len(c) for c in rm.communities.values()]
        all_sizes.extend(sizes_of_communities)
        counted = Counter(sizes_of_communities)
        sizes += counted

    min_size = 10
    max_size = max(all_sizes)

    year = "overall"

    P.xlabel("Size of community [members]")
    P.ylabel("Number of communities")
    P.suptitle("Year {}, communities bigger than 10 members\nNon-overlapping community size distribution".format(year))
    n, bins, patches = P.hist(filter(lambda x: x > min_size, all_sizes))
    autolabel(patches)
    P.xticks([min_size] + range(min_size-10, max_size + 1, 100) )
    P.xlim(min_size, max_size)
    P.show()
Ejemplo n.º 4
0
def plot_community_size_distribution_from_cfinder(year, k, prop, color):
    """Plot community size distribution from CFinder cliques output files"""
    filename = "datasets/enron/communities/8-{0}/k={1}/{2}".format(year, k, prop)
    # filename = "datasets/hepth/communities/cit-HepTh-{0}/k={1}/{2}".format(year, k, prop)
    lines = HepReader.read_lines(filename)
    sizes = {int(size): int(count) for size, count in [line.split() for line in lines if len(line) != 0]}
    x = sizes.keys()
    y = sizes.values()

    P.bar(x, y, label="year:{}, k={}, max={}".format(year, k, max(x)), align='center', alpha=0.7, color=color)

    P.xlabel("Size of community [members]")
    P.ylabel("Number of communities")
Ejemplo n.º 5
0
 def test_date_from_id_regular(self):
     self.assertEqual(HepReader._date_from_id("212999"), date(2002, 12, 01))
Ejemplo n.º 6
0
 def test_date_from_idXXcent(self):
     self.assertEqual(HepReader._date_from_id("9901999"), date(1999, 01, 01))
Ejemplo n.º 7
0
 def test_date_from_id_no_zeros(self):
     self.assertEqual(HepReader._date_from_id("1999"), date(2000, 01, 01))
Ejemplo n.º 8
0
def _get_edges_per_slot():
    slots = {} # {year : edges}
    for year in range(1992, 2004):
        slots[year] = HepReader.read_edges("datasets/hepth/timeslots/cit-HepTh-{0}.edges".format(year))

    return slots
Ejemplo n.º 9
0
    P.xlim(min_size, max_size)
    P.show()


def plot_data_distribution():
    """Plot number of new papers and new citations per year, take Hep files as input"""
    minyear = 1992
    maxyear = 2003
    x = [datetime(year, 01, 01) for year in range(minyear, maxyear + 1)]
    x.extend([datetime(year, 06, 01) for year in range(minyear, maxyear + 1)])

    x.extend([datetime(year, 9, 1) for year in range(minyear, maxyear + 1)])
    x.extend([datetime(year, 12, 1) for year in range(minyear, maxyear + 1)])

    x = sorted(x)
    dates = HepReader.read_dates("/home/stpk/dev/role-mining/datasets/hepth/cit-HepTh-dates-cleaned.txt")
    edges = HepReader.read_edges("/home/stpk/dev/role-mining/datasets/hepth/cit-HepTh.txt")
    slots = HepReader.split_to_timeslots(dates, edges, x)
    for slot in slots.values():
        print len(slot)

    citats = [len(slots[t]) for t in x]
    print citats
    print x

    fig, ax = P.subplots()
    ax.xaxis_date()
    P.title("Nowe publikacje na kwartal")
    P.plot(x, citats, color='b', alpha=0.5, label="Nowe cytowania")
    ax.set_xticks(x)
    P.xticks(rotation=70)