Exemple #1
0
def process_paste(text, has_header_row=True):
    """
    Return results for a pasted table
    """
    rows = text.splitlines()
    csv_rows = []

    for r in rows:
        groups = re.findall(r'"(.*?)+"|\t', r)
        if len(groups) == 3:
            csv.rows.append((groups[0], groups[2]))
        elif len(groups) == 1:
            csv_rows.append((r.split('\t')[0], r.split('\t')[1]))
        else:
            return None

    headers = csv_rows.pop(0) if has_header_row else ['source', 'target']
    file_path = filehandler.write_to_csv(headers, csv_rows)
    file_size = os.stat(file_path).st_size
    logger.debug('[CTD] File size: %d bytes', file_size)

    results = ctd.get_summary(file_path)
    results['has_multiple_sheets'] = False
    results['filename'] = 'Your Pasted Data'

    filehandler.delete_files([file_path])
    return results
Exemple #2
0
    def test_large_file(self):
        test_data_path = os.path.join(self._fixtures_dir, 'airline-routes.csv')
        results = ctd.get_summary(test_data_path)

        self.assertEqual(results['nodes'], 3425)
        self.assertEqual(results['edges'], 19257)
        self.assertTrue(results['large_dataset'])

        table_path = os.path.join(self._fixtures_dir,
                                  'airline-routes-centralities.csv')
        table_file = codecs.open(table_path, 'r')
        bc_table = table.Table.from_csv(table_file,
                                        no_header_row=False,
                                        snifflimit=0)
        bc_rows = bc_table.to_rows()

        bc_estimates = {}
        for row in results['table'][:40]:
            bc_estimates[row['id']] = row['centrality']

        for row in bc_rows:
            if row[0] in bc_estimates:
                self.assertAlmostEqual(
                    bc_estimates[row[0]], row[1],
                    places=2)  # accurate to two decimal places
Exemple #3
0
    def test_clustering_score(self):
        """
        Test global clustering score with generalized formula

        This is the average of the local clustering scores for each node v:

                  2 Nv        where Kv = degree
        C(v) = ----------           Nv = number of edges between
               Kv (Kv - 1)               the neighbors of v
        """
        test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
        results = ctd.get_summary(test_data_path)
        graph = ctd.get_graph(test_data_path)

        local_scores = []
        for v in graph.nodes():
            k = graph.degree(v)
            neighbor_links = []
            for u in nx.all_neighbors(graph, v):
                neighbor_links += [
                    tuple(sorted((u, w)))
                    for w in nx.common_neighbors(graph, u, v)
                ]
            n = len(list(set(neighbor_links)))
            local_scores.append(
                2 * n / float(k *
                              (k - 1))) if k > 1 else local_scores.append(0)

        self.assertAlmostEqual(results['clustering'],
                               sum(local_scores) / float(len(local_scores)))
Exemple #4
0
def process_paste(text, has_header_row=True):
    """
    Return results for a pasted table
    """
    rows = text.splitlines()
    csv_rows = []

    for r in rows:
        groups = re.findall(r'"(.*?)+"|\t', r)
        if len(groups) == 3:
            csv.rows.append((groups[0], groups[2]))
        elif len(groups) == 1:
            csv_rows.append((r.split('\t')[0], r.split('\t')[1]))
        else:
            return None

    headers = csv_rows.pop(0) if has_header_row else ['source', 'target']
    file_path = filehandler.write_to_csv(headers, csv_rows)
    file_size = os.stat(file_path).st_size
    logger.debug('[CTD] File size: %d bytes', file_size)

    results = ctd.get_summary(file_path)
    results['has_multiple_sheets'] = False
    results['filename'] = 'Your Pasted Data'

    filehandler.delete_files([file_path])
    return results
Exemple #5
0
 def test_import_xls(self):
     test_data_path = os.path.join(self._fixtures_dir,
                                   'zachary-karate-club.xlsx')
     csv_file = filehandler.convert_to_csv(test_data_path)[0]
     results = ctd.get_summary(csv_file)
     self.assertEqual(results['nodes'], 34)
     self.assertEqual(results['edges'], 78)
    def test_degree_scores(self):
        test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
        results = ctd.get_summary(test_data_path)
        table = sorted(results["table"], key=operator.itemgetter("degree"), reverse=True)

        self.assertEqual(table[0]["id"], u"Valjean")
        self.assertEqual(table[0]["degree"], 36)  # counted manually
Exemple #7
0
    def test_as_gexf(self):
        test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
        results = ctd.get_summary(test_data_path)

        test_gexf_path = os.path.join(self._fixtures_dir, 'graph.gexf')
        with open(test_gexf_path, 'r') as gexf:
            contents = gexf.read()

        self.assertEqual(contents, results['gexf'])
Exemple #8
0
    def test_is_not_bipartite_candidate(self):
        test_data_path = os.path.join(self._fixtures_dir, 'simple-network.csv')
        results = ctd.get_summary(test_data_path)
        data = json.loads(results['json'])
        nodes = data['nodes']

        self.assertFalse(results['bipartite'])
        for n in nodes:
            self.assertNotIn('column', n)
Exemple #9
0
    def test_degree_scores(self):
        test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
        results = ctd.get_summary(test_data_path)
        table = sorted(results['table'],
                       key=operator.itemgetter('degree'),
                       reverse=True)

        self.assertEqual(table[0]['id'], 'Valjean')
        self.assertEqual(table[0]['degree'], 36)  # counted manually
Exemple #10
0
    def test_is_not_bipartite_candidate(self):
        test_data_path = os.path.join(self._fixtures_dir, "simple-network.csv")
        results = ctd.get_summary(test_data_path)
        data = json.loads(results["json"])
        nodes = data["nodes"]

        self.assertFalse(results["bipartite"])
        for n in nodes:
            self.assertNotIn("column", n)
Exemple #11
0
    def test_as_gexf(self):
        test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
        results = ctd.get_summary(test_data_path)

        test_gexf_path = os.path.join(self._fixtures_dir, "graph.gexf")
        with open(test_gexf_path, "r") as gexf:
            contents = gexf.read()

        self.assertEqual(contents, results["gexf"])
Exemple #12
0
    def test_as_json_nodes(self):
        test_data_path = os.path.join(self._fixtures_dir, "simple-network.csv")
        results = ctd.get_summary(test_data_path)
        data = json.loads(results["json"])
        nodes = sorted(data["nodes"], key=operator.itemgetter("id"))  # [A, B, C, D, E]

        self.assertEqual(len(nodes), 5)
        for n in [0, 1, 3, 4]:
            self.assertEqual(nodes[n]["degree"], 1)
            self.assertEqual(nodes[n]["centrality"], 0)
        self.assertEqual(nodes[2]["degree"], 4)
        self.assertEqual(nodes[2]["centrality"], 1)
Exemple #13
0
def process_sample(source):
    """
    Return results for a sample file
    """
    sample_path = filehandler.get_sample_path(source)
    sample_name = filehandler.get_sample_title(source)
    logger.debug('[CTD] Loading from: %s', sample_path)

    results = ctd.get_summary(sample_path)
    results['has_multiple_sheets'] = False
    results['filename'] = sample_name

    return results
Exemple #14
0
    def test_as_json_nodes(self):
        test_data_path = os.path.join(self._fixtures_dir, 'simple-network.csv')
        results = ctd.get_summary(test_data_path)
        data = json.loads(results['json'])
        nodes = sorted(data['nodes'],
                       key=operator.itemgetter('id'))  # [A, B, C, D, E]

        self.assertEqual(len(nodes), 5)
        for n in [0, 1, 3, 4]:
            self.assertEqual(nodes[n]['degree'], 1)
            self.assertEqual(nodes[n]['centrality'], 0)
        self.assertEqual(nodes[2]['degree'], 4)
        self.assertEqual(nodes[2]['centrality'], 1)
Exemple #15
0
    def test_as_json_edges(self):
        test_data_path = os.path.join(self._fixtures_dir, "simple-network.csv")
        results = ctd.get_summary(test_data_path)
        data = json.loads(results["json"])
        nodes = data["nodes"]
        edges = sorted(data["links"], key=lambda e: (nodes[e["source"]]["id"], nodes[e["target"]]["id"]))

        self.assertEqual(len(edges), 4)
        self.assertEqual(nodes[edges[0]["source"]]["id"], u"A")
        self.assertEqual(nodes[edges[0]["target"]]["id"], u"C")

        targets = ["B", "D", "E"]
        for n in range(1, 4):
            self.assertEqual(nodes[edges[n]["source"]]["id"], u"C")
            self.assertEqual(nodes[edges[n]["target"]]["id"], targets[n - 1])
Exemple #16
0
def process_sample(source):
    """
    Return results for a sample file
    """
    sample_file = filehandler.get_sample(source)
    sample_path = sample_file['path']
    sample_name = sample_file['title']
    logger.debug('[CTD] Loading from: %s', sample_path)

    results = ctd.get_summary(sample_path)
    results['has_multiple_sheets'] = False
    results['filename'] = sample_name
    results['biography'] = sample_file['biography']

    return results
Exemple #17
0
    def test_centrality_scores_simple(self):
        """
        Test betweenness centrality for simple (independently verifiable) case

        A       D
          > C <      All shortest paths go through C, connector score = 1
        B       E
        """
        test_data_path = os.path.join(self._fixtures_dir, 'simple-network.csv')
        results = ctd.get_summary(test_data_path)
        table = results['table']

        self.assertEqual(table[0]['id'], 'C')
        self.assertEqual(table[0]['centrality'], 1)
        for i in range(1, 5):
            self.assertEqual(table[i]['centrality'], 0)
Exemple #18
0
    def test_degree_scores_simple(self):
        """
        Test degree scores for simple (independently verifiable) case

        A       D
          > C <      All nodes have degree 1 except for C, which has degree 4
        B       E
        """
        test_data_path = os.path.join(self._fixtures_dir, "simple-network.csv")
        results = ctd.get_summary(test_data_path)
        table = sorted(results["table"], key=operator.itemgetter("degree"), reverse=True)

        self.assertEqual(table[0]["id"], u"C")
        self.assertEqual(table[0]["degree"], 4)
        for i in range(1, 5):
            self.assertEqual(table[i]["degree"], 1)
Exemple #19
0
def process_upload(file, has_header_row=True):
    """
    Return results for an uploaded file
    """
    file_path = filehandler.open_doc(file)
    file_name = file.filename
    file_size = os.stat(file_path).st_size
    logger.debug('[CTD] File size: %d bytes', file_size)

    csv_paths = filehandler.convert_to_csv(file_path)
    results = ctd.get_summary(csv_paths[0], has_header_row) # only use first sheet
    results['has_multiple_sheets'] = True if len(csv_paths) > 1 else False
    results['filename'] = file_name

    filehandler.delete_files(csv_paths)
    return results
Exemple #20
0
    def test_centrality_scores_simple(self):
        """
        Test betweenness centrality for simple (independently verifiable) case

        A       D
          > C <      All shortest paths go through C, connector score = 1
        B       E
        """
        test_data_path = os.path.join(self._fixtures_dir, "simple-network.csv")
        results = ctd.get_summary(test_data_path)
        table = results["table"]

        self.assertEqual(table[0]["id"], u"C")
        self.assertEqual(table[0]["centrality"], 1)
        for i in range(1, 5):
            self.assertEqual(table[i]["centrality"], 0)
Exemple #21
0
    def test_as_json_edges(self):
        test_data_path = os.path.join(self._fixtures_dir, 'simple-network.csv')
        results = ctd.get_summary(test_data_path)
        data = json.loads(results['json'])
        nodes = data['nodes']
        edges = sorted(data['links'],
                       key=lambda e:
                       (nodes[e['source']]['id'], nodes[e['target']]['id']))

        self.assertEqual(len(edges), 4)
        self.assertEqual(nodes[edges[0]['source']]['id'], 'A')
        self.assertEqual(nodes[edges[0]['target']]['id'], 'C')

        targets = ['B', 'D', 'E']
        for n in range(1, 4):
            self.assertEqual(nodes[edges[n]['source']]['id'], 'C')
            self.assertEqual(nodes[edges[n]['target']]['id'], targets[n - 1])
Exemple #22
0
def process_upload(file, has_header_row=True):
    """
    Return results for an uploaded file
    """
    file_path = filehandler.open_doc(file)
    file_name = file.filename
    file_size = os.stat(file_path).st_size
    logger.debug('[CTD] File size: %d bytes', file_size)

    csv_paths = filehandler.convert_to_csv(file_path)
    results = ctd.get_summary(csv_paths[0],
                              has_header_row)  # only use first sheet
    results['has_multiple_sheets'] = True if len(csv_paths) > 1 else False
    results['filename'] = file_name

    filehandler.delete_files(csv_paths)
    return results
Exemple #23
0
    def test_degree_scores_simple(self):
        """
        Test degree scores for simple (independently verifiable) case

        A       D
          > C <      All nodes have degree 1 except for C, which has degree 4
        B       E
        """
        test_data_path = os.path.join(self._fixtures_dir, 'simple-network.csv')
        results = ctd.get_summary(test_data_path)
        table = sorted(results['table'],
                       key=operator.itemgetter('degree'),
                       reverse=True)

        self.assertEqual(table[0]['id'], 'C')
        self.assertEqual(table[0]['degree'], 4)
        for i in range(1, 5):
            self.assertEqual(table[i]['degree'], 1)
Exemple #24
0
    def test_is_bipartite_candidate(self):
        test_data_path = os.path.join(self._fixtures_dir, 'southern-women.csv')
        results = ctd.get_summary(test_data_path)
        data = json.loads(results['json'])
        nodes = data['nodes']
        cols = {
            'BRENDA': 0,
            'CHARLOTTE': 0,
            'DOROTHY': 0,
            'ELEANOR': 0,
            'EVELYN': 0,
            'FLORA': 0,
            'FRANCES': 0,
            'HELEN': 0,
            'KATHERINE': 0,
            'LAURA': 0,
            'MYRNA': 0,
            'NORA': 0,
            'OLIVIA': 0,
            'PEARL': 0,
            'RUTH': 0,
            'SYLVIA': 0,
            'THERESA': 0,
            'VERNE': 0,
            'E1': 1,
            'E10': 1,
            'E11': 1,
            'E12': 1,
            'E13': 1,
            'E14': 1,
            'E2': 1,
            'E3': 1,
            'E4': 1,
            'E5': 1,
            'E6': 1,
            'E7': 1,
            'E8': 1,
            'E9': 1
        }

        self.assertTrue(results['bipartite'])
        for n in nodes:
            self.assertEqual(n['column'], cols[n['id']])
Exemple #25
0
    def test_is_bipartite_candidate(self):
        test_data_path = os.path.join(self._fixtures_dir, "southern-women.csv")
        results = ctd.get_summary(test_data_path)
        data = json.loads(results["json"])
        nodes = data["nodes"]
        cols = {
            u"BRENDA": 0,
            u"CHARLOTTE": 0,
            u"DOROTHY": 0,
            u"ELEANOR": 0,
            u"EVELYN": 0,
            u"FLORA": 0,
            u"FRANCES": 0,
            u"HELEN": 0,
            u"KATHERINE": 0,
            u"LAURA": 0,
            u"MYRNA": 0,
            u"NORA": 0,
            u"OLIVIA": 0,
            u"PEARL": 0,
            u"RUTH": 0,
            u"SYLVIA": 0,
            u"THERESA": 0,
            u"VERNE": 0,
            u"E1": 1,
            u"E10": 1,
            u"E11": 1,
            u"E12": 1,
            u"E13": 1,
            u"E14": 1,
            u"E2": 1,
            u"E3": 1,
            u"E4": 1,
            u"E5": 1,
            u"E6": 1,
            u"E7": 1,
            u"E8": 1,
            u"E9": 1,
        }

        self.assertTrue(results["bipartite"])
        for n in nodes:
            self.assertEqual(n["column"], cols[n["id"]])
Exemple #26
0
    def test_large_file(self):
        test_data_path = os.path.join(self._fixtures_dir, "airline-routes.csv")
        results = ctd.get_summary(test_data_path)

        self.assertEqual(results["nodes"], 3425)
        self.assertEqual(results["edges"], 19257)
        self.assertTrue(results["large_dataset"])

        table_path = os.path.join(self._fixtures_dir, "airline-routes-centralities.csv")
        table_file = codecs.open(table_path, "r")
        bc_table = table.Table.from_csv(table_file, no_header_row=False, snifflimit=0)
        bc_rows = bc_table.to_rows()

        bc_estimates = {}
        for row in results["table"][:40]:
            bc_estimates[row["id"]] = row["centrality"]

        for row in bc_rows:
            if row[0] in bc_estimates:
                self.assertAlmostEqual(bc_estimates[row[0]], row[1], places=2)  # accurate to two decimal places
Exemple #27
0
    def test_centrality_scores(self):
        """
        Test betweenness centrality with generalized formula

        For a node v and every other node pair (s, t), we take the proportion of shortest paths s => t that include
        v and then normalize the sum of all the proportions by dividing (N - 1)(N - 2) / 2, the number of node pairs
        """
        test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
        results = ctd.get_summary(test_data_path)
        graph = ctd.get_graph(test_data_path)

        table = results['table']
        self.assertEqual(table[0]['id'], 'Valjean')

        nodes = graph.nodes()
        nodes.remove('Valjean')

        betweenness_centrality = 0
        visited_paths = []

        for u in nodes:
            for v in nodes:
                current_path = tuple(sorted((u, v)))
                if u == v or current_path in visited_paths:
                    continue
                else:
                    visited_paths.append(current_path)
                    paths = list(nx.all_shortest_paths(graph, u, v))
                    total_paths = len(paths)
                    paths_with_valjean = reduce(
                        lambda n, path: n + 1
                        if 'Valjean' in path else n, paths, 0)
                    betweenness_centrality += paths_with_valjean / float(
                        total_paths)

        node_pairs = len(nodes) * (len(nodes) - 1) / float(2)
        normalized_score = betweenness_centrality / node_pairs

        self.assertAlmostEqual(table[0]['centrality'], normalized_score)
Exemple #28
0
    def test_centrality_scores(self):
        """
        Test betweenness centrality with generalized formula

        For a node v and every other node pair (s, t), we take the proportion of shortest paths s => t that include
        v and then normalize the sum of all the proportions by dividing (N - 1)(N - 2) / 2, the number of node pairs
        """
        test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
        results = ctd.get_summary(test_data_path)
        graph = ctd.get_graph(test_data_path)

        table = results["table"]
        self.assertEqual(table[0]["id"], u"Valjean")

        nodes = graph.nodes()
        nodes.remove(u"Valjean")

        betweenness_centrality = 0
        visited_paths = []

        for u in nodes:
            for v in nodes:
                current_path = tuple(sorted((u, v)))
                if u == v or current_path in visited_paths:
                    continue
                else:
                    visited_paths.append(current_path)
                    paths = list(nx.all_shortest_paths(graph, u, v))
                    total_paths = len(paths)
                    paths_with_valjean = reduce(lambda n, path: n + 1 if u"Valjean" in path else n, paths, 0)
                    betweenness_centrality += paths_with_valjean / float(total_paths)

        node_pairs = len(nodes) * (len(nodes) - 1) / float(2)
        normalized_score = betweenness_centrality / node_pairs

        self.assertAlmostEqual(table[0]["centrality"], normalized_score)
Exemple #29
0
    def test_clustering_score(self):
        """
        Test global clustering score with generalized formula

        This is the average of the local clustering scores for each node v:

                  2 Nv        where Kv = degree
        C(v) = ----------           Nv = number of edges between
               Kv (Kv - 1)               the neighbors of v
        """
        test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
        results = ctd.get_summary(test_data_path)
        graph = ctd.get_graph(test_data_path)

        local_scores = []
        for v in graph.nodes():
            k = graph.degree(v)
            neighbor_links = []
            for u in nx.all_neighbors(graph, v):
                neighbor_links += [tuple(sorted((u, w))) for w in nx.common_neighbors(graph, u, v)]
            n = len(list(set(neighbor_links)))
            local_scores.append(2 * n / float(k * (k - 1))) if k > 1 else local_scores.append(0)

        self.assertAlmostEqual(results["clustering"], sum(local_scores) / float(len(local_scores)))
Exemple #30
0
 def test_clustering_score_clique(self):
     test_data_path = os.path.join(self._fixtures_dir,
                                   'handshake-problem.csv')
     results = ctd.get_summary(test_data_path, False)
     self.assertEqual(results['clustering'],
                      1)  # complete graph, all nodes connected
Exemple #31
0
 def test_density_score(self):
     test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results["density"], 0.08680792891319207)  # float(2 * self.count_edges()) /
Exemple #32
0
 def test_clustering_score_clique(self):
     test_data_path = os.path.join(self._fixtures_dir, "handshake-problem.csv")
     results = ctd.get_summary(test_data_path, False)
     self.assertEqual(results["clustering"], 1)  # complete graph, all nodes connected
Exemple #33
0
 def test_clustering_score_star(self):
     test_data_path = os.path.join(self._fixtures_dir, "simple-network.csv")
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results["clustering"], 0)  # no clusters, neighbors are never connected
Exemple #34
0
 def test_import_no_header(self):
     test_data_path = os.path.join(self._fixtures_dir, "handshake-problem.csv")
     results = ctd.get_summary(test_data_path, False)
     self.assertEqual(results["nodes"], 5)
     self.assertEqual(results["edges"], 10)
Exemple #35
0
 def test_invalid_import(self):
     test_data_path = os.path.join(self._fixtures_dir, "invalid-graph.csv")
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results, {})
Exemple #36
0
 def test_count_edges(self):
     test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results['edges'], 254)  # self.table.count_rows()
Exemple #37
0
 def test_count_nodes(self):
     test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results['nodes'],
                      77)  # len(set(self.table[0] + self.table[1]))
Exemple #38
0
 def test_import_no_header(self):
     test_data_path = os.path.join(self._fixtures_dir,
                                   'handshake-problem.csv')
     results = ctd.get_summary(test_data_path, False)
     self.assertEqual(results['nodes'], 5)
     self.assertEqual(results['edges'], 10)
Exemple #39
0
 def test_import_xls(self):
     test_data_path = os.path.join(self._fixtures_dir, "zachary-karate-club.xlsx")
     csv_file = filehandler.convert_to_csv(test_data_path)[0]
     results = ctd.get_summary(csv_file)
     self.assertEqual(results["nodes"], 34)
     self.assertEqual(results["edges"], 78)
Exemple #40
0
 def test_invalid_import(self):
     test_data_path = os.path.join(self._fixtures_dir, 'invalid-graph.csv')
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results, {})
Exemple #41
0
 def test_count_edges(self):
     test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results["edges"], 254)  # self.table.count_rows()
Exemple #42
0
 def test_clustering_score_star(self):
     test_data_path = os.path.join(self._fixtures_dir, 'simple-network.csv')
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results['clustering'],
                      0)  # no clusters, neighbors are never connected
Exemple #43
0
 def test_density_score(self):
     test_data_path = os.path.join(self._fixtures_dir, 'les-miserables.csv')
     results = ctd.get_summary(test_data_path)
     self.assertEqual(
         results['density'],
         0.08680792891319207)  # float(2 * self.count_edges()) /
Exemple #44
0
 def test_count_nodes(self):
     test_data_path = os.path.join(self._fixtures_dir, "les-miserables.csv")
     results = ctd.get_summary(test_data_path)
     self.assertEqual(results["nodes"], 77)  # len(set(self.table[0] + self.table[1]))