Ejemplo n.º 1
0
Archivo: hack3.py Proyecto: dwanev/SEL
    def calc_v1_matrix(self, v0):

        from_list = []
        to_list = []
        value_list = []
        max_id = 0

        wds = WikipediaDataset()

        for from_entity_id in v0:
            link_to = wds.get_links_to(from_entity_id)
            for v in link_to:
                from_list.append(from_entity_id)
                to_list.append(v)
                value_list.append(1)
                if v > max_id:
                    max_id = v

            link_from = set(wds.get_links_to(from_entity_id))
            for v in link_from:
                from_list.append(v)
                to_list.append(from_entity_id)
                value_list.append(1)

        mtx = sparse.coo_matrix((value_list, (from_list, to_list)), shape=(max_id + 1, max_id + 1))

        full_set = set(to_list)
        full_set.update(from_list)

        return mtx, full_set
Ejemplo n.º 2
0
    def calc_v1_matrix(self, entity_id_list):
        # find links that are within the set of nodes we are passed, and
        # all those in bound to them, and out bound from them
        from_list = []
        to_list = []
        value_list = []
        max_id = 0
        wds = WikipediaDataset()
        for from_entity_id in entity_id_list:
            link_to = wds.get_links_to(from_entity_id)
            for v in link_to:
                from_list.append(from_entity_id)
                to_list.append(v)
                value_list.append(1)
                if v > max_id:
                    max_id = v

            link_from = set(wds.get_links_from(from_entity_id))
            for v in link_from:
                from_list.append(v)
                to_list.append(from_entity_id)
                value_list.append(1)
                if v > max_id:
                    max_id = v
        # TODO The following line threw a Value error (row index exceeds matrix dimentions) here on docid 579, and docid 105
        try:
            mtx = sparse.coo_matrix((value_list, (from_list, to_list)),
                                    shape=(max_id + 1, max_id + 1))
            pass
        except ValueError as e:
            self.logger.warning(
                'An error occurred returning None rather that a V1 matrix. %s',
                e)
            return None
        return mtx
Ejemplo n.º 3
0
    def check_for_wikititle_collisions(self, case_insensitive=True):
        input_file = gzip.open("E:\\tmp\\" + 'wikipedia-dump.json.gz', 'rt', encoding='utf-8')

        wd = WikipediaDataset()
        wikititle_mt = wd.get_wikititle_case_insensitive_marisa_trie()
        wikititle_id_by_id = {}
        fname_prefix = self.get_intermediate_path()+'wikititle_id_by_id.'
        if case_insensitive:
            fname_prefix = fname_prefix + 'case_insensitive.'

        count = 1
        collision_count = 1
        line = ''

        duplicate_ids_by_wikititle = {}

        while count < 25000000 and line is not None:  # TODO check termination and remove magic number
            log_progress = count < 50000 and count % 10000 == 0
            if log_progress:
                self.logger.info('starting gc ')
                gc.collect()  # have no real reason to think this is needed or will help the memory issue
                self.logger.info('%d lines processed', count)

            save_progress = count % 1000000 == 0 or count == 10
            if save_progress:
                self.logger.info('%d lines processed', count)
                wikititle_by_id_filename = fname_prefix + str(count) + '.pickle'
                self.logger.info('about to save to %s', wikititle_by_id_filename)
                with open(wikititle_by_id_filename, 'wb') as handle:
                    pickle.dump(wikititle_id_by_id, handle, protocol=pickle.HIGHEST_PROTOCOL)
                self.logger.info('written  %s', wikititle_by_id_filename)

            line = input_file.readline()
            if  line is not None and line != '':
                data = json.loads(line)
                # pprint.pprint(data)
                if case_insensitive:
                    wikititle = data['wikiTitle'].lower()
                else:
                    wikititle = data['wikiTitle']

                wt_id = wikititle_mt[wikititle]
                wid = data['wid']
                wikititle_id_by_id[wid] = wt_id

            else:
                break

            count += 1

        self.logger.info('%d lines processed', count)
        wikititle_by_id_filename = fname_prefix + str(count) + '.pickle'
        self.logger.info('about to save to %s', wikititle_by_id_filename)
        with open(wikititle_by_id_filename, 'wb') as handle:
            pickle.dump(wikititle_id_by_id, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('written  %s', wikititle_by_id_filename)
Ejemplo n.º 4
0
    def __init__(self):
        # Set up logging
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
        self.logger = logging.getLogger(__name__)
        self.logger.addHandler(handler)
        self.logger.propagate = False
        self.logger.setLevel(logging.INFO)

        # instance variables
        self.wiki_ds = WikipediaDataset()
 def __init__(self):
     # set up logging
     handler = logging.StreamHandler()
     handler.setFormatter(
         logging.Formatter(
             '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
     self.logger = logging.getLogger(__name__)
     self.logger.addHandler(handler)
     self.logger.propagate = False
     self.logger.setLevel(logging.INFO)
     # set up instance variables
     wds = WikipediaDataset()
     self.intermediate_path = FileLocations.get_temp_path()
     self.spotlight_util = SpotlightUtil()
Ejemplo n.º 6
0
 def get_links_totally_within(self, entity_id_list):
     from_list = []
     to_list = []
     value_list = []
     v0_vertice_set = set(entity_id_list)
     wds = WikipediaDataset()
     for entity_id in v0_vertice_set:
         links_to = wds.get_links_to(entity_id)
         for link_to in links_to:
             if link_to in v0_vertice_set:
                 to_list.append(entity_id)
                 from_list.append(link_to)
                 value_list.append(1)
     return from_list, to_list, value_list
Ejemplo n.º 7
0
def unit_test_2():
    wds = WikipediaDataset()

    check_links(1563047,   7412236, wds) # steve_jobs
    check_links(16360692, 57564770, wds)
    check_links(2678997,  57564127, wds)
    check_links(37717778, 57563280, wds)
    check_links(43375967, 57563305, wds)
    check_links(46991680, 57563292, wds)
    check_links(51332113, 57564772, wds)
    check_links(52466986, 57563202, wds)
    check_links(52679129, 57563204, wds)
    check_links(57562759, 57565023, wds)
    check_links(57564483, 57564503, wds)
    check_links(57564520, 57564533, wds)
    check_links(57565377, 57565381, wds)
    check_links(57565437, 57565531, wds)
    check_links(603291,   57564623, wds)
    check_links(9422390,  57563903, wds)
Ejemplo n.º 8
0
def train_model():
    X, y, docid_array, entity_id_array = load_feature_matrix(
        feature_filename=INTERMEDIATE_PATH +
        'dexter_all_heavy_catted_8_7_2018.txt',
        feature_names=feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=40,
        tmp_filename='/tmp/temp_conversion_file.txt')

    # train only on records we have a golden salience for
    fg = FilterGolden()
    logger.info('X Shape = %s', X.shape)
    logger.info('y Shape = %s', y.shape)

    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()

    X2, y2, docid2, entityid2 = fg.get_only_golden_rows(
        X, y, docid_array, entity_id_array, dexter_dataset, wikipedia_dataset)

    logger.info('X2 Shape = %s', X2.shape)
    logger.info('y2 Shape = %s', y2.shape)

    wrapper = GBRTWrapper()
    gbrt = wrapper.train_model_no_split(X2, y2, n_estimators=40)
    logger.info('trained')
    # gbrt.save_model()

    # from https://shankarmsy.github.io/stories/gbrt-sklearn.html
    # One of the benefits of growing trees is that we can understand how important each of the features are
    print("Feature Importances")
    print(gbrt.feature_importances_)
    print()
    # Let's print the R-squared value for train/test. This explains how much of the variance in the data our model is
    # able to decipher.
    print("R-squared for Train: %.2f" % gbrt.score(X2, y2))
    # print ("R-squared for Test: %.2f" %gbrt.score(X_test, y_test) )
    # - See more at: https://shankarmsy.github.io/stories/gbrt-sklearn.html#sthash.JNZQbnph.dpuf
    return gbrt, X2, y2, docid2, entityid2
Ejemplo n.º 9
0
    def go(self, filename, feature_names, filter_only_golden):
        X, y, docid_array, entity_id_array = load_feature_matrix(feature_filename=filename,
                                                                 feature_names=feature_names,
                                                                 entity_id_index=1,
                                                                 y_feature_index=2,
                                                                 first_feature_index=4,
                                                                 number_features_per_line=len(feature_names) + 4,
                                                                 tmp_filename='/tmp/temp_conversion_file.txt'
                                                                 )

        # train only on records we have a golden salience for
        self.logger.info('__________________________',)
        self.logger.info('File %s', filename)
        self.logger.info('X Shape = %s', X.shape)
        self.logger.info('y Shape = %s', y.shape)

        if filter_only_golden:
            dexterDataset = DatasetDexter()
            wikipediaDataset = WikipediaDataset()
            fg = sellibrary.filter_only_golden.FilterGolden()
            X, y, docid_array, entity_id_array = fg.get_only_golden_rows(X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset)
            self.logger.info('After filtering only golden rows:')
            self.logger.info('X Shape = %s', X.shape)
            self.logger.info('y Shape = %s', y.shape)

        self.logger.info('y [1] %s', y[1:10])
        self.logger.info('y [1] %s', y[y > 0.0])

        y[y < 2.0] = 0
        y[y >= 2.0] = 1

        ig = self.information_gain_v2(X, y)
        self.logger.info('ig %s', ig)
        self.logger.info('ig shape %s', ig.shape)

        d = {}
        for i in range(len(feature_names)):
            d[feature_names[i]] = ig[i]

        self.sort_and_print(d)
        return d
Ejemplo n.º 10
0
    def __init__(self, features_to_zero = []):

        # __ instance variables
        self.ds = WikipediaDataset()
        self.features_to_zero = features_to_zero
Ejemplo n.º 11
0
    output_filename = dropbox_intermediate_path + 'wp_joined.txt'  #'joined_sel_sent_and_tf.txt'

    # Load File A
    X1, y1, docid_array1, entity_id_array1 = load_feature_matrix(
        feature_filename=filename_A,
        feature_names=file_A_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(file_A_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')

    print(y1.shape)
    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()
    # fg = FilterGolden()
    # X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows(X1, y1, docid_array1, entity_id_array1, dexter_dataset,
    #                                                     wikipedia_dataset)

    document_list = dexter_dataset.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid(
        document_list, wikipedia_dataset)

    print(y1.shape)

    # Load File B
    X2, y2, docid_array2, entity_id_array2 = load_feature_matrix(
        feature_filename=filename_B,
        feature_names=file_B_feature_names,
import logging

from sellibrary.wiki.wikipedia_datasets import WikipediaDataset

# set up logging
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.propagate = False
logger.setLevel(logging.INFO)

if __name__ == "__main__":
    ds = WikipediaDataset()
    # this requires extract_curid_by_wikititle_trie to have been run first
    ds.extract_graph_from_compressed()
Ejemplo n.º 13
0
class GraphUtils:
    # Set up logging
    handler = logging.StreamHandler()
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)
    wds = WikipediaDataset()
    #_____ const
    REALLY_BIG_NUMBER = 100
    VERY_SMALL_NUMBER = 0.001

    def __init__(self):
        pass

    def relateness(self, entity_id_a, entity_id_b):
        # return the milne and witten relatedness value
        link_to_a = set(self.wds.get_links_to(entity_id_a))
        link_to_b = set(self.wds.get_links_to(entity_id_b))
        intersect = link_to_a.intersection(link_to_b)
        size_a = len(link_to_a)
        size_b = len(link_to_b)
        size_int = len(intersect)
        self.logger.debug(' %d, %d %d ', size_a, size_b, size_int)
        p1 = np.log2(max(size_a, size_b))
        p2 = np.log2(max(size_int, 1))
        p3 = np.log2(
            5
        )  # this needs to set correctly - but as we just take the median - may not matter
        p4 = np.log2(max(1, min(size_a, size_b)))
        if p3 == p4:
            self.logger.warning(
                'Error calculating relatedness, denominator is 0. Can only crudely estimate. p1=%f, p2=%f, p3=%f, p4=%f ',
                p1, p2, p3, p4)
            relatedness = (p1 - p2) / GraphUtils.VERY_SMALL_NUMBER
        else:
            relatedness = (p1 - p2) / (p3 - p4)
        return relatedness

    def calc_v1_matrix(self, entity_id_list):
        # find links that are within the set of nodes we are passed, and
        # all those in bound to them, and out bound from them
        from_list = []
        to_list = []
        value_list = []
        max_id = 0
        wds = WikipediaDataset()
        for from_entity_id in entity_id_list:
            link_to = wds.get_links_to(from_entity_id)
            for v in link_to:
                from_list.append(from_entity_id)
                to_list.append(v)
                value_list.append(1)
                if v > max_id:
                    max_id = v

            link_from = set(wds.get_links_from(from_entity_id))
            for v in link_from:
                from_list.append(v)
                to_list.append(from_entity_id)
                value_list.append(1)
                if v > max_id:
                    max_id = v
        # TODO The following line threw a Value error (row index exceeds matrix dimentions) here on docid 579, and docid 105
        try:
            mtx = sparse.coo_matrix((value_list, (from_list, to_list)),
                                    shape=(max_id + 1, max_id + 1))
            pass
        except ValueError as e:
            self.logger.warning(
                'An error occurred returning None rather that a V1 matrix. %s',
                e)
            return None
        return mtx

    def get_links_totally_within(self, entity_id_list):
        from_list = []
        to_list = []
        value_list = []
        v0_vertice_set = set(entity_id_list)
        wds = WikipediaDataset()
        for entity_id in v0_vertice_set:
            links_to = wds.get_links_to(entity_id)
            for link_to in links_to:
                if link_to in v0_vertice_set:
                    to_list.append(entity_id)
                    from_list.append(link_to)
                    value_list.append(1)
        return from_list, to_list, value_list

    def calc_v0_matrix(self, entity_id_list):
        # only find links that are within the set of nodes we are passed
        from_list, to_list, value_list = self.get_links_totally_within(
            entity_id_list)
        l = []
        l.extend(from_list)
        l.extend(to_list)
        try:
            if len(l) > 0:
                max_id = max(l)  # l could be empty
            else:
                max_id = 1  # this occured on docid = 214
            mtx = sparse.coo_matrix((value_list, (from_list, to_list)),
                                    shape=(max_id + 1, max_id + 1))
        except ValueError as e:
            self.logger.warning(
                'Could not calculate coo matrix. from_list = %s, to_list = %s, value_list = %s ',
                from_list, to_list, value_list)
            logging.exception('')
            mtx = None
        return mtx

    def get_diameter(self,
                     mtx,
                     entity_id_list,
                     print_names=False,
                     break_early=False,
                     optional_docId=-1):
        fairness_by_entity_id = {}
        for entity_id in entity_id_list:
            fairness_by_entity_id[entity_id] = 0

        self.logger.info(
            'docid = %s, Calculating distances for %d  entities. Approx duration %d sec =( %f min )',
            str(optional_docId), len(entity_id_list),
            len(entity_id_list) * 3,
            len(entity_id_list) * 3 / 60.0)

        max_dist = 0
        count = 0
        for entity_id_1 in entity_id_list:
            self.logger.info('%d/%d Calculating distances from entity_id %d ',
                             count, len(entity_id_list), entity_id_1)
            distances, predecessors = dijkstra(mtx,
                                               indices=entity_id_1,
                                               return_predecessors=True)
            for entity_id_2 in entity_id_list:
                if print_names:
                    pass
                    #TODO load cache and print names
                    e1_name = str(entity_id_1)
                    e2_name = str(entity_id_2)
                    print('from ', e1_name, '(', entity_id_1, ') to', e2_name,
                          '(', entity_id_2, ') distance',
                          distances[entity_id_2])
                d = distances[entity_id_2]
                if not np.isinf(d):
                    if d > max_dist:
                        max_dist = d

                    fairness_by_entity_id[
                        entity_id_1] = fairness_by_entity_id[entity_id_1] + d
                    fairness_by_entity_id[
                        entity_id_2] = fairness_by_entity_id[entity_id_2] + d
            count += 1
            if break_early and count > 3:
                self.logger.warning(
                    'Breaking early, so we will have a smaller graph. ')
                break

        print('diameter ', max_dist)
        return max_dist, fairness_by_entity_id

    def get_mean_median_in_degree(self,
                                  mtx,
                                  full_set_entity_ids,
                                  break_early=False):
        if break_early:
            self.logger.warning('Breaking early, returning made up results')
            return 1, 2
        if mtx is None:
            return 0, 0

        csc = mtx.tocsc()
        list = []
        for id in full_set_entity_ids:
            s = csc.getcol(id).sum()
            list.append(s)
        mean = np.mean(list)
        median = np.median(list)
        return mean, median

    def get_mean_median_out_degree(self,
                                   mtx,
                                   full_set_entity_ids,
                                   break_early=False):
        if break_early:
            self.logger.warning('Breaking early, returning made up results')
            return 1, 2

        if mtx is None:
            return 0, 0

        csr = mtx.tocsr()
        list = []
        for id in full_set_entity_ids:
            s = csr.getrow(id).sum()
            list.append(s)
        mean = np.mean(list)
        median = np.median(list)
        return mean, median

    def get_mean_median_degree(self,
                               mtx,
                               full_set_entity_ids,
                               break_early=False):
        degree_by_entity_id = {}
        if break_early:
            self.logger.warning('Breaking early, returning made up results')
            for entity_id in full_set_entity_ids:
                degree_by_entity_id[entity_id] = 1
            return 1, 2, degree_by_entity_id

        if mtx is None:
            for entity_id in full_set_entity_ids:
                degree_by_entity_id[entity_id] = 0
            return 0, 0, degree_by_entity_id

        csc = mtx.tocsc()
        for id in full_set_entity_ids:
            s = csc.getcol(id).sum()
            if id in degree_by_entity_id:
                degree_by_entity_id[id] = degree_by_entity_id[id] + s
            else:
                degree_by_entity_id[id] = s

        csr = mtx.tocsr()
        for id in full_set_entity_ids:
            s = csr.getrow(id).sum()
            if id in degree_by_entity_id:
                degree_by_entity_id[id] = degree_by_entity_id[id] + s
            else:
                degree_by_entity_id[id] = s

        x = list(degree_by_entity_id.values())
        mean = np.mean(x)
        median = np.median(x)
        return mean, median, degree_by_entity_id

    def get_degree_for_entity(self, mtx, entity_id):
        csc = mtx.tocsc()
        s1 = csc.getcol(entity_id).sum()
        csr = mtx.tocsr()
        s2 = csr.getrow(entity_id).sum()
        return s1 + s2

    def get_closeness_by_entity_id(self, fairness_by_entity_id):
        closeness_by_entity_id = {}
        for entity_id in fairness_by_entity_id.keys():
            if fairness_by_entity_id[entity_id] != 0.0:
                closeness_by_entity_id[
                    entity_id] = 1.0 / fairness_by_entity_id[entity_id]
            else:
                closeness_by_entity_id[
                    entity_id] = GraphUtils.REALLY_BIG_NUMBER

        return closeness_by_entity_id

    def get_dense_down_sampled_adj_graph(self, mtx):
        # create a sparse matrix.
        entity_id_by_short_id = {}
        short_id_by_entity_id = {}

        t1 = 0
        t2 = 0
        if len(mtx.col) > 0:
            t1 = mtx.col.max()  # get max of this ndarray
        if len(mtx.row) > 0:
            t2 = mtx.row.max()  # get max of this ndarray
        max_id = max(t1, t2) + 1

        full_set_entity_ids = []
        full_set_entity_ids.extend(mtx.col)
        full_set_entity_ids.extend(mtx.row)
        count = 0
        for entity_id in full_set_entity_ids:
            entity_id_by_short_id[count] = entity_id
            short_id_by_entity_id[entity_id] = count
            count += 1

        # down sample the sparse matrix
        from_list = []
        to_list = []
        value_list = mtx.data
        for i in range(len(mtx.row)):
            from_list.append(short_id_by_entity_id[mtx.row[i]])
            to_list.append(short_id_by_entity_id[mtx.col[i]])

        max_id = 1
        if len(from_list) > 0:
            max_id = max(max_id, max(from_list)) + 1
        if len(to_list) > 0:
            max_id = max(max_id, max(to_list)) + 1

        mtx_small = sparse.coo_matrix((value_list, (from_list, to_list)),
                                      shape=(max_id, max_id))
        # obtain a dense matrix in the down sampled space
        dense = nx.from_scipy_sparse_matrix(mtx_small)

        return dense, entity_id_by_short_id, short_id_by_entity_id, from_list, to_list, mtx_small

    def calc_centrality(self, mtx, full_set_entity_ids):

        centrality_by_entity_id = {}
        if mtx is None:
            for entity_id in full_set_entity_ids:
                centrality_by_entity_id[entity_id] = 0.0
            return centrality_by_entity_id

        # create a sparse matrix.
        dense, entity_id_by_short_id, short_id_by_entity_id, from_list, to_list, mtx_small = self.get_dense_down_sampled_adj_graph(
            mtx)

        # calc centrality
        try:
            centrality = nx.eigenvector_centrality_numpy(dense)
            # convert centrality index back to the original space
            for k in centrality.keys():
                centrality_by_entity_id[
                    entity_id_by_short_id[k]] = centrality[k]
            self.logger.info(centrality_by_entity_id)

        except ValueError as e:
            self.logger.warning(
                'Could not calculate centrality. defaulting to 1')
            for entity_id in full_set_entity_ids:
                centrality_by_entity_id[entity_id] = 1
            # self.logger.warning('mtx_small %s:', mtx_small)
            self.logger.warning("Nodes in G: %s ", dense.nodes(data=True))
            self.logger.warning("Edges in G: %s ", dense.edges(data=True))
            logging.exception('')
        except TypeError as e:
            self.logger.warning(
                'Could not calculate centrality. defaulting to 1')
            for entity_id in full_set_entity_ids:
                centrality_by_entity_id[entity_id] = 1
            # self.logger.warning('mtx_small %s:', mtx_small)
            self.logger.warning("Nodes in G: %s ", dense.nodes(data=True))
            self.logger.warning("Edges in G: %s ", dense.edges(data=True))
            logging.exception('')
        except KeyError as e:
            self.logger.warning(
                'Could not calculate centrality. defaulting to 1')
            for entity_id in full_set_entity_ids:
                centrality_by_entity_id[entity_id] = 1
            # self.logger.warning('mtx_small %s:', mtx_small)
            self.logger.warning("Nodes in G: %s ", dense.nodes(data=True))
            self.logger.warning("Edges in G: %s ", dense.edges(data=True))
            logging.exception('')
        except nx.NetworkXException as e:
            self.logger.warning(
                'Could not calculate centrality. defaulting to 1')
            for entity_id in full_set_entity_ids:
                centrality_by_entity_id[entity_id] = 1
            self.logger.warning('mtx_small %s:', mtx_small)
            self.logger.warning("Nodes in G: %s ", dense.nodes(data=True))
            self.logger.warning("Edges in G: %s ", dense.edges(data=True))
            logging.exception('')

        return centrality_by_entity_id

    def calc_all_features(self, mtx, break_early=False, optional_docId=-1):
        full_set_entity_ids = self.get_unique_set_of_entity_ids(mtx)
        if break_early:
            self.logger.warning("Limiting the number of heavy entities to 5")
            l = list(full_set_entity_ids)
            full_set_entity_ids = set(l[0:min(5, len(l))])

        self.logger.info(
            'Calculating diameter & fairness on matrix with %d vertices',
            len(full_set_entity_ids))
        diameter, fairness_by_entity_id = self.get_diameter(
            mtx,
            full_set_entity_ids,
            break_early=break_early,
            optional_docId=optional_docId)
        feature_1_graph_size = len(full_set_entity_ids)
        self.logger.info('graph size: %d', feature_1_graph_size)
        feature_2_graph_diameter = diameter
        self.logger.info('diameter: %d', diameter)
        mean, median = self.get_mean_median_in_degree(mtx, full_set_entity_ids,
                                                      break_early)
        if median == 0.0:
            self.logger.warning('mean: %f median: %f', mean, median)
            feature_4_in_degree_mean_median = 0  # this can happen from small sets of input entities with no links between them
        else:
            feature_4_in_degree_mean_median = mean / median
        self.logger.info('in degree mean/median: %f',
                         feature_4_in_degree_mean_median)
        mean, median = self.get_mean_median_out_degree(mtx,
                                                       full_set_entity_ids,
                                                       break_early)
        if median == 0.0:
            self.logger.warning('mean: %f median: %f', mean, median)
            feature_5_out_degree_mean_median = 0  # valid for this to be 0
        else:
            feature_5_out_degree_mean_median = mean / median
        self.logger.info('out degree mean/median: %f',
                         feature_5_out_degree_mean_median)
        self.logger.info('calculating mean and median degrees.')
        mean, median, degree_by_entity_id = self.get_mean_median_degree(
            mtx, full_set_entity_ids, break_early=break_early)
        feature_3_node_degree_by_entity_id = degree_by_entity_id
        self.logger.info('node_degree_by_entity_id: %s',
                         feature_3_node_degree_by_entity_id)
        if median == 0.0:
            self.logger.warning('mean: %f median: %f', mean, median)
            feature_6_degree_mean_median = 0  # valid for this to be 0
        else:
            feature_6_degree_mean_median = mean / median
        self.logger.info('degree mean/median: %f',
                         feature_6_degree_mean_median)
        feature_7_fairness_by_entity_id = fairness_by_entity_id
        self.logger.info('fairness_by_entity_id: %s', fairness_by_entity_id)
        feature_8_closeness_by_entity_id = self.get_closeness_by_entity_id(
            fairness_by_entity_id)
        self.logger.info('closeness by entity id: %s',
                         feature_8_closeness_by_entity_id)
        feature_9_centrality_by_entity_id = self.calc_centrality(
            mtx, full_set_entity_ids)
        self.logger.info('centrality by entity id: %s',
                         feature_9_centrality_by_entity_id)
        return feature_1_graph_size, feature_2_graph_diameter, feature_3_node_degree_by_entity_id, feature_4_in_degree_mean_median, \
               feature_5_out_degree_mean_median, feature_6_degree_mean_median, feature_7_fairness_by_entity_id, feature_8_closeness_by_entity_id, feature_9_centrality_by_entity_id

    def filter_low_milne_and_witten_relatedness(self, mtx):
        if mtx is None:
            return None

        self.logger.info('Calculating milne and witten relatedness')
        col_values = []
        row_values = []
        data_values = []
        max_id = 0
        for i in range(len(mtx.data)):
            from_entity_id = mtx.row[i]
            to_entity_id = mtx.col[i]
            relatedness = self.relateness(from_entity_id, to_entity_id)
            if relatedness > 0.0:
                col_values.append(mtx.col[i])
                row_values.append(mtx.row[i])
                data_values.append(mtx.data[i])
                if mtx.col[i] > max_id:
                    max_id = mtx.col[i]
                if mtx.row[i] > max_id:
                    max_id = mtx.row[i]

        mtx = sparse.coo_matrix((data_values, (row_values, col_values)),
                                shape=(max_id + 1, max_id + 1))
        return mtx

    def get_unique_set_of_entity_ids(self, mtx):
        if mtx is None:
            return set()
        full_set = set(mtx.col)
        full_set.update(mtx.row)
        return full_set