Ejemplo n.º 1
0
def compute_mi(dataset, target_name, vocab, level=None):
    if not level:
        # bucket based on bottom/top 30%
        response = dataset.datafile_to_np(
            datafile=dataset.whole_data_files[target_name])
        response = response.toarray()
        low_threshold = utils.percentile(response, 0.3)
        high_threshold = utils.percentile(response, 0.7)
        response[response < low_threshold] = 0
        response[response > high_threshold] = 1
    else:
        level_idx = dataset.class_to_id_map[target_name][level]
        response = dataset.datafile_to_np(
            datafile=dataset.whole_data_files[target_name],
            feature_id_map=dataset.class_to_id_map[target_name])
        response = np.squeeze(
            response[:, dataset.class_to_id_map[target_name][level]].toarray())

    vocab = set(vocab)
    feature_counts = defaultdict(
        lambda: {
            'n00': 1.,  # docs without term, 0 label
            'n01': 1.,  # docs without term, 1 label
            'n10': 1.,  # docs with term, 0 label
            'n11': 1.  # docs with term, 1 label
        })

    input_text = open(dataset.whole_data_files[dataset.input_varname()])

    for line, label in zip(input_text, response):
        if not label in [0, 1]: continue
        line = set(line.strip().split())
        for feature in vocab:
            if label == 0:
                if feature in line:
                    feature_counts[feature]['n10'] += 1
                else:
                    feature_counts[feature]['n00'] += 1
            else:
                if feature in line:
                    feature_counts[feature]['n11'] += 1
                else:
                    feature_counts[feature]['n01'] += 1

    def mi(n00, n01, n10, n11):
        n0_ = n00 + n01  # docs without term
        n1_ = n11 + n10  # docs with term
        n_0 = n10 + n00  # docs with 0 label
        n_1 = n11 + n01  # docs with 1 label
        n = n00 + n01 + n11 + n10  # total n

        mutual_info = (n11/n) * math.log((n * n11) / (n1_ * n_1)) + \
                      (n01/n) * math.log((n * n01) / (n0_ * n_1)) + \
                      (n10/n) * math.log((n * n10) / (n1_ * n_0)) + \
                      (n00/n) * math.log((n * n00) / (n0_ * n_0))
        return mutual_info

    MIs = dict(map(lambda (f, d): (f, mi(**d)), feature_counts.items()))
    return MIs
def compute_ratios(dataset, target_name, vocab, level=None):
    """ computes odds ratios, returning a dict of each feature's ratio
        uses a subset of features, defined by feature_indices
    """
    if not level:
        # bucket based on bottom/top 30%
        response = dataset.datafile_to_np(
            datafile=dataset.whole_data_files[target_name])
        low_threshold = utils.percentile(response, 0.3)
        high_threshold = utils.percentile(response, 0.7)
        response[response < low_threshold] = 0
        response[response > high_threshold] = 1
    else:
        level_idx = dataset.class_to_id_map[target_name][level]
        response = dataset.datafile_to_np(
            datafile=dataset.whole_data_files[target_name],
            feature_id_map=dataset.class_to_id_map[target_name])
        response = np.squeeze(
            response[:, dataset.class_to_id_map[target_name][level]].toarray())

    vocab = set(vocab)
    feature_counts = defaultdict(lambda: {0: 0, 1: 0})

    input_text = open(dataset.whole_data_files[dataset.input_varname()])

    for line, label in zip(input_text, response):
        if not label in [0, 1]: continue
        line = line.strip().split()
        for feature in line:
            # ignore if not feature of interest or outside of bucketing
            if not feature in vocab: continue

            feature_counts[feature][label] += 1

    ratios = {}
    for feature, counts in feature_counts.iteritems():
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/
        a = feature_counts[feature][0]
        b = feature_counts[feature][1]
        c = len(response[response == 0]) - a
        d = len(response[response == 1]) - b
        try:
            ratios[feature] = float(a * d) / (b * c)
        except ZeroDivisionError:
            pass

    return ratios
Ejemplo n.º 3
0
    def _compute_ratios(self, dataset, target_name, feature_indices, level=None):
        """ computes odds ratios, returning a dict of each feature's ratio
            uses a subset of features, defined by feature_indices
        """
        if not level:
            # bucket based on bottom/top 30%
            response = np.copy(dataset.np_data[dataset.split][target_name].toarray())
            low_threshold = utils.percentile(response, 0.3)
            high_threshold = utils.percentile(response, 0.7)
            response[response < low_threshold] = 0
            response[response > high_threshold] = 1
        else:
            level_idx = dataset.class_to_id_map[target_name][level]
            response = dataset.np_data[dataset.split][target_name][:, level_idx].toarray()

        feature_indices = set(feature_indices)
        feature_counts = defaultdict(lambda: {0: 0, 1: 0})

        covariates = dataset.np_data[dataset.split][dataset.input_varname()]
        rows, cols = covariates.nonzero()
        for example, feature_idx in zip(rows, cols):
            if not feature_idx in feature_indices:  continue
            if not response[example][0] in [0, 1]: continue

            feature = dataset.ids_to_features[feature_idx]
            feature_counts[feature][response[example][0]] += 1

        ratios = {}
        for feature, counts in feature_counts.iteritems():
            # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/
            a = feature_counts[feature][0]
            b = feature_counts[feature][1]
            c = len(response[response == 0]) - a
            d = len(response[response == 1]) - b
            try:
                ratios[feature] = float(a * d) / (b * c)
            except ZeroDivisionError:
                pass

        return ratios
  def test_percentile(self):
    """Tests percentile calculations."""
    arr = range(100)
    # 70 is the number at the 70th percentile of arr.
    number_at_percentile = utils.percentile(arr, 0.7)
    self.assertEqual(number_at_percentile, 70)

    arr = range(150)
    number_at_percentile = utils.percentile(arr, 0.7)
    self.assertEqual(number_at_percentile, 105)

    arr = range(150)
    number_at_percentile = utils.percentile(arr, 0.0)
    self.assertEqual(number_at_percentile, 0)

    arr = range(150)
    number_at_percentile = utils.percentile(arr, 0.999)
    self.assertEqual(number_at_percentile, 149)

    with self.assertRaises(IndexError):
      number_at_percentile = utils.percentile(arr, 1.0)

    with self.assertRaises(AssertionError):
      utils.percentile(arr, -0.1)