Ejemplo n.º 1
0
 def _display(self):
     if self._mode == 'basic':
         rows = self._cursor.fetchall()
         if not rows:
             return ''
         newrows = []
         for row in rows:
             src, tgt, pe2e1, rel, pivnum, piv, dist = row
             pe2e1 = math.exp(-float(pe2e1))
             pe2e1str = '{:>6.4}'.format(
                 pe2e1) if pe2e1 < 0.0001 else '{:0<6.4f}'.format(pe2e1)
             rel_or_path = para_wn.get_relation_name(rel)
             if rel_or_path == 'undefined relation':
                 rel_or_path = 'WN distance=' + str(
                     dist) if dist >= 0 else 'not connected in WN'
             newrows.append([
                 src.encode('utf-8'),
                 tgt.encode('utf-8'), pe2e1str, rel_or_path, pivnum,
                 piv.encode('utf-8'), dist
             ])
         srclens = map(len, map(operator.itemgetter(0), newrows))
         trglens = map(len, map(operator.itemgetter(1), newrows))
         probstrlens = map(len, map(operator.itemgetter(2), newrows))
         relationlens = map(len, map(operator.itemgetter(3), newrows))
         maxsrclen, maxtrglen, maxproblen, maxrellen = max(srclens), max(
             trglens), max(probstrlens), max(relationlens)
         return self._format_display(newrows, self._interactive, maxsrclen,
                                     maxtrglen, maxproblen, maxrellen)
     # display the count results
     else:
         rows = self._cursor.fetchall()
         if not rows:
             return ''
         res = ['']
         rows = sorted(rows)
         if self._group_by == 'relation':
             rows = [(para_wn.get_relation_name(group), count)
                     for (group, count) in rows]
         elif self._group_by == 'samepos':
             rows = [(self._POS_IDX_TO_VALUES[group], count)
                     for (group, count) in rows]
         else:
             rows = [(str(group), count) for (group, count) in rows]
         grouplens = map(len, map(operator.itemgetter(0), rows))
         maxgrouplen = max(grouplens)
         for group, count in rows:
             if group == '':
                 res.append(str(count))
             else:
                 res.append('{}\t{}'.format(group.rjust(maxgrouplen),
                                            count))
         res.append('')
         return '\n'.join(res)
Ejemplo n.º 2
0
 def _display(self):
     if self._mode == 'basic':
         rows = self._cursor.fetchall()
         if not rows:
             return ''
         newrows = []
         for row in rows:
             src, tgt, pe2e1, rel, pivnum, piv, dist = row
             pe2e1 = math.exp(-float(pe2e1))
             pe2e1str = '{:>6.4}'.format(pe2e1) if pe2e1 < 0.0001 else '{:0<6.4f}'.format(pe2e1)
             rel_or_path = para_wn.get_relation_name(rel)
             if rel_or_path == 'undefined relation':
                 rel_or_path = 'WN distance=' + str(dist) if dist >= 0 else 'not connected in WN'
             newrows.append([src.encode('utf-8'), tgt.encode('utf-8'), pe2e1str, rel_or_path, pivnum, piv.encode('utf-8'), dist])
         srclens = map(len, map(operator.itemgetter(0), newrows))
         trglens = map(len, map(operator.itemgetter(1), newrows))
         probstrlens = map(len, map(operator.itemgetter(2), newrows))
         relationlens = map(len, map(operator.itemgetter(3), newrows))
         maxsrclen, maxtrglen, maxproblen, maxrellen = max(srclens), max(trglens), max(probstrlens), max(relationlens)
         return self._format_display(newrows, self._interactive, maxsrclen, maxtrglen, maxproblen, maxrellen)
     # display the count results
     else:
         rows = self._cursor.fetchall()
         if not rows:
             return ''
         res = ['']
         rows = sorted(rows)
         if self._group_by == 'relation':
             rows = [(para_wn.get_relation_name(group), count) for (group, count) in rows]
         elif self._group_by == 'samepos':
             rows = [(self._POS_IDX_TO_VALUES[group], count) for (group, count) in rows]
         else:
             rows = [(str(group), count) for (group, count) in rows]
         grouplens = map(len, map(operator.itemgetter(0), rows))
         maxgrouplen = max(grouplens)
         for group, count in rows:
             if group == '':
                 res.append(str(count))
             else:
                 res.append('{}\t{}'.format(group.rjust(maxgrouplen), count))
         res.append('')
         return '\n'.join(res)
Ejemplo n.º 3
0
def analyze_rules(all_rules, percentile_scores):
    db_size = len(all_rules)
    data = {}
    data[whole] = {}
    data[whole]['sample'] = []
    data[pivots] = {}
    data[pivots][whole] = []
    data[distances] = {}
    data[distances][whole] = {}
    data[distances][whole]['values'] = []
    data[per_source] = {}
    data[per_source][whole] = {}
    data[per_source][whole]['tgtnum'] = []
    for part in parts:
        data[part] = {}
        data[pivots][part] = []
        data[distances][part] = {}
        data[distances][part]['values'] = []
        data[per_source][part] = {}
        data[per_source][part]['tgtnum'] = []

    random_sample_size = max(int(len(all_rules) * 0.03), 1)
    random_sample_size = min(random_sample_size, 25)

    data[whole]['sample'] = get_rules_sample(all_rules, [15, 40, 60, 85],
                                             random_sample_size)

    cnt = 0
    ten_percent_len = db_size / 10
    w1 = ""
    stats_per_source = {}
    stats_per_source[whole] = {}
    stats_per_source[whole]['tgtnum'] = 0
    for part in parts:
        stats_per_source[part] = {}
        stats_per_source[part]['tgtnum'] = 0
    while len(all_rules) > 0:
        cnt = cnt + 1
        for percent in range(10):
            progress = percent * ten_percent_len
            if progress == cnt:
                sys.stdout.write(str(percent * 10) + "%... ")
                sys.stdout.flush()
                break
        rule = all_rules.pop(0)
        # the results must be sorted by source (rule[0]), if reached new source, save and reset statistics per source
        if w1 != rule[0]:
            # save the current stats_per_source to data
            if w1 != "":
                #parts + whole
                for x in stats_per_source.keys():
                    #relations + tgtnum
                    for y in stats_per_source[x].keys():
                        if y not in data[per_source][x].keys():
                            data[per_source][x][y] = []
                        if stats_per_source[x][y] > 0:
                            data[per_source][x][y].append(
                                stats_per_source[x][y])
            # reset statistics per source
            w1 = rule[0]
            stats_per_source = {}
            stats_per_source[whole] = {}
            stats_per_source[whole]['tgtnum'] = 0
            for part in parts:
                stats_per_source[part] = {}
                stats_per_source[part]['tgtnum'] = 0

        w2 = rule[1]
        prob = get_prob(rule)
        part = get_part(prob, percentile_scores)
        rel = wn.get_relation_name(rule[3])
        pivotnum = int(rule[4])
        dist = get_distance(int(rule[6]))
        if rel not in data[whole].keys():
            data[whole][rel] = 0

        # count how many times each of the relations was observed in the whole collection
        data[whole][rel] = data[whole][rel] + 1
        data[pivots][whole].append(pivotnum)

        # increase the count of targets per current source by 1
        stats_per_source[whole]['tgtnum'] += 1
        if rel not in stats_per_source[whole].keys():
            stats_per_source[whole][rel] = 0
        #increse by 1 the count of current relation per source
        stats_per_source[whole][rel] += 1

        # for undefined relations, save WN distance
        if rel == undefined:
            if dist not in data[distances][whole].keys():
                data[distances][whole][dist] = []
            # save the rule with that distance
            data[distances][whole][dist].append(rule)
            #save the distance itself
            data[distances][whole]['values'].append(int(rule[6]))

        if part in data.keys():
            if rel not in data[part].keys():
                data[part][rel] = []
            data[part][rel].append(rule)
            data[pivots][part].append(pivotnum)
            if rel == undefined:
                #for undefined relations, save WN distance
                if dist not in data[distances][part].keys():
                    data[distances][part][dist] = []
                data[distances][part][dist].append(rule)
                data[distances][part]['values'].append(int(rule[6]))

            # increase the count of targets per current source by 1
            stats_per_source[part]['tgtnum'] += 1
            if rel not in stats_per_source[part].keys():
                stats_per_source[part][rel] = 0
            #increse by 1 the count of current relation per source
            stats_per_source[part][rel] += 1

    print

    # save the last stats_per_source to data
    for x in stats_per_source.keys():
        # parts + whole
        for y in stats_per_source[x].keys():
            # relations + tgtnum
            if y not in data[per_source][x].keys():
                data[per_source][x][y] = []
            if stats_per_source[x][y] > 0:
                data[per_source][x][y].append(stats_per_source[x][y])

    return data
Ejemplo n.º 4
0
def analyze_rules(all_rules, percentile_scores):
    db_size = len(all_rules)
    data = {}
    data[whole] = {}
    data[whole]["sample"] = []
    data[pivots] = {}
    data[pivots][whole] = []
    data[distances] = {}
    data[distances][whole] = {}
    data[distances][whole]["values"] = []
    data[per_source] = {}
    data[per_source][whole] = {}
    data[per_source][whole]["tgtnum"] = []
    for part in parts:
        data[part] = {}
        data[pivots][part] = []
        data[distances][part] = {}
        data[distances][part]["values"] = []
        data[per_source][part] = {}
        data[per_source][part]["tgtnum"] = []

    random_sample_size = max(int(len(all_rules) * 0.03), 1)
    random_sample_size = min(random_sample_size, 25)

    data[whole]["sample"] = get_rules_sample(all_rules, [15, 40, 60, 85], random_sample_size)

    cnt = 0
    ten_percent_len = db_size / 10
    w1 = ""
    stats_per_source = {}
    stats_per_source[whole] = {}
    stats_per_source[whole]["tgtnum"] = 0
    for part in parts:
        stats_per_source[part] = {}
        stats_per_source[part]["tgtnum"] = 0
    while len(all_rules) > 0:
        cnt = cnt + 1
        for percent in range(10):
            progress = percent * ten_percent_len
            if progress == cnt:
                sys.stdout.write(str(percent * 10) + "%... ")
                sys.stdout.flush()
                break
        rule = all_rules.pop(0)
        # the results must be sorted by source (rule[0]), if reached new source, save and reset statistics per source
        if w1 != rule[0]:
            # save the current stats_per_source to data
            if w1 != "":
                # parts + whole
                for x in stats_per_source.keys():
                    # relations + tgtnum
                    for y in stats_per_source[x].keys():
                        if y not in data[per_source][x].keys():
                            data[per_source][x][y] = []
                        if stats_per_source[x][y] > 0:
                            data[per_source][x][y].append(stats_per_source[x][y])
            # reset statistics per source
            w1 = rule[0]
            stats_per_source = {}
            stats_per_source[whole] = {}
            stats_per_source[whole]["tgtnum"] = 0
            for part in parts:
                stats_per_source[part] = {}
                stats_per_source[part]["tgtnum"] = 0

        w2 = rule[1]
        prob = get_prob(rule)
        part = get_part(prob, percentile_scores)
        rel = wn.get_relation_name(rule[3])
        pivotnum = int(rule[4])
        dist = get_distance(int(rule[6]))
        if rel not in data[whole].keys():
            data[whole][rel] = 0

        # count how many times each of the relations was observed in the whole collection
        data[whole][rel] = data[whole][rel] + 1
        data[pivots][whole].append(pivotnum)

        # increase the count of targets per current source by 1
        stats_per_source[whole]["tgtnum"] += 1
        if rel not in stats_per_source[whole].keys():
            stats_per_source[whole][rel] = 0
        # increse by 1 the count of current relation per source
        stats_per_source[whole][rel] += 1

        # for undefined relations, save WN distance
        if rel == undefined:
            if dist not in data[distances][whole].keys():
                data[distances][whole][dist] = []
            # save the rule with that distance
            data[distances][whole][dist].append(rule)
            # save the distance itself
            data[distances][whole]["values"].append(int(rule[6]))

        if part in data.keys():
            if rel not in data[part].keys():
                data[part][rel] = []
            data[part][rel].append(rule)
            data[pivots][part].append(pivotnum)
            if rel == undefined:
                # for undefined relations, save WN distance
                if dist not in data[distances][part].keys():
                    data[distances][part][dist] = []
                data[distances][part][dist].append(rule)
                data[distances][part]["values"].append(int(rule[6]))

            # increase the count of targets per current source by 1
            stats_per_source[part]["tgtnum"] += 1
            if rel not in stats_per_source[part].keys():
                stats_per_source[part][rel] = 0
            # increse by 1 the count of current relation per source
            stats_per_source[part][rel] += 1

    print

    # save the last stats_per_source to data
    for x in stats_per_source.keys():
        # parts + whole
        for y in stats_per_source[x].keys():
            # relations + tgtnum
            if y not in data[per_source][x].keys():
                data[per_source][x][y] = []
            if stats_per_source[x][y] > 0:
                data[per_source][x][y].append(stats_per_source[x][y])

    return data