def _display(self): if self._mode == 'basic': rows = self._cursor.fetchall() if not rows: return '' newrows = [] for row in rows: src, tgt, pe2e1, rel, pivnum, piv, dist = row pe2e1 = math.exp(-float(pe2e1)) pe2e1str = '{:>6.4}'.format( pe2e1) if pe2e1 < 0.0001 else '{:0<6.4f}'.format(pe2e1) rel_or_path = para_wn.get_relation_name(rel) if rel_or_path == 'undefined relation': rel_or_path = 'WN distance=' + str( dist) if dist >= 0 else 'not connected in WN' newrows.append([ src.encode('utf-8'), tgt.encode('utf-8'), pe2e1str, rel_or_path, pivnum, piv.encode('utf-8'), dist ]) srclens = map(len, map(operator.itemgetter(0), newrows)) trglens = map(len, map(operator.itemgetter(1), newrows)) probstrlens = map(len, map(operator.itemgetter(2), newrows)) relationlens = map(len, map(operator.itemgetter(3), newrows)) maxsrclen, maxtrglen, maxproblen, maxrellen = max(srclens), max( trglens), max(probstrlens), max(relationlens) return self._format_display(newrows, self._interactive, maxsrclen, maxtrglen, maxproblen, maxrellen) # display the count results else: rows = self._cursor.fetchall() if not rows: return '' res = [''] rows = sorted(rows) if self._group_by == 'relation': rows = [(para_wn.get_relation_name(group), count) for (group, count) in rows] elif self._group_by == 'samepos': rows = [(self._POS_IDX_TO_VALUES[group], count) for (group, count) in rows] else: rows = [(str(group), count) for (group, count) in rows] grouplens = map(len, map(operator.itemgetter(0), rows)) maxgrouplen = max(grouplens) for group, count in rows: if group == '': res.append(str(count)) else: res.append('{}\t{}'.format(group.rjust(maxgrouplen), count)) res.append('') return '\n'.join(res)
def _display(self): if self._mode == 'basic': rows = self._cursor.fetchall() if not rows: return '' newrows = [] for row in rows: src, tgt, pe2e1, rel, pivnum, piv, dist = row pe2e1 = math.exp(-float(pe2e1)) pe2e1str = '{:>6.4}'.format(pe2e1) if pe2e1 < 0.0001 else '{:0<6.4f}'.format(pe2e1) rel_or_path = para_wn.get_relation_name(rel) if rel_or_path == 'undefined relation': rel_or_path = 'WN distance=' + str(dist) if dist >= 0 else 'not connected in WN' newrows.append([src.encode('utf-8'), tgt.encode('utf-8'), pe2e1str, rel_or_path, pivnum, piv.encode('utf-8'), dist]) srclens = map(len, map(operator.itemgetter(0), newrows)) trglens = map(len, map(operator.itemgetter(1), newrows)) probstrlens = map(len, map(operator.itemgetter(2), newrows)) relationlens = map(len, map(operator.itemgetter(3), newrows)) maxsrclen, maxtrglen, maxproblen, maxrellen = max(srclens), max(trglens), max(probstrlens), max(relationlens) return self._format_display(newrows, self._interactive, maxsrclen, maxtrglen, maxproblen, maxrellen) # display the count results else: rows = self._cursor.fetchall() if not rows: return '' res = [''] rows = sorted(rows) if self._group_by == 'relation': rows = [(para_wn.get_relation_name(group), count) for (group, count) in rows] elif self._group_by == 'samepos': rows = [(self._POS_IDX_TO_VALUES[group], count) for (group, count) in rows] else: rows = [(str(group), count) for (group, count) in rows] grouplens = map(len, map(operator.itemgetter(0), rows)) maxgrouplen = max(grouplens) for group, count in rows: if group == '': res.append(str(count)) else: res.append('{}\t{}'.format(group.rjust(maxgrouplen), count)) res.append('') return '\n'.join(res)
def analyze_rules(all_rules, percentile_scores): db_size = len(all_rules) data = {} data[whole] = {} data[whole]['sample'] = [] data[pivots] = {} data[pivots][whole] = [] data[distances] = {} data[distances][whole] = {} data[distances][whole]['values'] = [] data[per_source] = {} data[per_source][whole] = {} data[per_source][whole]['tgtnum'] = [] for part in parts: data[part] = {} data[pivots][part] = [] data[distances][part] = {} data[distances][part]['values'] = [] data[per_source][part] = {} data[per_source][part]['tgtnum'] = [] random_sample_size = max(int(len(all_rules) * 0.03), 1) random_sample_size = min(random_sample_size, 25) data[whole]['sample'] = get_rules_sample(all_rules, [15, 40, 60, 85], random_sample_size) cnt = 0 ten_percent_len = db_size / 10 w1 = "" stats_per_source = {} stats_per_source[whole] = {} stats_per_source[whole]['tgtnum'] = 0 for part in parts: stats_per_source[part] = {} stats_per_source[part]['tgtnum'] = 0 while len(all_rules) > 0: cnt = cnt + 1 for percent in range(10): progress = percent * ten_percent_len if progress == cnt: sys.stdout.write(str(percent * 10) + "%... ") sys.stdout.flush() break rule = all_rules.pop(0) # the results must be sorted by source (rule[0]), if reached new source, save and reset statistics per source if w1 != rule[0]: # save the current stats_per_source to data if w1 != "": #parts + whole for x in stats_per_source.keys(): #relations + tgtnum for y in stats_per_source[x].keys(): if y not in data[per_source][x].keys(): data[per_source][x][y] = [] if stats_per_source[x][y] > 0: data[per_source][x][y].append( stats_per_source[x][y]) # reset statistics per source w1 = rule[0] stats_per_source = {} stats_per_source[whole] = {} stats_per_source[whole]['tgtnum'] = 0 for part in parts: stats_per_source[part] = {} stats_per_source[part]['tgtnum'] = 0 w2 = rule[1] prob = get_prob(rule) part = get_part(prob, percentile_scores) rel = wn.get_relation_name(rule[3]) pivotnum = int(rule[4]) dist = get_distance(int(rule[6])) if rel not in data[whole].keys(): data[whole][rel] = 0 # count how many times each of the relations was observed in the whole collection data[whole][rel] = data[whole][rel] + 1 data[pivots][whole].append(pivotnum) # increase the count of targets per current source by 1 stats_per_source[whole]['tgtnum'] += 1 if rel not in stats_per_source[whole].keys(): stats_per_source[whole][rel] = 0 #increse by 1 the count of current relation per source stats_per_source[whole][rel] += 1 # for undefined relations, save WN distance if rel == undefined: if dist not in data[distances][whole].keys(): data[distances][whole][dist] = [] # save the rule with that distance data[distances][whole][dist].append(rule) #save the distance itself data[distances][whole]['values'].append(int(rule[6])) if part in data.keys(): if rel not in data[part].keys(): data[part][rel] = [] data[part][rel].append(rule) data[pivots][part].append(pivotnum) if rel == undefined: #for undefined relations, save WN distance if dist not in data[distances][part].keys(): data[distances][part][dist] = [] data[distances][part][dist].append(rule) data[distances][part]['values'].append(int(rule[6])) # increase the count of targets per current source by 1 stats_per_source[part]['tgtnum'] += 1 if rel not in stats_per_source[part].keys(): stats_per_source[part][rel] = 0 #increse by 1 the count of current relation per source stats_per_source[part][rel] += 1 print # save the last stats_per_source to data for x in stats_per_source.keys(): # parts + whole for y in stats_per_source[x].keys(): # relations + tgtnum if y not in data[per_source][x].keys(): data[per_source][x][y] = [] if stats_per_source[x][y] > 0: data[per_source][x][y].append(stats_per_source[x][y]) return data
def analyze_rules(all_rules, percentile_scores): db_size = len(all_rules) data = {} data[whole] = {} data[whole]["sample"] = [] data[pivots] = {} data[pivots][whole] = [] data[distances] = {} data[distances][whole] = {} data[distances][whole]["values"] = [] data[per_source] = {} data[per_source][whole] = {} data[per_source][whole]["tgtnum"] = [] for part in parts: data[part] = {} data[pivots][part] = [] data[distances][part] = {} data[distances][part]["values"] = [] data[per_source][part] = {} data[per_source][part]["tgtnum"] = [] random_sample_size = max(int(len(all_rules) * 0.03), 1) random_sample_size = min(random_sample_size, 25) data[whole]["sample"] = get_rules_sample(all_rules, [15, 40, 60, 85], random_sample_size) cnt = 0 ten_percent_len = db_size / 10 w1 = "" stats_per_source = {} stats_per_source[whole] = {} stats_per_source[whole]["tgtnum"] = 0 for part in parts: stats_per_source[part] = {} stats_per_source[part]["tgtnum"] = 0 while len(all_rules) > 0: cnt = cnt + 1 for percent in range(10): progress = percent * ten_percent_len if progress == cnt: sys.stdout.write(str(percent * 10) + "%... ") sys.stdout.flush() break rule = all_rules.pop(0) # the results must be sorted by source (rule[0]), if reached new source, save and reset statistics per source if w1 != rule[0]: # save the current stats_per_source to data if w1 != "": # parts + whole for x in stats_per_source.keys(): # relations + tgtnum for y in stats_per_source[x].keys(): if y not in data[per_source][x].keys(): data[per_source][x][y] = [] if stats_per_source[x][y] > 0: data[per_source][x][y].append(stats_per_source[x][y]) # reset statistics per source w1 = rule[0] stats_per_source = {} stats_per_source[whole] = {} stats_per_source[whole]["tgtnum"] = 0 for part in parts: stats_per_source[part] = {} stats_per_source[part]["tgtnum"] = 0 w2 = rule[1] prob = get_prob(rule) part = get_part(prob, percentile_scores) rel = wn.get_relation_name(rule[3]) pivotnum = int(rule[4]) dist = get_distance(int(rule[6])) if rel not in data[whole].keys(): data[whole][rel] = 0 # count how many times each of the relations was observed in the whole collection data[whole][rel] = data[whole][rel] + 1 data[pivots][whole].append(pivotnum) # increase the count of targets per current source by 1 stats_per_source[whole]["tgtnum"] += 1 if rel not in stats_per_source[whole].keys(): stats_per_source[whole][rel] = 0 # increse by 1 the count of current relation per source stats_per_source[whole][rel] += 1 # for undefined relations, save WN distance if rel == undefined: if dist not in data[distances][whole].keys(): data[distances][whole][dist] = [] # save the rule with that distance data[distances][whole][dist].append(rule) # save the distance itself data[distances][whole]["values"].append(int(rule[6])) if part in data.keys(): if rel not in data[part].keys(): data[part][rel] = [] data[part][rel].append(rule) data[pivots][part].append(pivotnum) if rel == undefined: # for undefined relations, save WN distance if dist not in data[distances][part].keys(): data[distances][part][dist] = [] data[distances][part][dist].append(rule) data[distances][part]["values"].append(int(rule[6])) # increase the count of targets per current source by 1 stats_per_source[part]["tgtnum"] += 1 if rel not in stats_per_source[part].keys(): stats_per_source[part][rel] = 0 # increse by 1 the count of current relation per source stats_per_source[part][rel] += 1 print # save the last stats_per_source to data for x in stats_per_source.keys(): # parts + whole for y in stats_per_source[x].keys(): # relations + tgtnum if y not in data[per_source][x].keys(): data[per_source][x][y] = [] if stats_per_source[x][y] > 0: data[per_source][x][y].append(stats_per_source[x][y]) return data