def summarize_features():
    level2nodes = load_level2_nodes()

    tm = TaxonomyManager(dir=TAXONOMY_DIR,
                         levels=4,
                         verbosity='low',
                         lazy=False)
    level2classes = [c for c in tm.classes if c.level() == 2 and
                     c.id() in level2nodes]
    descendants = {}
    for parent in level2classes:
        desc = tm.descendants_of(parent.id())
        descendants[parent.id()] = set([c.id() for c in desc
                                        if c.level() == 4])

    for feature in FEATURES:
        in_dir = os.path.join(FEATURESET_ROOT, feature, 'raw')
        out_dir = os.path.join(FEATURESET_ROOT, feature, 'summarized')
        files = [f for f in os.listdir(in_dir) if f.endswith('.csv')]
        for filename in files:
            #print(filename)
            #element_id = int(filename.split('.')[0])

            # Load up the list of thesaurus nodes from this element's
            #  raw .csv file
            in_file = os.path.join(in_dir, filename)
            nodes = defaultdict(int)
            with open(in_file) as csvfile:
                csvreader = csv.reader(csvfile)
                for row in csvreader:
                    thesaurus_node = int(row[2])
                    nodes[thesaurus_node] += 1

            total = sum(nodes.values()) or 1

            level2_counts = {}
            for id, desc in descendants.items():
                level2_counts[id] = 0
                for node, count in nodes.items():
                    if node in desc:
                        level2_counts[id] += count

            out_file = os.path.join(out_dir, filename)
            with open(out_file, 'w') as csvfile:
                csvwriter = csv.writer(csvfile)
                for id in sorted(level2_counts.keys()):
                    ratio = level2_counts[id] / total
                    ratio = float('%.3g' % ratio)
                    row = [id, level2_counts[id], ratio,]
                    csvwriter.writerow(row)
Example #2
0
    def compute_rectangles(self, **kwargs):
        for key, value in kwargs.items():
            self.args[key] = value
        in_dir = self.args.get('taxonomy_dir', None)
        self.taxonomy_manager = TaxonomyManager(dir=in_dir,
                                                levels=self.args.get('depth', 4),
                                                verbosity=None,
                                                lazy=True,)

        self._set_sizes(self.taxonomy_manager.classes)
        root_classes = [c for c in self.taxonomy_manager.classes
                        if c.level() == 1]
        self.total_size = sum([c.branch_size for c in root_classes])

        self.sort_order = 0
        self._iterate_tree(classes=root_classes,
                           orientation=self.args.get('orientation', 'horizontal'),
                           x=self.args.get('x', 0),
                           y=self.args.get('y', 0),
                           width=self.args.get('width', 1),
                           height=self.args.get('height', 1),)
Example #3
0
class Treemap(object):

    """
    Build data used for displaying the Historical Thesaurus as a treemap.

    Optional named arguments to init are:
     - taxonomy_dir: path to the HTOED taxonomy directory
     - depth: number of levels to go down (defaults to 4)
     - orientation: 'horizontal' (default) or 'vertical'
     - width: defaults to 1
     - height: defaults to 1
    """

    def __init__(self, **kwargs):
        self.args = {key: value for key, value in kwargs.items()}
        self.rectangles = []
        self.total_size = None

    def write_to_csv(self, **kwargs):
        """
        Write the treemap data to a CSV file.

        Named argument:
         - out_file: path to the output CSV file
        """
        for key, value in kwargs.items():
            self.args[key] = value

        columns = OUTPUT_COLUMNS[:]
        omit_breadcrumb = self.args.get('omit_breadcrumb', False)
        omit_label = self.args.get('omit_label', True)
        if omit_breadcrumb:
            columns = [c for c in columns if c != 'breadcrumb']
        if omit_label:
            columns = [c for c in columns if c != 'label']

        if not self.rectangles:
            self.compute_rectangles()
        self._set_rounder()
        with open(self.args['out_file'], 'w') as filehandle:
            writer = csv.writer(filehandle)
            writer.writerow(columns)
            for thesaurus_class in self.rectangles:
                writer.writerow(self._data_row(thesaurus_class))

    def _convert_to_json(self, **kwargs):
        for key, value in kwargs.items():
            self.args[key] = value
        if not self.rectangles:
            self.compute_rectangles()
        self._set_rounder()
        mode = self.args.get('mode', 'array')
        outputlist = []
        for thesaurus_class in self.rectangles:
            if mode in ('array', 'list', 'tuple'):
                datablob = self._data_row(thesaurus_class)
            elif mode in ('dict', 'dictionary', 'hash', 'object'):
                datablob = {key: value for key, value in
                            zip(OUTPUT_COLUMNS,
                                self._data_row(thesaurus_class))}
            outputlist.append(datablob)
        return outputlist

    def return_json(self, **kwargs):
        outputlist = self._convert_to_json(**kwargs)
        return json.dumps(outputlist)

    def write_to_json(self, **kwargs):
        """
        Write the treemap data to a JSON file.

        Named arguments:
         - out_file: path to the output JSON file
         - mode: 'array' (default) or 'dict', which changes how the
            data for each element is stored; 'dict' is more verbose
            and explicit.
        """
        for key, value in kwargs.items():
            self.args[key] = value
        outputlist = self._convert_to_json(**kwargs)
        with open(self.args.get('out_file'), 'w') as filehandle:
            json.dump(outputlist, filehandle)

    def _data_row(self, thesaurus_class):
        omit_breadcrumb = self.args.get('omit_breadcrumb', False)
        omit_label = self.args.get('omit_label', True)
        ratio = thesaurus_class.branch_size / self.total_size
        ratio = float('%.3g' % ratio)
        row = [
            thesaurus_class.id(),
            thesaurus_class.root(),
            thesaurus_class.level(),
            thesaurus_class.branch_size,
            ratio,
            self._rounder(thesaurus_class.rectangle.x),
            self._rounder(thesaurus_class.rectangle.y),
            self._rounder(thesaurus_class.rectangle.width),
            self._rounder(thesaurus_class.rectangle.height),
            thesaurus_class.sort,
        ]
        if not omit_label:
            row.append(thesaurus_class.label())
        if not omit_breadcrumb:
            row.append(thesaurus_class.breadcrumb())
        return row

    def compute_rectangles(self, **kwargs):
        for key, value in kwargs.items():
            self.args[key] = value
        in_dir = self.args.get('taxonomy_dir', None)
        self.taxonomy_manager = TaxonomyManager(dir=in_dir,
                                                levels=self.args.get('depth', 4),
                                                verbosity=None,
                                                lazy=True,)

        self._set_sizes(self.taxonomy_manager.classes)
        root_classes = [c for c in self.taxonomy_manager.classes
                        if c.level() == 1]
        self.total_size = sum([c.branch_size for c in root_classes])

        self.sort_order = 0
        self._iterate_tree(classes=root_classes,
                           orientation=self.args.get('orientation', 'horizontal'),
                           x=self.args.get('x', 0),
                           y=self.args.get('y', 0),
                           width=self.args.get('width', 1),
                           height=self.args.get('height', 1),)

    def _iterate_tree(self, **kwargs):
        classes = kwargs.get('classes')
        orientation = kwargs.get('orientation')
        x_coord = kwargs.get('x')
        y_coord = kwargs.get('y')
        width = kwargs.get('width')
        height = kwargs.get('height')
        total_size = sum([thesaurus_class.branch_size for
                          thesaurus_class in classes])

        if total_size == 0:
            scale = 0
        elif orientation == 'horizontal':
            scale = width / total_size
        elif orientation == 'vertical':
            scale = height / total_size

        for thesaurus_class in classes:
            rect_size = thesaurus_class.branch_size * scale
            if orientation == 'horizontal':
                thesaurus_class.rectangle = Coordinates(x_coord, y_coord, rect_size, height)
                x_coord += rect_size
            else:
                thesaurus_class.rectangle = Coordinates(x_coord, y_coord, width, rect_size)
                y_coord += rect_size
            thesaurus_class.sort = self.sort_order
            self.rectangles.append(thesaurus_class)
            self.sort_order += 1

            minimum_size = thesaurus_class.size(branch=True) * 0.01
            offspring = [child for child in
                         self.taxonomy_manager.children_of(thesaurus_class.id())
                         if child.wordclass() is None and
                         child.size(branch=True) > minimum_size]
            if offspring:
                if orientation == 'horizontal':
                    new_orientation = 'vertical'
                else:
                    new_orientation = 'horizontal'
                self._iterate_tree(classes=offspring,
                                   orientation=new_orientation,
                                   x=thesaurus_class.rectangle.x,
                                   y=thesaurus_class.rectangle.y,
                                   width=thesaurus_class.rectangle.width,
                                   height=thesaurus_class.rectangle.height,)

    def _set_rounder(self):
        decimals = self.args.get('decimal_places', 4)
        self.rounder_formatter = '%.' + str(decimals) + 'f'

    def _rounder(self, value):
        return float(self.rounder_formatter % value)

    def _set_sizes(self, classes):
        try:
            self.args['class_sizes']
        except KeyError:
            for thesaurus_class in classes:
                thesaurus_class.branch_size = thesaurus_class.size(branch=True)
        else:
            for thesaurus_class in classes:
                if thesaurus_class.id() in self.args['class_sizes']:
                    thesaurus_class.branch_size = self.args['class_sizes'][thesaurus_class.id()]
                else:
                    thesaurus_class.branch_size = thesaurus_class.size(branch=True)