Example #1
0
def store_content():
    ThesaurusInstance.objects.all().delete()

    ci = ContentIterator(in_dir=IN_DIR, fixLigatures=True, verbosity="low")
    records = []
    for thesclass in ci.iterate():
        for instance in thesclass.instances():
            inf_node = instance.node.find("./infl")
            if inf_node is not None:
                inflections = inf_node.text or None
            else:
                inflections = None
            record = ThesaurusInstance(
                lemma=instance.lemma(),
                refentry=instance.refentry(),
                refid=instance.refid(),
                start_year=instance.start_date(),
                end_year=instance.end_date(),
                thesclass_id=thesclass.id(),
                inflections=inflections,
            )
            records.append(record)
        if len(records) > 1000:
            ThesaurusInstance.objects.bulk_create(records)
            records = []
    ThesaurusInstance.objects.bulk_create(records)
Example #2
0
def store_taxonomy():
    ThesaurusInstance.objects.all().delete()
    ThesaurusClass.objects.all().delete()

    ci = ContentIterator(in_dir=IN_DIR, fixLigatures=True, verbosity="low")
    for thesclass in ci.iterate():
        if thesclass.id() == 1630 or thesclass.parent() == 1630:
            print(thesclass.id(), thesclass.label(), thesclass.wordclass(penn=True))
    valid_ids = {thesclass.id(): thesclass.size() for thesclass in ci.iterate()}

    tree_manager = TaxonomyManager(lazy=True, verbosity=None)
    for level in range(1, 20):
        classes = [c for c in tree_manager.classes if c.level() == level and c.id() in valid_ids]
        stdout.write("%d\t%d\n" % (level, len(classes)))
        records = []
        for thesclass in classes:
            revised_size = valid_ids[thesclass.id()]
            if thesclass.label():
                label = thesclass.label()[0:LABEL_LENGTH]
            else:
                label = None
            record = ThesaurusClass(
                id=thesclass.id(),
                label=label,
                wordclass=thesclass.wordclass(penn=True),
                level=thesclass.level(),
                parent_id=thesclass.parent(),
                node_size=revised_size,
                branch_size=thesclass.size(branch=True),
            )
            records.append(record)
            if len(records) > 1000:
                ThesaurusClass.objects.bulk_create(records)
                records = []
        ThesaurusClass.objects.bulk_create(records)
Example #3
0
def make_lean_ht():
    iterator = ContentIterator(out_dir=OUT_DIR, yield_mode='file')
    for classes in iterator.iterate():
        # Build a map of each class indexed by ID
        classmap = {thesclass.id(): thesclass for thesclass in classes}
        # Set of IDs marking classes which will be dropped
        dropped_classes = set()

        # Drop instances that represent minor senses
        for thesclass in classes:
            if thesclass.instances():
                wordclass = thesclass.wordclass(penn=True)
                stripnodes = []
                for instance in thesclass.instances():
                    minor_sense, minor_homograph = _test_status(instance, wordclass)
                    if minor_sense or minor_homograph:
                        stripnodes.append(instance.node)
                if stripnodes:
                    container = stripnodes[0].getparent()
                    for node in stripnodes:
                        container.remove(node)
                    # Reset the listed size of the class
                    new_size = thesclass.size() - len(stripnodes)
                    if thesclass.size() == thesclass.size(branch=True):
                        thesclass.reset_size(new_size, branch=True)
                    thesclass.reset_size(new_size)
                    if thesclass.size(branch=True) == 0:
                        dropped_classes.add(thesclass.id())

        # Roll up minor leaf nodes to the parent node
        for thesclass in [c for c in classes if not c.id() in dropped_classes]:
            thesclass.reload_instances()
            parentclass = classmap.get(thesclass.parent(), None)
            if _viable_for_rollup(thesclass, parentclass):
                # Move instances from this class to the parent class
                for instance in thesclass.instances():
                    parentclass.node.append(instance.node)
                # Mark this class to be dropped
                dropped_classes.add(thesclass.id())
                print('-----------------------------------------')
                print(thesclass.id(), thesclass.breadcrumb())
                print('->', parentclass.id(), parentclass.breadcrumb())

        # Remove child-node pointers for nodes which are about to be deleted
        for thesclass in [c for c in classes if not c.id() in dropped_classes]:
            for child_id in thesclass.child_nodes():
                if child_id in dropped_classes:
                    thesclass.remove_child(child_id)

        # Remove nodes for classes marked to be dropped
        for classid in dropped_classes:
            thesclass = classmap[classid]
            thesclass.node.getparent().remove(thesclass.node)

        # Redo counts in the remaining classes
        for thesclass in [c for c in classes if not c.id() in dropped_classes]:
            thesclass.reload_instances()
            thesclass.reset_size(len(thesclass.instances()))
Example #4
0
def inflect_ht():
    iterator = ContentIterator(in_dir=IN_DIR, out_dir=OUT_DIR, yield_mode='file')
    for classes in iterator.iterate():
        for thesclass in classes:
            wordclass = thesclass.wordclass(penn=True)
            if wordclass in MAPPINGS:
                for instance in thesclass.instances():
                    z = _get_inflections(instance.lemma(), wordclass)
                    if z:
                        inf_node = etree.SubElement(instance.node, 'infl')
                        inf_node.text = z
Example #5
0
def _cache_thesaurus_lemmas(content_dir):
    lemmas = {}
    ci = ContentIterator(path=content_dir, fixLigatures=True, verbosity='low')
    for c in ci.iterate():
        if c.instances():
            if c.wordclass() is not None and c.wordclass() in WORDCLASS_MAP:
                wordclass = WORDCLASS_MAP[c.wordclass()]
            else:
                wordclass = None
            for i in c.instances():
                identifier = '%d_%d_%d' % (int(i.refentry()),
                                           int(i.refid()),
                                           int(c.id()))
                if not identifier in lemmas:
                    lemmas[identifier] = (i.lemma(), wordclass)
    return lemmas
Example #6
0
def store_content():
    ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True)
    ThesInstance.__table__.create(DB_ENGINE, checkfirst=True)

    ci = ContentIterator(path=IN_DIR, fixLigatures=True, verbosity="low")
    buffer_size = 0
    for thesclass in ci.iterate():
        for instance in thesclass.instances():
            record_data = {
                "lemma": instance.lemma(),
                "refentry": instance.refentry(),
                "refid": instance.refid(),
                "start_year": instance.start_date(),
                "end_year": instance.end_date(),
                "class_id": thesclass.id(),
            }
            DB_SESSION.add(ThesInstance(record_data))
            buffer_size += 1
        if buffer_size > 1000:
            DB_SESSION.commit()
            buffer_size = 0
    DB_SESSION.commit()
Example #7
0
def store_taxonomy():
    ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True)
    ThesClass.__table__.drop(DB_ENGINE, checkfirst=True)
    ThesClass.__table__.create(DB_ENGINE, checkfirst=True)

    ci = ContentIterator(path=IN_DIR, fixLigatures=True, verbosity="low")
    valid_ids = {thesclass.id(): thesclass.size() for thesclass in ci.iterate()}

    tree_manager = TaxonomyManager(lazy=True, verbosity=None)
    for level in range(1, 20):
        classes = [c for c in tree_manager.classes if c.level() == level and c.id() in valid_ids]
        print(level, len(classes))
        buffer_size = 0
        for thesaurus_class in classes:
            revised_size = valid_ids[thesaurus_class.id()]
            record = ThesClass(thesaurus_class, size=revised_size)
            DB_SESSION.add(record)
            buffer_size += 1
            if buffer_size > 1000:
                DB_SESSION.commit()
                buffer_size = 0
        DB_SESSION.commit()
Example #8
0
def recheck_counts():
    # Figure out the node sizes of all the individual classes
    node_sizes = defaultdict(int)
    iterator = ContentIterator(in_dir=CONTENT_DIR)
    for thesclass in iterator.iterate():
        node_sizes[thesclass.id()] = len(thesclass.instances())

    branch_sizes = {}
    cumulate = defaultdict(int)
    tree_manager = TaxonomyManager(dir=TAX_DIR, lazy=True, verbosity=None)
    levels = list(reversed(range(1, 20)))
    for level in levels:
        classes = [c for c in tree_manager.classes if c.level() == level]
        print(level, len(classes))
        for thesclass in classes:
            branch_sizes[thesclass.id()] = cumulate[thesclass.id()] + node_sizes[thesclass.id()]
        for thesclass in classes:
            cumulate[thesclass.parent()] += branch_sizes[thesclass.id()]

    iterator = ContentIterator(in_dir=CONTENT_DIR, out_dir=CONTENT_DIR_TMP)
    for thesclass in iterator.iterate():
        thesclass.node.set('numInstancesDirect', str(node_sizes[thesclass.id()]))
        thesclass.node.set('numInstancesDescendant', str(branch_sizes[thesclass.id()]))
        node_sizes[thesclass.id()] = len(thesclass.instances())

    for in_file in os.listdir(TAX_DIR):
        lines = []
        with open(os.path.join(TAX_DIR, in_file)) as filehandle:
            for line in filehandle:
                m = re.search('^[ \t]+<class id="(\d+)"', line)
                if m:
                    id = int(m.group(1))
                    additions = ATTSTRING % (node_sizes[id], branch_sizes[id])
                    line = re.sub('>', additions, line, count=1)
                lines.append(line)

        with open(os.path.join(TAX_DIR_TMP, in_file), 'w') as filehandle:
            filehandle.writelines(lines)
Example #9
0
def insert_child_nodes():
    """
    Copy child nodes from the taxonomy version of the data, and insert
    into the content version
    """
    tree_manager = TaxonomyManager(dir=TAX_DIR, lazy=True, verbosity=None)
    childmap = defaultdict(list)
    for thesclass in tree_manager.classes:
        if thesclass.parent():
            childmap[thesclass.parent()].append(thesclass)

    iterator = ContentIterator(in_dir=CONTENT_DIR, out_dir=CONTENT_DIR_TMP)
    for thesclass in iterator.iterate():
        if thesclass.id() in childmap:
            cn_node = etree.Element("childNodes")
            for child in childmap[thesclass.id()]:
                n = etree.SubElement(cn_node, "node")
                n.set("idref", str(child.id()))
                n.set("numInstancesDescendant", str(child.size(branch=True)))
                if child.label():
                    n.text = child.label()
                if child.is_wordclass_level():
                    n.set("pos", child.wordclass())
            thesclass.node.append(cn_node)
Example #10
0
def compile_iteration(in_dir, out_dir, **kwargs):
    sanitize = kwargs.get('sanitize', False)
    drop_instances = kwargs.get('drop_instances', False)
    deduplicate = kwargs.get('deduplicate', False)

    iterator = ContentIterator(in_dir=in_dir, out_dir=out_dir, yield_mode='file')
    for classes in iterator.iterate():
        # Build a map of each class indexed by ID
        classmap = {thesclass.id(): thesclass for thesclass in classes}
        # Set of IDs marking classes which will be dropped
        dropped_classes = set()

        # Drop instances that are not usable
        if drop_instances:
            for thesclass in classes:
                if thesclass.instances():
                    wordclass = thesclass.wordclass(penn=True)
                    stripnodes = [instance for instance in
                                  thesclass.instances()
                                  if _is_not_usable(instance)]
                    if stripnodes:
                        for instance in stripnodes:
                            instance.selfdestruct()
                        # Reset the listed size of the class
                        new_size = thesclass.size() - len(stripnodes)
                        if thesclass.size() == thesclass.size(branch=True):
                            thesclass.reset_size(new_size, branch=True)
                        thesclass.reset_size(new_size)
                        if thesclass.size(branch=True) == 0:
                            dropped_classes.add(thesclass.id())

        # Roll up minor leaf nodes to the parent node
        for thesclass in [c for c in classes if not c.id() in dropped_classes]:
            thesclass.reload_instances()
            parentclass = classmap.get(thesclass.parent(), None)
            if parentclass:
                grandparentclass = classmap.get(parentclass.parent(), None)
            else:
                grandparentclass = None
            if _viable_for_rollup(thesclass, parentclass, grandparentclass):
                # Move instances from this class to the parent class
                for instance in thesclass.instances():
                    parentclass.node.append(instance.node)
                # Mark this class to be dropped
                dropped_classes.add(thesclass.id())

        # Remove child-node pointers for nodes which are about to be deleted
        for thesclass in [c for c in classes if not c.id() in dropped_classes]:
            for child_id in thesclass.child_nodes():
                if child_id in dropped_classes:
                    thesclass.remove_child(child_id)

        # Remove nodes for classes marked to be dropped
        for classid in dropped_classes:
            thesclass = classmap[classid]
            thesclass.selfdestruct()

        # Redo counts in the remaining classes
        for thesclass in [c for c in classes if not c.id() in dropped_classes]:
            thesclass.reload_instances()
            thesclass.reset_size(len(thesclass.instances()))
            if sanitize:
                for instance in thesclass.instances():
                    _sanitize_lemma(instance, thesclass.wordclass(penn=True))
            if deduplicate:
                _deduplicate_instances(thesclass)