Python find_all_occurrencesの例、regulation.node.find_all_occurrences Pythonの例

コード例 #1

0

ファイルを表示

ファイル: main.py プロジェクト: grapesmoker/regulations-xml-parser

    def scan_current_node_for_terms(self, event=None):

        node_text = self.xml_text.get('1.0', tk.END)
        label = self.current_node.get('label')
        print label
        self.terms = []
        self.unmarked_defs.delete(0, tk.END)

        for term, def_label, _ in self.gather_defined_terms():
            term_locations = set(find_all_occurrences(node_text, term))
            plural_term = self.inf.plural(term)
            plural_term_locations = set(find_all_occurrences(node_text, plural_term))
            unmarked_locs = list(plural_term_locations | term_locations ^ plural_term_locations)
            for start in unmarked_locs:
                if start in plural_term_locations:
                    term_to_use = plural_term
                elif start in term_locations:
                    term_to_use = term
                end = start + len(term_to_use)
                start_index = '1.0 + {} chars'.format(start)
                end_index = '1.0 + {} chars'.format(end)

                if not enclosed_in_tag(node_text, 'ref', start) and \
                        not enclosed_in_tag(node_text, 'def', start) and \
                        not enclosed_in_tag(node_text, 'title', start):
                    term_data = (term_to_use, start, end, start_index, end_index, def_label)
                    if term_data not in self.terms:
                        self.terms.append(term_data)

        self.terms.sort(key=itemgetter(2))
        for term in self.terms:
            self.unmarked_defs.insert(tk.END, '{} [{}]'.format(term[0], term[1]))
            self.xml_text.tag_add('undefined_term', term[3], term[4])

        self.xml_text.tag_configure('undefined_term', background='yellow')

コード例 #2

0

ファイルを表示

ファイル: regulation_node_tests.py プロジェクト: willbarton/regulations-xml-parser

 def test_find_all_occurrences(self):
     s = "There are many days. Sunday is a day. Saturday is a day too. Days happen.".lower()
     occurances = find_all_occurrences(s, 'day')
     self.assertTrue(33 in occurances)
     self.assertTrue(52 in occurances)
     self.assertEqual(len(occurances), 2)
     occurances = find_all_occurrences(s, 'days')
     self.assertTrue(15 in occurances)
     self.assertTrue(61 in occurances)
     self.assertEqual(len(occurances), 2)

コード例 #3

0

ファイルを表示

ファイル: regulation_node_tests.py プロジェクト: cfpb/regulations-xml-parser

 def test_find_all_occurrences(self):
     s = "There are many days. Sunday is a day. Saturday is a day too. Days happen.".lower(
     )
     occurances = find_all_occurrences(s, 'day')
     self.assertTrue(33 in occurances)
     self.assertTrue(52 in occurances)
     self.assertEqual(len(occurances), 2)
     occurances = find_all_occurrences(s, 'days')
     self.assertTrue(15 in occurances)
     self.assertTrue(61 in occurances)
     self.assertEqual(len(occurances), 2)

コード例 #4

0

ファイルを表示

    def scan_current_node_for_terms(self, event=None):

        node_text = self.xml_text.get('1.0', tk.END)
        label = self.current_node.get('label')
        self.terms = []
        self.unmarked_defs.delete(0, tk.END)

        for term, def_label, _ in self.gather_defined_terms():
            term_locations = set(find_all_occurrences(node_text, term))
            plural_term = self.inf.plural(term)
            plural_term_locations = set(
                find_all_occurrences(node_text, plural_term))
            unmarked_locs = list(plural_term_locations
                                 | term_locations ^ plural_term_locations)
            for start in unmarked_locs:
                if start in plural_term_locations:
                    term_to_use = plural_term
                elif start in term_locations:
                    term_to_use = term
                end = start + len(term_to_use)
                start_index = '1.0 + {} chars'.format(start)
                end_index = '1.0 + {} chars'.format(end)

                if not enclosed_in_tag(node_text, 'ref', start) and \
                        not enclosed_in_tag(node_text, 'def', start) and \
                        not enclosed_in_tag(node_text, 'title', start) and \
                        not enclosed_in_tag(node_text, 'subject', start):
                    term_data = (term_to_use, start, end, start_index,
                                 end_index, def_label)
                    if term_data not in self.terms:
                        self.terms.append(term_data)

        self.terms.sort(key=itemgetter(2))
        for term in self.terms:
            self.unmarked_defs.insert(tk.END,
                                      '{} [{}]'.format(term[0], term[1]))
            self.xml_text.tag_add('undefined_term', term[3], term[4])

        self.xml_text.tag_configure('undefined_term', background='yellow')

コード例 #5

0

ファイルを表示

def build_external_citations_layer(root):
    """
    Build the external citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of external citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker')
        par_text = marker + ' ' + xml_node_text(
            paragraph.find('{eregs}content'))
        par_label = paragraph.get('label')
        cites = paragraph.findall('.//{eregs}ref[@reftype="external"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split(':')
            citation_type = target[0]
            try:
                citation_target = target[1].split('-')
            except IndexError as e:
                print(
                    "Error in external citations: '{}' is not formatted properly. "
                    .format(target),
                    "Look for an empty or malformed target in a reftype=\"external\"."
                )
                raise e
            text = cite.text
            positions = find_all_occurrences(par_text, text)
            cite_dict = OrderedDict()
            cite_dict['citation'] = citation_target
            cite_dict['citation_type'] = citation_type
            cite_dict['offsets'] = []
            for pos in positions:
                cite_dict['offsets'].append([pos, pos + len(text)])

            if cite_dict not in citation_list and cite_dict['offsets'] != []:
                citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict

コード例 #6

0

ファイルを表示

ファイル: tree.py プロジェクト: willbarton/regulations-xml-parser

def build_external_citations_layer(root):
    """
    Build the external citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of external citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker')
        par_text = marker + ' ' + xml_node_text(
            paragraph.find('{eregs}content'))
        par_label = paragraph.get('label')
        cites = paragraph.findall('.//{eregs}ref[@reftype="external"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split(':')
            citation_type = target[0]
            try:
                citation_target = target[1].split('-')
            except IndexError as e:
                print("Error in external citations: '{}' is not formatted properly. ".format(target),
                      "Look for an empty or malformed target in a reftype=\"external\".")
                raise e
            text = cite.text
            positions = find_all_occurrences(par_text, text)
            cite_dict = OrderedDict()
            cite_dict['citation'] = citation_target
            cite_dict['citation_type'] = citation_type
            cite_dict['offsets'] = []
            for pos in positions:
                cite_dict['offsets'].append([pos, pos + len(text)])

            if cite_dict not in citation_list and cite_dict['offsets'] != []:
                citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict

コード例 #7

0

ファイルを表示

def build_terms_layer(root):
    """
    Build the terms layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of terms, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    definitions_dict = OrderedDict()
    terms_dict = OrderedDict()

    inf_engine = inflect.engine()
    inf_engine.defnoun('bonus', 'bonuses')

    paragraphs = root.findall('.//{eregs}paragraph') + \
        root.findall('.//{eregs}interpParagraph')

    definitions = root.findall('.//{eregs}def')

    paragraphs_with_defs = [
        par for par in paragraphs if par.find('{eregs}content') is not None
        and par.find('{eregs}content').find('{eregs}def') is not None
    ]

    for paragraph in paragraphs_with_defs:
        label = paragraph.get('label')
        marker = paragraph.get('marker') or ''
        title = paragraph.find('{eregs}title')
        content = apply_formatting(paragraph.find('{eregs}content'))
        par_text = xml_node_text(content).strip()
        definitions = content.findall('{eregs}def')

        total_offset = get_offset(paragraph, marker, title)

        for defn in definitions:
            defined_term = defn.get('term')
            if inf_engine.singular_noun(defined_term.lower()) and not \
                    defined_term.lower() in settings.SPECIAL_SINGULAR_NOUNS:
                key = inf_engine.singular_noun(defined_term.lower()) + \
                    ':' + label
            else:
                key = defined_term.lower() + ':' + label

            def_text = defn.text
            positions = find_all_occurrences(par_text, def_text)
            def_dict = OrderedDict()
            pos = positions[0]
            def_dict['position'] = [
                pos + total_offset, pos + len(def_text) + total_offset
            ]
            def_dict['reference'] = label
            def_dict['term'] = defined_term
            if def_dict['position'] != []:
                definitions_dict[key] = def_dict

    for paragraph in paragraphs:
        content = apply_formatting(paragraph.find('{eregs}content'))
        terms = content.findall('.//{eregs}ref[@reftype="term"]')
        title = paragraph.find('{eregs}title')
        marker = paragraph.get('marker') or ''

        label = paragraph.get('label')
        # If this is a subparagraph of a type that wants an intro paragraph
        # and this paragraph is intro text, set the paragraph's label to reference
        # the parent's
        if wants_intro_text(
                paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_tree
            label = paragraph.getparent().get('label')

        if len(terms) > 0:
            terms_dict[label] = []

        total_offset = get_offset(paragraph, marker, title)

        term_positions = OrderedDict()
        term_targets = OrderedDict()

        for term in terms:
            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != term:
                    tail = child.tail or ''
                    running_par_text += child.text + tail
                else:
                    break

            text = term.text
            target = term.get('target')
            defn_location = [
                key for key, defn in definitions_dict.items()
                if defn['reference'] == target
            ]
            if len(defn_location) > 0:
                defn_location = defn_location[0]
                term_position = len(running_par_text) + total_offset
                term_positions.setdefault(text, []).append(term_position)
                term_targets[text] = defn_location

        for term, positions in term_positions.items():
            target = term_targets[term]
            ref_dict = OrderedDict()
            ref_dict['offsets'] = []
            for pos in positions:
                ref_dict['offsets'].append([pos, pos + len(term)])
            ref_dict['ref'] = target
            if len(ref_dict['offsets']) > 0 and \
                    ref_dict not in terms_dict[label]:
                terms_dict[label].append(ref_dict)

    terms_dict['referenced'] = definitions_dict

    return terms_dict

コード例 #8

0

ファイルを表示

ファイル: tree.py プロジェクト: ascott1/regulations-xml-parser

def build_terms_layer(root):
    """
    Build the terms layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of terms, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    definitions_dict = OrderedDict()
    terms_dict = OrderedDict()

    inf_engine = inflect.engine()
    inf_engine.defnoun('bonus', 'bonuses')

    paragraphs = root.findall('.//{eregs}paragraph') + \
        root.findall('.//{eregs}interpParagraph')

    definitions = root.findall('.//{eregs}def')

    paragraphs_with_defs = [par for par in paragraphs if par.find('{eregs}content') is not None
                            and par.find('{eregs}content').find('{eregs}def') is not None]

    for paragraph in paragraphs_with_defs:
        label = paragraph.get('label')
        marker = paragraph.get('marker') or ''
        content = apply_formatting(paragraph.find('{eregs}content'))
        par_text = (marker + ' ' + xml_node_text(content)).strip()
        definitions = content.findall('{eregs}def')

        for defn in definitions:
            defined_term = defn.get('term')
            if inf_engine.singular_noun(defined_term.lower()) and not \
                    defined_term.lower() in settings.SPECIAL_SINGULAR_NOUNS:
                key = inf_engine.singular_noun(defined_term.lower()) + \
                    ':' + label
            else:
                key = defined_term.lower() + ':' + label

            def_text = defn.text
            positions = find_all_occurrences(par_text, def_text)
            def_dict = OrderedDict()
            pos = positions[0]
            def_dict['position'] = [pos, pos + len(def_text)]
            def_dict['reference'] = label
            def_dict['term'] = defined_term
            if def_dict['position'] != []:
                definitions_dict[key] = def_dict

    for paragraph in paragraphs:
        content = apply_formatting(paragraph.find('{eregs}content'))
        terms = content.findall('.//{eregs}ref[@reftype="term"]')
        title = paragraph.find('{eregs}title')
        marker = paragraph.get('marker') or ''

        label = paragraph.get('label')
        if wants_intro_text(paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_text
            label = paragraph.getparent().get('label')

        if len(terms) > 0:
            terms_dict[label] = []

        if marker != '' and paragraph.tag != '{eregs}interpParagraph':
            marker_offset = len(marker + ' ')
        else:
            marker_offset = 0

        # Keyterm offset.
        # Note: reg-site treats interp-paragraphs as "special" — they
        # don't get the keyterm text included, so we don't include an
        # offset here.
        if title is not None and title.get('type') == 'keyterm' and \
                paragraph.tag != '{eregs}interpParagraph':
            keyterm_offset = len(title.text)
        else:
            keyterm_offset = 0

        term_positions = OrderedDict()
        term_targets = OrderedDict()

        for term in terms:
            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != term:
                    tail = child.tail or ''
                    running_par_text += child.text + tail
                else:
                    break

            text = term.text
            target = term.get('target')
            defn_location = [key for key, defn in definitions_dict.items() if defn['reference'] == target]
            if len(defn_location) > 0:
                defn_location = defn_location[0]
                term_position = len(running_par_text) + marker_offset + keyterm_offset
                term_positions.setdefault(text, []).append(term_position)
                term_targets[text] = defn_location

        for term, positions in term_positions.items():
            target = term_targets[term]
            ref_dict = OrderedDict()
            ref_dict['offsets'] = []
            for pos in positions:
                ref_dict['offsets'].append([pos, pos + len(term)])
            ref_dict['ref'] = target
            if len(ref_dict['offsets']) > 0 and \
                    ref_dict not in terms_dict[label]:
                terms_dict[label].append(ref_dict)

    terms_dict['referenced'] = definitions_dict

    return terms_dict