def make_label_sortable(label, roman=False): """ Make labels sortable, but converting them as appropriate. For example, "45Ai33b" becomes (45, "A", "i", 33, "b"). Also, appendices have labels that look like 30(a), we make those appropriately sortable. """ if roman: romans = list(itertools.islice(roman_nums(), 0, 50)) return (1 + romans.index(label),) segments = _component_re.findall(label) return tuple(int(seg) if seg.isdigit() else seg for seg in segments)
def test_roman_nums(self): first_3999 = list(itertools.islice(utils.roman_nums(), 0, 3999)) self.assertEqual(['i', 'ii', 'iii', 'iv', 'v'], first_3999[:5]) def assert_equal(str_value, idx): self.assertEqual(str_value, first_3999[idx - 1]) assert_equal('xvii', 10 + 5 + 1 + 1) assert_equal('xlv', (50-10) + 5) assert_equal('dclv', 500 + 100 + 50 + 5) assert_equal('mcmxcvi', 1000 + (1000-100) + (100-10) + 5 + 1)
def test_roman_nums(self): first_3999 = list(itertools.islice(utils.roman_nums(), 0, 3999)) self.assertEqual(['i', 'ii', 'iii', 'iv', 'v'], first_3999[:5]) def assert_equal(str_value, idx): self.assertEqual(str_value, first_3999[idx - 1]) assert_equal('xvii', 10 + 5 + 1 + 1) assert_equal('xlv', (50 - 10) + 5) assert_equal('dclv', 500 + 100 + 50 + 5) assert_equal('mcmxcvi', 1000 + (1000 - 100) + (100 - 10) + 5 + 1)
def make_label_sortable(label, roman=False): """ Make labels sortable, but converting them as appropriate. Also, appendices have labels that look like 30(a), we make those appropriately sortable. """ if label.isdigit(): return (int(label),) if roman: romans = list(itertools.islice(roman_nums(), 0, 50)) if label in romans: return (1 + romans.index(label),) # segment the label piece into component parts # e.g. 45Ai33b becomes (45, 'A', 'i', 33, 'b') INT, UPPER, LOWER = 1, 2, 3 segments, segment, seg_type = [], "", None for ch in label: if ch.isdigit(): ch_type = INT elif ch.isalpha() and ch == ch.upper(): ch_type = UPPER elif ch.isalpha() and ch == ch.lower(): ch_type = LOWER else: # other character, e.g. parens, guarantee segmentation ch_type = None if ch_type != seg_type and segment: # new type of character segments.append(segment) segment = "" seg_type = ch_type if ch_type: segment += ch if segment: # ended with something other than a paren segments.append(segment) segments = [int(seg) if seg.isdigit() else seg for seg in segments] return tuple(segments)
def make_label_sortable(label, roman=False): """ Make labels sortable, but converting them as appropriate. Also, appendices have labels that look like 30(a), we make those appropriately sortable. """ if label.isdigit(): return (int(label), ) if roman: romans = list(itertools.islice(roman_nums(), 0, 50)) if label in romans: return (1 + romans.index(label), ) # segment the label piece into component parts # e.g. 45Ai33b becomes (45, 'A', 'i', 33, 'b') INT, UPPER, LOWER = 1, 2, 3 segments, segment, seg_type = [], "", None for ch in label: if ch.isdigit(): ch_type = INT elif ch.isalpha() and ch == ch.upper(): ch_type = UPPER elif ch.isalpha() and ch == ch.lower(): ch_type = LOWER else: # other character, e.g. parens, guarantee segmentation ch_type = None if ch_type != seg_type and segment: # new type of character segments.append(segment) segment = "" seg_type = ch_type if ch_type: segment += ch if segment: # ended with something other than a paren segments.append(segment) segments = [int(seg) if seg.isdigit() else seg for seg in segments] return tuple(segments)
import itertools import re import string import sys from regparser.tree import struct from regparser.search import segments from regparser.utils import roman_nums p_levels = [ list(string.ascii_lowercase), [str(i) for i in range(1, 51)], list(itertools.islice(roman_nums(), 0, 50)), list(string.ascii_uppercase), ['<E>' + str(i) + '</E>' for i in string.ascii_lowercase] # Technically, there's italics (roman), but we aren't # handling that yet ] class ParagraphParser(): def __init__(self, p_regex, node_type): """p_regex is the regular expression used when searching through paragraphs. It should contain a %s for the next paragraph 'part' (e.g. 'a', 'A', '1', 'i', etc.) inner_label_fn is a function which takes the current label, and the next paragraph 'part' and produces a new label.""" self.p_regex = p_regex self.node_type = node_type
return u'<E T="03">{}</E>'.format(marker_plain) def deemphasize(marker): """Though the knowledge of emphasis is helpful for determining depth, it is _unhelpful_ in other scenarios, where we only care about the plain text. This function removes <E> tags""" return marker.replace('<E T="03">', '').replace('</E>', '') lower = (tuple(string.ascii_lowercase) + tuple(a+a for a in string.ascii_lowercase if a != 'i')) upper = (tuple(string.ascii_uppercase) + tuple(a+a for a in string.ascii_uppercase)) ints = tuple(str(i) for i in range(1, 999)) roman = tuple(itertools.islice(roman_nums(), 0, 50)) upper_roman = tuple(r.upper() for r in roman) em_ints = tuple(emphasize(i) for i in ints) em_roman = tuple(emphasize(i) for i in roman) # Distinction between types of stars as it indicates how much space they can # occupy STARS_TAG = 'STARS' INLINE_STARS = '* * *' stars = (STARS_TAG, INLINE_STARS) # Account for paragraphs without a marker at all MARKERLESS = 'MARKERLESS' markerless = (MARKERLESS,)
import itertools import re import string import sys from regparser.tree import struct from regparser.search import segments from regparser.utils import roman_nums p_levels = [ list(string.ascii_lowercase), [str(i) for i in range(1, 51)], list(itertools.islice(roman_nums(), 0, 50)), list(string.ascii_uppercase), ['<E T="03">' + str(i) + '</E>' for i in range(1, 51)], ['<E T="03">' + i + '</E>' for i in itertools.islice(roman_nums(), 0, 50)] ] def p_level_of(marker): """Given a marker(string), determine the possible paragraph levels it could fall into. This is useful for determining the order of paragraphs""" potential_levels = [] for level, markers in enumerate(p_levels): if marker in markers: potential_levels.append(level) return potential_levels
return u'<E T="03">{}</E>'.format(marker_plain) def deemphasize(marker): """Though the knowledge of emphasis is helpful for determining depth, it is _unhelpful_ in other scenarios, where we only care about the plain text. This function removes <E> tags""" return marker.replace('<E T="03">', '').replace('</E>', '') lower = (tuple(string.ascii_lowercase) + tuple(a + a for a in string.ascii_lowercase if a != 'i')) upper = (tuple(string.ascii_uppercase) + tuple(a + a for a in string.ascii_uppercase)) ints = tuple(str(i) for i in range(1, 999)) roman = tuple(itertools.islice(roman_nums(), 0, 50)) upper_roman = tuple(r.upper() for r in roman) em_ints = tuple(emphasize(i) for i in ints) em_roman = tuple(emphasize(i) for i in roman) # Distinction between types of stars as it indicates how much space they can # occupy STARS_TAG = 'STARS' INLINE_STARS = '* * *' stars = (STARS_TAG, INLINE_STARS) # Account for paragraphs without a marker at all MARKERLESS = 'MARKERLESS' markerless = (MARKERLESS, ) types = [
import itertools import re import string from regparser.tree import struct from regparser.search import segments from regparser.utils import roman_nums p_levels = [ list(string.ascii_lowercase), [str(i) for i in range(1, 51)], list(itertools.islice(roman_nums(), 0, 50)), list(string.ascii_uppercase), ['<E T="03">' + str(i) + '</E>' for i in range(1, 51)], ['<E T="03">' + i + '</E>' for i in itertools.islice(roman_nums(), 0, 50)] ] def p_level_of(marker): """Given a marker(string), determine the possible paragraph levels it could fall into. This is useful for determining the order of paragraphs""" potential_levels = [] for level, markers in enumerate(p_levels): if marker in markers: potential_levels.append(level) return potential_levels class ParagraphParser(): def __init__(self, p_regex, node_type): """p_regex is the regular expression used when searching through