コード例 #1
0
def file_to_ja(structure, infile, expressions, cleaner, grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    :param structure: A nested list one level lower than the final result. Example: for a depth 2
    text, structure should be [[]].
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify segment (chapter) level. Do
    not include an expression with which to break up the actual text.
    :param cleaner: A function that takes a list of strings and returns an array with the text broken up
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    ja = jagged_array.JaggedArray(structure)

    if structure == []:
        depth = 1
    else:
        depth = ja.get_depth()

    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth - 1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex)
                        for ex in expressions], [-1] * len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):

            if reg.search(line):
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp))
                    temp = []

                    if grab_all:
                        temp.append(line)

                # increment index that's been hit, reset all subsequent indices
                indices[i] += 1
                indices[i + 1:] = [0 for x in indices[i + 1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp))

    return ja
コード例 #2
0
 def test_ja_normalize(self):
     input_ja = ["a", [], ["", "a", ["c"]], ["", ""], ["b"]]
     output_ja = [[["a"]], [], [[], ["a"], ["c"]], [[], []], [["b"]]]
     jaobj = ja.JaggedArray(input_ja)
     jaobj.normalize()
     assert jaobj.array() == output_ja
コード例 #3
0
def file_to_ja_g(depth, infile, expressions, cleaner, gimatria=False, group_name='gim', grab_all=[False] * 6):
    """
    like file to ja but with changing the numbers to Gimatria
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: a boolean list accourding to the regexs, if True then grab all of that if False earse line
            the 5 is just above the 3 whitch is the deepst length we use for now.
    :param gimatria: if the text is presented with gimatria in it.
    :param group_name: a name given to the group of letters for the gimatria to actually use
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    structure = reduce(lambda x, y: [x], range(depth - 1), [])
    ja = jagged_array.JaggedArray(structure)

    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth - 1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1] * len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:

                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []
                if grab_all[i]:
                    temp.append(line)
                    # increment index that's been hit, reset all subsequent indices
                if gimatria:  # note: if you uncomment the top must make this elif
                    gimt = getGematria(found.group('{}'.format(group_name)))
                    if gimt != 0:  # increment index that's been hit, reset all subsequent indices
                        indices[i] = gimt - 1
                    else:
                        indices[i] += 1
                else:
                    indices[i] += 1
                indices[i + 1:] = [-1 if x >= 0 else x for x in indices[i + 1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja
コード例 #4
0
sys.path.insert(0, p)

from sources.local_settings import *

sys.path.insert(0, SEFARIA_PROJECT_PATH)
os.environ['DJANGO_SETTINGS_MODULE'] = "sefaria.settings"
from sefaria.model import *

from sources.functions import numToHeb, getGematria, post_index, post_text
from sefaria.datatype import jagged_array
from data_utilities.util import ja_to_xml, traverse_ja

reload(sys)
sys.setdefaultencoding("utf-8")

simanim_ja = jagged_array.JaggedArray([[[]]
                                       ])  #JA of Simanim[Seifim[comments]]]


def soupAndOpen(filename):
    with open(filename, "r") as file:
        page = file.read()
        return BeautifulSoup(page)


def is_titled_seif(tag):
    return tag.has_attr('title') and u"סעיף" in tag['title']


def getSeifNumber(txt):
    assert u"סעיף" in txt
    seif_number_he = txt.split(' ')[1]