Example #1
0
 def __init__(self,
              known_metadata,
              docid,
              format=ARTFLVector,
              parallel=ARTFLParallels,
              xpaths=None,
              metadata_xpaths=None,
              token_regex=Default_Token_Regex,
              non_nesting_tags=[],
              self_closing_tags=[],
              pseudo_empty_tags=[],
              output=None):
     self.known_metadata = known_metadata
     self.docid = docid
     self.i = shlaxtree.ShlaxIngestor(target=self)
     self.tree = None  #unnecessary?
     self.root = None
     self.stack = []
     self.map = xpaths or TEI_XPaths
     self.metadata_paths = metadata_xpaths or TEI_MetadataXPaths
     self.v = OHCOVector.CompoundStack(format, parallel, docid, output)
     # OHCOVector should take an output file handle.
     self.extractors = []
     self.file_position = 0
     self.token_regex = token_regex
     self.non_nesting_tags = non_nesting_tags
     self.self_closing_tags = self_closing_tags
     self.pseudo_empty_tags = pseudo_empty_tags
     self.pushed_tags = {}
     self.depth_pushed = {}
    def __init__(self,
                 output,
                 docid,
                 filesize,
                 token_regex=r"(\w+)|([\.\?\!])",
                 xpaths=[("doc", "./")],
                 metadata_xpaths=[],
                 suppress_tags=[],
                 pseudo_empty_tags=[],
                 known_metadata={}):
        self.types = ["doc", "div1", "div2", "div3", "para", "sent", "word"]
        self.parallel_type = "page"
        self.output = output
        self.docid = docid
        ## Initialize an OHCOVector Stack. operations on this stack produce all parser output.
        self.v = OHCOVector.CompoundStack(self.types, self.parallel_type,
                                          docid, output)

        self.filesize = filesize

        self.token_regex = token_regex
        self.xpaths = xpaths[:]
        self.metadata_xpaths = metadata_xpaths[:]

        self.suppress_xpaths = suppress_tags
        self.pseudo_empty_tags = pseudo_empty_tags
        self.known_metadata = known_metadata

        self.buffer_position = 0
        self.buffers = []
    def __init__(self,
                 output,
                 docid,
                 filesize,
                 token_regex=r"(\w+)|([\.\?\!])",
                 xpaths=[("doc", "./")],
                 metadata_xpaths=[],
                 suppress_tags=[],
                 pseudo_empty_tags=[],
                 words_to_index=[],
                 known_metadata={}):
        self.types = ["doc", "div1", "div2", "div3", "para", "sent", "word"]
        self.parallel_type = "page"
        self.output = output
        self.docid = docid
        self.filesize = filesize
        self.v = OHCOVector.CompoundStack(self.types, self.parallel_type,
                                          docid, output)

        self.token_regex = token_regex
        self.xpaths = xpaths[:]
        self.metadata_xpaths = metadata_xpaths[:]

        self.suppress_xpaths = suppress_tags
        self.pseudo_empty_tags = pseudo_empty_tags
        self.known_metadata = known_metadata

        self.stack = []
        self.root = None
        self.handlers = {}
        self.buffer_position = 0
        self.buffers = []