コード例 #1
0
ファイル: Parser.py プロジェクト: waltms/libphilo
 def __init__(self,
              known_metadata,
              docid,
              format=ARTFLVector,
              parallel=ARTFLParallels,
              xpaths=None,
              metadata_xpaths=None,
              token_regex=Default_Token_Regex,
              non_nesting_tags=[],
              self_closing_tags=[],
              pseudo_empty_tags=[],
              output=None):
     self.known_metadata = known_metadata
     self.docid = docid
     self.i = shlaxtree.ShlaxIngestor(target=self)
     self.tree = None  #unnecessary?
     self.root = None
     self.stack = []
     self.map = xpaths or TEI_XPaths
     self.metadata_paths = metadata_xpaths or TEI_MetadataXPaths
     self.v = OHCOVector.CompoundStack(format, parallel, docid, output)
     # OHCOVector should take an output file handle.
     self.extractors = []
     self.file_position = 0
     self.token_regex = token_regex
     self.non_nesting_tags = non_nesting_tags
     self.self_closing_tags = self_closing_tags
     self.pseudo_empty_tags = pseudo_empty_tags
     self.pushed_tags = {}
     self.depth_pushed = {}
コード例 #2
0
def parse(text):
    try:
        parser = FragmentParser()
        driver = LXMLTreeDriver(target=parser)
        feeder = st.ShlaxIngestor(target=driver)
        feeder.feed(text)
        return feeder.close()
    except ValueError:
        # we use LXML's HTML parser which is more flexible and then feed the result to fragment parser
        parser = etree.HTMLParser()
        tree = etree.fromstring(text.decode('utf8', 'ignore'), parser=parser)
        new_text = etree.tostring(tree, method="xml").replace(
            "<html><body>", '').replace("</body></html>",
                                        '').replace("philohighlight",
                                                    "philoHighlight")
        parser = FragmentParser()
        driver = LXMLTreeDriver(target=parser)
        feeder = st.ShlaxIngestor(target=driver)
        feeder.feed(new_text)
        return feeder.close()
コード例 #3
0
def strip_tags(text):
    parser = FragmentStripper()
    feeder = st.ShlaxIngestor(target=parser)
    feeder.feed(text)
    return feeder.close()
コード例 #4
0
def parse(text):
    parser = FragmentParser()
    driver = LXMLTreeDriver(target=parser)
    feeder = st.ShlaxIngestor(target=driver)
    feeder.feed(text)
    return feeder.close()
コード例 #5
0
ファイル: TagCensus.py プロジェクト: brown-ccv/PhiloLogic4
 def parse(self,text):
     self.tags = {}
     parser = st.ShlaxIngestor(target=self)
     parser.feed(text)
     self.close()
コード例 #6
0
ファイル: tagcensus.py プロジェクト: pleonard212/PhiloLogic4
        if census.tags[tag]["start"] != census.tags[tag]["end"]:
            status += "*"
        if census.tags[tag]["malformed"]:
            status += "X"
        print "%s\t%s\t%d\t%d\t%d\t%d" % (
            status, tag, census.tags[tag]["start"], census.tags[tag]["end"],
            census.tags[tag]["empty"], census.tags[tag]["malformed"])


if __name__ == "__main__":
    file_count = 0
    total = None
    for fn in sys.argv[1:]:
        file_count += 1
        census = TagCensus()
        parser = st.ShlaxIngestor(target=census)
        parser.feed(open(fn).read())
        print fn
        print_census(census)
        if total:
            for tag in census.tags.keys():
                if tag not in total.tags:
                    total.tags[tag] = {
                        "start": 0,
                        "end": 0,
                        "empty": 0,
                        "malformed": 0
                    }
                total.tags[tag]["start"] += census.tags[tag]["start"]
                total.tags[tag]["end"] += census.tags[tag]["end"]
                total.tags[tag]["empty"] += census.tags[tag]["empty"]