Example #1
0
def __md_parse(xml_doc, text):
    """
     Return the DOM from multiline markdown text. Headers (#..., ##..., ###...) 
     are interpreted as nodes name and the following text as the content of
     these nodes.
     /!\ plusieurs noeuds de même nom au même niveau ???
  """
    lines = text.splitlines()
    context = xml_doc
    if LIB.sanitize(lines[0]) == "---":  # Markdown with yaml meta-data block
        yaml_block = ""
        for i in range(1, len(lines)):
            if LIB.sanitize(lines[i]) == "---": break
            yaml_block += lines[i] + "\n"
        meta_dict = yaml.load(yaml_block)
        XML.add_dict_as_xml(context, meta_dict)
        raw_content_text = ""
        for j in range(i + 1, len(lines)):
            raw_content_text += lines[j] + "\n"
        node_raw = ET.SubElement(context, "raw_content")
        node_raw.text = str(raw_content_text)

    else:  # Structured Markdown
        cur_level = 0
        code_fence_on = False
        for i in range(len(lines)):
            line = lines[i]
            match_code_fence = re.match(r"^ *```", line)
            if match_code_fence: code_fence_on = not code_fence_on
            match_title = re.match(r"^ *(#+)(.+)", line)
            if match_title and not code_fence_on:
                new_level = len(match_title.group(1))
                node_txt = match_title.group(2)
                node_name = __extract_node_name(node_txt)
                attributes = __extract_attributes(node_txt)
                if new_level > cur_level + 1:
                    meta = XML.xpath_node(xml_doc, ".//_meta")
                    XML.add_error(meta, "__md_parse()", "fatal",
                                  "md_level_mismatch", "level mismatch", i,
                                  line)
                else:
                    for lev in range(new_level, cur_level + 1):
                        context = context.getparent()
                    context = ET.SubElement(context, node_name, attributes)
                    cur_level = new_level
            else:
                context.text = (context.text
                                if context.text else "") + line + "\n"

        node_raw = ET.SubElement(xml_doc, "raw_content")
        node_raw.text = text

    return xml_doc
Example #2
0
def doc(root_dir, name):
    """
     Parse a Markdown or CSV document file and return DOM.
  """
    full_path = os.path.join(root_dir, name)
    input_file = codecs.open(full_path, mode="r", encoding="utf-8")

    dir, file = os.path.split(name)
    forget, ext = os.path.splitext(name)
    ext = ext[1:].lower()
    xml_doc = ET.Element('document', {
        "path": name,
        "dir": dir,
        "file": file,
        "ext": ext
    })
    meta = ET.SubElement(xml_doc, "_meta")

    if (ext == LIB.CSV_EXT):
        try:
            dom = __csv_parse(xml_doc, input_file.read())
        except:
            dom = __bin_parse(xml_doc, None)

    elif (ext == LIB.MD_EXT):
        #try:
        dom = __md_parse(xml_doc, input_file.read())
        #except:
        dom = __bin_parse(xml_doc, None)

    else:  # other parsed as binary
        dom = __bin_parse(xml_doc, None)

    #build @ref, @title, @icon, @cat attributes
    for root_node in XML.xpath_list(dom, "*[name][not(@is_template)]"):
        ref = __extract_ref(XML.xpath_plain(root_node, "name/text()"))
        if (ref > ""):
            title = LIB.sanitize(XML.xpath_plain(root_node, "title/text()"))
            icon = __extract_image(XML.xpath_plain(root_node, "text()"))
            cat = "CAT-" + root_node.tag
            root_node.set("ref", ref)
            root_node.set("title", title)
            root_node.set("icon", icon)
            root_node.set("cat", cat)
            #find all distinct ref patterns
            text = " ".join(root_node.xpath(".//text()"))
            ext_refs = set(re.findall('\[`([A-Z]{2,6}-[\w_]+)`\]', text))
            for ext_ref in ext_refs:
                if (ext_ref != ref):
                    XML.add_external_ref(meta, ext_ref, ref)

        else:
            XML.add_error(meta, "doc()", "warning", "ref_error",
                          "document reference syntax error", 0, ref)

    return dom
Example #3
0
def __extract_node_name(txt):
    """
     Return txt with only lowercase and any non alphanumerical caracters replaced
     by '_'. Stop at first '@'.
  """
    txt = re.sub(r"@.*$", "", txt)
    txt = LIB.sanitize(txt).lower()
    txt = re.sub(r"\W+", "_", txt)
    txt = "_" + txt if re.match("^\d", txt) else txt
    return txt
Example #4
0
def __extract_attributes(txt):
    """
     Return a dictionary of attributes from '@key=value' patterns in txt.
  """
    txt = re.sub(r"^[^@]*", "", txt).strip("@")
    groups = txt.split("@")
    groups = [LIB.sanitize(a).lower() for a in groups]
    key_vals = [(g.split("=") + [""])[0:2] for g in groups
                if __is_valid_name(g.split("=")[0])]
    attribs = dict(key_vals)
    return attribs
Example #5
0
    corpusdir + f for f in os.listdir(corpusdir) if f[-7:] == '.tar.gz'
]
for t in tarballs:
    with tarfile.open(t, 'r:gz') as t:
        t.extractall(path=tmp)

#list relevant txt files
reg = re.compile('^[A-Z]$')
corpusfiles = [tmp + f for f in os.listdir(tmp) if reg.match(f)]

#store all sentences in one list (already sanitized and lowercased)
corpus = []
for f in corpusfiles:
    with open(f, 'r') as f:
        for line in f.readlines():
            corpus.append(sanitize(line.lower()))

#delete temp subdir
rmtree(tmp)

## RANDOM-PICK LINES AND NONCES ##

#prepare vars
srcrange = range(len(corpus))  #no. of lines in BNC
outrange = range(Corpora.size)  #no. of desired lines in output
nums = []
nonces = []
out = []

#load pos-tagger (to avoid selecting punctuation as nonces)
nlp = spacy.load('en')
Example #6
0
#!/usr/bin/env python3
"""Takes a dump of the 1st sentences of Wikipedia pages; then random-picks a set
no. of nonce :: context pairs therefrom, purges invalid entries, and formats
them so that they can be parsed by vectorize.py.
"""

import random, spacy
from conf import Corpora
from lib import getline_inf, sanitize

## PREPARE SOURCE CORPUS ##

with open(Corpora.inf.srcdir + 'wiki_src.txt', 'r') as src:
    src = src.readlines()  #store lines in list
    src = [sanitize(line.lower()) for line in src]  #sanitize and lowercase

## FILTER OUT INVALID ENTRIES ##

#lambda: lines coming from disambiguation wiki pages
disamb = lambda x : 'may refer to'   in x or 'may stand for'   in x \
          or 'can refer to'   in x or 'can stand for'   in x \
          or 'might refer to' in x or 'might stand for' in x \
          or 'also refer to'  in x or 'also stand for'  in x

#lambda: empty lines (nonce without definition)
empty = lambda x: len(x) <= 1 or '::' in x[-5:]

#filter out
src = [line for line in src if not (disamb(line) or empty(line))]

## RANDOM-PICK LINES THEN OUTPUT ##