def test_find_referent(self): assert xp.find(xc3, '//tier[@type="words"]/referent()') == xc3[0][0] assert xp.find(xc3, '//tier[@type="words"]/referent("alignment")') == None assert xp.find(xc3, '//tier[@type="words"]/referent("segmentation")') == xc3[0][0] assert xp.find(xc3, '//item[../@type="words"]/referent()') == xc3[0][0][0] assert xp.findall(xc3, '//item[../@type="words"]/referent()') == [xc3[0][0][0], xc3[0][0][0], xc3[0][0][0]] assert xp.findall(xc3, '//item[../@type="words"]/referent("alignment")') == [] assert xp.findall(xc3, '//item[../@type="words"]/referent("segmentation")') == [xc3[0][0][0], xc3[0][0][0], xc3[0][0][0]]
def test_find_referrer(self): assert xp.find(xc3, '//tier[@type="phrases"]/referrer()') == xc3[0][5] # because "alignment" comes before "segmentation" assert xp.findall(xc3, '//tier[@type="phrases"]/referrer()') == [xc3[0][5], xc3[0][1]] assert xp.find(xc3, '//tier[@type="phrases"]/referrer("segmentation")') == xc3[0][1] assert xp.find(xc3, '//tier[@type="phrases"]/referrer("alignment")') == xc3[0][5] assert xp.find(xc3, '//item[../@type="phrases"]/referrer()') == xc3[0][5][0] assert xp.findall(xc3, '//item[../@type="phrases"]/referrer()') == [xc3[0][5][0], xc3[0][1][0], xc3[0][1][1], xc3[0][1][2]] assert xp.findall(xc3, '//item[../@type="phrases"]/referrer("alignment")') == [xc3[0][5][0]] assert xp.findall(xc3, '//item[../@type="words"]/referrer("segmentation")') == [xc3[0][2][0], xc3[0][2][1], xc3[0][2][2], xc3[0][2][3], xc3[0][2][4], xc3[0][2][5]]
def remove_language_name(items, igt): new_items = [] lgcode = xp.find(igt, LANG_CODE_PATH) lgname = xp.find(igt, LANG_NAME_PATH) lgtoks = [] if lgcode and '?' not in lgcode and '*' not in lgcode: codes = set(lgcode.split(':')) # split up complex codes codes.update(map(str.upper, list(codes))) codes.update(map(str.lower, list(codes))) lgtoks.extend(codes) if lgname and '?' not in lgname: lgtoks.append(lgname) lgtoks.append(lgname.upper()) if re.search('[- ]', lgname, re.U): # abbreviation for multiword names lgtoks.append(''.join(ln[0] for ln in re.split(r'[- ]+', lgname, re.U))) if re.search(r'^\w{3}', lgname, re.U): lgtoks.append(lgname[:3]) if lgtoks: sig = '|'.join(re.escape(t) for t in lgtoks) start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U) end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U) for item in items: new_items.append(item) # add now; might be modified later tags = get_tags(item) if tags[0] != 'M': orig = item.text m = start_lg_re.match(item.text) if m: meta_item = Item(id=item.id, text=m.group(0).strip(), attributes=dict(item.attributes)) meta_item.attributes['tag'] = 'M+LN' new_items.append(meta_item) item.text = start_lg_re.sub(whitespace, item.text) m = end_lg_re.search(item.text) if m: meta_item = Item(id=item.id, text=m.group(0).strip(), attributes=dict(item.attributes)) meta_item.attributes['tag'] = 'M+LN' items.append(meta_item) item.text = end_lg_re.sub(whitespace, item.text).rstrip() if 'LN' in tags and item.text != orig: tags.remove('LN') item.attributes['tag'] = '+'.join(tags) else: new_items = items return new_items
def get_igts(self, corpus_id, ids=None, matches=None): igts = map(xigtjson.decode_igt, self._read_igts(corpus_id, ids=ids)) if matches is not None: # matches are a disjunction (only one has to match) matcher = lambda i: any(xp.find(i, m) is not None for m in matches) igts = filter(matcher, igts) return list(igts)
def make_sortkey(sortkeys): # return int values if possible (for int comparison), otherwise strings def safe_int(x): try: return int(x) except ValueError: return x key = lambda x: [k for sk in sortkeys for k in map(safe_int, re.split(r'(\d+)', xp.find(x, sk) or ''))] return key
def test_find_simple_path(self): assert xp.find(xc1, '/igt') == xc1[0] assert xp.find(xc1, '/igt/tier') == xc1[0][0] assert xp.find(xc1, '/igt/tier/item') == xc1[0][0][0] assert xp.find(xc1, 'igt/tier/item') == xc1[0][0][0] assert xp.find(xc1, 'tier/item') == None assert xp.find(xc1[0], 'tier/item') == xc1[0][0][0]
def test_find_node(self): assert xp.find(xc1, 'igt') == xc1[0] assert xp.find(xc1, 'tier') == None assert xp.find(xc1[0], 'tier') == xc1[0][0] assert xp.find(xc1, 'item') == None assert xp.find(xc1[0], 'item') == None assert xp.find(xc1[0][0], 'item') == xc1[0][0][0]
def test_find_metadata(self): assert xp.find(xc1m, 'igt/metadata') == xc1m[0].metadata[0] assert xp.findall(xc1m, 'igt/metadata') == [xc1m[0].metadata[0]] assert xp.find(xc1m, 'igt/metadata/meta') == xc1m[0].metadata[0][0] assert xp.findall(xc1m, 'igt/metadata/meta') == [xc1m[0].metadata[0][0]] assert xp.find(xc1m, 'igt/metadata/meta/*') == xc1m[0].metadata[0][0][0] assert xp.findall(xc1m, 'igt/metadata/meta/*') == [xc1m[0].metadata[0][0][0], xc1m[0].metadata[0][0][1]] assert xp.find(xc1m, 'igt/metadata/meta/dc:subject') == xc1m[0].metadata[0][0][0] assert xp.find(xc1m, 'igt/metadata//dc:subject') == xc1m[0].metadata[0][0][0] assert xp.find(xc1m, 'igt/metadata/meta/dc:subject/@olac:code') == 'jpn' assert xp.find(xc1m, 'igt/metadata/meta/dc:subject/text()') == 'Japanese' assert xp.findall(xc1m, 'igt/metadata/meta/dc:*/@olac:code') == ['jpn', 'eng']
def test_predicate(self): assert xp.find(xc1, '//tier[@type="phrases"]') == xc1[0][0] assert xp.find(xc1, '//tier[@type="translations"]') == xc1[0][1] assert xp.find(xc1, '//tier[@type="phrases"]/item') == xc1[0][0][0] assert xp.find(xc1, '//item[../@type="translations"]') == xc1[0][1][0] assert xp.find(xc3, '//item[../@type="glosses"][value()="NOM"]') == xc3[0][3][1]
def find_descendants(self): assert xp.find(xc1, '//item') == xc1[0][0][0] assert xp.find(xc1[0], './/item') == xc1[0][0][0] assert xp.find(xc1[0][1], './/item') == xc1[0][1][0] assert xp.find(xc1[0][1], '//item') == xc1[0][0][0] assert xp.find(xc1m, '//meta') == xc1m[0].metadata[0][0]
def test_find_relative(self): assert xp.find(xc1, '.') == xc1 assert xp.find(xc1[0], '.') == xc1[0] assert xp.find(xc1[0], '..') == xc1 assert xp.find(xc1[0], '../.') == xc1
def test_find_root(self): assert xp.find(xc1, '/.') == xc1 assert xp.find(xc1[0], '/.') == xc1 assert xp.find(xc1[0][0], '/.') == xc1
def wordlist(filelist, gloss=None, meta=None): """ This function takes a list of Xigt-XML ODIN files, looks for the 'normalized' ODIN tier, and grabs the contents of all gloss and meta lines. It tokenizes simply by matching all word characters (using regex's `\w` escape) so as to pull out hyphenated and dotted gloss line tokens. The output is returned as a wordlist reverse sorted by count. :param filelist: List of input files to process. :type filelist: list[str] :param gloss: Path to use for the output gloss wordlist. :type gloss: str :param meta: Path to use for the output meta wordlist. :type meta: str """ gloss_words = defaultdict(int) meta_words = defaultdict(int) # ------------------------------------------- # Iterate over all the paths in the list of files. # ------------------------------------------- for path in filelist: with open(path, 'r', encoding='utf-8') as f: # Load the XigtCorpus, using the transient mode (most memory efficient) xc = xigtxml.load(f, mode='transient') # Now, iterate over each `Igt` instance in each file, for igt in xc: # Use a xigtpath expression to find the `tier` item that is a child of this node, # with state="normalized" as an attribute. norm_tier = xigtpath.find(igt, './tier[@state="normalized"]') # Next, since the `tag` attribute can be G+CR or M+AC etc., grab all lines # with a tag that starts with the desired tag letter. gloss_lines = [item for item in norm_tier if item.attributes['tag'].startswith("G")] meta_lines = [item for item in norm_tier if item.attributes['tag'].startswith("M")] # Define a local function to update the wordlists for gloss and meta # lines. def update_count(l_l, words): for l in l_l: if l.value(): for w in l.value().split(): for sub_w in re.findall('[\w]+', w): # <-- tokenize if sub_w.strip(): words[sub_w.lower()] += 1 # <-- lowercase, and add # Update the counts. update_count(gloss_lines, gloss_words) update_count(meta_lines, meta_words) # Define a function to write out the wordlist objects to files. # here, we will reverse sort by frequency of the word, and # tab-delineate the columns. def write_items(words, path): if path: f = open(path, 'w', encoding='utf-8') items = sorted(words.items(), key=lambda x: (x[1], x[0]), reverse=True) for w, count in items: f.write('{}\t{}\n'.format(w, count)) f.close() write_items(gloss_words, gloss) write_items(meta_words, meta)
def index(fn, by, idx): xc = xigtxml.load(fn, mode='transient') for i, igt in enumerate(xc): idx_key = xp.find(igt, by) idx[idx_key][fn].add(i)
def test_text(self): assert xp.find(xc1, '//item/text()') == 'inu=ga san-biki hoe-ru'
def test_value(self): assert xp.find(xc3, '//tier[@type="words"]/item/value()') == 'inu=ga'
def test_disjunction(self): assert xp.find(xc1, '(/igt/tier[@type="phrases"] | /igt/tier[@type="translations"])') == xc1[0][0] assert xp.findall(xc1, '(/igt/tier[@type="phrases"] | /igt/tier[@type="translations"])') == [xc1[0][0], xc1[0][1]] assert xp.find(xc1, 'igt/(tier[@type="phrases"] | tier[@type="translations"])') == xc1[0][0] assert xp.findall(xc1, 'igt/(tier[@type="phrases"] | tier[@type="translations"])') == [xc1[0][0], xc1[0][1]] assert xp.findall(xc1, 'igt/(tier[@type="phrases"] | tier[@type="translations"])/item') == [xc1[0][0][0], xc1[0][1][0]]