def loadTf(self): C = self.C backend = C.backend org = C.org repo = C.repo version = C.data["version"] A = use( f"{org}/{repo}:clone", checkout="clone", backend=backend, version=version ) self.A = A
def __init__(self, vocab_json, tf_app='bhsa', set_data=None, session_data=None, resume_time=False, term_n=0, pause_times=[]): # set meta data for study loop (for saves) self.session_data = session_data self.set_data = set_data self.term_n = term_n self.pause_times = pause_times self.tf_app = tf_app self.fstem = vocab_json.stem # for save names # load set data if not set_data: with open(vocab_json, encoding='utf8') as setfile: set_data = json.load(setfile) self.set_data = set_data # retrieve TF app data appdata = set_data['app_data'] app = appdata['app'] datversion = appdata['version'] self.glossfeat = appdata['gloss_feature'] self.freqfeat = appdata['freq_feature'] self.wordtype = appdata['wordtype'] self.context = appdata['context'] # load the app print('preparing TF...') self.TF = use(app, version=datversion, silent=True) self.F, self.T, self.L = self.TF.api.F, self.TF.api.T, self.TF.api.L # prepare for run, check cycle length run = self.check_end_cycle(set_data) if not run: self.save_file(set_data, vocab_json) raise Exception('EXIT PROGRAM INITIATED; FILE SHUFFLED AND SAVED') # build the study set, prep data for study session if session_data is None: self.session_data = Session(set_data) # build session data self.vocab_json = vocab_json if resume_time: print(f'\nSession is resumed from {resume_time}.\n') # preliminary session report deck_stats = self.session_data.deck_stats print(set_data['name'], 'ready for study.') print(f"this is session {set_data['cycle_data']['total_sessions']+1}:") for score, stat in deck_stats.items(): print(f'score {score}: {stat} terms') print(f'total: {sum(deck_stats.values())}')
def foreground(self, app, refresh=False): A = self.apps[app] hoist = self.hoist appSpec = app if '/' in app else f"{app}:clone" if A is None: A = use(appSpec, checkout="clone", silent="deep", hoist=hoist) self.apps[app] = A else: if refresh: A.reuse(hoist=hoist) else: A.api.makeAvailableIn(hoist) hoist["A"] = A
def executeApp(self): app = self.app indent(level=1, reset=True) info(f'BEGIN testing {app} with {len(self.queryLists[app])} queries') indent(level=2, reset=True) info(f'loading {app}') self.A = use(f'{app}:clone', checkout='clone', silent=True) info(f'making sets for {app}') self.makeSets() info(f'running queries for {app}') self.runQueries() indent(level=2) info(f'all queries run') indent(level=1) info(f'END testing {app}')
from tf.app import use A = use('banks:clone', checkout='clone') T = A.api.T T.headingFromNode(100)
# # From within Text-Fabric, we can ask for this ranking, by means of # # * `C.levels.data`: inspecting the precomputed data # * `F.otype.all`: the resulting node types # * `N.otypeRank`: the resulting ranking # # We load the BHSA and Uruk # ([here](https://annotation.github.io/text-fabric/tf/about/corpora.html) is more info on these corpora) # and have a look at their node type ranking. As = {} for corpus in ("bhsa", "uruk"): print(f"Loading {corpus} ...") As[corpus] = use(f"{corpus}:clone", silent="deep") As[corpus].info("done") # We have loaded both datasets. # # We want to be able to put them into the foreground, i.e. make it so that the global variables `A N F E L T S C TF` become bound to the # forground data set. We write a function for that. def foreground(corpus, hoist): thisA = As[corpus] hoist["A"] = thisA thisTf = thisA.TF thisTf.makeAvailableIn(hoist) thisA.showContext("corpus")
# # So, go off to a terminal and give the command # # ```text-fabric peshitta:latest --checkout=latest``` # # This fetches the latest version of the Peshitta app and data. # # After that, you can just say # # ```text-fabric peshitta``` # # until you got word that a new version of app and/or data has become available. from tf.app import use A = use("peshitta", hoist=globals()) # ## string `JBW L` in the text # Assuming `JBW` is a single word and L is a single word: query = """ word word_etcbc=JBW <: word word_etcbc=L """ results = A.search(query) # That does not help. At least one of the assumptions leads to nowhere. # At this point it might help to use the TF browser to conduct some experiments on the side line. #
from tf.app import use query = ''' p:phrase =: wFirst:word wLast:word := wGap:word wFirst < wGap wLast > wGap p || wGap v:verse v [[ wFirst v [[ wGap ''' A = use('bhsa:clone', checkout='clone') results = A.search(query)
print(passage) print(passage[0]) raise IndexError('Try using the right kind of book names bro') return book_to_index[passage[0]] * 1000000 + int(passage[1]) * 1000 + int( passage[2]) sqlFile = sys.argv[1] jsonFile = sys.argv[2] conn = sqlite3.connect(sqlFile) c = conn.cursor() # OLD: Remove checkout=local if you haven't updated the data files in a while # Remove ":latest" to fix the rate limit thing A = use('bhsa', hoist=globals(), checkout='local') # def nullifyNaAndEmptyAndUnknown(list_to_reduce): # templist = list_to_reduce # keys_to_remove = set() # for key, value in templist.items(): # if value == "NA" or value == "" or value == "unknown": # keys_to_remove.add(key) # for key in keys_to_remove: # templist[key] = None # return templist def normify(word): return normalize('NFC', word)
from utils import prs_set def do(task): result = task md = f'''commit | release | local | base | subdir --- | --- | --- | --- | --- `{task[0]}` | `{task[1]}` | `{task[2]}` | `{task[3]}` | `{task[4]}` ''' display(Markdown(md)) A = use('bhsa:latest', version='2017', mod=('cmerwich/bh-reference-system/tf'), hoist=globals(), silent=True) def compute_text(my_book_name, from_chapter, to_chapter): results = [] highlights = {} my_chapters = set(range(from_chapter, to_chapter + 1)) for book in F.otype.s('book'): book_name = T.bookName(book) for chn in L.d(book, 'chapter'):
import pandas as pd from operator import attrgetter from tf.app import use from tf.fabric import Fabric VERSION = 'c' A = use('bhsa', version=VERSION, hoist=globals(), silent=True) TF.load('g_prs', add=True) def PrintCoref(Corefs): ''' Visualises the coreference classes that MiMi has detected. `Corefs` is a list of coreference sets and singleton sets. The coreference sets contain mentions, that are stored in the class `Mention`. ''' i = 0 classes = [] print('verse', 'id', 'mention', 'txttyp', '§', 'p', 'g', 'n', 'func', 'type',
from tf.app import use A = use('bhsa', lgc=False, check=True)
# If the data is there, it will be auto-downloaded and stored on your machine. # # Let's do it. # %load_ext autoreload # %autoreload 2 # + import collections import os from tf.app import use # - A = use("oldbabylonian:clone", checkout="clone", hoist=globals()) # A = use('oldbabylonian', hoist=globals()) # # Making data # # We illustrate the data creation part by creating a new feature, `ummama`. # The idea is that we mark every sign reading that occurs between `um-ma` and `ma` some where in the first 3 lines of a face. # We want to mark every occurrence of such signs elsewhere in the corpus with `ummama=1`. # # We only do it if the sign between the `um-ma` and `ma` (which must be on the same line) is not missing, damaged, or questionable. # # The easiest way to get started is to run a query: query = """ line ln<4 =: sign reading=um missing# damage# question#
from collections import defaultdict import pandas as pd from tf.app import use A = use( 'bhsa', version='2017', hoist=globals(), silent=True ) column_names=('ann_A','ann_B', 'L', 'M', 'R', 'D', 'd') data_types={'ann_A': str, 'ann_B': str ,'L': int, 'M': int, 'R': int, 'D': int, 'd': float} def MakeTable(iaa_file): iaa_table = pd.read_table(iaa_file, delim_whitespace=True, names=column_names, dtype=data_types ) return iaa_table def ExportToLatex(output_loc, file_name, data_frame, indx = True): with open(f'{output_loc}{file_name}.tex','w') as texf: texf.write(data_frame.to_latex(index=indx)) def CountVersesWords(): text_list = ['Psalms 138', 'Psalms 88', 'Psalms 11', 'Psalms 129', 'Psalms 70', 'Psalms 32', 'Psalms 20', 'Psalms 17', 'Psalms 101', 'Psalms 67',
# We load the BHSA and display the example in question. # - # %load_ext autoreload # %autoreload 2 # + # pip3 install beautifulsoup4 from bs4 import BeautifulSoup as bs from tf.app import use from tf.advanced.helpers import dh # - A = use("bhsa", hoist=globals()) from ipywidgets import Text, Layout, Box, HBox, VBox, Label, HTML, Button v1 = T.nodeFromSection(("Genesis", 1, 1)) A.pretty(v1, standardFeatures=True, fmt="text-phono-full") # What we want is a display like this, but with the glosses (`in` `beginning` `create` etc) editable. # Also all values after `pdp=` should be editable. And the information in the labels with clause and phrase as well # (`xQtX`, `PP`, `Time`) etc. If you hover over them, you see they are values of features `typ`, `rela` and `function`. # # The task is to rebuild this from the # [layout widgets of ipywidgets](https://ipywidgets.readthedocs.io/en/7.6.3/examples/Widget%20Styling.html), # such as Box, HBox, VBox, HTML. # # We start with something simpler, the first phrase (`in beginning`), without the passage reference.
self.report('processing conjunction pairs...') self.conj = Conjunction(tf, **base_sets) self.report('\tdone') self.report('processing construct pairs...') self.cons = Construct(tf, **base_sets) self.report('\tdone') def report(self, mssg): if not self.silent: print(mssg) # set up TF print('Setting up Text-Fabric...') A = use('bhsa', hoist=globals(), silent=True) print('\tdone...') print('\n-- RUNNING WORDSETS --\n') wsets = WordSets(A, silent=False) print('\n-- WSETS COMPLETE --') print('\npickleing word sets...') export = { 'noms': wsets.noms, 'preps': wsets.preps, 'quants': wsets.quants, 'accent_type': wsets.accents.accenttype, 'mwords': wsets.accents.mwords, 'conj_pairs': wsets.conj.pairs, 'cons_pairs': wsets.cons.pairs,
def getDataFromDir(): TF = Fabric(locations=dataDir, modules=[""]) api = TF.loadAll() A = use(appFolder, api=api) return A
import os from sys import exit, stderr from collections import defaultdict, Counter from glob import glob from pprint import pprint from operator import itemgetter, attrgetter import pandas as pd from tf.app import use from tf.fabric import Fabric from utils import * A = use('bhsa', version='2017', mod=('cmerwich/participant-analysis/coreference/tf:clone'), hoist=globals(), silent=True) class ValueData: def __init__(self, quintuple): self.ct = quintuple[0] self.seqNum = int(quintuple[1]) self.isSuffix = quintuple[3] == 's' self.wordPart = quintuple[4] if quintuple[2] == '': self.size = 1 else: self.size = int(quintuple[2])
# ### In notebooks # # This notebook is an example of how you can work with the new data. # # ## Using sets in queries # # You can use the names of sets in all places where you currently use `word`, `sign`, `face`, etc. # More info in the [docs](https://annotation.github.io/text-fabric/tf/about/searchusage.html). from tf.app import use from tf.lib import readSets A = use( "oldbabylonian:clone", version="1.0.4", checkout="clone", hoist=globals(), mod="annotation/tutorials/oldbabylonian/cookbook/pos/tf:clone", ) # A = use('oldbabylonian', hoist=globals(), mod='annotation/tutorials/oldbabylonian/cookbook/pos/tf') # Note that the features `pos` and `subpos` and friends are loaded now. # # Let's print the frequency lists of their values. # First a convenience function to print the frequency list of an arbitrary feature. def freqList(feat): for (p, n) in Fs(feat).freqList(): print(f"{p:<12}: {n:>5} x")
# # Cluster display in Old Babylonian # # We show some details of the display logic by following an example: cluster nodes in the Old Babylonian corpus. # # Clusters are difficult, because # # * they do not necessarily respect proper embedding # * material can be part of several clusters # # We show how we deal with the second part and prevent multiple display of members of multiple clusters. # As an illustration, we'll show the effect of an earlier bug and indicate the fix. # # We start with loading the corpus. A = use("oldbabylonian:clone", checkout="clone", hoist=globals()) A.reuse() # # An example line # # Here is a line with some nested clusters. # In fact, it is the first line of the corpus. # # The node number is stored in the variable `ln`. # # We show the raw ATF source of the line, and the text according to several text formats. ln = F.otype.s("line")[0] ln
def main(): """Writes features to corpora folder.""" use('bhsa', hoist=globals()) data = { 'book_idx': [], 'book': [], 'chapter': [], 'verse': [], 'clause': [], 'word': [], 'lexeme': [], 'word_pos': [], 'verbal_stem': [], 'word_number': [], 'verbal_tense': [], 'clause_type': [], 'phrase_function': [], 'language': [], 'type': [] } all_books = { T.bookName(b).lower(): i for i, b in enumerate(F.otype.s('book')) } for book_name in all_books: book_idx = all_books[book_name] logger.info(f"Extracting {book_name}...") b = F.otype.s('book')[book_idx] for i, c in enumerate(L.d(b, 'chapter')): for j, v in enumerate(L.d(c, 'verse')): for k, cl in enumerate(L.d(v, 'clause')): for l, p in enumerate(L.d(cl, 'phrase')): for w in L.d(p, 'word'): row_dict = { 'book_idx': book_idx, 'book': book_name, 'chapter': i, 'verse': j, 'clause': k, 'word': T.text(w).strip(), 'lexeme': F.lex_utf8.v(w), 'word_pos': F.sp.v(w), 'verbal_stem': F.vs.v(w), 'word_number': F.nu.v(w), 'verbal_tense': F.vt.v(w), 'clause_type': F.typ.v(cl), 'phrase_function': F.function.v(p), 'language': F.language.v(w), 'type': F.txt.v(cl), } data = _append_to_main_dict(data, row_dict) data_df = pd.DataFrame(data) data_df['domain'] = [typ[-1] for typ in data_df['type']] data_df.to_csv(os.path.join(MAIN_DIR, 'corpora', 'main_corpus.csv'), index=False)
import re, collections, csv from collections import defaultdict import pandas as pd from anytree import Node, RenderTree, findall, findall_by_attr, find_by_attr from tf.app import use A = use('bhsa', hoist=globals(), mod='ch-jensen/participants/actor/tf', silent=True) class GenerateNodes: def __init__(self, book, chapter): self.book = book self.chapter = chapter def nodeList(self): ''' Generates a node list consisting of all phrase atom-,suphrase- and word-nodes of given book and chapter. ''' chapter_node = T.nodeFromSection((self.book, self.chapter)) phrase_atom_list = L.d(chapter_node, 'phrase_atom') node_list = [] for n in phrase_atom_list: node_list.append(n) for subph in L.d(n, 'subphrase'): node_list.append(subph) for w in L.d(n, 'word'): node_list.append(w) return node_list def actorLabel(self, n, t='string'):
from tf.app import use # We do not only load the main corpus data, but also the additional *sim* (similarity) feature that is in a # module. # # For the very last version, use `hot`. # # For the latest release, use `latest`. # # If you have cloned the repos (TF app and data), use `clone`. # # If you do not want/need to upgrade, leave out the checkout specifiers. A = use( "annotation/banks:hot", mod="annotation/banks/sim/tf", hoist=globals(), ) # # Use the similarity edge feature # # We print all similar pairs of words that are at least 50% similar but not 100%. query = """ word <sim>50> word """ results = A.search(query) A.table(results, end=10, withPassage="1 2")
'JOB': 'Iob', 'PRO': 'Proverbia', 'RUT': 'Ruth', 'SNG': 'Canticum', 'ECC': 'Ecclesiastes', 'LAM': 'Threni', 'EST': 'Esther', 'DAN': 'Daniel', 'EZR': 'Esra', 'NEH': 'Nehemia', '1CH': 'Chronica I', '2CH': 'Chronica II' } #num_book_dict = {'01': 'GEN', '02': 'EXO', '03': 'LEV', '04': 'NUM', '05': 'DEU', '06': 'JOS', '07': 'JDG', '08': 'RUT', '09': '1SA', '10': '2SA', '11': '1KI', '12': '2KI', '13': '1CH', '14': '2CH', '15': 'EZR', '16': 'NEH', '17': 'EST', '18': 'JOB', '19': 'PSA', '20': 'PRO', '21': 'ECC', '22': 'SNG', '23': 'ISA', '24': 'JER', '25': 'LAM', '26': 'EZK', '27': 'DAN', '28': 'HOS', '29': 'JOL', '30': 'AMO', '31': 'OBA', '32': 'JON', '33': 'MIC', '34': 'NAM', '35': 'HAB', '36': 'ZEP', '37': 'HAG', '38': 'ZEC', '39': 'MAL'} A = use('bhsa', hoist=globals()) for i, b in enumerate(F.otype.s('book')): book = T.bookName(b) if "_" in book: book = book.replace("_", " ").replace("1", "I").replace("2", "II") book_ptx_abrev = bhsa_book_list_with_ptx_ids[i] book_num = book_num_dict[book_ptx_abrev] book_heb = ptx_ids_bhs_names[book_ptx_abrev] book_latin = ptx_ids_latin_names[book_ptx_abrev] #if "Chron" in book: # print(book) # Write file filename = "SFMs\\" + book_num + book_ptx_abrev + "BHSA" + ".sfm" full_filename = path + filename with codecs.open(full_filename, "w", "utf-8") as file:
import pprint as pp from tf.app import use import numpy as np import random A = use('bhsa:hot', hoist=globals()) # NAIVE BAYES FUNCTIONS START def gen_verse(book, chapter, verse, verbose=0, lex_format="Hebrew"): """ returns a list of lexemes of a selected verse :param lex_format: format of lexemes. "Hebrew" for Hebrew :param book: Book name as string :param chapter: number of the chapter :param verse: number of the verse :param verbose: 1 for printing 0 for not printing the verse :return: returns a list of lexemes from the selected verse """ indices = L.d(T.nodeFromSection((book, chapter, verse)), 'word') if lex_format == "Hebrew": verse_by_lexemes = [F.lex_utf8.v(word_idx) for word_idx in indices] else: verse_by_lexemes = [F.lex.v(word_idx) for word_idx in indices] if verbose == 1: print(verse_by_lexemes) return verse_by_lexemes, book def gen_book_vocab(books, chapters=None, verses=None): """
import os from sys import exit, stderr from collections import defaultdict, Counter from glob import glob from pprint import pprint from operator import itemgetter, attrgetter import pandas as pd from tf.app import use from tf.fabric import Fabric from utils import converse_pgn, suffix_dict A = use('bhsa', version='2017', mod=('cmerwich/participant-analysis/coreference/tf,' 'cmerwich/bh-reference-system/tf'), hoist=globals(), silent=True) class ValueData: def __init__(self, quintuple): self.ct = quintuple[0] self.seqNum = int(quintuple[1]) self.isSuffix = quintuple[3] == 's' self.wordPart = quintuple[4] if quintuple[2] == '': self.size = 1 else: self.size = int(quintuple[2])
def loadCorpus(): A = use("bhsa", silent="deep") return A