def jsonify(): """ Controller for the file. Match entries to properties and write files. """ # Check for prerequesite files if not os.path.exists('thesaurus'): print('Thesaurus not found. Generating now.') os.system('python thesaurus.py') # Check for folders to write to for folder in ['jsons', 'properties']: if not os.path.exists(folder): os.mkdir(folder) manuscript = BnF(apply_corrections=False) df_dict = read_csvs() # read in thesaurus df_dict = read_manuscript(manuscript, df_dict) # match entries to terms write_files(df_dict) # write jsons and csvs
def create_thesaurus(): """ Creates directory 'thesaurus' containing a .csv file for each property. Each .csv has three columns, count, verbatim_term, and prefLabel_en. Count is the number of occurrences of the verbatim term in the manuscript. verbatim_term is an term of the given property. prefLabel_en is the normalized form of the term. Normalization entails the following steps: 1. Remove white space, punctuation, or other undesired marks 2. Lowercase all terms 3. Singularize all terms 4. If the term consists of multiple words, find its semantic head. If the head is also a term of the same property, the preferred label becomes the semantic head. """ manuscript = BnF(apply_corrections=False) # Create directory 'thesaurus' if one does not exist if not os.path.exists(m_k_data_to_thesaurus): os.mkdir(m_k_data_to_thesaurus) for prop in tqdm(properties): simple_df, complex_df = get_prop_dfs( manuscript, prop) # get dataframe of count, verbatim terms # create the prefLabel_en column by lemmatizing terms to lower case, singular, and stripped of white space simple_df['prefLabel_en'] = simple_df.verbatim_term.apply( lambda x: singularize(re.sub(r"’|'", '', x))) complex_df['prefLabel_en'] = complex_df.verbatim_term.apply( lambda x: x.replace('\'', '').lower().strip()) complex_df = simplify_terms( simple_df, complex_df) # reduce complex terms to their semantic heads complex_df['prefLabel_en'] = complex_df.prefLabel_en.apply( lambda x: inflection.singularize(x)) df = pd.concat([simple_df, complex_df]) # merge dataframes df.to_csv(f'{m_k_data_to_thesaurus}/{prop}.csv', index=False) # write dataframe to a file
import os from digital_manuscript import BnF manuscript = BnF(apply_corrections=False) versions = ['tc', 'tcn', 'tl'] def test_balanced(): perfect = True for entry in manuscript.entries: if not all(entry.balanced[v] for v in versions): print(f'{entry.identity} is not balanced.') perfect = False if perfect: print('All entries are balanced') def test_properties(): for entry in manuscript.entries: for prop in entry.properties: tc_len = len(entry.properties[prop]['tc']) tcn_len = len(entry.properties[prop]['tcn']) tl_len = len(entry.properties[prop]['tl']) if not tc_len == tcn_len == tl_len: print(f'{entry.identity}, {prop} -- tc: {tc_len}, tcn: {tcn_len}, tl: {tl_len}') print(entry.properties[prop]['tc']) print(entry.properties[prop]['tcn']) print(entry.properties[prop]['tl']) print() def test_matched(): perfect = True
# Third-Party Modules import pandas as pd # Local Modules from digital_manuscript import BnF # TODO: make sure you can import digital manuscript. # set constants versions = ['tc', 'tcn', 'tl'] # TODO: Adjust path to whatetver it is on your machine MANUSCRIPT_PATH = os.getcwd() + '/../m-k-manuscript-data' CATEGORIES_PATH = os.getcwd() # initialize variables manuscript = BnF() df = pd.read_csv(f'{CATEGORIES_PATH}/sizes.csv').fillna('') size_dict = {} # identity: category # store contents of categories.csv in size_dict for i, row in df.iterrows(): identity, XPath_Location, folio, link, size_suggestion, notes = row if size_suggestion: size_dict[identity] = size_suggestion # read and write files for version in versions: dir_path = f'{MANUSCRIPT_PATH}/ms-xml/{version}/' for r, d, f in os.walk(dir_path): for filename in f: # iterate through /ms-xml/ folder data = None