Ejemplo n.º 1
0
def jsonify():
    """ Controller for the file. Match entries to properties and write files. """

    # Check for prerequesite files
    if not os.path.exists('thesaurus'):
        print('Thesaurus not found. Generating now.')
        os.system('python thesaurus.py')

    # Check for folders to write to
    for folder in ['jsons', 'properties']:
        if not os.path.exists(folder):
            os.mkdir(folder)

    manuscript = BnF(apply_corrections=False)
    df_dict = read_csvs()  # read in thesaurus
    df_dict = read_manuscript(manuscript, df_dict)  # match entries to terms
    write_files(df_dict)  # write jsons and csvs
Ejemplo n.º 2
0
def create_thesaurus():
    """ 
  Creates directory 'thesaurus' containing a .csv file for each property. Each .csv has three columns, count,
  verbatim_term, and prefLabel_en. Count is the number of occurrences of the verbatim term in the manuscript.
  verbatim_term is an term of the given property. prefLabel_en is the normalized form of the term.

  Normalization entails the following steps:
  1. Remove white space, punctuation, or other undesired marks
  2. Lowercase all terms
  3. Singularize all terms
  4. If the term consists of multiple words, find its semantic head. If the head is also a term of the same property,
  the preferred label becomes the semantic head.
  """
    manuscript = BnF(apply_corrections=False)

    # Create directory 'thesaurus' if one does not exist
    if not os.path.exists(m_k_data_to_thesaurus):
        os.mkdir(m_k_data_to_thesaurus)

    for prop in tqdm(properties):
        simple_df, complex_df = get_prop_dfs(
            manuscript, prop)  # get dataframe of count, verbatim terms

        # create the prefLabel_en column by lemmatizing terms to lower case, singular, and stripped of white space
        simple_df['prefLabel_en'] = simple_df.verbatim_term.apply(
            lambda x: singularize(re.sub(r"’|'", '', x)))
        complex_df['prefLabel_en'] = complex_df.verbatim_term.apply(
            lambda x: x.replace('\'', '').lower().strip())

        complex_df = simplify_terms(
            simple_df,
            complex_df)  # reduce complex terms to their semantic heads
        complex_df['prefLabel_en'] = complex_df.prefLabel_en.apply(
            lambda x: inflection.singularize(x))

        df = pd.concat([simple_df, complex_df])  # merge dataframes
        df.to_csv(f'{m_k_data_to_thesaurus}/{prop}.csv',
                  index=False)  # write dataframe to a file
Ejemplo n.º 3
0
import os
from digital_manuscript import BnF

manuscript = BnF(apply_corrections=False)
versions = ['tc', 'tcn', 'tl']

def test_balanced():
  perfect = True
  for entry in manuscript.entries:
    if not all(entry.balanced[v] for v in versions):
      print(f'{entry.identity} is not balanced.')
      perfect = False
  if perfect:
    print('All entries are balanced')

def test_properties():
  for entry in manuscript.entries:
    for prop in entry.properties:
      tc_len = len(entry.properties[prop]['tc'])
      tcn_len = len(entry.properties[prop]['tcn'])
      tl_len = len(entry.properties[prop]['tl'])
      
      if not tc_len == tcn_len == tl_len:
        print(f'{entry.identity}, {prop} -- tc: {tc_len}, tcn: {tcn_len}, tl: {tl_len}')
        print(entry.properties[prop]['tc'])
        print(entry.properties[prop]['tcn'])
        print(entry.properties[prop]['tl'])
        print()

def test_matched():
  perfect = True
Ejemplo n.º 4
0
# Third-Party Modules
import pandas as pd

# Local Modules
from digital_manuscript import BnF
# TODO: make sure you can import digital manuscript.

# set constants
versions = ['tc', 'tcn', 'tl']
# TODO: Adjust path to whatetver it is on your machine
MANUSCRIPT_PATH = os.getcwd() + '/../m-k-manuscript-data'
CATEGORIES_PATH = os.getcwd()

# initialize variables
manuscript = BnF()
df = pd.read_csv(f'{CATEGORIES_PATH}/sizes.csv').fillna('')
size_dict = {}  # identity: category

# store contents of categories.csv in size_dict
for i, row in df.iterrows():
    identity, XPath_Location, folio, link, size_suggestion, notes = row
    if size_suggestion:
        size_dict[identity] = size_suggestion

# read and write files
for version in versions:
    dir_path = f'{MANUSCRIPT_PATH}/ms-xml/{version}/'
    for r, d, f in os.walk(dir_path):
        for filename in f:  # iterate through /ms-xml/ folder
            data = None