import glob, operator, os, re from collections import Counter from helper import build_corpus import matplotlib.pyplot as plt from matplotlib_venn import venn2, venn2_circles import numpy as np import pandas as pd import seaborn as sns # # Gather all file names corpus_dir = '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-works/data/' authors = ['bahaullah', 'bab', 'abdulbaha', 'shoghi-effendi'] languages = ['ar', 'fa'] file_names = build_corpus(corpus_dir, authors, languages) # Functions containing the various steps of normalization def process_one(text): return text def process_two(text): text = text.replace("\n", " ") text = text.replace("\t", " ") text = text.replace("(", "") text = text.replace(")", "") text = text.replace("﴾", "") text = text.replace("﴿", "") text = text.replace('"', '') text = text.replace(":", "")
import glob, os from collections import Counter from helper import process, build_corpus dir = "/Users/jtim/Dropbox/Academic/sources/corpora/bahai-works/data/" authors = ['abdulbaha'] languages = ['ar'] corpus = build_corpus(dir, authors, languages) mmha1 = [] arabic_counter = Counter() for name in corpus: if 'mmha1' in name: mmha1.append(name) for file in mmha1: with open("{}{}".format(dir, name), 'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out: out.write(file) out.write('\n') out.write("--------------------------------------") out.write('\n') out.write(f.read()) for file in mmha1: with open("{}{}".format(dir, name), 'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out_c: words = Counter(process(f.read()).split())
from helper import build_corpus, process import glob, nltk, os import matplotlib.pyplot as plt # Gather corpora base_dir = '/Users/jtim/Dropbox/Academic/sources/corpora/' bahaullah = build_corpus('{}bahai-works/data/'.format(base_dir), ['bahaullah'], ['ar']) abdulbaha = build_corpus('{}bahai-works/data/'.format(base_dir), ['abdulbaha'], ['ar']) bab = build_corpus('{}bahai-works/data/'.format(base_dir), ['bab'], ['ar']) murtada_ansari = glob.glob( '/Users/jtim/Dropbox/Academic/sources/corpora/open-arabic-1300AH/data/1281MurtadaAnsari/*/arc/*.txt' ) bahaullah_baghdad = [ 'bahaullah-anbka-15-ar.txt', # سورة الذكر 'bahaullah-aqa2-67-ar.txt', # جواهر الاسرار 'bahaullah-aqa2-93-ar.txt', # سورة القدير 'bahaullah-aqa2-76-ar.txt', # سورة الله 'bahaullah-aqa2-101-ar.txt', # لوح الحورية 'bahaullah-km-1-ar.txt', # الكلمات المكنونة العربية 'bahaullah-st-010-1-ar.txt', # الحروفات العاليات 'bahaullah-st-029-ar.txt', # (لوح آية النور (تفسير الحروفات المقطعة 'bahaullah-st-037-ar.txt', # لوح الفتنة 'bahaullah-st-041-ar.txt', # لوح الحق 'bahaullah-st-052-ar.txt', # لوح كل الطعام 'bahaullah-st-087-ar.txt', # لوح مدينة الرضا 'bahaullah-st-088-ar.txt', # لوح مدينة التوحيد 'bahaullah-st-100-ar.txt', # لوح سبحان ربي الاعلى 'bahaullah-st-133-ar.txt', # سورة النصح 'bahaullah-st-138-ar.txt', # (سورة الصبر (لوح ايوب
from collections import Counter import glob, os, random import matplotlib.pyplot as plt import numpy as np arabic_islamicate_files = glob.glob( '/Users/jtim/Dropbox/Academic/sources/corpora/cleaned-combined-open-arabic/*.txt' ) print(len(arabic_islamicate_files)) persian_islamicate_files = glob.glob( '/Users/jtim/Dropbox/Academic/sources/corpora/cleaned-persian-dh/*.txt') print(len(persian_islamicate_files)) authors = ['abdulbaha', 'bab', 'bahaullah', 'shoghi-effendi'] arabic_bahai_files = build_corpus( '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/data/', authors, ['ar']) persian_bahai_files = build_corpus( '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/data/', authors, ['fa']) def main(): directory = '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/output/islamicate-texts/' if not os.path.exists(directory): os.makedirs(directory) # Islamicate variables arabic_islamicate_counter = Counter() persian_islamicate_counter = Counter() arabic_islamicate_vocabulary = set()