Python build_corpus Examples

Programming Language: Python

Namespace/Package Name: helper

Method/Function: build_corpus

Examples at hotexamples.com: 4

Python build_corpus - 4 examples found. These are the top rated real world Python examples of helper.build_corpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: normalize.py Project: jacobthill/dissertation

import glob, operator, os, re
from collections import Counter
from helper import build_corpus
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import numpy as np
import pandas as pd
import seaborn as sns

# # Gather all file names
corpus_dir = '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-works/data/'
authors = ['bahaullah', 'bab', 'abdulbaha', 'shoghi-effendi']
languages = ['ar', 'fa']

file_names = build_corpus(corpus_dir, authors, languages)


# Functions containing the various steps of normalization
def process_one(text):
    return text


def process_two(text):
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("﴾", "")
    text = text.replace("﴿", "")
    text = text.replace('"', '')
    text = text.replace(":", "")

Example #2

Show file

import glob, os
from collections import Counter
from helper import process, build_corpus

dir = "/Users/jtim/Dropbox/Academic/sources/corpora/bahai-works/data/"
authors = ['abdulbaha']
languages = ['ar']

corpus = build_corpus(dir, authors, languages)

mmha1 = []

arabic_counter = Counter()

for name in corpus:
    if 'mmha1' in name:
        mmha1.append(name)

for file in mmha1:
    with open("{}{}".format(dir, name),
              'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out:
        out.write(file)
        out.write('\n')
        out.write("--------------------------------------")
        out.write('\n')
        out.write(f.read())

for file in mmha1:
    with open("{}{}".format(dir, name),
              'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out_c:
        words = Counter(process(f.read()).split())

Example #3

Show file

File: chi-squared-2.py Project: jacobthill/dissertation

from helper import build_corpus, process
import glob, nltk, os
import matplotlib.pyplot as plt

# Gather corpora
base_dir = '/Users/jtim/Dropbox/Academic/sources/corpora/'
bahaullah = build_corpus('{}bahai-works/data/'.format(base_dir), ['bahaullah'],
                         ['ar'])
abdulbaha = build_corpus('{}bahai-works/data/'.format(base_dir), ['abdulbaha'],
                         ['ar'])
bab = build_corpus('{}bahai-works/data/'.format(base_dir), ['bab'], ['ar'])
murtada_ansari = glob.glob(
    '/Users/jtim/Dropbox/Academic/sources/corpora/open-arabic-1300AH/data/1281MurtadaAnsari/*/arc/*.txt'
)
bahaullah_baghdad = [
    'bahaullah-anbka-15-ar.txt',  # سورة الذكر
    'bahaullah-aqa2-67-ar.txt',  # جواهر الاسرار
    'bahaullah-aqa2-93-ar.txt',  # سورة القدير
    'bahaullah-aqa2-76-ar.txt',  # سورة الله
    'bahaullah-aqa2-101-ar.txt',  # لوح الحورية
    'bahaullah-km-1-ar.txt',  # الكلمات المكنونة العربية
    'bahaullah-st-010-1-ar.txt',  # الحروفات العاليات
    'bahaullah-st-029-ar.txt',  # (لوح آية النور (تفسير الحروفات المقطعة
    'bahaullah-st-037-ar.txt',  # لوح الفتنة
    'bahaullah-st-041-ar.txt',  # لوح الحق
    'bahaullah-st-052-ar.txt',  # لوح كل الطعام
    'bahaullah-st-087-ar.txt',  # لوح مدينة الرضا
    'bahaullah-st-088-ar.txt',  # لوح مدينة التوحيد
    'bahaullah-st-100-ar.txt',  # لوح سبحان ربي الاعلى
    'bahaullah-st-133-ar.txt',  # سورة النصح
    'bahaullah-st-138-ar.txt',  # (سورة الصبر (لوح ايوب

Example #4

Show file

File: islamicate-texts.py Project: jacobthill/dissertation

from collections import Counter
import glob, os, random
import matplotlib.pyplot as plt
import numpy as np

arabic_islamicate_files = glob.glob(
    '/Users/jtim/Dropbox/Academic/sources/corpora/cleaned-combined-open-arabic/*.txt'
)
print(len(arabic_islamicate_files))
persian_islamicate_files = glob.glob(
    '/Users/jtim/Dropbox/Academic/sources/corpora/cleaned-persian-dh/*.txt')
print(len(persian_islamicate_files))

authors = ['abdulbaha', 'bab', 'bahaullah', 'shoghi-effendi']
arabic_bahai_files = build_corpus(
    '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/data/', authors,
    ['ar'])
persian_bahai_files = build_corpus(
    '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/data/', authors,
    ['fa'])


def main():
    directory = '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/output/islamicate-texts/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Islamicate variables
    arabic_islamicate_counter = Counter()
    persian_islamicate_counter = Counter()
    arabic_islamicate_vocabulary = set()