コード例 #1
0
ファイル: text_tools.py プロジェクト: bchoatejr/religion
def output_json_book_chapter_verse(input_file):
    BOOK, CHAPTER, VERSE, LENGTH = 0, 1, 2, 5
    i_file = open(input_file, 'r')
    i_file.readline()
    d_book = {}
    for txt in i_file.readlines():
        col = split_line(txt)
#    d_book[col[BOOK]] = d_book.get(col[BOOK], {})
#    d_chapter = d_book.get(col[CHAPTER], {})
#    d_verse = d_chapter.get(col[VERSE], col[LENGTH])
        d_book[col[BOOK]] = d_book.get(col[BOOK], {})
        d_book[col[BOOK]][col[CHAPTER]] = d_book[col[BOOK]].get(col[CHAPTER], {})
#    d_chapter = d_book[col[BOOK]].get(col[CHAPTER], {})
#    d_verse = d_chapter[col[CHAPTER]].get(col[VERSE], col[LENGTH])
        d_book[col[BOOK]][col[CHAPTER]][col[VERSE]] =  d_book[col[BOOK]][col[CHAPTER]].get(col[VERSE], col[LENGTH])

    sorted_chp = sorted(d_book.iteritems(), key=operator.itemgetter(0))

    directory, file_name, extension = file_name_ext(input_file)
    output_filename = directory + '/' + file_name + '-chapter_verse.json'
    o_file = open(output_filename, 'w')

#  json.dump(sorted_chp, o_file)
    json.dump(d_book, o_file)
    o_file.close()
コード例 #2
0
ファイル: mormon_tools.py プロジェクト: bchoatejr/religion
def find_chapters(book):
    text = get_file_text(book)

    directory, file_name, extension = file_name_ext(book)
    output_filename = directory + '/' + file_name + '-chp.txt'
    o_file = open(output_filename, 'w')

#  chapters_regex = '\d+.+\d+:\d+'
    chapters_regex = '(.+?)(\d+):(\d+)'
#  crex = re.compile(chapters_regex)
#prog = re.compile(pattern)
#result = prog.match(string)

#  matches = re.findall(chapters_regex, txt, re.M + re.S + re.U)
#  matches = crex.findall(txt)
#  matches = re.findall(chapters_regex, txt)
#  for m in matches:
    fields = ['Book', 'Chapter', 'Verse', 'Title Start', 'Title End', 'Verse Length']
    o_file.write('\t'.join(fields) + '\n')
    previous_title = -1
    for m in re.finditer(chapters_regex, text):
        #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end()))
        if previous_title != -1:
            verse_length = m.start() - previous_title
            o_file.write('%d\n' %(verse_length))
        o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end()))
        previous_title = m.end()

    end_regex = 'End of the Project Gutenberg EBook'
    end_chapter = re.search(end_regex, text).start()

    verse_length = end_chapter - previous_title
    o_file.write('%d\n' %(verse_length))
    o_file.close()
コード例 #3
0
ファイル: text_tools.py プロジェクト: bchoatejr/religion
def find_koran_chapters(book):
    text = get_file_text(book)

    directory, file_name, extension = file_name_ext(book)
    output_filename = directory + '/' + file_name + '-chapters_debug.txt'
    o_file = open(output_filename, 'w')

#    regex = '(CHAPTER.+?)\.\\n\\n(.+?)\.\\n'
#    regex = '(CHAPTER.+?)\.\s\s(.+?)\s\s'
#    regex = '(CHAPTER.+?)\.$(.+?)\s\s' # not working
#    regex = '(CHAPTER.+?)\s\sENTITLED, (.+?);\s\s' # not working
    regex = '(CHAPTER.+?)\.\n\nENTITLED, (.+?);'
#m6 = re.findall(rx6, text, re.M + re.S + re.U)    
#
#
#CHAPTER III.
#
#ENTITLED, THE FAMILY OF IMRAN; REVEALED AT MEDINA

#  chapters_regex = '\d+.+\d+:\d+'
#    chapters_regex = '(.+?)(\d+):(\d+)'
#  crex = re.compile(chapters_regex)
#prog = re.compile(pattern)
#result = prog.match(string)

#  matches = re.findall(chapters_regex, txt, re.M + re.S + re.U)
#  matches = crex.findall(txt)
#  matches = re.findall(chapters_regex, txt)
#  for m in matches:
    fields = ['Chapter', 'Title', 'Title Start', 'Title End', 'Verse Length']
    o_file.write('\t'.join(fields) + '\n')
    previous_title = -1
#    for m in re.finditer(regex, text, re.M):    
    for m in re.finditer(regex, text, re.M + re.S + re.U):
        #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end()))
        if previous_title != -1:
            verse_length = m.start() - previous_title
            o_file.write('%d\n' %(verse_length))
#        o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end()))
        title = m.group(2)
        title = re.sub(',', '', title)
#        title = re.sub('; REVEALED AT MECCA', '', title)
#        title = re.sub('; WHERE IT WAS REVEALED IS DISPUTED', '', title)
#        title = re.sub('; REVEALED AT MEDINA', '', title)
#        title = re.sub('; REVEALED PARTLY AT MECCA, AND PARTLY AT MEDINA', '', title)        
        o_file.write('%s\t%s\t%d\t%d\t' %(m.group(1), title, m.start(), m.end()))
        previous_title = m.end()

#    end_regex = 'End of the Project Gutenberg EBook'
#    end_chapter = re.search(end_regex, text).start()
    end_chapter = len(text)

    verse_length = end_chapter - previous_title
    o_file.write('%d\n' %(verse_length))
    o_file.close()
コード例 #4
0
ファイル: text_tools.py プロジェクト: bchoatejr/religion
def chords_add_header(chapter_index_filename, verse_location, n_gram_path):
    directory, file_name, extension = file_name_ext(chapter_index_filename)
#  chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0)
    files = os.listdir(verse_location)

    for f in files:
        directory, file_name, extension = file_name_ext(f)
        chord_filename = n_gram_path + file_name + '_chord.' + extension
        dic_filename = n_gram_path + file_name + '_dic_index.' + extension
        d_file = open(dic_filename, 'r')
        word_header = []
        for txt in d_file.readlines():
            cols = split_line(txt)
            word_header.append(cols[0])
        d_file.close()

        c_txt = get_file_text(chord_filename)

        o_file = open(chord_filename, 'w')
        o_file.write(','.join(word_header) + '\n')
        o_file.write(c_txt)
        o_file.close()
コード例 #5
0
ファイル: text_tools.py プロジェクト: bchoatejr/religion
def json_out(input_file):

    i_file = open(input_file, 'r')
    i_file.readline()
    d_chapter_names = {}
    for txt in i_file.readlines():
        fields = split_line(txt)
        d_chapter_names[fields[0]] = d_chapter_names.get(fields[0], 0) + 1


    sorted_chp = sorted(d_chapter_names.iteritems(), key=operator.itemgetter(0))

    directory, file_name, extension = file_name_ext(input_file)
    output_filename = directory + '/' + file_name + '-chp_names.json'
    o_file = open(output_filename, 'w')

    json.dump(sorted_chp, o_file)
    o_file.close()
コード例 #6
0
ファイル: text_tools.py プロジェクト: bchoatejr/religion
def n_gram_chapters(chapter_index_filename, verse_location, n_gram_path):
    directory, file_name, extension = file_name_ext(chapter_index_filename)

    chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0)

    files = os.listdir(verse_location)

    for k, v in chapters.items():
        file_filter = k + '*'
        file_match_list = fnmatch.filter(files, file_filter)

        for f in file_match_list:
            verse_text = get_file_text(verse_location + f)
            verse_text = verse_text.replace('.', ' ') #TODO: Make this a function, punctuation replacement
            verse_text = verse_text.replace(',', ' ')
            verse_text = verse_text.replace('\n', ' ')
            verse_words = verse_text.split()
            n_gram = {}
            dic = {} #New
            prev_word = verse_words[0]
            dic[prev_word] = dic.get(prev_word, 0) + 1
            for i in range(1, len(verse_words)):
#        n_gram[prev_word] = n_gram.get(verse_words[prev_word], {})
                n_gram[prev_word] = n_gram.get(prev_word, {})
                n_gram_child = n_gram[prev_word]
                n_gram_child[verse_words[i]] = n_gram_child.get(verse_words[i], 1)
                dic[verse_words[i]] = dic.get(verse_words[i], 0) + 1
                prev_word = verse_words[i]

            directory, file_name, extension = file_name_ext(f)
            output_filename = n_gram_path + file_name + '_n-gram.' + extension
            o_file = open(output_filename, 'w')
            json.dump(n_gram, o_file, sort_keys=True, indent=4)
            o_file.close()

            verse_index = []
            index_lookup = {}
            i = 0
            for k, v in dic.items():
                verse_index.append(k)
                index_lookup[k] = i
                i+=1

            output_filename = n_gram_path + file_name + '_dic_index.' + extension
            o_file = open(output_filename, 'w')

            dic_size = len(verse_index)
            verse_dic = {}
            for i in range(dic_size):
                o_file.write('%s\t%d\n' % (verse_index[i], i))
                verse_dic[verse_index[i]] = i
            o_file.close()

            verse_matrix = np.zeros((dic_size, dic_size), dtype = np.int)
            for k, v in n_gram.items():
                #get row
                row_index = index_lookup[k]
                for k2, v2 in v.items():
                    col_index = index_lookup[k2]
                    verse_matrix[row_index, col_index] = v2

            output_filename = n_gram_path + file_name + '_chord.' + extension
            np.savetxt(output_filename, verse_matrix, delimiter=',', fmt='%d')