def clean_koran_chapters_to_file(book_filename, output_path, results_filename, book_name): db = connect_db('religion') parameters = (book_name,) cursor = db.cursor(MySQLdb.cursors.DictCursor) sql = 'select chapters.title, chapter_name_start, chapter_name_end, chapter_length \ from chapters JOIN books ON chapters.book_id=books.book_id \ where books.title=?' cursor.execute('select chapters.title, chapter_name_start, chapter_name_end, chapter_length from chapters JOIN books ON chapters.book_id=books.book_id where books.title=%s', parameters) f = open(book_filename, 'r') txt = f.read() f.close() rf = open(results_filename, 'w') for row in cursor: verse = txt[row['chapter_name_end']:row['chapter_name_end']+row['chapter_length']] # verse = verse.replace(',', '', title) verse = verse.replace('REVEALED AT MECCA', '') verse = verse.replace('WHERE IT WAS REVEALED IS DISPUTED', '') verse = verse.replace('REVEALED AT MEDINA', '') verse = verse.replace('REVEALED PARTLY AT MECCA, AND PARTLY AT MEDINA', '') verse = verse.replace('IN THE NAME OF THE MOST MERCIFUL GOD.\n\n', '') verse = verse.replace('A.L.M.\n', '') f_name = output_path + row['title'] + '.txt' f = open(f_name, 'w') f.write('%s' % verse) f.close() rf.write('%s\t%d\n' % (row['title'], len(verse))) rf.close()
def find_koran_verses(): db = connect_db('religion') cursor = db.cursor(MySQLdb.cursors.DictCursor) # sql = 'select chapters.title, chapter_name_start, chapter_name_end, chapter_length \ # from chapters JOIN books ON chapters.book_id=books.book_id where books.title="Koran" and chapters.title="THE COW"' sql = 'select chapters.title, chapters.chapter_id, chapter_name_start, chapter_name_end, chapter_length \ from chapters JOIN books ON chapters.book_id=books.book_id where books.title="Koran"' cursor.execute(sql) f = open('../../data/books/Koran.txt', 'r') txt = f.read() f.close() output_filename = '../../data/books/Koran_verses.txt' o_file = open(output_filename, 'w') fields = ['chapter_id', 'number', 'start', 'end'] o_file.write('\t'.join(fields) + '\n') row_index = 0 for row in cursor: row_index+=1 chapter_start = row['chapter_name_end'] verse = txt[row['chapter_name_end']:row['chapter_name_end']+row['chapter_length']] verse_start_search = re.search('IN THE NAME OF THE MOST MERCIFUL GOD.+?\s', verse, re.M + re.S + re.U) if verse_start_search == None: verse_start_search = re.search('\.\s', verse, re.M + re.S + re.U) # verse_start_location = verse_start_search.end() previous_verse_location = verse_start_search.end() + chapter_start chapter_id = row['chapter_id'] previous_verse_number = '0' # m = re.findall('^\d+', verse, re.M + re.S + re.U) for m in re.finditer('^\d+', verse, re.M + re.S + re.U): # if len(m) == 0: # print row_index #First and Last section are different # for i in range(1, len(m)-1): o_file.write('%d\t%s\t%d\t%d\n' % (chapter_id, previous_verse_number, previous_verse_location, chapter_start + m.start()-1)) previous_verse_location = m.end() + chapter_start # previous_chapter_id = row['chapter_id'] previous_verse_number = m.group(0) if chapter_id == 226: x = 1 verse_end_location = verse.find('\n\n', previous_verse_location) + chapter_start o_file.write('%d\t%s\t%d\t%d\n' % (chapter_id, previous_verse_number, previous_verse_location, verse_end_location)) o_file.close() return
def save_chapters_to_file_mysql(book_filename, output_path, book_name): db = connect_db('religion') parameters = (book_name,) cursor = db.cursor(MySQLdb.cursors.DictCursor) #cursor.execute (query) #rows = cursor.fetchall () #for row in rows: #print row['employee_id'] sql = 'select chapters.title, chapter_name_start, chapter_name_end, chapter_length \ from chapters JOIN books ON chapters.book_id=books.book_id \ where books.title=?' cursor.execute('select chapters.title, chapter_name_start, chapter_name_end, chapter_length from chapters JOIN books ON chapters.book_id=books.book_id where books.title=%s', parameters) f = open(book_filename, 'r') txt = f.read() f.close() for row in cursor: f_name = output_path + row['title'] + '.txt' f = open(f_name, 'w') f.write('%s' % txt[row['chapter_name_end']:row['chapter_name_end']+row['chapter_length']]) f.close()
def test_koran_verses(): db = connect_db('religion') cursor = db.cursor(MySQLdb.cursors.DictCursor) # sql = 'select chapters.title, chapter_name_start, chapter_name_end, chapter_length \ # from chapters JOIN books ON chapters.book_id=books.book_id where books.title="Koran" and chapters.title="THE COW"' sql = 'select chapters.title, chapters.chapter_id, chapter_name_start, chapter_name_end, chapter_length \ from chapters JOIN books ON chapters.book_id=books.book_id where books.title="Koran"' cursor.execute(sql) f = open('../../data/books/Koran.txt', 'r') txt = f.read() f.close() numerical_index_test = False end_of_chapter_test = True fail_check = 0 # output_filename = '../../data/books/Koran_verses.txt' # o_file = open(output_filename, 'w') # fields = ['chapter_id', 'number', 'start', 'end'] # o_file.write('\t'.join(fields) + '\n') previous_verse = -1 # row = cursor.fetchone() check = 0 row_index = 0 for row in cursor: verse = txt[row['chapter_name_end']:row['chapter_name_end']+row['chapter_length']] # search_index = verse.find('.\n\n') # search_index = verse.find('IN THE NAME OF THE MOST MERCIFUL GOD.\n\n') verse_start_search = re.search('IN THE NAME OF THE MOST MERCIFUL GOD.+?\s', verse, re.M + re.S + re.U) if verse_start_search == None: verse_start_search = re.search('\.\s', verse, re.M + re.S + re.U) verse_start_location = verse_start_search.end() # if verse_start >= 0: # check+=1 # else: # print row['title'] # continue if numerical_index_test: m = re.findall('^\d+', verse, re.M + re.S + re.U) if len(m) == 0: print row_index if end_of_chapter_test: # search_index = verse.find('________') #113 search_index = verse.find('\n\n') #113 if search_index <= 0: print row['title'] fail_check+=1 else: check+=1 row_index+=1 #First and Last section are different # o_file.close() print check print 'failures:%d' % fail_check return