if r.status != 200: if transCode != 'intb' or bookNo != 58 or chapterNo != 3: break content = r.read().decode(encoding) verses = verse_re.findall(content) if transCode == 'glu' and bookCodes[bookNo] == '3jo' and chapterNo == 1: verses = verses[:-1] for v, vn in izip(verses, count(1)): if int(v[0]) != chapterNo: raise Exception('strange chapterno at verse {}'.format(vn)) if int(v[1]) != vn: raise Exception('strange verseno at verse {}'.format(vn)) xml = create_xml([v[2] for v in verses]) with open('{0}_{1:03}.html'.format(bookCodes[bookNo], chapterNo), 'w') as f: f.write(xml.encode('utf-8')) c.execute( "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", (langCode, bookCodes[bookNo], chapterNo, xml)) print '{0}|{1}|{2}'.format(bookCodes[bookNo], chapterNo, len(verses)) sys.stdout.flush() c.execute("vacuum") db.commit()
verses[33:33] = [('34', '34', u'——')] if book[0] == 'ac' and chapterNo == 24: verses[6:6] = [('7', '7', u'——')] if book[0] == 'ac' and chapterNo == 28: verses[28:28] = [('29', '29', u'——')] for v, vn in izip(verses, count(1)): if v[0] != v[1]: raise Exception('verse no not consistent: {0} != {1}'.format( v[0], v[1])) if int(v[0]) != vn: raise Exception('invalid verse no: {0} (should be {1})'.format( v[0], vn)) xml = create_xml( v[2].replace('<br>', ' ').replace('\n', ' ').replace(' ', ' ') for v in verses) with open('{0}_{1:03}.html'.format(book[0], chapterNo), 'w') as f: f.write(xml.encode('utf-8')) c.execute( "INSERT INTO html (langCode, bookCode, chapterno, html) VALUES (?, ?, ?, ?)", ('fi', book[0], chapterNo, xml)) print '{0}|{1}|{2}'.format(book[0], chapterNo, len(verses)) sys.stdout.flush() c.execute('vacuum') db.commit()
found = verse_re.findall(content) if len(found) > 0: raw_verses += found else: break if len(raw_verses) == 0: break for i, rv in enumerate(raw_verses, 1): if int(rv[0]) != versao or \ int(rv[1]) != bookNo or \ int(rv[2]) != chapterNo or \ int(rv[3]) != i: raise Exception('Strange verse no: {0}'.format(rv)) verses = [rv[4].decode(encoding).strip() for rv in raw_verses] xml = create_xml(verses) c.execute( 'INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)', (langCode, bookCode, chapterNo, xml)) print '{0}|{1}|{2}'.format(bookCode, chapterNo, len(raw_verses)) sys.stdout.flush() c.execute('VACUUM') db.commit()
chapterNo = 0 expectedVerseNo = 1 for rv in raw_verses: cn = int(rv[0]) if cn != chapterNo: if cn != chapterNo + 1: raise Exception('Unexpected chapterNo: {0} {1}'.format( bookCode, cn)) chapterNo = cn expectedVerseNo = 1 chapters.append([]) vn = int(rv[1]) if vn != expectedVerseNo: raise Exception('Unexpected verseNo: {0} {1}:{2}'.format( bookCode, cn, vn)) expectedVerseNo += 1 chapters[cn - 1].append(rv[2].replace(' ', ' ').strip()) for c, cn in izip(chapters, count(1)): cursor.execute( "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", ('s', bookCode, cn, create_xml(c))) print "{0:6}, {1:03}, {2:03}".format("'{0}'".format(bookCode), cn, len(c)) cursor.execute('VACUUM') db.commit()
chapter += 1 url = '/bible/{0}/{1:02}/{2:02}/'.format(lang, bookNo, chapter) c = httplib.HTTPConnection('bibleonline.ru') c.request( 'GET', url, '', { 'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.38 Safari/535.11', 'Referer': 'http://bibleonline.ru/bible/rus/01/01/' }) content = c.getresponse().read() verses = verse_re.findall(content) if len(verses) == 0: break for n, v in itertools.izip(itertools.count(1), verses): if int(v[0]) != n: raise Exception('Strange verse number: {0} {1}:{2}'.format( bookCode, chapter, n)) texts = [v[1].decode('utf-8') for v in verses] f = open('{0}/{1}_{2:03}.html'.format(lang, bookCode, chapter), 'w') f.write(createxml.create_xml(texts).encode('utf-8')) f.close() print '{0}|{1}|{2}'.format(bookCode, chapter, len(verses)) sys.stdout.flush()
verses = [] for verseNo, verseElement in enumerate(verseElements, 1): if verseNo != int(verseElement.attributes['vnumber'].value): raise Exception('Invalid verse no {0} {1}:{2}'.format( bookCode, chapterNo, verseNo)) if verseElement.firstChild is not None and verseElement.firstChild.nextSibling is not None: raise Exception( 'Seems like child tag in VERS {0} {1}:{2}'.format( bookCode, chapterNo, verseNo)) if verseElement.firstChild is not None: verses.append(verseElement.firstChild.nodeValue) else: verses.append('') xml_text = create_xml(verses) # with open('{0}_{1}.html'.format(bookCode, chapterNo), 'w') as f: # f.write(xml_text.encode('utf-8')) c.execute( "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", (sys.argv[3], bookCode, chapterNo, xml_text)) print '{0}|{1}|{2}'.format(bookCode, chapterNo, len(verses)) sys.stdout.flush() c.execute('vacuum') db.commit()
text = text.replace('\n', ' ') if text.endswith('<br>'): text = text[:-4] if text.endswith('</p>'): text = text[:-4] text = text.strip() text = text.replace('<br>', '<br/>') withoutTags = re.sub(known_tags_re, '', text) if withoutTags.find('<') != -1: raise Exception('Unknown tag: {0} {1}:{2}'.format( bookCode, chapterNo, verseNo)) texts.append(text) #print chapterNo, verseNo, text html = create_xml(texts) try: xml.dom.minidom.parseString(html.encode('utf-8')) except Exception as e: print html raise Exception('Failed to parse {0} {1} {2}'.format( bookCode, chapterNo, e)) c.execute( "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", (langCode, bookCode, chapterNo, html)) print '{0}|{1}|{2}'.format(bookCode, chapterNo, len(verses)) c.execute('VACUUM') c.close()
payload_re = re.compile(r'<span class="arabictext">([^<]*?)<br>') http = httplib.HTTPConnection('copticchurch.net') for book in books: c.execute( "SELECT count(*) FROM chapterSize WHERE transCode='{0}' AND bookCode=?" .format(transCode), (book[0], )) chapters = c.fetchall()[0][0] for chapterNo in xrange(1, chapters + 1): url = "/cgibin/bible/index.php?r={0}+{1}&version=SVD&showVN=1".format( engname[book[0]].replace(' ', '+'), chapterNo) http.request("GET", url) content = http.getresponse().read().decode('cp1256') payload = payload_re.search(content).group(1) verses = [ v.strip().split(' ', 1)[1].strip() for v in payload.strip().split('\n') ] f = open('{0}_{1:03}.html'.format(book[0], chapterNo), 'w') f.write(create_xml(verses, True).encode('utf-8')) f.close() print '{0}\t{1}\t{2}'.format(book[0], chapterNo, len(verses)) sys.stdout.flush()
chapterNo = int(parts[1]) verseNo = int(parts[2]) if bookNo != prevBook: content.append([[]]) elif chapterNo != prevChapter: content[bookNo - 1].append([]) content[bookNo - 1][chapterNo - 1].append(parts[3].decode('utf-8')) assert len(content) == bookNo assert len(content[bookNo - 1]) == chapterNo assert len(content[bookNo - 1][chapterNo - 1]) == verseNo prevBook = bookNo prevChapter = chapterNo for bookNo, book in enumerate(content, 0): for chapterNo, chapter in enumerate(book, 1): xml = create_xml(chapter) c.execute( 'INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)', (sys.argv[3], bookCodes[bookNo], chapterNo, xml)) print '{0}|{1}|{2}'.format(bookCodes[bookNo], chapterNo, len(chapter)) sys.stdout.flush() c.execute('VACUUM') db.commit()
chapter += 1 http.request('GET', '/main.php?menu=bible&act=1&nc=50&district={0}&chapter={1}'.format( bookNo, chapter )) content = http.getresponse().read() verses = verse_re.findall(content) if len(verses) == 0: break for n, v in izip(count(1), verses): if int(v[0]) != n: raise Exception('Strange verse number: {0} {1}:{2}'.format(bookCode, chapter, n)) xml = create_xml(v[1].decode('windows-1251') for v in verses) c.execute("INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", ( 'bl', bookCode, chapter, xml )) c.execute('select count(*) from html') print c.fetchone() print '{0}|{1}|{2}'.format( bookCode, chapter, len(verses) ) sys.stdout.flush() c.execute('vacuum')
print '{0:6}, {1:03}, {2:03}'.format("'{0}'".format(bookCodes[bookNo]), cn, v) textKey = 'teks________' textpos = content.find(textKey) text = content[textpos + len(textKey) + 4:] verses = text.split('\n') db = sqlite3.Connection(transCode + '.sqlite') c = db.cursor() c.execute("DROP TABLE IF EXISTS html") c.execute( "CREATE TABLE html (langCode, bookCode, chapterNo, html, PRIMARY KEY (langCode, bookCode, chapterNo))" ) textpos = 0 for bookNo in xrange(66): for cn in range(chapterCount[bookNo]): xml = create_xml( v.decode('utf-8') for v in verses[textpos:textpos + verseCount[bookNo][cn]]) textpos += verseCount[bookNo][cn] c.execute( "INSERT INTO html (langCode, bookCode, chapterNo, html) VALUES (?, ?, ?, ?)", (langCode, bookCodes[bookNo], cn + 1, xml)) c.execute('vacuum') db.commit()
"SELECT bookCode, bookName FROM books WHERE transCode='kjv' ORDER BY bookNo" ) books = c.fetchall() verse_re = re.compile( r"<a href=\"http://www.kingjamesbibleonline.org/[^\"]*?-(\d+?)/\" title='View more translations[^']*?'>(.*?)</a></p>" ) for book in books: c.execute( "SELECT count(*) FROM chapterSize WHERE transCode='kjv' AND bookCode=?", (book[0], )) chapters = c.fetchall()[0][0] for chapterNo in xrange(1, chapters + 1): url = '/{0}-Chapter-{1}/'.format(book[1].replace(' ', '-'), chapterNo) http = httplib.HTTPConnection('www.kingjamesbibleonline.org') http.request("GET", url) response = http.getresponse() content = response.read() verses = [v[1] for v in verse_re.findall(content)] f = open('{0}_{1:03}.html'.format(book[0], chapterNo), 'w') f.write(create_xml(verses)) f.close() print '{0}\t{1}\t{2}'.format(book[0], chapterNo, len(verses)) sys.stdout.flush()