def build_json_source_from_usx(path, lid, pid, date_modified, reporter=None): """ Builds a json source object from a USX file :param path: :param date_modified: :param reporter: a lambda handler instance for reporting :type reporter: Handler :return: """ # use utf-8-sig to remove the byte order mark with codecs.open(path, 'r', encoding='utf-8-sig') as in_file: usx = in_file.readlines() try: data = get_url( 'https://cdn.door43.org/bible/txt/1/{}/chunks.json'.format(pid)) chunks = index_chunks(json.loads(data)) except: raise 'Failed to retrieve chunk information for {}'.format(path) book = usx_to_chunked_json(usx, chunks, lid, pid) return { 'source': { 'chapters': book, 'date_modified': date_modified.replace('-', '').split('T')[0] } }
def download_chunks(pid): """ Downloads the chunks for the bible book :param pid: :return: the chunk json data or None """ try: data = get_url('https://cdn.door43.org/bible/txt/1/{}/chunks.json'.format(pid)) return json.loads(data) except: return None
def get_url(self, url, catch_exception=False): return get_url(url, catch_exception)
def get_url(self, url): return get_url(url)
def index_tn_rc(lid, temp_dir, rc_dir, reporter=None): """ Converts a v3 tN into it's v2 equivalent. This will write a bunch of files and return a list of files to be uploaded :param lid: the language id of the notes :param temp_dir: the directory where all the files will be written :param rc_dir: the directory of the resource container :param reporter: a lambda handler used for reporting :type reporter: Handler :return: a list of note files to upload """ note_general_re = re.compile('^([^#]+)', re.UNICODE) note_re = re.compile('^#+([^#\n]+)#*([^#]*)', re.UNICODE | re.MULTILINE | re.DOTALL) tn_uploads = {} manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml'))) dc = manifest['dublin_core'] for project in manifest['projects']: pid = Handler.sanitize_identifier(project['identifier']) chunk_json = [] if pid != 'obs': try: data = get_url('https://cdn.door43.org/bible/txt/1/{}/chunks.json'.format(pid)) chunk_json = index_chunks(json.loads(data)) except: if reporter: reporter.report_error('Failed to retrieve chunk information for {}-{}'.format(lid, pid)) continue note_dir = os.path.normpath(os.path.join(rc_dir, project['path'])) note_json = [] if not os.path.exists(note_dir): raise Exception('Project directory missing. Could not find {}'.format(note_dir)) chapters = os.listdir(note_dir) for chapter in chapters: if chapter in ['.', '..', 'front']: continue chapter_dir = os.path.join(note_dir, chapter) verses = os.listdir(chapter_dir) verses.sort() notes = [] firstvs = None note_hashes = [] for verse in verses: if verse in ['.', '..', 'intro.md']: continue # notes = [] verse_file = os.path.join(chapter_dir, verse) verse = verse.split('.')[0] verse_body = read_file(verse_file) verse_body = convert_rc_links(verse_body) general_notes = note_general_re.search(verse_body) # close chunk chapter_key = chapter if firstvs is not None and (pid != 'obs' and chapter_key not in chunk_json): # attempt to recover if Psalms if pid == 'psa': chapter_key = chapter_key.zfill(3) else: if reporter: reporter.report_error( 'Could not find chunk data for {} {} {}'.format(rc_dir, pid, chapter_key)) if firstvs is not None and (pid == 'obs' or verse in chunk_json[chapter_key]): note_json.append({ 'id': '{}-{}'.format(chapter, firstvs), 'tn': notes }) firstvs = verse notes = [] elif firstvs is None: firstvs = verse if general_notes: verse_body = note_general_re.sub('', verse_body) notes.append({ 'ref': 'General Information', 'text': general_notes.group(0).strip() }) for note in note_re.findall(verse_body): # TRICKY: do not include translation words in the list of notes if note[0].strip().lower() != 'translationwords': hasher = hashlib.md5() hasher.update(note[0].strip().lower().encode('utf-8')) note_hash = hasher.hexdigest() if note_hash not in note_hashes: note_hashes.append(note_hash) notes.append({ 'ref': note[0].strip(), 'text': note[1].strip() }) # close last chunk if firstvs is not None: note_json.append({ 'id': '{}-{}'.format(chapter, firstvs), 'tn': notes }) if note_json: tn_key = '_'.join([lid, '*', pid, 'tn']) note_json.append({'date_modified': dc['modified'].replace('-', '')}) note_upload = prep_data_upload('{}/{}/notes.json'.format(pid, lid), note_json, temp_dir) tn_uploads[tn_key] = note_upload return tn_uploads