def process_corpora(conf_list, backend, reg_dir, variant, replace): for conf_file in conf_list: logging.getLogger(__name__).info('Processing {0}'.format(conf_file)) with open(conf_file) as fr: conf = InstallJson() conf.update(fr) if replace: logging.getLogger(__name__).info( 'Removing existing record (including registry) for {0}.'. format(conf.ident)) backend.remove_corpus(conf.ident) if backend.contains_corpus(conf.ident): logging.getLogger(__name__).info( 'Corpus {0} already present - skipping.'.format( conf.ident)) else: backend.save_corpus_config( conf, reg_dir, get_corpus_size(conf.ident, reg_dir)) logging.getLogger(__name__).info( 'Saved config for {0}.'.format(conf.ident)) if variant: reg_path = os.path.join(reg_dir, variant, conf.ident) else: reg_path = os.path.join(reg_dir, conf.ident) if os.path.isfile(reg_path): enc = infer_encoding(reg_path) with open(reg_path) as fr2: parse_registry(fr2, variant=variant, backend=backend, encoding=enc)
def process_corpora(conf_list, backend, reg_dir, variant, replace): for conf_file in conf_list: logging.getLogger(__name__).info('Processing {0}'.format(conf_file)) with open(conf_file) as fr: conf = InstallJson() conf.update(fr) if replace: logging.getLogger(__name__).info( 'Removing existing record (including registry) for {0}.'.format(conf.ident)) backend.remove_corpus(conf.ident) if backend.contains_corpus(conf.ident): logging.getLogger(__name__).info( 'Corpus {0} already present - skipping.'.format(conf.ident)) else: backend.save_corpus_config(conf, reg_dir, get_corpus_size(conf.ident, reg_dir)) logging.getLogger(__name__).info('Saved config for {0}.'.format(conf.ident)) if variant: reg_path = os.path.join(reg_dir, variant, conf.ident) else: reg_path = os.path.join(reg_dir, conf.ident) if os.path.isfile(reg_path): enc = infer_encoding(reg_path) with open(reg_path) as fr2: parse_registry(fr2, variant=variant, backend=backend, encoding=enc)
def create_corp_record(node, db, shared, json_out, variant): ident = node.attrib['ident'].lower() web = node.attrib['web'] if 'web' in node.attrib else None tagset = node.attrib.get('tagset', None) speech_segment_struct, speech_segment_attr = fetch_structattr( node.attrib.get('speech_segment', None)) default_virt_keyboard = node.attrib.get('default_virt_keyboard', None) speaker_id_struct, speaker_id_attr = fetch_structattr(node.attrib.get('speaker_id_attr', None)) speech_overlap_struct, speech_overlap_attr = fetch_structattr( node.attrib.get('speech_overlap_attr', None)) speech_overlap_val = node.attrib.get('speech_overlap_val', None) collator_locale = node.attrib.get('collator_locale', 'en_US') use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false')) sentence_struct = node.attrib['sentence_struct'] if 'sentence_struct' in node.attrib else None curr_time = time.time() group_name, version = InstallJson.create_sorting_values(ident) cursor = new_cursor(db) cursor.execute('INSERT INTO kontext_corpus (id, group_name, version, created, updated, active, web, ' 'tagset, collator_locale, speech_overlap_val, use_safe_font, size, default_virt_keyboard) ' 'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (ident, group_name, version, int(curr_time), int(curr_time), 1, web, tagset, collator_locale, speech_overlap_val, use_safe_font, shared.get_corpus_size(ident), default_virt_keyboard)) # dependent structures and attrs if speech_segment_struct and speech_segment_attr: create_structattr(db, ident, speech_segment_struct, speech_segment_attr) if speaker_id_attr and speaker_id_struct: create_structattr(db, ident, speaker_id_struct, speaker_id_attr) if speech_overlap_struct and speech_overlap_attr: create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr) if sentence_struct: create_structure(db, ident, sentence_struct) cursor.execute('UPDATE kontext_corpus SET ' 'sentence_struct = ?, ' 'speech_segment_struct = ?, speech_segment_attr = ?, speaker_id_struct = ?, ' 'speaker_id_attr = ?, speech_overlap_struct = ?, speech_overlap_attr = ? ' 'WHERE id = ?', (sentence_struct, speech_segment_struct, speech_segment_attr, speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr, ident)) # json generator json_out.switch_to(ident) json_out.current.ident = ident json_out.current.web = web json_out.current.sentence_struct = sentence_struct json_out.current.tagset = tagset json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr) json_out.current.speaker_id_attr = speaker_id_attr json_out.current.speech_overlap_attr = speech_overlap_attr json_out.current.speech_overlap_val = speech_overlap_val json_out.current.collator_locale = collator_locale json_out.use_safe_font = use_safe_font create_metadata_record(db, shared, node, ident, json_out.current) json_out.metadata.default_virt_keyboard = default_virt_keyboard parse_tckc(node, db, ident, json_out.current)
def create_corp_record(node, db, shared, json_out, variant): ident = node.attrib['ident'].lower() web = node.attrib['web'] if 'web' in node.attrib else None tagset = node.attrib.get('tagset', None) speech_segment_struct, speech_segment_attr = fetch_structattr( node.attrib.get('speech_segment', None)) speaker_id_struct, speaker_id_attr = fetch_structattr(node.attrib.get('speaker_id_attr', None)) speech_overlap_struct, speech_overlap_attr = fetch_structattr( node.attrib.get('speech_overlap_attr', None)) speech_overlap_val = node.attrib.get('speech_overlap_val', None) collator_locale = node.attrib.get('collator_locale', 'en_US') use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false')) sentence_struct = node.attrib['sentence_struct'] if 'sentence_struct' in node.attrib else None curr_time = time.time() group_name, version = InstallJson.create_sorting_values(ident) cursor = new_cursor(db) cursor.execute('INSERT INTO kontext_corpus (id, group_name, version, created, updated, active, web, ' 'tagset, collator_locale, speech_overlap_val, use_safe_font, size) ' 'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (ident, group_name, version, int(curr_time), int(curr_time), 1, web, tagset, collator_locale, speech_overlap_val, use_safe_font, shared.get_corpus_size(ident))) # dependent structures and attrs if speech_segment_struct and speech_segment_attr: create_structattr(db, ident, speech_segment_struct, speech_segment_attr) if speaker_id_attr and speaker_id_struct: create_structattr(db, ident, speaker_id_struct, speaker_id_attr) if speech_overlap_struct and speech_overlap_attr: create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr) if sentence_struct: create_structure(db, ident, sentence_struct) cursor.execute('UPDATE kontext_corpus SET ' 'sentence_struct = ?, ' 'speech_segment_struct = ?, speech_segment_attr = ?, speaker_id_struct = ?, ' 'speaker_id_attr = ?, speech_overlap_struct = ?, speech_overlap_attr = ? ' 'WHERE id = ?', (sentence_struct, speech_segment_struct, speech_segment_attr, speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr, ident)) # json generator json_out.switch_to(ident) json_out.current.ident = ident json_out.current.web = web json_out.current.sentence_struct = sentence_struct json_out.current.tagset = tagset json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr) json_out.current.speaker_id_attr = speaker_id_attr json_out.current.speech_overlap_attr = speech_overlap_attr json_out.current.speech_overlap_val = speech_overlap_val json_out.current.collator_locale = collator_locale json_out.use_safe_font = use_safe_font create_metadata_record(db, shared, node, ident, json_out.current) parse_tckc(node, db, ident, json_out.current)
def create_corp_record(node, db, shared, json_out): ident = node.attrib['ident'].lower() web = node.attrib['web'] if 'web' in node.attrib else None tagset = node.attrib.get('tagset', None) speech_segment = node.attrib.get('speech_segment', None) speaker_id_attr = node.attrib.get('speaker_id_attr', None) speech_overlap_attr = node.attrib.get('speech_overlap_attr', None) speech_overlap_val = node.attrib.get('speech_overlap_val', None) collator_locale = node.attrib.get('collator_locale', 'en_US') use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false')) sentence_struct = node.attrib[ 'sentence_struct'] if 'sentence_struct' in node.attrib else None curr_time = time.time() group_name, version = InstallJson.create_sorting_values(ident) cursor = db.cursor() cursor.execute( 'INSERT INTO kontext_corpus (id, group_name, version, created, updated, active, web, ' 'tagset, collator_locale, speech_segment, speaker_id_attr, speech_overlap_attr, ' 'speech_overlap_val, use_safe_font, size) ' 'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (ident, group_name, version, int(curr_time), int(curr_time), 1, web, tagset, collator_locale, speech_segment, speaker_id_attr, speech_overlap_attr, speech_overlap_val, use_safe_font, shared.get_corpus_size(ident))) json_out.switch_to(ident) json_out.current.ident = ident json_out.current.web = web json_out.current.sentence_struct = sentence_struct json_out.current.tagset = tagset json_out.current.speech_segment = speech_segment json_out.current.speaker_id_attr = speaker_id_attr json_out.current.speech_overlap_attr = speech_overlap_attr json_out.current.speech_overlap_val = speech_overlap_val json_out.current.collator_locale = collator_locale json_out.use_safe_font = use_safe_font create_metadata_record(node, ident, db, shared, json_out.current) parse_tckc(node, db, ident, json_out.current) sentence_struct_id = create_initial_registry(db, shared, ident, sentence_struct) if sentence_struct_id: cursor.execute( 'UPDATE kontext_corpus SET sentence_struct_id = ? WHERE id = ?', (sentence_struct_id, ident))
def create_corp_record(node, db, shared, json_out, variant, create_if_none): cursor = new_cursor(db) ident = node.attrib['ident'].lower() if not _corpus_exists(db, ident): if create_if_none: cursor.execute('INSERT INTO corpora (name) VALUES (%s)', (ident,)) else: return web = node.attrib['web'] if 'web' in node.attrib else None tagset = node.attrib.get('tagset', None) speech_segment_struct, speech_segment_attr = fetch_structattr( node.attrib.get('speech_segment', None)) speaker_id_struct, speaker_id_attr = fetch_structattr(node.attrib.get('speaker_id_attr', None)) speech_overlap_struct, speech_overlap_attr = fetch_structattr( node.attrib.get('speech_overlap_attr', None)) speech_overlap_val = node.attrib.get('speech_overlap_val', None) collator_locale = node.attrib.get('collator_locale', 'en_US') use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false')) sentence_struct = node.attrib['sentence_struct'] if 'sentence_struct' in node.attrib else None group_name, version = InstallJson.create_sorting_values(ident) t1 = datetime.datetime.now(tz=pytz.timezone('Europe/Prague')).strftime("%Y-%m-%dT%H:%M:%S%z") cursor.execute('UPDATE corpora SET group_name = %s, version = %s, updated = CURRENT_TIMESTAMP, ' 'web = %s, tagset = %s, collator_locale = %s, speech_overlap_val = %s, use_safe_font = %s, ' 'size = %s, created = %s, updated = %s ' 'WHERE name = %s', (group_name, version, web, tagset, collator_locale, speech_overlap_val, use_safe_font, shared.get_corpus_size(ident), t1, t1, ident)) # dependent structures and attrs if speech_segment_struct and speech_segment_attr: create_structattr(db, ident, speech_segment_struct, speech_segment_attr) if speaker_id_attr and speaker_id_struct: create_structattr(db, ident, speaker_id_struct, speaker_id_attr) if speech_overlap_struct and speech_overlap_attr: create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr) if sentence_struct: create_structure(db, ident, sentence_struct) cursor.execute('UPDATE corpora SET ' 'sentence_struct = %s, ' 'speech_segment_struct = %s, speech_segment_attr = %s, speaker_id_struct = %s, ' 'speaker_id_attr = %s, speech_overlap_struct = %s, speech_overlap_attr = %s ' 'WHERE name = %s', (sentence_struct, speech_segment_struct, speech_segment_attr, speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr, ident)) # json generator json_out.switch_to(ident) json_out.current.ident = ident json_out.current.web = web json_out.current.sentence_struct = sentence_struct json_out.current.tagset = tagset json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr) json_out.current.speaker_id_attr = speaker_id_attr json_out.current.speech_overlap_attr = speech_overlap_attr json_out.current.speech_overlap_val = speech_overlap_val json_out.current.collator_locale = collator_locale json_out.use_safe_font = use_safe_font create_metadata_record(db, shared, node, ident, json_out.current) parse_tckc(node, db, ident, json_out.current)
def switch_to(self, corpus_id): if corpus_id not in self._data: self._data[corpus_id] = InstallJson() self._current = self._data[corpus_id]
def create_corp_record(node, db, shared, json_out, variant): ident = node.attrib['ident'].lower() if not _corpus_exists(db, ident): return web = node.attrib['web'] if 'web' in node.attrib else None tagset = node.attrib.get('tagset', None) speech_segment_struct, speech_segment_attr = fetch_structattr( node.attrib.get('speech_segment', None)) speaker_id_struct, speaker_id_attr = fetch_structattr( node.attrib.get('speaker_id_attr', None)) speech_overlap_struct, speech_overlap_attr = fetch_structattr( node.attrib.get('speech_overlap_attr', None)) speech_overlap_val = node.attrib.get('speech_overlap_val', None) collator_locale = node.attrib.get('collator_locale', 'en_US') use_safe_font = decode_bool(node.attrib.get('use_safe_font', 'false')) sentence_struct = node.attrib[ 'sentence_struct'] if 'sentence_struct' in node.attrib else None group_name, version = InstallJson.create_sorting_values(ident) cursor = new_cursor(db) t1 = datetime.datetime.now( tz=pytz.timezone('Europe/Prague')).strftime("%Y-%m-%dT%H:%M:%S%z") cursor.execute( 'UPDATE corpora SET group_name = %s, version = %s, updated = CURRENT_TIMESTAMP, ' 'web = %s, tagset = %s, collator_locale = %s, speech_overlap_val = %s, use_safe_font = %s, ' 'size = %s, created = %s, updated = %s ' 'WHERE name = %s', (group_name, version, web, tagset, collator_locale, speech_overlap_val, use_safe_font, shared.get_corpus_size(ident), t1, t1, ident)) # dependent structures and attrs if speech_segment_struct and speech_segment_attr: create_structattr(db, ident, speech_segment_struct, speech_segment_attr) if speaker_id_attr and speaker_id_struct: create_structattr(db, ident, speaker_id_struct, speaker_id_attr) if speech_overlap_struct and speech_overlap_attr: create_structattr(db, ident, speech_overlap_struct, speech_overlap_attr) if sentence_struct: create_structure(db, ident, sentence_struct) cursor.execute( 'UPDATE corpora SET ' 'sentence_struct = %s, ' 'speech_segment_struct = %s, speech_segment_attr = %s, speaker_id_struct = %s, ' 'speaker_id_attr = %s, speech_overlap_struct = %s, speech_overlap_attr = %s ' 'WHERE name = %s', (sentence_struct, speech_segment_struct, speech_segment_attr, speaker_id_struct, speaker_id_attr, speech_overlap_struct, speech_overlap_attr, ident)) # json generator json_out.switch_to(ident) json_out.current.ident = ident json_out.current.web = web json_out.current.sentence_struct = sentence_struct json_out.current.tagset = tagset json_out.current.speech_segment = '{0}.{1}'.format(speech_segment_struct, speech_segment_attr) json_out.current.speaker_id_attr = speaker_id_attr json_out.current.speech_overlap_attr = speech_overlap_attr json_out.current.speech_overlap_val = speech_overlap_val json_out.current.collator_locale = collator_locale json_out.use_safe_font = use_safe_font create_metadata_record(db, shared, node, ident, json_out.current) parse_tckc(node, db, ident, json_out.current)