def main(alto_fulltext_file, language_file, chunksize, processes): """ Read the documents of the corpus from ALTO_FULLTEXT_FILE where each line of the .csv file describes one page. Foreach page classify its language by means of langid. Store the classification results as a pickled pandas DataFrame in LANGUAGE_FILE. """ target_path = os.path.dirname(language_file) if len(target_path) > 0 and not os.path.exists(target_path): os.makedirs(target_path, exist_ok=True) if alto_fulltext_file.endswith('.csv'): chunks = get_csv_chunks(alto_fulltext_file, chunksize) elif alto_fulltext_file.endswith('.sqlite3'): chunks = get_sqlite_chunks(alto_fulltext_file, chunksize) else: raise RuntimeError('Unsupported input file format.') language = list() for lan in prun(get_chunk_tasks(chunks), processes=processes, initializer=LanguageTask.initialize): language.append(lan) language = pd.concat(language, axis=0) language.to_pickle(language_file) return
def main(alto_fulltext_file, entropy_file, chunksize, processes): """ Read the documents of the corpus from ALTO_FULLTEXT_FILE where each line of the .csv file describes one page. Foreach page compute its character entropy rate and store the result as a pickled pandas DataFrame in ENTROPY_FILE. """ os.makedirs(os.path.dirname(entropy_file), exist_ok=True) if alto_fulltext_file.endswith('.csv'): chunks = get_csv_chunks(alto_fulltext_file, chunksize) elif alto_fulltext_file.endswith('.sqlite3'): chunks = get_sqlite_chunks(alto_fulltext_file, chunksize) else: raise RuntimeError('Unsupported input file format.') entropy = list() for et in prun(get_chunk_tasks(chunks), processes=processes): entropy.append(et) entropy = pd.concat(entropy, axis=0) entropy.to_pickle(entropy_file) return
def run(entities_file, embeddings, data_sequence, split_parts, processes, n_trees, distance_measure, output_path, search_k, max_dist, sem=None): return prun(LookUpBySurface._get_all(data_sequence, set(embeddings.keys()), split_parts, sem=sem), processes=processes, initializer=LookUpBySurface.initialize, initargs=(entities_file, embeddings, n_trees, distance_measure, output_path, search_k, max_dist))
def run(entities_file, context_matrix_file, data_sequence_1, data_sequence_2, embeddings_1, ent_type_1, split_parts, n_trees, distance_measure_1, output_path, search_k_1, max_dist, lookup_semaphore, embeddings_2, ent_type_2, w_size, batch_size, embed_semaphore, processes, refine_processes=0): return \ prun(RefineLookup._get_all(entities_file, data_sequence_1, data_sequence_2, embeddings_1, ent_type_1, split_parts, n_trees, distance_measure_1, output_path, search_k_1, max_dist, lookup_semaphore, embeddings_2, ent_type_2, w_size, batch_size, embed_semaphore, processes), initializer=RefineLookup.initialize, initargs=(context_matrix_file,), processes=refine_processes)
def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len): """ Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file. FULLTEXT_FILE: The CSV or SQLITE3 file to read from. SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame that is stored in <selection_file>. CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata. """ os.makedirs(os.path.dirname(corpus_file), exist_ok=True) print('Open {}.'.format(corpus_file)) corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8') corpus_fh.write(u'\ufeff') if fulltext_file.endswith('.csv'): chunks = get_csv_chunks(fulltext_file, chunksize) elif fulltext_file.endswith('.sqlite3'): chunks = get_sqlite_chunks(fulltext_file, chunksize) else: raise RuntimeError('Unsupported input file format.') for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize, initargs=(selection_file,)): corpus_fh.write(text) corpus_fh.close() return
def infinite_feature_sequence(self): features = [] candidates = [] current_entity = None for entity_id, candidate, fe in \ prun(self.get_feature_tasks(), initializer=ConvertSamples2Features.initialize, initargs=(self._tokenizer, self._max_seq_length), processes=self._feature_processes): if entity_id is None: yield current_entity, features, pd.concat(candidates) if len(candidates) > 0 else [] features = [] candidates = [] current_entity = None continue if current_entity is None: current_entity = entity_id if fe is not None: features.append(fe) if candidate is not None: candidates.append(candidate)
def run(embeddings, data_sequence, ent_type, w_size, batch_size, processes, sem=None, start_iteration=0): for result in \ prun(EmbedWithContext._get_all(data_sequence, start_iteration, ent_type, w_size, batch_size, sem), processes=processes, initializer=EmbedWithContext.initialize, initargs=(embeddings,)): for _, link_result in result.iterrows(): yield link_result
def run(index_file, mapping_file, distance_measure, search_k, embeddings, data_sequence, start_iteration, ent_type, w_size, batch_size, processes, sem=None): return prun( LookUpBySurfaceAndContext._get_all(embeddings, data_sequence, start_iteration, ent_type, w_size, batch_size, processes, sem), processes=3*processes, initializer=LookUpBySurfaceAndContext.initialize, initargs=(index_file, mapping_file, distance_measure, search_k))
def sentence_stat(tsv_file, json_file, clef_gs_file, data_set_file, min_pairs, max_pairs, processes): tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3) tsv.loc[tsv.TOKEN.isnull(), 'TOKEN'] = "" tsv_gs = pd.read_csv(clef_gs_file, sep='\t', comment='#', quoting=3) tsv_gs.loc[tsv_gs.TOKEN.isnull(), 'TOKEN'] = "" with open(json_file, 'r') as fp_json: ned_result = json.load(fp_json) ned_result = add_ground_truth(ned_result, tsv, tsv_gs) applicable_results = sum([ 'gt' in entity_result and 'decision' in entity_result for _, entity_result in ned_result.items() ]) rank_intervalls = np.linspace(0.001, 0.1, 100) quantiles = np.linspace(0.1, 1, 10) def get_tasks(): nonlocal rank_intervalls, quantiles for entity_id, entity_result in ned_result.items(): if 'gt' not in entity_result: continue if 'decision' not in entity_result: continue yield SentenceStatTask(entity_result, quantiles, rank_intervalls, min_pairs, max_pairs) progress = tqdm(prun(get_tasks(), processes=processes), total=applicable_results) data = list() data_len = 0 for data_part in progress: if data_part is None: continue data.append(data_part) data_len += len(data_part) progress.set_description("#data: {}".format(data_len)) progress.refresh() data = pd.concat(data) data.to_pickle(data_set_file)
def get_lookup(self): for entity_id, ent_type, sentences, (_, embedded, embedding_config) in \ prun(self.get_embed(), initializer=EmbedTask.initialize, initargs=(self._embeddings,), processes=self._embed_processes): yield LookUpByEmbeddingWrapper(entity_id, sentences, page_title=entity_id, entity_embeddings=embedded, embedding_config=embedding_config, entity_title=entity_id, entity_type=ent_type, split_parts=self._split_parts, max_candidates=None) # return all the candidates - filtering is done below
def to_csv(source_dir, output_file, processes): with open(output_file, 'w') as f: writer = csv.writer(f) writer.writerow(['file_name', 'text', 'wc', 'ppn']) for filename, text, wc, ppn in prun(ExtractTask.get_all(source_dir), processes=processes): if filename is None: continue writer.writerow([filename, text, wc, ppn])
def infinite_feature_sequence(self): results = dict() for job_id, entity_id, candidate, fe in \ prun(self.get_feature_tasks(), initializer=ConvertSamples2Features.initialize, initargs=(self._tokenizer, self._max_seq_length), processes=self._feature_processes): self._queue_final_output.add_to_job(job_id, (entity_id, candidate, fe)) while True: job_id, task_info, iter_quit = self._queue_final_output.get_next_task( ) if iter_quit: return if task_info is None: break entity_id, candidate, fe, params = task_info if self._verbose: print("infinite_feature_sequence: {}:{}".format( job_id, entity_id)) if job_id not in results: results[job_id] = { 'features': [], 'candidates': [], 'entity_id': entity_id } if entity_id is None: result = results.pop(job_id) yield job_id, (result['entity_id'], result['features'], (pd.concat(result['candidates']) if len(result['candidates']) > 0 else [])) continue if fe is not None: results[job_id]['features'].append(fe) if candidate is not None: results[job_id]['candidates'].append(candidate)
def _sentence_stat(ned_result, tsv, tsv_gs, min_pairs, max_pairs, processes): ned_result = add_ground_truth(ned_result, tsv, tsv_gs) applicable_results = sum([ 'gt' in entity_result and 'decision' in entity_result for _, entity_result in ned_result.items() ]) rank_intervalls = np.linspace(0.001, 0.1, 100) quantiles = np.linspace(0.1, 1, 10) def get_tasks(): nonlocal rank_intervalls, quantiles for entity_id, entity_result in ned_result.items(): if 'gt' not in entity_result: continue if 'decision' not in entity_result: continue yield SentenceStatTask(entity_result, quantiles, rank_intervalls, min_pairs, max_pairs) progress = tqdm(prun(get_tasks(), processes=processes), total=applicable_results) data = list() data_len = 0 for data_part in progress: if data_part is None: continue data.append(data_part) data_len += len(data_part) progress.set_description("#data: {}".format(data_len)) progress.refresh() if len(data) < 1: return pd.DataFrame() data = pd.concat(data) return data
def to_sqlite(source_dir, output_file, processes): with sqlite3.connect(output_file) as conn: conn.execute('pragma journal_mode=wal') for idx, (filename, text, wc, ppn) in\ enumerate(prun(ExtractTask.get_all(source_dir), processes=processes)): if filename is None: continue pd.DataFrame({'id': idx, 'file_name': filename, 'text': text, 'wc': wc, 'ppn': ppn}, index=[idx]).\ reset_index(drop=True).set_index('id').\ to_sql('text', con=conn, if_exists='append', index_label='id') conn.execute('create index idx_ppn on text(ppn);')
def get_sentence_pairs(self): for job_id, entity_id, candidate, pairs in \ prun(self.get_sentence_lookup(), initializer=SentenceLookup.initialize, initargs=(self._ned_sql_file, ), processes=self._pairing_processes): if entity_id is None: # signal entity_id == None self._queue_pairs.add_to_job(job_id, (entity_id, None, None)) else: if pairs is None: self._queue_pairs.add_to_job(job_id, (entity_id, candidate, None)) else: for idx, row in pairs.iterrows(): pair = (row.id_a, row.id_b, json.loads(row.sen_a), json.loads(row.sen_b), row.pos_a, row.pos_b, row.end_a, row.end_b, row.label) self._queue_pairs.add_to_job( job_id, (entity_id, candidate, pair)) candidate = None while True: job_id, task_info, iter_quit = self._queue_pairs.get_next_task( ) if iter_quit: return if task_info is None: break entity_id, candidate, pair, params = task_info if self._verbose: print("get_sentence_pairs: {}:{}".format( job_id, entity_id)) yield job_id, entity_id, candidate, pair
def process_sequence(self): complete_result = OrderedDict() for eid, result in prun(self.get_decider_tasks(), initializer=DeciderTask.initialize, initargs=(self._decider, self._entities), processes=self._decider_processes): if eid is None: print('process_sequence done.') yield complete_result complete_result = OrderedDict() continue if result is None: continue complete_result[eid] = result
def altoannotator(tagged_sqlite_file, source_dir, dest_dir, processes, no_gzip): """ Read NER tagging results from TAGGED_SQLITE_FILE. Read ALTO XML files in subfolders of directory SOURCE_DIR. Annotate the XML content with NER information and write the annotated ALTO XML back to the same directory structure in DEST_DIR. """ dest_dir = "{}/{}".format( dest_dir, os.path.splitext(os.path.basename(tagged_sqlite_file))[0]) os.makedirs(dest_dir, exist_ok=True) for _ in prun(AnnotateTask.get_all(source_dir, dest_dir, no_gzip), processes=processes, initializer=AnnotateTask.initialize, initargs=(tagged_sqlite_file, )): pass
def infinite_process_sequence(self): for job_id, (eid, result) in \ prun(self.get_decider_tasks(), initializer=DeciderTask.initialize, initargs=(self._decider, self._entities), processes=self._decider_processes): self._queue_final_output.add_to_job(job_id, (eid, result)) while True: job_id, task_info, iter_quit = self._queue_final_output.get_next_task( ) if iter_quit: return if task_info is None: break eid, result, params = task_info yield job_id, (eid, result)
def get_decider_tasks(self): for entity_id, decision, candidates in prun( self.get_classifier_tasks(), initializer=ClassifierTask.initialize, initargs=(self._no_cuda, self._model_dir, self._model_file, self._batch_size), processes=self._classifier_processes): if candidates is None: yield DeciderTask(entity_id=None, decision=None, candidates=None, quantiles=None, rank_intervalls=None, threshold=None) continue yield DeciderTask(entity_id, decision, candidates, self._quantiles, self._rank_intervalls, self._threshold, self._return_full)
def get_sentence_pairs(self): for entity_id, candidate, pairs in \ prun(self.get_sentence_lookup(), initializer=SentenceLookup.initialize, initargs=(self._ned_sql_file, ), processes=self._pairing_processes): if entity_id is None: # signal entity_id == None yield None, None, None continue if pairs is None: continue for idx, row in pairs.iterrows(): pair = (row.id_a, row.id_b, json.loads(row.sen_a), json.loads(row.sen_b), row.pos_a, row.pos_b, row.end_a, row.end_b, row.label) yield entity_id, candidate, pair candidate = None
def get_sentence_lookup(self): for sentences, (entity_id, candidates) in \ prun(self.get_lookup(), initializer=LookUpByEmbeddings.initialize, initargs=(self._entities_file, self._entity_types, self._n_trees, self._distance_measure, self._entity_index_path, self._search_k, self._max_dist), processes=self._lookup_processes): if entity_id is None: # signal entity_id == None yield SentenceLookupWrapper(entity_id=None) continue candidates = candidates.merge(self._entities[['proba']], left_on="guessed_title", right_index=True) candidates = candidates.\ sort_values(['match_uniqueness', 'dist', 'proba', 'match_coverage', 'len_guessed'], ascending=[False, True, False, False, True]) candidates = candidates.iloc[0:self._max_candidates] for idx in range(0, len(candidates)): yield SentenceLookupWrapper(entity_id, sentences=sentences, candidates=candidates.iloc[[idx]], max_pairs=self._max_pairs)
def get_decider_tasks(self): for job_id, entity_id, decision, candidates in \ prun(self.get_classifier_tasks(), initializer=ClassifierTask.initialize, initargs=(self._no_cuda, self._model_dir, self._model_file, self._batch_size), processes=self._classifier_processes): self._queue_decider.add_to_job(job_id, (entity_id, decision, candidates)) while True: job_id, task_info, iter_quit = self._queue_decider.get_next_task( ) if iter_quit: return if task_info is None: break entity_id, decision, candidates, params = task_info print("get_decider_tasks: {}:{}".format(job_id, entity_id)) if entity_id is None: continue if candidates is None: continue yield DeciderTaskWrapper(job_id, entity_id=entity_id, decision=decision, candidates=candidates, quantiles=self._quantiles, rank_intervalls=self._rank_intervalls, **params)
def run(embeddings, all_entities, split_parts, processes): return prun(EmbedTask._get_all(all_entities, split_parts), processes=processes, initializer=EmbedTask.initialize, initargs=(embeddings, ))
def on_db_file(fulltext_sqlite_file, selection_file, model_name, ner_endpoint, chunksize, noproxy, processes, outfile): """ Reads the text content per page of digitalized collections from sqlite file FULLTEXT_SQLITE_FILE. Considers only a subset of documents that is defined by SELECTION_FILE. Performs NER on the text content using the REST endpoint[s] NER_ENDPOINT .... Writes the NER results back to another sqlite file whose name is equal to FULLTEXT_SQLITE_FILE + '-ner-' or to the file specified in the --outfile option. Writes results in chunks of size <chunksize>. Suppress proxy with option --noproxy. """ if noproxy: os.environ['no_proxy'] = '*' logging.info('Using endpoints: {}'.format(ner_endpoint)) model_name = model_name.replace(" ", "") ner_endpoint_tmp = [] for endpoint in ner_endpoint: models = json.loads(requests.get("{}/models".format(endpoint)).content) models = pd.DataFrame.from_dict(models)[['name', 'id']] models['name'] = models['name'].str.replace(" ", "") models = models.set_index('name') ner_endpoint_tmp.append("{}/ner/{}".format(endpoint, models.loc[model_name]['id'])) ner_endpoint = ner_endpoint_tmp if outfile is None: tagged_sqlite_file = os.path.splitext( os.path.basename(fulltext_sqlite_file))[0] + "-ner-" + model_name + ".sqlite3" else: tagged_sqlite_file = outfile start_row = 0 if os.path.exists(tagged_sqlite_file): with create_connection(tagged_sqlite_file) as read_conn: start_row = read_conn.execute('select max(id) from tagged').fetchone()[0] + 1 logger.info('Starting from idx: {}'.format(start_row)) with create_connection(tagged_sqlite_file) as write_conn: tagged = [] for num, ppn, file_name, text, tags, original_text, received_text in\ prun(NERTask.get_all(fulltext_sqlite_file, selection_file, ner_endpoint, start_row), processes=len(ner_endpoint) if processes is None else processes): tagged.append({'id': num, 'ppn': ppn, 'file_name': file_name, 'text': text, 'tags': tags}) try: assert original_text == received_text except AssertionError: logging.warning('PPN: {}, file_name: {}\n\n\nInput and output differ:\n\nInput: {}\n\nOutput:{}'. format(ppn, file_name, original_text, received_text)) if len(tagged) > chunksize: # noinspection PyTypeChecker df_tagged = pd.DataFrame.from_dict(tagged).reset_index(drop=True).set_index('id') df_tagged.to_sql('tagged', con=write_conn, if_exists='append', index_label='id') tagged = [] if len(tagged) > 0: # noinspection PyTypeChecker df_tagged = pd.DataFrame.from_dict(tagged).reset_index(drop=True).set_index('id') df_tagged.to_sql('tagged', con=write_conn, if_exists='append', index_label='id') try: write_conn.execute('create index idx_ppn on tagged(ppn);') except sqlite3.OperationalError: pass return
def ned_sentence_data(tagged_sqlite_file, ned_sqlite_file, processes, writequeue): """ TAGGED_SQLITE_FILE: A sqlite database file that contains all wikipedia articles where the relevant entities have been tagged. This is a database that gives per article access to the tagged sentences, it can be created using 'tag-wiki-entities2sqlite'. NED_SQLITE_FILE: Output database. This database gives fast per entity and per sentence access, i.e., it provides a fast answer to the question: "Give me all sentences where entity X is discussed." """ first_write = True sentence_counter = 0 link_counter = 0 # prevent infinite growth of multiprocessing queue sem = Semaphore(writequeue) with sqlite3.connect(ned_sqlite_file) as write_conn: write_conn.execute('pragma journal_mode=wal') for df_sentence, df_linking in prun(NEDDataTask.get_all( tagged_sqlite_file, sem=sem), processes=processes): if df_sentence is None: sem.release() continue df_sentence['id'] += sentence_counter df_linking['sentence'] += sentence_counter df_linking['id'] = [ link_counter + i for i in range(len(df_linking)) ] sentence_counter += len(df_sentence) link_counter += len(df_linking) df_sentence.set_index('id').to_sql('sentences', con=write_conn, if_exists='append', index_label='id') df_linking.set_index('id').to_sql('links', con=write_conn, if_exists='append', index_label='id') if first_write: write_conn.execute('create index idx_target on links(target);') write_conn.execute( 'create index idx_sentence on links(sentence);') write_conn.execute( 'create index idx_page_id on sentences(page_id);') write_conn.execute( 'create index idx_page_title on sentences(page_title);') first_write = False sem.release()