def download_and_save_abstracts_for_search_term(search_term, dataset, max_ids): abstracts_df = retrieve_pubmed_abstracts([search_term], max_ids) dataset_output_directory = os.path.join(global_output_directory_name, dataset) FileUtil.create_directory_if_not_exists(dataset_output_directory) abstracts_df.to_csv(os.path.join(dataset_output_directory, 'abstracts.csv')) return abstracts_df
def find_and_save_food_disease_dfs(self, ids_and_abstracts, dataset): save_directory = os.path.join(global_output_directory_name, dataset) FileUtil.create_directory_if_not_exists(save_directory) for extractor in self.food_extractors + self.disease_extractors: print(extractor.name) df_to_save = pd.DataFrame() i = 0 save_file = os.path.join( save_directory, '{extractor_name}.csv'.format(extractor_name=extractor.name)) if not os.path.isfile(save_file): for (file_name, file_content) in ids_and_abstracts: doc = self.english_model(file_content) #print(i) i += 1 file_name = str(file_name) try: extracted_df = extractor.extract(doc, file_name, self.dataset, save_entities=False) extracted_df['extractor'] = extractor.name extracted_df['file_name'] = file_name df_to_save = df_to_save.append(extracted_df) except: if self.verbose: print('Error happened') traceback.print_exc(file=sys.stdout) if i % 1000 == 0: df_to_save.drop_duplicates().to_csv( os.path.join( save_directory, '{extractor_name}_{i}.csv'.format( extractor_name=extractor.name, i=i))) if df_to_save.shape[0] == 0: df_to_save = pd.DataFrame(columns=[ 'start_char', 'end_char', 'entity_type', 'entity_id', 'text', 'sentence', 'sentence_index', 'extractor', 'file_name' ]) df_to_save.drop_duplicates().to_csv(save_file) else: print('File already exists: {0}'.format(save_file))
def save(self, doc: Doc, objects: List, file_name: str, file_subdirectory: str): output_directory = self.get_output_directory(file_subdirectory) FileUtil.create_directory_if_not_exists(output_directory) doc, objects_column_names = self.prepare_doc_for_saving(doc, objects) doc.to_disk(f'{output_directory}/{file_name}') PandasUtil.write_object_list_as_dataframe_file(doc._.entities, file_name, f'{output_directory}/as_df', columns=objects_column_names)