def download_and_save_abstracts_for_search_term(search_term, dataset, max_ids):
    abstracts_df = retrieve_pubmed_abstracts([search_term], max_ids)
    dataset_output_directory = os.path.join(global_output_directory_name,
                                            dataset)
    FileUtil.create_directory_if_not_exists(dataset_output_directory)
    abstracts_df.to_csv(os.path.join(dataset_output_directory,
                                     'abstracts.csv'))
    return abstracts_df
    def find_and_save_food_disease_dfs(self, ids_and_abstracts, dataset):

        save_directory = os.path.join(global_output_directory_name, dataset)
        FileUtil.create_directory_if_not_exists(save_directory)

        for extractor in self.food_extractors + self.disease_extractors:
            print(extractor.name)
            df_to_save = pd.DataFrame()
            i = 0
            save_file = os.path.join(
                save_directory,
                '{extractor_name}.csv'.format(extractor_name=extractor.name))
            if not os.path.isfile(save_file):
                for (file_name, file_content) in ids_and_abstracts:
                    doc = self.english_model(file_content)
                    #print(i)
                    i += 1
                    file_name = str(file_name)
                    try:
                        extracted_df = extractor.extract(doc,
                                                         file_name,
                                                         self.dataset,
                                                         save_entities=False)
                        extracted_df['extractor'] = extractor.name
                        extracted_df['file_name'] = file_name
                        df_to_save = df_to_save.append(extracted_df)
                    except:
                        if self.verbose:
                            print('Error happened')
                            traceback.print_exc(file=sys.stdout)

                    if i % 1000 == 0:
                        df_to_save.drop_duplicates().to_csv(
                            os.path.join(
                                save_directory,
                                '{extractor_name}_{i}.csv'.format(
                                    extractor_name=extractor.name, i=i)))
                if df_to_save.shape[0] == 0:
                    df_to_save = pd.DataFrame(columns=[
                        'start_char', 'end_char', 'entity_type', 'entity_id',
                        'text', 'sentence', 'sentence_index', 'extractor',
                        'file_name'
                    ])
                df_to_save.drop_duplicates().to_csv(save_file)
            else:
                print('File already exists: {0}'.format(save_file))
Exemple #3
0
 def save(self, doc: Doc, objects: List, file_name: str, file_subdirectory: str):
     output_directory = self.get_output_directory(file_subdirectory)
     FileUtil.create_directory_if_not_exists(output_directory)
     doc, objects_column_names = self.prepare_doc_for_saving(doc, objects)
     doc.to_disk(f'{output_directory}/{file_name}')
     PandasUtil.write_object_list_as_dataframe_file(doc._.entities, file_name, f'{output_directory}/as_df', columns=objects_column_names)