def import_orgs(): for file_path in filter(lambda x: x.startswith('profile_organization'), get_parquet_file_paths()): table = pq.read_table(file_path) df = table.to_pandas() for idx, row in df.iterrows(): print(f"org {idx}") c = Organization() c.id = uuid.uuid4().hex c.created = row['created'] c.updated = row['updated'] c.profile_id = row['profile_id'] c.starts_at = row['starts_at'] if not pd.isnull( row['starts_at']) else None c.ends_at = row['ends_at'] if not pd.isnull( row['ends_at']) else None c.name = row['name'] c.title = row['title'] c.description = row['description'] session.add(c) session.commit()
def spider_close(self): """ Завершение процесса сбора данных. Сохранение собранных данных в базу данных. """ logging.info("Spider closed") logging.info("Saving collected data") for domain in self.organization_processing_by_domain.keys(): logging.info("Saving for " + domain) collected_data = self.collected_data_by_domain[domain] collected_data.resources = json.dumps( collected_data.resources_array).encode("utf-8") collected_data.save() organization = Organization() organization.name = domain organization.save() self.organization_processing_by_domain[ domain].organization = organization self.organization_processing_by_domain[ domain].collected_data = collected_data self.organization_processing_by_domain[domain].save()