def copy_to_database(self, *, s_file_id): ''' Define a task method, and bind it to the base task. The setup method of the base task will be fired by the task_prerun signal handler, init_task, before the code in this task method is run, e.g, self.s_file and self.import_utility are available. ''' table_name = self.s_file.raw_table_name columns = ''' record_id UUID DEFAULT gen_random_uuid(), responding_agency VARCHAR, employer VARCHAR, last_name VARCHAR, first_name VARCHAR, title VARCHAR, department VARCHAR, base_salary VARCHAR, extra_pay VARCHAR, date_started VARCHAR, data_year INT ''' create = 'CREATE TABLE {} ({})'.format(table_name, columns) with connection.cursor() as cursor: cursor.execute(create) meta = CsvMeta(self.s_file.standardized_file) formatted_data_file = meta.trim_extra_fields() with open(formatted_data_file, 'r', encoding='utf-8') as f: with connection.cursor() as cursor: copy_fmt = 'COPY "{table}" ({cols}) FROM STDIN CSV HEADER' copy = copy_fmt.format(table=table_name, cols=','.join(meta.REQUIRED_FIELDS)) cursor.copy_expert(copy, f) cursor.execute('CREATE INDEX ON {} (TRIM(LOWER(employer)))'.format( table_name)) self.update_status('copied to database') return 'Copied {} to database'.format(formatted_data_file)
def clean_standardized_file(self): s_file = self.cleaned_data['standardized_file'] meta = CsvMeta(s_file) self._validate_filetype(meta.file_type) self._validate_fields(meta.field_names) return s_file
def validate(self, data_file): with open(data_file, 'rb') as df: meta = CsvMeta(File(df)) if meta.file_type != 'csv': raise CommandError('Data file must be a CSV') missing_fields = ', '.join( set(CsvMeta.REQUIRED_FIELDS) - set(meta.field_names)) if missing_fields: message = 'Standardized file missing fields: {}'.format( missing_fields) raise CommandError(message) valid_file_name = meta.trim_extra_fields() self.stdout.write('Validated {}'.format(data_file)) return valid_file_name
def clean_standardized_file(self): s_file = self.cleaned_data['standardized_file'] meta = CsvMeta(s_file) self._validate_filetype(meta.file_type) self._validate_fields(meta.field_names) now = datetime.datetime.now().strftime('%Y-%m-%dT%H%M%S') s_file.name = '{}-{}'.format(now, s_file.name) return s_file
def test_match_or_create_responding_agency(raw_table_setup, canned_data, employer, queue, raw_field, model, model_kwargs): s_file = raw_table_setup q = queue(s_file.id) name = canned_data[raw_field] item = {'id': None, 'name': name} if isinstance(q, ChildEmployerQueue): parent = canned_data['Employer'] employer.build(name=parent, vintage=s_file.upload) item['parent'] = parent for match in (None, 'a matching agency'): q.match_or_create(item.copy(), match) with connection.cursor() as cursor: select = ''' SELECT EXISTS( SELECT 1 FROM {raw_payroll} WHERE {processed_field} = '{item}' ), EXISTS( SELECT 1 FROM {raw_payroll} WHERE {processed_field} = '{match}' ) '''.format(raw_payroll=s_file.raw_table_name, processed_field=CsvMeta._clean_field(raw_field), item=name, match=match) cursor.execute(select) item_exists, match_exists = cursor.fetchone() if match: assert match_exists and not item_exists else: assert item_exists and not match_exists assert model.objects.get(name=name, **model_kwargs)