def process_allocated(out_dir, input_path): # create allocated things allocated_csv_filename = os.path.join( out_dir, 'nimsp_allocated_contributions.csv') allocated_csv = open(allocated_csv_filename, 'w') allocated_emitter = AllocatedEmitter(allocated_csv, fieldnames=FIELDNAMES) # create unallocated things unallocated_csv_filename = os.path.join( out_dir, 'nimsp_unallocated_contributions.csv.TMP') unallocated_csv = open(unallocated_csv_filename, 'w') unallocated_emitter = UnallocatedEmitter(unallocated_csv, fieldnames=FIELDNAMES + ['contributionid']) input_file = open(input_path, 'r') input_fields = [name for (name, _, _) in CSV_SQL_MAPPING] source = VerifiedCSVSource(input_file, input_fields) output_func = chain_filters(unallocated_emitter, DCIDFilter(SALT_KEY), allocated_emitter) load_data(source, NIMSPDenormalize.get_allocated_record_processor(), output_func) for o in [allocated_csv, unallocated_csv]: o.close()
def load_payment(csvpath, *args, **options): loader = FARAPaymentLoader( source='DOJ', description='load from denormalized CSVs', imported_by="loadfara.py (%s)" % os.getenv('LOGNAME', 'unknown'), ) payment_record_processor = chain_filters( CSVFieldVerifier(), FieldRemover('id'), FieldRemover('import_reference'), FieldAdder('import_reference', loader.import_session), FieldCopier({'date_asterisk': 'date'}), FieldModifier('date', parse_fara_date), FieldModifier('date_asterisk', parse_fara_asterisk), FieldModifier('amount', parse_decimal), FieldModifier(('document_id', 'client_id', 'registrant_id', 'record_id', 'location_id', 'subcontractor_id'), parse_int), UnicodeFilter(), StringLengthFilter(Payment)) output_func = chain_filters( LoaderEmitter(loader), Every(REPORT_FREQUENCY, progress_tick), ) input_iterator = VerifiedCSVSource(open(os.path.abspath(csvpath)), fieldnames=Payment.FIELDNAMES, skiprows=1) load_data(input_iterator, payment_record_processor, output_func)
def handle(self, input_path, year, **options): imp = Import.objects.create(source=input_path, imported_by=__file__) input_file = open(input_path, 'r') input_source = VerifiedCSVSource(input_file, FIELDS, skiprows=1) processor = LoadTCSEarmarks.get_record_processor( int(year), imp) # todo: real year and import_ref load_data(input_source, processor, save_earmark)
def denormalize(self, data_path, cycles, catcodes, candidates, committees): infiles = Files(*[os.path.join(data_path, 'raw', 'crp', 'pac_other%s.txt' % cycle) for cycle in cycles]) outfile = open(os.path.join(data_path, 'denormalized', 'denorm_pac2pac.txt'), 'w') output_func = CSVEmitter(outfile, fieldnames=FIELDNAMES).process_record source = VerifiedCSVSource(infiles, fieldnames=FILE_TYPES['pac_other'], quotechar="|") record_processor = self.get_record_processor(catcodes, candidates, committees) load_data(source, record_processor, output_func)
def issue_handler(inpath, outpath, infields, outfields): run_recipe( VerifiedCSVSource(open(inpath, 'r'), fieldnames=infields, quotechar='|'), FieldCountValidator(len(FILE_TYPES['lob_issue'])), CSVFieldVerifier(), FieldRenamer({ 'id': 'SI_ID', 'transaction': 'UniqID', 'general_issue_code': 'IssueID', 'general_issue': 'Issue', 'specific_issue': 'SpecIssue', 'year': 'Year', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def run(self): run_recipe( VerifiedCSVSource(open(self.inpath)), CSVFieldVerifier(), FieldModifier('year', lambda x: int(x) if x else None), FieldRenamer({'transaction_id': 'transaction'}), NoneFilter(), FieldModifier('specific_issue', lambda x: '' if x is None else x), TRANSACTION_FILTER, UnicodeFilter(), CountEmitter(every=10000, log=self.log), LoaderEmitter(IssueLoader( source=self.inpath, description='load from denormalized CSVs', imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'), log=self.log, ), commit_every=100), )
def denormalize(self, data_path, cycles, catcodes, candidates, committees): record_processor = self.get_record_processor(catcodes, candidates, committees) for cycle in cycles: in_path = os.path.join(data_path, 'raw', 'crp', 'indivs%s.txt' % cycle) infile = open(in_path, 'r') out_path = os.path.join(data_path, 'denormalized', 'denorm_indivs.%s.txt' % cycle) outfile = open(out_path, 'w') sys.stdout.write('Reading from %s, writing to %s...\n' % (in_path, out_path)) input_source = VerifiedCSVSource(infile, fieldnames=FILE_TYPES['indivs'], quotechar="|") output_func = CSVEmitter(outfile, fieldnames=FIELDNAMES).process_record load_data(input_source, record_processor, output_func)
def handle(self, csvpath, *args, **options): loader = ContributionLoader( source=options.get('source'), description='load from denormalized CSVs', imported_by="loadcontributions.py (%s)" % os.getenv('LOGNAME', 'unknown'), ) try: input_iterator = VerifiedCSVSource(open(os.path.abspath(csvpath)), FIELDNAMES, skiprows=1 + int(options['skip'])) output_func = chain_filters( LoaderEmitter(loader), #Every(self.COMMIT_FREQUENCY, lambda i: transaction.commit()), Every(self.COMMIT_FREQUENCY, progress_tick), Every(self.COMMIT_FREQUENCY, lambda i: reset_queries()), ) record_processor = self.get_record_processor(loader.import_session) load_data(input_iterator, record_processor, output_func) transaction.commit() except KeyboardInterrupt: traceback.print_exception(*sys.exc_info()) transaction.rollback() raise except: traceback.print_exception(*sys.exc_info()) transaction.rollback() raise finally: sys.stdout.flush() sys.stderr.flush()
def process_unallocated(out_dir, salts_db): unallocated_csv_filename = os.path.join( out_dir, 'nimsp_unallocated_contributions.csv.TMP') unallocated_csv = open(os.path.join(out_dir, unallocated_csv_filename), 'r') salted_csv_filename = os.path.join( out_dir, 'nimsp_unallocated_contributions.csv') salted_csv = open(salted_csv_filename, 'w') source = VerifiedCSVSource(unallocated_csv, fieldnames=FIELDNAMES + ['contributionid'], skiprows=1) output_func = CSVEmitter(salted_csv, FIELDNAMES).process_record load_data(source, NIMSPDenormalize.get_unallocated_record_processor(salts_db), output_func) for f in [salted_csv, unallocated_csv]: f.close()