Beispiel #1
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldMerger({'bill_type_raw': ['bill_name']},
                     lambda x: re.sub(r'[^A-Z]*', '', x),
                     keep_fields=True),
         FieldMerger({'bill_type': ['bill_type_raw']},
                     lambda x: self.bill_type_map.get(x, None),
                     keep_fields=True),
         FieldMerger({'bill_no': ['bill_name']},
                     lambda x: self.digits.match(x).groups()[0]
                     if x and self.digits.match(x) else None,
                     keep_fields=True),
         NoneFilter(),
         IssueFilter(),
         UnicodeFilter(),
         CountEmitter(every=20000, log=self.log),
         LoaderEmitter(BillLoader(
             source=self.inpath,
             description='load from denormalized CSVs',
             imported_by="loadlobbying (%s)" %
             os.getenv('LOGNAME', 'unknown'),
             log=self.log,
         ),
                       commit_every=1),
     )
def lobbying_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        UnicodeFilter(),
        FieldRemover('Source'),
        FieldMerger({'registrant_name': ('Registrant', 'RegistrantRaw')}, name_proc),
        FieldMerger({'registrant_is_firm': ('IsFirm',)}, yn_proc),
        FieldMerger({'client_name': ('Client', 'Client_raw')}, name_proc),
        FieldMerger({'amount': ('Amount',)}, lambda x: float(x or 0)),
        FieldMerger({'affiliate': ('Affiliate',)}, yn_proc),
        FieldMerger({'filing_included_nsfs': ('IncludeNSFS',)}, yn_proc),
        FieldMerger({'include_in_industry_totals': ('Ind',)}, yn_proc),
        FieldMerger({'use': ('Use',)}, yn_proc),
        FieldRenamer({
            'transaction_id': 'Uniqid',
            'transaction_type': 'Type',
            'transaction_type_desc': 'TypeLong',
            'year': 'Year',
            'client_category': 'Catcode',
            'client_parent_name': 'Ultorg',
            'filing_type': 'Self',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #3
0
def lobbying_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        UnicodeFilter(),
        FieldRemover('Source'),
        FieldMerger({'registrant_name': ('Registrant', 'RegistrantRaw')},
                    name_proc),
        FieldMerger({'registrant_is_firm': ('IsFirm', )}, yn_proc),
        FieldMerger({'client_name': ('Client', 'Client_raw')}, name_proc),
        FieldMerger({'amount': ('Amount', )}, lambda x: float(x or 0)),
        FieldMerger({'affiliate': ('Affiliate', )}, yn_proc),
        FieldMerger({'filing_included_nsfs': ('IncludeNSFS', )}, yn_proc),
        FieldMerger({'include_in_industry_totals': ('Ind', )}, yn_proc),
        FieldMerger({'use': ('Use', )}, yn_proc),
        FieldRenamer({
            'transaction_id': 'Uniqid',
            'transaction_type': 'Type',
            'transaction_type_desc': 'TypeLong',
            'year': 'Year',
            'client_category': 'Catcode',
            'client_parent_name': 'Ultorg',
            'filing_type': 'Self',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #4
0
def load_cpi_areas():
    headers = ('area_code','area_name','display_level','selectable','sort_sequence')
    url = "ftp://ftp.bls.gov/pub/time.series/cu/cu.area"
    run_recipe(
        CSVSource(utils.RemoteFile(url), delimiter='\t'),
        CSVEmitter(open('bls_areas.csv', 'w'), headers)
    )
Beispiel #5
0
def load_schools():

    for index, filename in enumerate(("sc091aai.csv", "sc091akn.csv", "sc091aow.csv")):
        run_recipe(
            CSVSource(open(os.path.join(NCES_ROOT, filename))),
            FieldRemover(("mzip409", "member09", "phone09", "ulocal09", "type09", "level09", "status09")),
            FieldRenamer(
                {
                    "school_id": "ncessch",
                    "name": "schnam09",
                    "street": "mstree09",
                    "city": "mcity09",
                    "state": "mstate09",
                    "zipcode": "mzip09",
                    "grade_low": "gslo09",
                    "grade_high": "gshi09",
                }
            ),
            FieldAdder("latitude", None),
            FieldAdder("longitude", None),
            FieldAdder("codes", None),
            MSACoder(),
            SqliteEmitter(DB, "nces_schools", fieldnames=HEADERS),
            # DebugEmitter(),
            CountEmitter(every=100),
        )
Beispiel #6
0
def load_schools():

	for filename in ('sc091aai.csv','sc091akn.csv','sc091aow.csv'):
		run_recipe(
			CSVSource(open(os.path.join(NCES_ROOT, filename))),
			FieldRemover((
				'mzip409',
				'member09',
				'phone09',
				'ulocal09',
				'type09',
				'level09',
				'status09',
			)),
			FieldRenamer({
				'school_id': 'ncessch',
				'name': 'schnam09',
				'street': 'mstree09',
				'city': 'mcity09',
				'state': 'mstate09',
				'zipcode': 'mzip09',
				'grade_low': 'gslo09',
				'grade_high': 'gshi09',
			}),
			FieldAdder('code', None),
			GeoCoder(),
			SqliteEmitter(DB, 'nces_schools', fieldnames=HEADERS),
			#DebugEmitter(),
		)
Beispiel #7
0
def load_occupations():
	run_recipe(
		K2LocationSource(),
		MySQLOccupationEmitter('root', '', 'k2'),
		#emitters.DebugEmitter(),
		error_stream = emitters.DebugEmitter(),
	)
Beispiel #8
0
def load_zipcodes():
    
    headers = (
        'country_code','postal_code','name',
        'state_name','state_code',
        'county_name','county_code',
        'community_name','community_code',
        'latitude','longitude','accuracy'
    )
    
    state_histogram = Histogram('state_code')
    state_histogram.label_length = 2
    
    csv_path = settings.dataset_path('default', filename='zipcodes.txt')
    
    run_recipe(
        CSVSource(open(csv_path), delimiter="\t", fieldnames=headers),
        FieldKeeper(('postal_code','name','state_code','latitude','longitude')),
        FieldModifier(('latitude','longitude'), float),
        FieldMerger({'latlng': ('latitude', 'longitude')}, lambda lat, lng: (lat, lng)),
        #MongoZipEmitter(),
        #DebugEmitter(),
        state_histogram,
    )
    
    return str(state_histogram)
Beispiel #9
0
def process_sopr_filing(sopr_xml_file):
    from sunlightapi import live_settings as DJ_SETTINGS
    DJ_APPLABEL = 'lobbyists'
    
    saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
        # flatten non-list dictionaries & clean up some fields
        DictFlattener(['filing', 'client', 'registrant']),
        FieldRemover(['govt_entities', 'affiliated_orgs', 'foreign_entities',
                      'client_state_or_local_gov', 'client_status',
                      'filing_affiliated_orgs_url']),
        FieldRenamer({'filing_date': 'filing_filing_date'}),
        
        # process names & dates
        FieldAdder('client_contact_name', ''),
        NameCleaner('client_contact_name', prefix='client_contact_', nomatch_name='client_raw_contact_name'),
        FieldModifier('filing_date', lambda x: x.split('.')[0]),
        DateCleaner('filing_date', from_format='%Y-%m-%dT%H:%M:%S', to_format='%Y-%m-%d'),
        
        # flatten lists
        Flattener(['issues', 'lobbyists']),
        FieldCopier({'issues.filing_id': 'filing_id',
                     'lobbyists.filing_id': 'filing_id'}),
        
        # handle lists
        saucebrush.filters.Splitter({
          'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')],
          'lobbyists':[FieldRemover(['indicator', 'status']),
                       NameCleaner('name', nomatch_name='raw_name'),
                       Unique(),    # remove some duplicate lobbyists on a form
                       DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist')
                       ],
        }),
        FieldRemover(['issues', 'lobbyists']),
        DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing')
    )
Beispiel #10
0
def load_locations():

    conn = MySQLdb.connect(
        user=settings.MYSQL_USER,
        passwd=settings.MYSQL_PASS,
        db=settings.MYSQL_DATABASE,
        host=settings.MYSQL_HOST,
        port=settings.MYSQL_PORT,
    )

    cursor = conn.cursor()
    cursor.execute("""DELETE FROM occupation_category""")
    cursor.execute("""DELETE FROM occupation""")
    cursor.close()

    path = settings.dataset_path(None, filename='occupations.csv')

    run_recipe(
        sources.CSVSource(open(path)),
        ValidOccupationFilter(),
        CategoryIDFilter(),
        CategoryEmitter(conn),
        OccupationEmitter(conn),
        #emitters.DebugEmitter(),
        error_stream = emitters.DebugEmitter(),
    )

    conn.commit()
    conn.close()
Beispiel #11
0
def code_a5():

    run_recipe(
        sources.CSVSource(open("/Users/Jeremy/Downloads/A5.csv")),
        filters.FieldAdder("code", None),
        MSAFilter(),
        emitters.CSVEmitter(open("/Users/Jeremy/Downloads/A5.coded.csv", "w"), fieldnames=FIELD_NAMES),
    )
Beispiel #12
0
def load_locations():
	run_recipe(
		K2LocationSource(),
		DataFilter(),
		filters.FieldRemover(('_id','ffiec','geo','oes','naccrra','nces','rpp_local','rpp_state')),
		MySQLLocationEmitter('root', '', 'k2'),
		#emitters.DebugEmitter(),
		error_stream = emitters.DebugEmitter(),
	)
Beispiel #13
0
def load_items():

	path = os.path.join(CPI_ROOT, "cu.item")
	headers = ('item_code','item_name','display_level','selectable','sort_sequence')

	run_recipe(
		csv.DictReader(local_file(path), delimiter='\t'),
		#SqliteEmitter(DB, 'cpi_items', fieldnames=headers),
		DebugEmitter(),
	)
Beispiel #14
0
def load_ffiec():

	run_recipe(
		ffiec_iter(),
		FieldModifier(('low','high'), float),
		FieldMerger({'avg': ('low','high')}, lambda x, y: (x + y) / 2, keep_fields=True),
		#SqliteEmitter(DB, 'ffiec_incomes', fieldnames=HEADERS),
		GenericMSAFilter(),
		MongoEmitter(),
		#DebugEmitter(),
	)
Beispiel #15
0
def load_areas():

	path = os.path.join(CPI_ROOT, "cu.area")
	headers = ('area_code','area_name','msa_code','display_level','selectable','sort_sequence')

	run_recipe(
		csv.DictReader(local_file(path), delimiter='\t'),
		#FieldAdder('msa_code', None),
		#MSAFilter(),
		#SqliteEmitter(DB, 'cpi_areas', fieldnames=headers),
		DebugEmitter(),
	)
Beispiel #16
0
def agency_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldAdder('id', ''),
        FieldRenamer({
            'transaction': 'UniqID',
            'agency_name': 'Agency',
            'agency_ext_id': 'AgencyID',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
def agency_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldAdder('id', ''),
        FieldRenamer({
            'transaction': 'UniqID',
            'agency_name': 'Agency',
            'agency_ext_id': 'AgencyID',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
def bills_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldAdder('id', ''),
        FieldRenamer({
            'bill_id':     'B_ID',
            'issue':       'SI_ID',
            'congress_no': 'CongNo',
            'bill_name':   'Bill_Name',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #19
0
def bills_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldAdder('id', ''),
        FieldRenamer({
            'bill_id': 'B_ID',
            'issue': 'SI_ID',
            'congress_no': 'CongNo',
            'bill_name': 'Bill_Name',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #20
0
def load_locations():
    
    csv_path = settings.datasource_path('locations', filename='locations.csv')
    
    # load locations into mongodb
    run_recipe(
        CSVSource(open(csv_path)),
        FieldRemover('points'),
        MSAFilter(),
        GeoJSONTestFilter(),
        CensusTestFilter(),
        GeoFilter(),
        DebugEmitter(),
        #MongoDBEmitter(settings.MONGO_DATABASE, 'locations'),
    )
def issue_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldRenamer({
            'id': 'SI_ID',
            'transaction': 'UniqID',
            'general_issue_code': 'IssueID',
            'general_issue': 'Issue',
            'specific_issue': 'SpecIssue',
            'year': 'Year',
        }),
        FieldModifier(('general_issue', 'specific_issue'), lambda x: x.replace('\n', ' ')),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #22
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldRenamer(self.field_map),
         FieldRemover('committee_fec_id committee_name report_year report_type is_amendment start_date end_date reporting_period_amount_all semi_annual_amount_all'.split()),
         BundleFilter(),
         #FieldModifier('file_num', lambda x: Bundle.objects.get(pk=x)),
         # Convert any stray floats to integers
         FieldModifier('amount semi_annual_amount'.split(), \
                 lambda x: int(round(float(x))) if x else None),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=500),
         #DebugEmitter(),
         DjangoModelEmitter('settings', LobbyistBundle)
     )
Beispiel #23
0
def calculate_average(locs):

	def locsource():
		for loc in locs:
			if 'ffiec' in loc and 'diff' in loc['ffiec']:
				yield loc['ffiec']

	sd = stats.StandardDeviation('diff')

	run_recipe(
		locsource(),
		sd,
	)

	print "Average: %s" % sd.average()
	print "stddev:  %s" % sd.value()[0]
Beispiel #24
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldRenamer(self.field_map),
         FieldRemover('committee_fec_id committee_name report_year report_type is_amendment start_date end_date reporting_period_amount_all semi_annual_amount_all'.split()),
         BundleFilter(),
         #FieldModifier('file_num', lambda x: Bundle.objects.get(pk=x)),
         # Convert any stray floats to integers
         FieldModifier('amount semi_annual_amount'.split(), \
                 lambda x: int(round(float(x))) if x else None),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=500),
         #DebugEmitter(),
         SimpleDjangoModelEmitter(LobbyistBundle)
     )
Beispiel #25
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldModifier('year', lambda x: int(x) if x else None),
         FieldRenamer({'transaction_id': 'transaction'}),
         NoneFilter(),
         TRANSACTION_FILTER,
         UnicodeFilter(),
         CountEmitter(every=10000, log=self.log),
         LoaderEmitter(AgencyLoader(
             source=self.inpath,
             description='load from denormalized CSVs',
             imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'),
             log=self.log,
         ), commit_every=100),
     )
def lobbyist_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldAdder('id', ''),
        FieldMerger({'lobbyist_name': ('Lobbyist', 'Lobbyist_raw')}, name_proc),
        FieldMerger({'member_of_congress': ('FormerCongMem',)}, yn_proc),
        FieldRenamer({
            'transaction': 'Uniqid',
            'year': 'Year',
            'lobbyist_ext_id': 'LobbyistID',
            'candidate_ext_id': 'CID',
            'government_position': 'OfficalPos',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
def issue_handler(inpath, outpath, infields, outfields):

    run_recipe(
        VerifiedCSVSource(open(inpath, 'r'), fieldnames=infields, quotechar='|'),
        FieldCountValidator(len(FILE_TYPES['lob_issue'])),
        CSVFieldVerifier(),
        FieldRenamer({
            'id': 'SI_ID',
            'transaction': 'UniqID',
            'general_issue_code': 'IssueID',
            'general_issue': 'Issue',
            'specific_issue': 'SpecIssue',
            'year': 'Year',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #28
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldMerger({'bill_type_raw': ['bill_name']}, lambda x: re.sub(r'[^A-Z]*', '', x), keep_fields=True),
         FieldMerger({'bill_type': ['bill_type_raw']}, lambda x: self.bill_type_map.get(x, None), keep_fields=True),
         FieldMerger({'bill_no': ['bill_name']}, lambda x: self.digits.match(x).groups()[0] if x and self.digits.match(x) else None, keep_fields=True),
         NoneFilter(),
         IssueFilter(),
         UnicodeFilter(),
         CountEmitter(every=20000, log=self.log),
         LoaderEmitter(BillLoader(
             source=self.inpath,
             description='load from denormalized CSVs',
             imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'),
             log=self.log,
         ), commit_every=1),
     )
Beispiel #29
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldModifier('year', lambda x: int(x) if x else None),
         FieldRenamer({'transaction_id': 'transaction'}),
         NoneFilter(),
         TRANSACTION_FILTER,
         UnicodeFilter(),
         CountEmitter(every=10000, log=self.log),
         LoaderEmitter(AgencyLoader(
             source=self.inpath,
             description='load from denormalized CSVs',
             imported_by="loadlobbying (%s)" %
             os.getenv('LOGNAME', 'unknown'),
             log=self.log,
         ),
                       commit_every=100),
     )
Beispiel #30
0
def lobbyist_handler(inpath, outpath, infields, outfields):

    run_recipe(
        CSVSource(open(inpath), fieldnames=infields, quotechar='|'),
        FieldAdder('id', ''),
        FieldMerger({'lobbyist_name': ('Lobbyist', 'Lobbyist_raw')},
                    name_proc),
        FieldMerger({'member_of_congress': ('FormerCongMem', )}, yn_proc),
        FieldRenamer({
            'transaction': 'Uniqid',
            'year': 'Year',
            'lobbyist_ext_id': 'LobbyistID',
            'candidate_ext_id': 'CID',
            'government_position': 'OfficalPos',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #31
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldModifier('year', lambda x: int(x) if x else None),
         FieldModifier('amount', lambda x: Decimal(x) if x else None),
         FieldModifier((
             'affiliate','filing_included_nsfs','include_in_industry_totals',
             'registrant_is_firm','use'), lambda x: x == 'True'),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=20000, log=self.log),
         LoaderEmitter(LobbyingLoader(
             source=self.inpath,
             description='load from denormalized CSVs',
             imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'),
             log=self.log,
         )),
     )
Beispiel #32
0
def issue_handler(inpath, outpath, infields, outfields):

    run_recipe(
        VerifiedCSVSource(open(inpath, 'r'),
                          fieldnames=infields,
                          quotechar='|'),
        FieldCountValidator(len(FILE_TYPES['lob_issue'])),
        CSVFieldVerifier(),
        FieldRenamer({
            'id': 'SI_ID',
            'transaction': 'UniqID',
            'general_issue_code': 'IssueID',
            'general_issue': 'Issue',
            'specific_issue': 'SpecIssue',
            'year': 'Year',
        }),
        #DebugEmitter(),
        CSVEmitter(open(outpath, 'w'), fieldnames=outfields),
    )
Beispiel #33
0
def load_naccrra():
    
    csv_path = os.path.join(settings.dataset_path('default'), 'childcarecosts.csv')
    
    run_recipe(
        sources.CSVSource(open(csv_path)),
        filters.FieldRenamer({
            'state': 'State',
            'family_infant': 'Family-Infant',
            'family_4': 'Family-4-Year-Old',
            'family_school': 'Family-School-Age',
            'center_infant': 'Center-Infant',
            'center_4': 'Center-4-Year-Old',
            'center_school': 'Center-School-Age',
        }),
        MongoNACCRRAEmitter(),
        emitters.CountEmitter(),
        #emitters.DebugEmitter(),
        error_stream = emitters.DebugEmitter(),
    )
Beispiel #34
0
 def process_record(self, record):
     
     occ = record['occupation']
     
     stats_filters = {}
     
     for fieldname in STATS_FIELDS:
         stats_filters[fieldname] = stats.StandardDeviation(fieldname)
     
     run_recipe(
         sources.SqliteSource(db_path, """SELECT * FROM locations WHERE occupation = ?""", (occ,)),
         filters.FieldModifier(STATS_FIELDS, to_float),
         Recipe(*stats_filters.values()),
         error_stream = emitters.DebugEmitter(),
     )
     
     for fieldname, stats_filter in stats_filters.iteritems():
         record['%s_stddev' % fieldname] = stats_filter.value()[0]
         record['%s_mean' % fieldname] = stats_filter.average()
     
     return record
Beispiel #35
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldModifier('year', lambda x: int(x) if x else None),
         FieldModifier('amount', lambda x: Decimal(x) if x else None),
         FieldModifier(
             ('affiliate', 'filing_included_nsfs',
              'include_in_industry_totals', 'registrant_is_firm', 'use'),
             lambda x: x == 'True'),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=20000, log=self.log),
         LoaderEmitter(
             LobbyingLoader(
                 source=self.inpath,
                 description='load from denormalized CSVs',
                 imported_by="loadlobbying (%s)" %
                 os.getenv('LOGNAME', 'unknown'),
                 log=self.log,
             )),
     )
Beispiel #36
0
def process_fec_year(year):
    # committees
    source = FixedWidthFileSource(open('%s/foiacm.dta' % year), CM_FIELDS)
    #sqlite = SqliteOutput('fec%s.sqlite' % year, 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler'])
    emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year, 'a'), 'committee',
                                [f[0] for f in CM_FIELDS if f[0] != 'filler'])
    run_recipe(source, emit_mysql)

    # candidate
    source = FixedWidthFileSource(open('%s/foiacn.dta' % year), CN_FIELDS)
    fieldremover = FieldRemover(('fillerA', 'fillerB'))
    #sqlite = SqliteOutput('fec%s.sqlite' % year, 'candidate', [f[0] for f in CN_FIELDS if f[0] != 'filler'])
    emit_mysql = SqlDumpEmitter(
        open('fec%s.sql' % year, 'a'), 'candidate',
        [f[0] for f in CN_FIELDS if not f[0].startswith('filler')])
    run_recipe(source, fieldremover, emit_mysql)

    # contributions
    source = FixedWidthFileSource(open('%s/itcont.dta' % year), INDIV_FIELDS)
    decobolizer = FieldModifier(('amount', ), fix_cobol_number)
    #sqlite = SqliteOutput('fec%s.sqlite' % year, 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler'])
    emit_mysql = SqlDumpEmitter(
        open('fec%s.sql' % year, 'a'), 'contribution',
        [f[0] for f in INDIV_FIELDS if f[0] != 'filler'])
    run_recipe(source, decobolizer, emit_mysql)
Beispiel #37
0
def load_prices():

	paths = [os.path.join(CPI_ROOT, fn) for fn in (
		"cu.data.3.AsizeNorthEast",
		"cu.data.4.AsizeNorthCentral",
		"cu.data.5.AsizeSouth",
		"cu.data.6.AsizeWest",
		"cu.data.7.OtherNorthEast",
		"cu.data.8.OtherNorthCentral",
		"cu.data.9.OtherSouth",
		"cu.data.10.OtherWest",
	)]

	headers = ('area_code','item_code','year','periodicity','period','value')

	run_recipe(
		csv.DictReader(local_files(paths), delimiter='\t'),
		FieldModifier('value', lambda x: x.lstrip()),
		SeriesIDFilter(),
		#SqliteEmitter(DB, 'cpi_prices', fieldnames=headers),
		DebugEmitter(),
	)
Beispiel #38
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldRenamer(self.field_map),
         # Values are [N|A]. Convert to boolean.
         FieldModifier('is_amendment', \
                 lambda x: x == 'A'),
         # Convert any stray floats to integers
         FieldModifier('reporting_period_amount semi_annual_amount'.split(), \
                 lambda x: int(round(float(x))) if x else None),
         # Convert date formats
         FieldModifier('start_date end_date filing_date'.split(), \
                 lambda x: datetime.strptime(x, '%m/%d/%Y') if x else None),
         # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load
         FieldCopier({'pdf_url': 'first_image_num'}),
         FieldModifier('pdf_url', \
                 lambda x: 'http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf'.format(x[-3:], x)),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=200),
         #DebugEmitter(),
         DjangoModelEmitter('settings', Bundle)
     )
Beispiel #39
0
def load_cpi():
    
    urls = {
        'north_east': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.3.AsizeNorthEast',
        'north_central': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.4.AsizeNorthCentral',
        'south': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.5.AsizeSouth',
        'west': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.6.AsizeWest',
    }
    
    url = urls['west']
    headers = ('series_id','survey_abbr','seasonal_code','periodicity_code',
        'area_code','item_code','year','period','value','footnote_codes')
    
    reader = remote_files(*urls.values(), headers=True)
    
    run_recipe(
        #CSVSource(utils.RemoteFile(url), delimiter='\t'),
        CSVSource(reader, delimiter='\t'),
        FieldModifier(('series_id','value'), str.strip),
        SeriesIDFilter('series_id'),
        ValueConditionalFilter('year', '2008'),
        CSVEmitter(open('bls.csv', 'w'), headers)
    )
Beispiel #40
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldRenamer(self.field_map),
         # Values are [N|A]. Convert to boolean.
         FieldModifier('is_amendment', \
                 lambda x: x == 'A'),
         # Convert any stray floats to integers
         FieldModifier('reporting_period_amount semi_annual_amount'.split(), \
                 lambda x: int(round(float(x.replace('$','').replace(',','')))) if x else None),
         # Convert date formats
         FieldModifier('start_date end_date filing_date'.split(), \
                 lambda x: datetime.strptime(x, '%m/%d/%Y') if x else None),
         # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load
         FieldCopier({'pdf_url': 'first_image_num'}),
         FieldModifier('pdf_url', \
                 lambda x: 'http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf'.format(x[-3:], x)),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=200),
         #DebugEmitter(),
         SimpleDjangoModelEmitter(Bundle)
     )
Beispiel #41
0
 def run(self):
     run_recipe(
         CSVSource(open(self.inpath)),
         FieldRenamer(self.field_map),
         # Values are [N|A]. Convert to boolean.
         FieldModifier("is_amendment", lambda x: x == "A"),
         # Convert any stray floats to integers
         FieldModifier(
             "reporting_period_amount semi_annual_amount".split(),
             lambda x: int(round(float(x.replace("$", "").replace(",", "")))) if x else None,
         ),
         # Convert date formats
         FieldModifier(
             "start_date end_date filing_date".split(), lambda x: datetime.strptime(x, "%m/%d/%Y") if x else None
         ),
         # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load
         FieldCopier({"pdf_url": "first_image_num"}),
         FieldModifier("pdf_url", lambda x: "http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf".format(x[-3:], x)),
         NoneFilter(),
         UnicodeFilter(),
         CountEmitter(every=200),
         # DebugEmitter(),
         SimpleDjangoModelEmitter(Bundle),
     )
Beispiel #42
0
from saucebrush.filters import Splitter, PhoneNumberCleaner, FieldMerger, FieldAdder
from saucebrush.emitters import DebugEmitter
import operator
from itertools import count
import saucebrush

data = [{
    'person': {
        'firstname': 'James',
        'lastname': 'Turk'
    },
    'phones': [{
        'phone': '222-222-2222'
    }, {
        'phone': '(202) 333-3321'
    }]
}]

namemerger = FieldMerger({'name': ('firstname', 'lastname')},
                         lambda x, y: ' '.join((x, y)))
phonecleaner = PhoneNumberCleaner(('phone', ))
splitter = Splitter({'person': [namemerger], 'phones': [phonecleaner]})
ider = FieldAdder('id', count())

saucebrush.run_recipe(data, ider, splitter, DebugEmitter())
Beispiel #43
0
query = """
    SELECT p.ID, p.post_author, u.user_login,
        p.post_date, p.post_date_gmt, p.post_modified, p.post_modified_gmt,
        p.post_content, p.post_title, p.post_category, p.post_excerpt, p.guid, p.post_type
    FROM oh_posts p
    INNER JOIN oh_users u ON p.post_author = u.ID
    INNER JOIN oh_term_relationships tr ON p.ID = tr.object_id
    INNER JOIN oh_term_taxonomy tt ON tr.term_taxonomy_id = tt.term_taxonomy_id
    INNER JOIN oh_terms t ON tt.term_id = t.term_id
    WHERE p.post_status = 'publish' AND p.post_type = 'post' AND (t.name = 'act' or t.name = 'The Day in Transparency')
    ORDER BY p.post_date DESC
"""

mongo = Connection()

saucebrush.run_recipe(
    MySQLSource(conn, query),
    MetaFilter(conn),
    TagFilter(conn),
    ContentFilter(),
    FieldModifier(('post_content', 'post_excerpt'), lambda x: Binary(x)),
    MongoDBEmitter('openhouse', 'blog', drop_collection=True, conn=mongo),
    #DebugEmitter(),
)

for d in mongo['openhouse']['blog'].find():
    print "------", d['post_title']

conn.close()
Beispiel #44
0
    def test_run_recipe(self):
        saver = Saver()
        run_recipe([1, 2], saver)

        self.assertEqual(saver.saved, [1, 2])