def old_sf_files(self, year, dur, states): '''Get list of files to download from the server, using the old system. The list is actually a list of dicts, with entries for the folder portion of the url, the actual file name, and the state abbreviation. The Census, in its infinite wisdom, has very inconsistent naming conventions, but the greatest difference is that before 2009, the summaryfile/ directory contains a subdirectory for each state, and within those subdirectories there is a listing of all the files as well as a zip file containing all those files. Thus the format is generally (with some exceptions): acsYEAR_DURyr/summaryfile/StateName/all_st.zip Where StateName is what is returned by get_state_dir(), basically the state name in camel case and without spaces, and st is the two-letter abbreviation (lowercase) for that state (or "us"). In 2005 and 2006, the outermost directory is just acsYEAR, without any duration (and those are both 1-year files). In 2006, moreover, the name of the actual zip files is different: st_all_2006.zip ''' files = [] for st in states: is2005 = (year == 2005) logger.debug("State filestub: %s" % state_filestub(st, is2005)) st_dir = self.rooturls[year][dur]['data'] + state_filestub( st, is2005) st_links = get_links(st_dir) #print st_links if is2005: st_file = "all_{0}.zip".format(st) geo_file = "{0}geo.2005-1yr".format(st) elif "all_{0}.zip".format(st) in st_links: st_file = "all_{0}.zip".format(st) geo_file = "g{0}{1}{2}.txt".format(year, dur, st) elif "{0}_all_2006.zip".format(st) in st_links: st_file = "{0}_all_2006.zip".format(st) geo_file = "g{0}{1}{2}.txt".format(year, dur, st) else: logger.warning("NO FILE for: {0} {1}".format(st, st_dir)) continue #files.append({'url': st_dir + '/', 'file': st_file, 'state': st}) #files.append({'url': st_dir + '/', 'file': geo_file, 'state': st}) files.append({'url': st_dir + '/' + st_file, 'state': st}) files.append({'url': st_dir + '/' + geo_file, 'state': st}) return files
def old_sf_files(self, year, dur, states): '''Get list of files to download from the server, using the old system. The list is actually a list of dicts, with entries for the folder portion of the url, the actual file name, and the state abbreviation. The Census, in its infinite wisdom, has very inconsistent naming conventions, but the greatest difference is that before 2009, the summaryfile/ directory contains a subdirectory for each state, and within those subdirectories there is a listing of all the files as well as a zip file containing all those files. Thus the format is generally (with some exceptions): acsYEAR_DURyr/summaryfile/StateName/all_st.zip Where StateName is what is returned by get_state_dir(), basically the state name in camel case and without spaces, and st is the two-letter abbreviation (lowercase) for that state (or "us"). In 2005 and 2006, the outermost directory is just acsYEAR, without any duration (and those are both 1-year files). In 2006, moreover, the name of the actual zip files is different: st_all_2006.zip ''' files = [] for st in states: is2005 = (year==2005) logger.debug("State filestub: %s" % state_filestub(st, is2005)) st_dir = self.rooturls[year][dur]['data'] + state_filestub(st, is2005) st_links = get_links(st_dir) #print st_links if is2005: st_file = "all_{0}.zip".format(st) geo_file = "{0}geo.2005-1yr".format(st) elif "all_{0}.zip".format(st) in st_links: st_file = "all_{0}.zip".format(st) geo_file = "g{0}{1}{2}.txt".format(year, dur, st) elif "{0}_all_2006.zip".format(st) in st_links: st_file = "{0}_all_2006.zip".format(st) geo_file = "g{0}{1}{2}.txt".format(year, dur, st) else: logger.warning("NO FILE for: {0} {1}".format(st, st_dir)) continue #files.append({'url': st_dir + '/', 'file': st_file, 'state': st}) #files.append({'url': st_dir + '/', 'file': geo_file, 'state': st}) files.append({'url': st_dir + '/' + st_file, 'state': st }) files.append({'url': st_dir + '/' + geo_file, 'state': st }) return files
def state_data_files(self, year, dur, states): try: state_urls = get_links(self.rooturls[year][dur]['data']) except KeyError: logger.error("No valid data URL for {year}, {dur}".format(year=year, dur=dur)) return [] #data_files = [] if self.pums: return self.pums_files(self.rooturls[year][dur]['data'], year, dur, states) else: if u'Alabama/' in state_urls: # Old SF return self.old_sf_files(year, dur, states) else: # New SF return self.new_sf_files(year, dur, states)
def state_data_files(self, year, dur, states): try: state_urls = get_links(self.rooturls[year][dur]['data']) except KeyError: logger.error("No valid data URL for {year}, {dur}".format( year=year, dur=dur)) return [] #data_files = [] if self.pums: return self.pums_files(self.rooturls[year][dur]['data'], year, dur, states) else: if u'Alabama/' in state_urls: # Old SF return self.old_sf_files(year, dur, states) else: # New SF return self.new_sf_files(year, dur, states)
def new_sf_files(self, year, dur, states): """Get list of files to download from the server, using the new system. The returned list is actually a list of dicts, with entries for the folder portion of the url, the actual file name, and then the state abbreviation. Starting in 2009, the Census is more consistent with the structure of the directories. There is one subdirectory within summaryfile/ that contains all the state files, each of which is just a single zipped file. Each state does NOT, thus, have a subdirectory itself. """ files = [] for st in states: st_dir = self.rooturls[year][dur]['data'] st_links = get_links(st_dir) for link in st_links: # look for the state name in any ZIP file if re.search(r'{0}.*\.zip'.format(state_filestub(st)), link): #files.append({'url': st_dir, 'file': link, 'state': st}) files.append({'url': st_dir + link, 'state': st}) return files
def new_sf_files(self, year, dur, states): """Get list of files to download from the server, using the new system. The returned list is actually a list of dicts, with entries for the folder portion of the url, the actual file name, and then the state abbreviation. Starting in 2009, the Census is more consistent with the structure of the directories. There is one subdirectory within summaryfile/ that contains all the state files, each of which is just a single zipped file. Each state does NOT, thus, have a subdirectory itself. """ files = [] for st in states: st_dir = self.rooturls[year][dur]['data'] st_links = get_links(st_dir) for link in st_links: # look for the state name in any ZIP file if re.search(r'{0}.*\.zip'.format(state_filestub(st)), link): #files.append({'url': st_dir, 'file': link, 'state': st}) files.append({ 'url': st_dir + link, 'state': st }) return files
def stubs_and_documentation(self, year, dur): try: doc_url = self.rooturls[year][dur]['documentation'] except KeyError: logger.error("No valid documentation URL for {year}, {dur}".format( year=year, dur=dur)) return {'stubs': [], 'docs': [], 'macro': []} # Technical documentation PDF file tech_doc_re = re.compile(r'(.*_SF_Tech_Doc\.pdf)') def _tech_filter(href): if tech_doc_re.search(href): return href tech_doc = get_links(doc_url, link_filter=_tech_filter) logger.debug("Tech doc: %s" % tech_doc) docs = [{'url': doc_url + tdoc, 'file': tdoc} for tdoc in tech_doc] # Stubs file old_stub_re = re.compile(r'merge_5_6.*') new_stub_re = re.compile(r'Sequence_?Number_?Table_?Number_?Lookup.*') dur_stub_re = re.compile(r'ACS_' + re.escape(str(dur)) + r'yr_Seq_Table_Number_Lookup.*') #logger.debug(pprint.pformat(get_links(doc_url))) # Filter for old and new stub files def _match_old_or_new(url): if old_stub_re.search(url) or new_stub_re.search( url) or dur_stub_re.search(url): return url else: return None # old = old_stub_re.search(url) # if old: # return old # new = new_stub_re.search(url) # if new: # return new # return dur_stub_re.search(url) if year == 2005: # 2005 stubs were not moved to new server stub_urls = [ "http://www2.census.gov/acs2005/Chapter_6_acs_2005_tables_Sum_file_shells.xls" ] #stubs = [{ 'url': "http://www2.census.gov/acs2005/", # 'file': "Chapter_6_acs_2005_tables_Sum_file_shells.xls" }] elif year == 2006: stub_urls = [ "http://www2.census.gov/acs2006/merge_5_6_final.txt", "http://www2.census.gov/acs2006/merge_5_6_final.xls" ] # stubs = [{ 'url': "http://www2.census.gov/acs2006/", # 'file': "merge_5_6_final.txt" }, # { 'url': "http://www2.census.gov/acs2006/", # 'file': "merge_5_6_final.xls" }] elif year == 2007: stub_files = get_links(doc_url, link_filter=_match_old_or_new) stub_urls = [doc_url + f for f in stub_files] #tubs = [{ 'url': doc_url, 'file': f } for f in stub_files] elif year <= 2012: stub_files = get_links(doc_url + "user_tools/", link_filter=_match_old_or_new) stub_urls = [doc_url + "user_tools/" + f for f in stub_files] #stubs = [{ 'url': doc_url + "user_tools/", 'file': f } for f in stub_files] elif year >= 2013: stub_files = get_links(doc_url + "user_tools/", link_filter=_match_old_or_new) stub_urls = [doc_url + "user_tools/" + f for f in stub_files] stubs = [{'url': u} for u in stub_urls] # Example Macros if year <= 2006: macro_url = None elif year == 2007 and dur == 3: macro_url = "http://www2.census.gov/programs-surveys/acs/summary_file/2007/documentation/3_year/Sample SAS Programs/summary_file_example_macros.sas" elif year <= 2008: macro_url = doc_url + "0SASExamplePrograms/summary_file_example_macros.sas" elif year == 2009 and dur == 3: macro_url = "http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/3_year/user_tools/SF_ALL_Macro.sas" elif year <= 2012: macro_url = doc_url + "user_tools/SF_All_Macro.sas" elif year == 2013: macro_url = doc_url + "user_tools/SummaryFile_All_Macro.sas" elif year == 2014: macro_url = doc_url + "user_tools/SF_All_Macro_1YR.sas" macros = None if macro_url: macros = [{'url': macro_url}] # """ # http://www2.census.gov/programs-surveys/acs/summary_file/2007/documentation/1_year/0SASExamplePrograms/summary_file_example_macros.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2008/documentation/1_year/0SASExamplePrograms/summary_file_example_macros.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2008/documentation/3_year/0SASExamplePrograms/summary_file_example_macros.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/1_year/user_tools/SF_All_Macro.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/3_year/user_tools/SF_ALL_Macro.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/5_year/user_tools/SF_All_Macro.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2010/documentation/1_year/user_tools/SF_All_Macro.sas""" logger.debug("STUBS: \n%s" % pprint.pformat(stubs)) return {'stubs': stubs, 'docs': docs, 'macros': macros}
def folders(self, years, durations): return get_links( self.baseurl, lambda href: acs_year_dur_filter(href, years, durations))
def folders(self, years, durations): return get_links(self.baseurl, lambda href: acs_year_dur_filter(href, years, durations))
def stubs_and_documentation(self, year, dur): try: doc_url = self.rooturls[year][dur]['documentation'] except KeyError: logger.error("No valid documentation URL for {year}, {dur}".format(year=year, dur=dur)) return { 'stubs': [], 'docs': [], 'macro': [] } # Technical documentation PDF file tech_doc_re = re.compile(r'(.*_SF_Tech_Doc\.pdf)') def _tech_filter(href): if tech_doc_re.search(href): return href tech_doc = get_links(doc_url, link_filter = _tech_filter ) logger.debug("Tech doc: %s" % tech_doc) docs = [{ 'url': doc_url + tdoc, 'file': tdoc } for tdoc in tech_doc] # Stubs file old_stub_re = re.compile(r'merge_5_6.*') new_stub_re = re.compile(r'Sequence_?Number_?Table_?Number_?Lookup.*') dur_stub_re = re.compile(r'ACS_' + re.escape(str(dur)) + r'yr_Seq_Table_Number_Lookup.*') #logger.debug(pprint.pformat(get_links(doc_url))) # Filter for old and new stub files def _match_old_or_new(url): if old_stub_re.search(url) or new_stub_re.search(url) or dur_stub_re.search(url): return url else: return None # old = old_stub_re.search(url) # if old: # return old # new = new_stub_re.search(url) # if new: # return new # return dur_stub_re.search(url) if year==2005: # 2005 stubs were not moved to new server stub_urls = ["http://www2.census.gov/acs2005/Chapter_6_acs_2005_tables_Sum_file_shells.xls"] #stubs = [{ 'url': "http://www2.census.gov/acs2005/", # 'file': "Chapter_6_acs_2005_tables_Sum_file_shells.xls" }] elif year == 2006: stub_urls = ["http://www2.census.gov/acs2006/merge_5_6_final.txt", "http://www2.census.gov/acs2006/merge_5_6_final.xls" ] # stubs = [{ 'url': "http://www2.census.gov/acs2006/", # 'file': "merge_5_6_final.txt" }, # { 'url': "http://www2.census.gov/acs2006/", # 'file': "merge_5_6_final.xls" }] elif year == 2007: stub_files = get_links(doc_url, link_filter=_match_old_or_new) stub_urls = [doc_url + f for f in stub_files] #tubs = [{ 'url': doc_url, 'file': f } for f in stub_files] elif year <= 2012: stub_files = get_links(doc_url + "user_tools/", link_filter=_match_old_or_new) stub_urls = [doc_url + "user_tools/" + f for f in stub_files] #stubs = [{ 'url': doc_url + "user_tools/", 'file': f } for f in stub_files] elif year >= 2013: stub_files = get_links(doc_url + "user_tools/", link_filter=_match_old_or_new) stub_urls = [doc_url + "user_tools/" + f for f in stub_files] stubs = [{ 'url': u } for u in stub_urls ] # Example Macros if year<=2006: macro_url = None elif year==2007 and dur==3: macro_url = "http://www2.census.gov/programs-surveys/acs/summary_file/2007/documentation/3_year/Sample SAS Programs/summary_file_example_macros.sas" elif year <= 2008: macro_url = doc_url + "0SASExamplePrograms/summary_file_example_macros.sas" elif year==2009 and dur==3: macro_url = "http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/3_year/user_tools/SF_ALL_Macro.sas" elif year <= 2012: macro_url = doc_url + "user_tools/SF_All_Macro.sas" elif year == 2013: macro_url = doc_url + "user_tools/SummaryFile_All_Macro.sas" elif year == 2014: macro_url = doc_url + "user_tools/SF_All_Macro_1YR.sas" macros = None if macro_url: macros = [{ 'url': macro_url }] # """ # http://www2.census.gov/programs-surveys/acs/summary_file/2007/documentation/1_year/0SASExamplePrograms/summary_file_example_macros.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2008/documentation/1_year/0SASExamplePrograms/summary_file_example_macros.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2008/documentation/3_year/0SASExamplePrograms/summary_file_example_macros.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/1_year/user_tools/SF_All_Macro.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/3_year/user_tools/SF_ALL_Macro.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2009/documentation/5_year/user_tools/SF_All_Macro.sas # http://www2.census.gov/programs-surveys/acs/summary_file/2010/documentation/1_year/user_tools/SF_All_Macro.sas""" logger.debug("STUBS: \n%s" % pprint.pformat(stubs)) return { 'stubs': stubs, 'docs': docs, 'macros': macros }