def create_csv_gastos(year, reader): if not exists(join(output_path, year)): makedirs(join(output_path, year)) with open('%s/%s/gastos.csv' % ( output_path, year), 'w') as f: fieldnames = [ 'EJERCICIO', 'CENTRO GESTOR', 'FUNCIONAL', 'ECONOMICA', 'FINANCIACION', 'DESCRIPCION', 'SANCIONADO' ] wr = CSVKitDictWriter(f, fieldnames=fieldnames, delimiter=';') wr.writeheader() for row in reader: line = {} # line.append(year) centro = format_zeroes( int(row['JUR']), 2)+format_zeroes( int(row['OGESE']), 3)+format_zeroes( int(row['UE']), 4) line = { 'EJERCICIO': year, 'CENTRO GESTOR': centro, 'FUNCIONAL': row['FIN']+row['FUN'], # 'ECONOMICA': row['inciso']+row['principal']+row['parcial'], 'ECONOMICA': row['ECO'][1:4], 'FINANCIACION': row['FF'], 'DESCRIPCION': row['PARC_DESC'], 'SANCIONADO': row['SANCION'].replace(",",".") } wr.writerow(line)
def process_armlist(): # Create output files folder if needed OUTPUT_PATH = INPUT_PATH if not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH) # Initialize geocoder geocoder = Nominatim() with open('%s/%s.csv' % (INPUT_PATH, OUTPUT_FILE), 'w') as fout: writer = CSVKitDictWriter(fout, fieldnames=HEADER, extrasaction='ignore') writer.writeheader() with open('%s/%s.csv' % (INPUT_PATH, INPUT_FILE), 'r') as f: reader = CSVKitDictReader(f) count = 0 for row in reader: count += 1 if count % 1000 == 0: print "processed %s records" % count if LIMIT and (count >= LIMIT_SAMPLE): break # Clean data clean(row) # Geocode # geocode(row, geocoder) geocode_nominatim(row, geocoder) # Write to csv file writer.writerow(row) print('finished processing {}.csv'.format(INPUT_FILE))
def create_key_file(fname, elec="paso"): """generate key to assign polling station""" input_path = ELEC_CONFIG[elec][0] output_path = ELEC_CONFIG[elec][1] quotes = ELEC_CONFIG[elec][2] # Create folders if not os.path.exists(output_path): os.makedirs(output_path) # Open output file in write mode with open("%s/%s_key.csv" % (output_path, fname), "w") as fout: results = CSVKitDictWriter( fout, encoding="utf-8", # Ignore keys not in fieldnames extrasaction="ignore", fieldnames=HEADER, ) # Write header results.writeheader() # Open input file in read mode with open("%s/%s.csv" % (input_path, fname), "r") as f: reader = CSVKitDictReader(f, quotechar=quotes) count = 0 for row in reader: count += 1 if count % 10000 == 0: print ("processed %s polling tables" % (count)) r = clean_row(row) results.writerow(r)
def get_books_goodreads_ids(input_filename=os.path.join('data', 'books.csv'), output_filename=os.path.join('data', 'goodreads_ids.csv')): """ Retrieve GoodReads slugs corresponding to books in the books spreadsheet. """ fieldnames = [ # Only include enough fields to identify the book 'title', 'isbn', 'goodreads_id' ] with open(input_filename) as readfile: reader = CSVKitDictReader(readfile, encoding='utf-8') reader.fieldnames = [name.strip().lower() for name in reader.fieldnames] with open(output_filename, 'wb') as fout: writer = CSVKitDictWriter(fout, fieldnames=fieldnames) writer.writeheader() for book in reader: output_book = {'title': book['title'], 'isbn': book['isbn'], 'goodreads_id': ''} if book['isbn']: output_book['goodreads_id'] = Book.get_goodreads_id(book['isbn']) writer.writerow(output_book) # According to the Goodreads API documenation (https://www.goodreads.com/api/terms) # the rate limit is 1 request per second. time.sleep(2)
def persist_cache(): """ Persist cache to disk """ with open('%s/%s.csv' % (INPUT_PATH, CACHE_FILE), 'w') as fout: writer = CSVKitDictWriter(fout, fieldnames=CACHE_HEADER) writer.writeheader() for k, v in cache.iteritems(): row = {'address': k, 'latitude': v[1], 'longitude': v[0]} writer.writerow(row)
def persist_cache(): """ Persist cache to disk """ with open('%s/cached_locations.csv' % CACHE_PATH, 'w') as fout: writer = CSVKitDictWriter(fout, fieldnames=CACHE_HEADER, quoting=QUOTE_ALL) writer.writeheader() for k, v in cache.iteritems(): row = {'address': k, 'latitude': v[1], 'longitude': v[0]} writer.writerow(row)
def get_books_itunes_ids(input_filename=os.path.join('data', 'books.csv'), output_filename=os.path.join('data', 'itunes_ids.csv')): """ Retrieve iTunes IDs corresponding to books in the books spreadsheet. """ fieldnames = [ # Only include enough fields to identify the book 'title', 'isbn', 'itunes_id', ] with open(input_filename) as readfile: reader = CSVKitDictReader(readfile, encoding='utf-8') reader.fieldnames = [ name.strip().lower() for name in reader.fieldnames ] with open(output_filename, 'wb') as fout: writer = CSVKitDictWriter(fout, fieldnames=fieldnames) writer.writeheader() for book in reader: # Note that we don't create Book objects because the # parsing/lookup takes too long and we only need to lookup the # iTunes ID. output_book = {k: book[k] for k in fieldnames} if book['title']: output_book['itunes_id'] = Book.get_itunes_id( book['title']) writer.writerow(output_book) # We have to wait to avoid API throttling. According to # the Enterprise Partner Feed documentation, the limit is ~20 # calls per minute. See # https://affiliate.itunes.apple.com/resources/documentation/itunes-enterprise-partner-feed/ # I had previously tried a sleep time of 5 and many requests # failed time.sleep(10)
def get_station_coverage_headlines( csv_path=DEFAULT_STATION_COVERAGE_CSV_PATH, output_path=DEFAULT_STATION_COVERAGE_HEADLINES_CSV_PATH, isbn_key=DEFAULT_ISBN_KEY, title_key=DEFAULT_TITLE_KEY, url_key=DEFAULT_URL_KEY, headline_key=DEFAULT_HEADLINE_KEY): """ Get headlines for station coverage links. Args: csv_path (str): Path to input CSV file. output_path (str): Path to output CSV file. isbn_key (str): Column name in the CSV data for the column that contains the book's ISBN. title_key (str): Column name in the CSV data for the column that contains the book's title. url_key (str): Column name in the CSV data for the column that contains the station coverage URL. headline_key (str): Column name in the CSV data for the colum that contains the station coverage headline. """ with open(csv_path) as f: reader = CSVKitDictReader(f) with open(output_path, 'wb') as fout: fieldnames = [title_key, isbn_key, headline_key] writer = CSVKitDictWriter(fout, fieldnames=fieldnames) writer.writeheader() for row in reader: output_row = {} output_row[isbn_key] = row[isbn_key] output_row[title_key] = row[title_key] url = row[url_key] if url: output_row[headline_key] = get_link_title(url) writer.writerow(output_row)
def run(args): try: if args.debug: logger.setLevel(logging.DEBUG) if args.no_cache: load_geocoded_cache() # Create output if not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH) # Initialize geocoder geocoder = Nominatim() with open('%s/output.csv' % OUTPUT_PATH, 'w') as fout: writer = CSVKitDictWriter(fout, fieldnames=HEADER, extrasaction='ignore', quoting=QUOTE_ALL) writer.writeheader() with open(args.input, 'r') as f: reader = CSVKitDictReader(f) logger.info('start processing %s' % args.input) for ix, row in enumerate(reader): if (ix + 1) % 100 == 0: logger.debug("processed %s records" % (ix + 1)) if args.sample and (ix >= args.sample): break # Geocode geocode_nominatim(row, geocoder) # Write to csv file writer.writerow(row) logger.info('finished processing %s' % args.input) finally: if args.no_cache: # Always persist cache file to disk persist_cache()
def create_key_file(fname, elec='paso', sim=False): '''generate key to assign polling station''' input_path = ELEC_CONFIG[elec][0] output_path = ELEC_CONFIG[elec][1] # Create folders if not os.path.exists(output_path): os.makedirs(output_path) # Open output file in write mode with open('%s/%s_key.csv' % (output_path, fname), 'w') as fout: results = CSVKitDictWriter( fout, encoding='utf-8', # Ignore keys not in fieldnames extrasaction='ignore', fieldnames=HEADER) # Write header results.writeheader() # Open input file in read mode fname = 'mesaspresidente' if sim else fname with open('%s/%s.csv' % (input_path, fname), 'r') as f: reader = CSVKitDictReader(f, quotechar="'") count = 0 for row in reader: count += 1 if (count % 50000 == 0): print('processed %s polling tables' % (count)) if sim: rows = generate_sim_rows(row) for r in rows: results.writerow(r) else: r = clean_row(row) results.writerow(r)
raise with open('www/static-data/books.json', 'wb') as writefile: writefile.write(json.dumps(book_list)) with open('data/test-itunes-equiv.csv', 'w') as fout: writer = CSVKitDictWriter(fout, fieldnames=['title', 'isbn', 'isbn13', 'itunes_id'], extrasaction='ignore') writer.writeheader() writer.writerows(book_list) with open('data/tag-audit.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['tag', 'slug', 'count']) for slug, count in tags.items(): writer.writerow([SLUGS_TO_TAGS[slug], slug, count]) logger.info("End.") @task def load_books(): """ Loads/reloads just the book data. Does not save image files. """ logger.info("start load_books") logger.info("get books csv") get_books_csv() logger.info("start parse_books_csv")
def merge_external_links( books_csv_path=DEFAULT_BOOKS_CSV, links_json_path=DEFAULT_EXTERNAL_LINKS_JSON_PATH, output_csv_path=DEFAULT_EXTERNAL_LINKS_OUTPUT_CSV_PATH): """ Create a CSV file containing external links. Create a CSV file containing external links that can be copied into the books Google Spreadsheet. Args: books_csv_path (str): Path to CSV file containing data from the books Google Spreadsheet. links_json_path (str): Path to JSON file created by `parse_external_links_csv()`. output_csv_path (str): Path to output CSV file. """ fieldnames = [ # Only include enough fields to identify the book 'title', 'isbn', 'external_links_html', ] with open(links_json_path) as jsonf: lookup = json.load(jsonf) matched = set() with open(books_csv_path) as readfile: reader = CSVKitDictReader(readfile, encoding='utf-8') reader.fieldnames = [ name.strip().lower() for name in reader.fieldnames ] with open(output_csv_path, 'wb') as fout: writer = CSVKitDictWriter(fout, fieldnames=fieldnames) writer.writeheader() for book in reader: output_book = { 'title': book['title'], 'isbn': book['isbn'], 'external_links_html': '', } if book['isbn']: try: links, matching_isbn = lookup_links_by_isbn( book['isbn'], lookup) output_book['external_links_html'] = ','.join( links) matched.add(matching_isbn) except KeyError: # No matching member station coverage. This is OK. pass writer.writerow(output_book) # Do an audit to see if there are any ISBNs in the member station # responses that didn't match books. for isbn in lookup: if isbn not in matched: logger.warn("No matching book found for ISBN %s" % (isbn))