Ejemplo n.º 1
0
def create_csv_gastos(year, reader):
    if not exists(join(output_path, year)):
        makedirs(join(output_path, year))
    with open('%s/%s/gastos.csv' % (
            output_path, year), 'w') as f:
        fieldnames = [
            'EJERCICIO', 'CENTRO GESTOR', 'FUNCIONAL',
            'ECONOMICA', 'FINANCIACION', 'DESCRIPCION',
            'SANCIONADO'
        ]
        wr = CSVKitDictWriter(f, fieldnames=fieldnames, delimiter=';')
        wr.writeheader()
        for row in reader:
            line = {}
            # line.append(year)
            centro = format_zeroes(
                    int(row['JUR']), 2)+format_zeroes(
                    int(row['OGESE']), 3)+format_zeroes(
                    int(row['UE']), 4)


            line = {
                'EJERCICIO': year,
                'CENTRO GESTOR': centro,
                'FUNCIONAL': row['FIN']+row['FUN'],
                # 'ECONOMICA': row['inciso']+row['principal']+row['parcial'],
                'ECONOMICA': row['ECO'][1:4],
                'FINANCIACION': row['FF'],
                'DESCRIPCION': row['PARC_DESC'],
                'SANCIONADO': row['SANCION'].replace(",",".")
            }
            wr.writerow(line)
def _main():
    """extract individual tables from range"""
    start_time = time()
    # Create folders
    if not os.path.exists(OUTPUT_COMMON_PATH):
        os.makedirs(OUTPUT_COMMON_PATH)

    # Open output file in write mode
    with open("%s/%s_mesas.csv" % (OUTPUT_COMMON_PATH, "establecimientos"), "w") as fout:
        results = CSVKitDictWriter(
            fout,
            encoding="utf-8",
            # Ignore keys not in fieldnames
            extrasaction="ignore",
            fieldnames=HEADER,
        )
        # Write header
        results.writeheader()

        # Open input file in read mode
        with open("%s/%s.csv" % (INPUT_COMMON_PATH, "establecimientos"), "r") as f:
            reader = CSVKitDictReader(f)
            count = 0
            for row in reader:
                count += 1
                if count % 1000 == 0:
                    print ("processed %s polling stations" % (count))
                l = extract_range(row)
                results.writerows(l)
    print "extract individual tables: %s seconds" % (time() - start_time)
Ejemplo n.º 3
0
def process_armlist():
    # Create output files folder if needed
    OUTPUT_PATH = INPUT_PATH
    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)

    # Initialize geocoder
    geocoder = Nominatim()

    with open('%s/%s.csv' %
              (INPUT_PATH, OUTPUT_FILE), 'w') as fout:
        writer = CSVKitDictWriter(fout, fieldnames=HEADER,
                                  extrasaction='ignore')
        writer.writeheader()
        with open('%s/%s.csv' % (INPUT_PATH, INPUT_FILE), 'r') as f:
            reader = CSVKitDictReader(f)
            count = 0
            for row in reader:
                count += 1
                if count % 1000 == 0:
                    print "processed %s records" % count
                if LIMIT and (count >= LIMIT_SAMPLE):
                    break

                # Clean data
                clean(row)
                # Geocode
                # geocode(row, geocoder)
                geocode_nominatim(row, geocoder)
                # Write to csv file
                writer.writerow(row)
            print('finished processing {}.csv'.format(INPUT_FILE))
def create_key_file(fname, elec="paso"):
    """generate key to assign polling station"""
    input_path = ELEC_CONFIG[elec][0]
    output_path = ELEC_CONFIG[elec][1]
    quotes = ELEC_CONFIG[elec][2]
    # Create folders
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Open output file in write mode
    with open("%s/%s_key.csv" % (output_path, fname), "w") as fout:
        results = CSVKitDictWriter(
            fout,
            encoding="utf-8",
            # Ignore keys not in fieldnames
            extrasaction="ignore",
            fieldnames=HEADER,
        )
        # Write header
        results.writeheader()

        # Open input file in read mode
        with open("%s/%s.csv" % (input_path, fname), "r") as f:
            reader = CSVKitDictReader(f, quotechar=quotes)
            count = 0
            for row in reader:
                count += 1
                if count % 10000 == 0:
                    print ("processed %s polling tables" % (count))
                r = clean_row(row)
                results.writerow(r)
Ejemplo n.º 5
0
def persist_cache():
    """
    Persist cache to disk
    """
    with open('%s/%s.csv' %
              (INPUT_PATH, CACHE_FILE), 'w') as fout:
        writer = CSVKitDictWriter(fout, fieldnames=CACHE_HEADER)
        writer.writeheader()
        for k, v in cache.iteritems():
            row = {'address': k, 'latitude': v[1], 'longitude': v[0]}
            writer.writerow(row)
def process_telegrams(fname=None, proj=None):
    '''Download telegrams from gov site'''
    # Create output files folder if needed
    client = DocumentCloud(DOCUMENTCLOUD_USERNAME, DOCUMENTCLOUD_PASSWORD)
    with open('%s/%s.csv' %
              (OUTPUT_PATH, fname), 'w') as fout:
        writer = CSVKitDictWriter(fout, fieldnames=HEADER)
        writer.writeheader()
        r = get_proj_docs_dc(client, proj)
        writer.writerows(r)

        print('finished processing {}.csv'.format(fname))
def process_telegrams(fext='pdf', downloaded=None):
    '''Download telegrams from gov site'''
    # Create output files folder if needed
    OUTPUT_PATH = '%s/%s' % (INPUT_PATH, fext)
    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)
    with open('%s/%s_%s.csv' %
              (INPUT_PATH, OUTPUT_FILE, fext), 'w') as fout:
        writer = CSVKitDictWriter(fout, fieldnames=HEADER)
        writer.writeheader()
        with open('%s/%s.csv' % (INPUT_PATH, INPUT_FILE), 'r') as f:
            reader = CSVKitDictReader(f)
            r = Parallel(n_jobs=N_CORES)(delayed(download_telegram)(downloaded,
                                                                    fext,
                                                                    row)
                                         for row in reader)
            print('finished processing {}.csv'.format(INPUT_FILE))
            r = filter(None, r)
            writer.writerows(r)
def process_telegrams(cache_set, fname=None):
    """Download telegrams from gov site"""
    # Create output files folder if needed
    client = DocumentCloud(DOCUMENTCLOUD_USERNAME, DOCUMENTCLOUD_PASSWORD)

    with open("%s/uploaded_%s.csv" % (INPUT_PATH, fname), "w") as fout:
        writer = CSVKitDictWriter(fout, fieldnames=HEADER)
        writer.writeheader()
        # Create the project
        with open("%s/%s.csv" % (INPUT_PATH, fname), "r") as f:
            reader = CSVKitDictReader(f)
            r = Parallel(n_jobs=N_CORES)(delayed(upload_telegram)(fname, client, cache_set, row) for row in reader)

            print ("finished processing {}.csv".format(fname))
        print "Longitud del dataset de resultados sin filtrar: %s" % (len(r))
        r = filter(None, r)
        print "Longitud del dataset de resultados filtrados: %s" % (len(r))
        if len(r):
            writer.writerows(r)
Ejemplo n.º 9
0
def get_books_goodreads_ids(input_filename=os.path.join('data', 'books.csv'),
        output_filename=os.path.join('data', 'goodreads_ids.csv')):
    """
    Retrieve GoodReads slugs corresponding to books in the books spreadsheet.

    """
    fieldnames = [
        # Only include enough fields to identify the book
        'title',
        'isbn',
        'goodreads_id'
    ]

    with open(input_filename) as readfile:
        reader = CSVKitDictReader(readfile, encoding='utf-8')
        reader.fieldnames = [name.strip().lower() for name in reader.fieldnames]

        with open(output_filename, 'wb') as fout:
            writer = CSVKitDictWriter(fout, fieldnames=fieldnames)
            writer.writeheader()

            for book in reader:

                output_book = {'title': book['title'], 'isbn': book['isbn'], 'goodreads_id': ''}

                if book['isbn']:
                    output_book['goodreads_id'] = Book.get_goodreads_id(book['isbn'])

                writer.writerow(output_book)

                # According to the Goodreads API documenation (https://www.goodreads.com/api/terms)
                # the rate limit is 1 request per second.
                time.sleep(2)
Ejemplo n.º 10
0
def process_armlist():
    # Create output files folder if needed
    OUTPUT_PATH = INPUT_PATH
    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)

    # Initialize geocoder
    geocoder = Nominatim()

    with open('%s/%s.csv' % (INPUT_PATH, OUTPUT_FILE), 'w') as fout:
        writer = CSVKitDictWriter(fout,
                                  fieldnames=HEADER,
                                  extrasaction='ignore')
        writer.writeheader()
        with open('%s/%s.csv' % (INPUT_PATH, INPUT_FILE), 'r') as f:
            reader = CSVKitDictReader(f)
            count = 0
            for row in reader:
                count += 1
                if count % 1000 == 0:
                    print "processed %s records" % count
                if LIMIT and (count >= LIMIT_SAMPLE):
                    break

                # Clean data
                clean(row)
                # Geocode
                # geocode(row, geocoder)
                geocode_nominatim(row, geocoder)
                # Write to csv file
                writer.writerow(row)
            print('finished processing {}.csv'.format(INPUT_FILE))
Ejemplo n.º 11
0
def persist_cache():
    """
    Persist cache to disk
    """
    with open('%s/%s.csv' % (INPUT_PATH, CACHE_FILE), 'w') as fout:
        writer = CSVKitDictWriter(fout, fieldnames=CACHE_HEADER)
        writer.writeheader()
        for k, v in cache.iteritems():
            row = {'address': k, 'latitude': v[1], 'longitude': v[0]}
            writer.writerow(row)
Ejemplo n.º 12
0
def persist_cache():
    """
    Persist cache to disk
    """
    with open('%s/cached_locations.csv' % CACHE_PATH, 'w') as fout:
        writer = CSVKitDictWriter(fout, fieldnames=CACHE_HEADER,
                                  quoting=QUOTE_ALL)
        writer.writeheader()
        for k, v in cache.iteritems():
            row = {'address': k, 'latitude': v[1], 'longitude': v[0]}
            writer.writerow(row)
Ejemplo n.º 13
0
def get_books_itunes_ids(input_filename=os.path.join('data', 'books.csv'),
                         output_filename=os.path.join('data',
                                                      'itunes_ids.csv')):
    """
    Retrieve iTunes IDs corresponding to books in the books spreadsheet.

    """
    fieldnames = [
        # Only include enough fields to identify the book
        'title',
        'isbn',
        'itunes_id',
    ]

    with open(input_filename) as readfile:
        reader = CSVKitDictReader(readfile, encoding='utf-8')
        reader.fieldnames = [
            name.strip().lower() for name in reader.fieldnames
        ]

        with open(output_filename, 'wb') as fout:
            writer = CSVKitDictWriter(fout, fieldnames=fieldnames)
            writer.writeheader()

            for book in reader:
                # Note that we don't create Book objects because the
                # parsing/lookup takes too long and we only need to lookup the
                # iTunes ID.

                output_book = {k: book[k] for k in fieldnames}

                if book['title']:
                    output_book['itunes_id'] = Book.get_itunes_id(
                        book['title'])

                writer.writerow(output_book)

                # We have to wait to avoid API throttling.  According to
                # the Enterprise Partner Feed documentation, the limit is ~20
                # calls per minute.  See
                # https://affiliate.itunes.apple.com/resources/documentation/itunes-enterprise-partner-feed/
                # I had previously tried a sleep time of 5 and many requests
                # failed
                time.sleep(10)
def create_key_file(fname, elec='paso', sim=False):
    '''generate key to assign polling station'''
    input_path = ELEC_CONFIG[elec][0]
    output_path = ELEC_CONFIG[elec][1]
    # Create folders
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Open output file in write mode
    with open('%s/%s_key.csv'
              % (output_path, fname), 'w') as fout:
        results = CSVKitDictWriter(
            fout,
            encoding='utf-8',
            # Ignore keys not in fieldnames
            extrasaction='ignore',
            fieldnames=HEADER)
        # Write header
        results.writeheader()

        # Open input file in read mode
        fname = 'mesaspresidente' if sim else fname
        with open('%s/%s.csv'
                  % (input_path, fname), 'r') as f:
            reader = CSVKitDictReader(f, quotechar="'")
            count = 0
            for row in reader:
                count += 1
                if (count % 50000 == 0):
                    print('processed %s polling tables' % (count))
                if sim:
                    rows = generate_sim_rows(row)
                    for r in rows:
                        results.writerow(r)
                else:
                    r = clean_row(row)
                    results.writerow(r)
Ejemplo n.º 15
0
def get_station_coverage_headlines(
        csv_path=DEFAULT_STATION_COVERAGE_CSV_PATH,
        output_path=DEFAULT_STATION_COVERAGE_HEADLINES_CSV_PATH,
        isbn_key=DEFAULT_ISBN_KEY,
        title_key=DEFAULT_TITLE_KEY,
        url_key=DEFAULT_URL_KEY,
        headline_key=DEFAULT_HEADLINE_KEY):
    """
    Get headlines for station coverage links.

    Args:
        csv_path (str): Path to input CSV file.
        output_path (str): Path to output CSV file.
        isbn_key (str): Column name in the CSV data for the column that
            contains the book's ISBN.
        title_key (str): Column name in the CSV data for the column that
            contains the book's title.
        url_key (str): Column name in the CSV data for the column that
            contains the station coverage URL.
        headline_key (str): Column name in the CSV data for the colum that
            contains the station coverage headline.

    """
    with open(csv_path) as f:
        reader = CSVKitDictReader(f)

        with open(output_path, 'wb') as fout:
            fieldnames = [title_key, isbn_key, headline_key]
            writer = CSVKitDictWriter(fout, fieldnames=fieldnames)
            writer.writeheader()

            for row in reader:
                output_row = {}
                output_row[isbn_key] = row[isbn_key]
                output_row[title_key] = row[title_key]
                url = row[url_key]
                if url:
                    output_row[headline_key] = get_link_title(url)
                writer.writerow(output_row)
Ejemplo n.º 16
0
def run(args):
    try:
        if args.debug:
            logger.setLevel(logging.DEBUG)

        if args.no_cache:
            load_geocoded_cache()

        # Create output
        if not os.path.exists(OUTPUT_PATH):
            os.makedirs(OUTPUT_PATH)

        # Initialize geocoder
        geocoder = Nominatim()

        with open('%s/output.csv' % OUTPUT_PATH, 'w') as fout:
            writer = CSVKitDictWriter(fout, fieldnames=HEADER,
                                      extrasaction='ignore',
                                      quoting=QUOTE_ALL)
            writer.writeheader()
            with open(args.input, 'r') as f:
                reader = CSVKitDictReader(f)
                logger.info('start processing %s' % args.input)
                for ix, row in enumerate(reader):
                    if (ix + 1) % 100 == 0:
                        logger.debug("processed %s records" % (ix + 1))
                    if args.sample and (ix >= args.sample):
                        break
                    # Geocode
                    geocode_nominatim(row, geocoder)
                    # Write to csv file
                    writer.writerow(row)
                logger.info('finished processing %s' % args.input)
    finally:
        if args.no_cache:
            # Always persist cache file to disk
            persist_cache()
Ejemplo n.º 17
0
    # The destination directory, `www/static-data` might not exist if you're
    # bootstrapping the project for the first time, so make sure it does before
    # trying to write the JSON.
    try:
        os.makedirs('www/static-data')
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    with open('www/static-data/books.json', 'wb') as writefile:
        writefile.write(json.dumps(book_list))

    with open('data/test-itunes-equiv.csv', 'w') as fout:
        writer = CSVKitDictWriter(fout,
                                  fieldnames=['title', 'isbn',
                                              'isbn13', 'itunes_id'],
                                  extrasaction='ignore')
        writer.writeheader()
        writer.writerows(book_list)

    with open('data/tag-audit.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['tag', 'slug', 'count'])
        for slug, count in tags.items():
            writer.writerow([SLUGS_TO_TAGS[slug], slug, count])
    logger.info("End.")


@task
def load_books():
    """
Ejemplo n.º 18
0
def merge_external_links(
        books_csv_path=DEFAULT_BOOKS_CSV,
        links_json_path=DEFAULT_EXTERNAL_LINKS_JSON_PATH,
        output_csv_path=DEFAULT_EXTERNAL_LINKS_OUTPUT_CSV_PATH):
    """
    Create a CSV file containing external links.

    Create a CSV file containing external links that can be copied into the
    books Google Spreadsheet.

    Args:
        books_csv_path (str): Path to CSV file containing data from the books
            Google Spreadsheet.
        links_json_path (str): Path to JSON file created by
            `parse_external_links_csv()`.
        output_csv_path (str): Path to output CSV file.

    """
    fieldnames = [
        # Only include enough fields to identify the book
        'title',
        'isbn',
        'external_links_html',
    ]
    with open(links_json_path) as jsonf:
        lookup = json.load(jsonf)
        matched = set()

        with open(books_csv_path) as readfile:
            reader = CSVKitDictReader(readfile, encoding='utf-8')
            reader.fieldnames = [
                name.strip().lower() for name in reader.fieldnames
            ]

            with open(output_csv_path, 'wb') as fout:
                writer = CSVKitDictWriter(fout, fieldnames=fieldnames)
                writer.writeheader()

                for book in reader:
                    output_book = {
                        'title': book['title'],
                        'isbn': book['isbn'],
                        'external_links_html': '',
                    }

                    if book['isbn']:
                        try:
                            links, matching_isbn = lookup_links_by_isbn(
                                book['isbn'], lookup)
                            output_book['external_links_html'] = ','.join(
                                links)
                            matched.add(matching_isbn)
                        except KeyError:
                            # No matching member station coverage.  This is OK.
                            pass

                    writer.writerow(output_book)

            # Do an audit to see if there are any ISBNs in the member station
            # responses that didn't match books.
            for isbn in lookup:
                if isbn not in matched:
                    logger.warn("No matching book found for ISBN %s" % (isbn))