Example #1
0
def go(path, max_attempts=5):
    if not os.path.isfile(path):
        exit('Make sure you ran `make nomenklatura` in the data dir.')
    with open(path, 'rb') as f:
        reader = DictReader(f)
        update_attempts = 0
        restarts = 0
        for row in reader:
            is_updated = process_row(row)
            if is_updated:
                if update_attempts:
                    update_attempts = 0  # reset counter
                    restarts += 1  # log how many times we've reset the counter
            else:
                update_attempts += 1
                if update_attempts >= max_attempts:
                    print('skipping the rest after trying {} and {} resets'
                        .format(update_attempts, restarts))
                    # we're just running through old entries at this point
                    # since the datafile is in reverse chronological order
                    break
    # these `Interest`s need updated stats
    for interest in Interest.objects.filter(
            canonical__isnull=True, stats__isnull=True):
        print 'update', interest
        interest.make_stats()
Example #2
0
def scrape(path):
    logger.info("Processing %s" % path)
    with open(path, 'rb') as f:
        for total_rows, row in enumerate(f):  # subtract 1 for header row
            pass
        f.seek(0)
        reader = DictReader(f)
        prev_pass = None
        first = True
        new_compensations = []
        for row in tqdm(reader,
                        total=total_rows,
                        leave=True,
                        mininterval=1.0,
                        miniters=100):
            if first:
                # wipe all `Compensation` objects for the year to avoid double
                # counting corrected compensations
                year = row['YEAR_APPL']
                Compensation.objects.filter(annum__year=year).delete()
                first = False
            prev_pass = process_row(row, prev_pass=prev_pass)
            if prev_pass.compensation:
                new_compensations.append(prev_pass.compensation)
        logger.debug('{} new compensations'.format(len(new_compensations)))
        Compensation.objects.bulk_create(new_compensations)
Example #3
0
def scrape(path, logger=logger):
    logger.info("Processing %s" % path)
    with open(path, 'rb') as f:
        reader = DictReader(f)
        last_pass = None
        for row in reader:
            last_pass = process_row(row, last_pass=last_pass)
Example #4
0
def generate_test_row(path, **kwargs):
    import random
    from pprint import pprint

    logger.info("Processing %s" % path)
    with open(path, 'rb') as f:
        reader = DictReader(f, encoding='latin_1')
        for i, row in enumerate(reader):
            if random.randint(0, 999) < 1:  # adjust this to go deeper
                pprint(row)
                break
Example #5
0
def generate_test_row(path, **kwargs):
    """Helper to replace `scrape` to print out a sample for testing."""
    import random
    from pprint import pprint

    with open(path, 'rb') as f:
        reader = DictReader(f)
        for row in reader:
            if random.randint(0, 99) < 1:  # adjust this to go deeper
                pprint(row)
                break
Example #6
0
def go(path):
    if not os.path.isfile(path):
        exit('Make sure you ran `make nomenklatura` in the data dir.')
    with open(path, 'rb') as f:
        reader = DictReader(f)
        for row in reader:
            process_row(row)
    for interest in Interest.objects.filter(canonical__isnull=True,
                                            stats__isnull=True):
        print 'update', interest
        interest.make_stats()
Example #7
0
    def handle(self, csv_path, *args, **options):
        from tx_lobbying.models import Address

        with open(csv_path, 'rb') as f:
            reader = DictReader(f)
            for row in reader:
                address, created = Address.objects.update_or_create(
                    address1=row['address1'],
                    address2=row['address2'],
                    city=row['city'],
                    state=row['state'],
                    zipcode=row['zipcode'],
                    defaults=dict(
                        coordinate=row['coordinate'],
                        coordinate_quality=row['coordinate_quality'],
                    ))
                print address, created
Example #8
0
def process_csv(path, _inner_func, **kwargs):
    logger.info("Processing %s" % path)
    total = get_record_count(path)
    with open(path, 'rb') as f:
        reader = DictReader(f, encoding='latin_1')
        # store output from the last pass since coversheet and lobbyist don't
        # really change row to row to save some queries.
        last_pass = None
        for i, row in enumerate(reader):
            if not i % 1000:
                logger.info(u'{}/{} filed date: {} report date:{}'
                    .format(
                        i,
                        total,
                        row.get('FILED_DATE'),
                        row.get('RPT_DATE')
                    ))
            if YEAR_START and int(row['YEAR_APPL']) < YEAR_START:
                continue
            try:
                last_pass = _inner_func(row, last_pass=last_pass, **kwargs)
            except ValueError as e:
                logger.warn('Row missing data: %s, %s' % (row, e))
                continue