def geocode_all(db,
                data_folder="geocoder/data",
                terms_folder="geocoder/terms",
                lines_per_insert=1000):

    print("Loading table...")
    # The query bellow seems not very efficient...
    # Maybe change it as the link says.
    # https://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator
    non_geocoded = Execucao.query.filter(Execucao.searched == False).all()
    with Geocoder(data_folder, terms_folder) as geocoder:
        counter = ProgressCounter(len(non_geocoded), print_abs=True)
        to_be_inserted = 0
        for row in non_geocoded:
            cells = get_geolocable_cells(row)
            geoent = geocoder.geocode_list(cells)
            if geoent:
                lat, lon, reg = geoent.best_coords()
                if lat:
                    row.point = "POINT(%s %s)" % (lon, lat)
            row.searched = True
            to_be_inserted += 1
            if to_be_inserted == lines_per_insert:
                db.session.commit()
                to_be_inserted = 0
            counter.update()
        if to_be_inserted:
            db.session.commit()
        counter.end()
def update_from_csv(db, csv):
    '''Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values).'''
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    modified_counter = 0
    added_counter = 0

    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        date = datetime.datetime.strptime(new_row['data']['datafinal'],
                                          '%Y-%m-%d')
        if row_model:
            modified = {}

            # Check if state was modified
            if row_model.state != new_row['state'].decode('utf8'):
                modified['state'] = (row_model.state, new_row['state'])
                row_model.state = new_row['state']

            # Check if a field in data was modified
            for key, new_value in new_row['data'].items():
                old_value = row_model.data.get(key)

                # Avoids confusion caused by new_value not been unicode
                if type(new_value) is str:
                    new_value = new_value.decode('utf8')
                    new_row['data'][key] = new_value

                # Avoids data that changed from 0 to None
                if (old_value or new_value) and (old_value != new_value):
                    modified[key] = (old_value, new_value)

            # Avoids registering row as modified if only datafinal changend
            if len(modified) == 1 and 'datafinal' in modified:
                modified = {}

            if modified:
                db.session.add(
                    History(event='modified',
                            code=code,
                            date=date,
                            data=modified))
                modified_counter += 1

            # Updates DB data even if only 'datafinal' changed
            row_model.data = new_row['data']
        else:
            db.session.add(
                History(event='created', code=code, date=date, data=new_row))
            db.session.add(Execucao(**new_row))
            added_counter += 1
        counter.update()
    counter.end()
    db.session.commit()
    print('Added/Modified/Total: %s/%s/%s' %
          (added_counter, modified_counter, len(table)))
def geocode_all(db, data_folder="geocoder/data",
                terms_folder="geocoder/terms",
                lines_per_insert=1000):

    print("Loading table...")
    # The query bellow seems not very efficient...
    # Maybe change it as the link says.
    # https://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator
    non_geocoded = Execucao.query.filter(Execucao.searched == False).all()
    with Geocoder(data_folder, terms_folder) as geocoder:
        counter = ProgressCounter(len(non_geocoded), print_abs=True)
        to_be_inserted = 0
        for row in non_geocoded:
            cells = get_geolocable_cells(row)
            geoent = geocoder.geocode_list(cells)
            if geoent:
                lat, lon, reg = geoent.best_coords()
                if lat:
                    row.point = "POINT(%s %s)" % (lon, lat)
            row.searched = True
            to_be_inserted += 1
            if to_be_inserted == lines_per_insert:
                db.session.commit()
                to_be_inserted = 0
            counter.update()
        if to_be_inserted:
            db.session.commit()
        counter.end()
Exemple #4
0
def geocode_all(db, data_folder="geocoder/data",
                terms_folder="geocoder/terms",
                lines_per_insert=1000):
    print("Loading table...")
    non_geocoded = get_non_geocode(lines_per_insert)
    if non_geocoded:
        while non_geocoded:
            with Geocoder(data_folder, terms_folder) as geocoder:
                counter = ProgressCounter(len(non_geocoded), print_abs=True)
                to_be_inserted = 0
                for row in non_geocoded:
                    cells = get_geolocable_cells(row)
                    geoent = geocoder.geocode_list(cells)
                    if geoent:
                        lat, lon, reg = geoent.best_coords()
                        if lat:
                            row.point = "POINT(%s %s)" % (lon, lat)
                    row.searched = True
                    to_be_inserted += 1
                    if to_be_inserted == lines_per_insert:
                        db.session.commit()
                        to_be_inserted = 0
                    counter.update()
                if to_be_inserted:
                    db.session.commit()
                counter.end()
            non_geocoded = get_non_geocode(lines_per_insert)
Exemple #5
0
def insert_all(db, csv_file='../data/urls.csv', lines_per_insert=100):
    print("Getting Contratos urls from: " + csv_file)
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    to_update = 0
    updated = 0
    for di, d in data.iterrows():
        to_update += 1
        stmt = update(Contrato).values({
            'file_url': d['file_url'],
            'txt_file_url': d['file_txt']
        }).where(Contrato.numero == str(d['numero']))
        db.session.execute(stmt)
        if to_update == lines_per_insert or (updated + to_update) == total:
            counter.update(to_update)
            updated += to_update
            to_update = 0
            db.session.commit()

    counter.end()

    print("Updated {} Contratos".format(updated))
def update_from_csv(db, csv):
    """Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values)."""
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    modified_counter = 0
    added_counter = 0

    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        date = datetime.datetime.strptime(new_row["data"]["datafinal"], "%Y-%m-%d")
        if row_model:
            modified = {}

            # Check if state was modified
            if row_model.state != new_row["state"].decode("utf8"):
                modified["state"] = (row_model.state, new_row["state"])
                row_model.state = new_row["state"]

            # Check if a field in data was modified
            for key, new_value in new_row["data"].items():
                old_value = row_model.data.get(key)

                # Avoids confusion caused by new_value not been unicode
                if type(new_value) is str:
                    new_value = new_value.decode("utf8")
                    new_row["data"][key] = new_value

                # Avoids data that changed from 0 to None
                if (old_value or new_value) and (old_value != new_value):
                    modified[key] = (old_value, new_value)

            # Avoids registering row as modified if only datafinal changend
            if len(modified) == 1 and "datafinal" in modified:
                modified = {}

            if modified:
                db.session.add(History(event="modified", code=code, date=date, data=modified))
                modified_counter += 1

            # Updates DB data even if only 'datafinal' changed
            row_model.data = new_row["data"]
        else:
            db.session.add(History(event="created", code=code, date=date, data=new_row))
            db.session.add(Execucao(**new_row))
            added_counter += 1
        counter.update()
    counter.end()
    db.session.commit()
    print("Added/Modified/Total: %s/%s/%s" % (added_counter, modified_counter, len(table)))
def insert_all(db,
               csv_file='../data/contratos-2014.xls',
               lines_per_insert=100):
    print("Importing Contratos from: {}".format(csv_file))
    data = pd.read_excel(csv_file)
    data = data.fillna(-1)

    cache = {}
    to_insert = []
    total = len(data)
    inserted = 0
    counter = ProgressCounter(total)

    for row_i, row in data.iterrows():

        r = {}

        if len(to_insert) == lines_per_insert:
            inserted += len(to_insert)
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        r['numero'] = int(row_i) + 1
        r['orgao'] = row['Orgao']
        r['data_assinatura'] = parse_date(row['Data da Assinatura'])
        r['vigencia'] = int(
            row['Vigencia']) if not np.isnan(row['Vigencia']) else -1
        r['objeto'] = row['Objeto']
        r['modalidade'] = row['Modalidade']
        r['evento'] = row['Evento']
        r['processo_administrativo'] = row['Processo Administrativo']
        r['cnpj'] = row['CNPJ']
        r['nome_fornecedor'] = row['Nome']
        r['valor'] = parse_money(row['Valor'])
        r['licitacao'] = row['Licitacao\n']
        r['data_publicacao'] = parse_date(row['Data Publicacao'])

        to_insert.append(r)

    if len(to_insert) > 0:
        inserted += len(to_insert)
        insert_rows(db, to_insert)

    counter.end()

    print("Imported {} Contratos".format(inserted))
def insert_all(db, csv_file='../data/contratos-2014.xls', lines_per_insert=100):
    print("Importing Contratos from: {}".format(csv_file))
    data = pd.read_excel(csv_file)
    data = data.fillna(-1)

    cache = {}
    to_insert = []
    total = len(data)
    inserted = 0
    counter = ProgressCounter(total)

    for row_i, row in data.iterrows():

        r = {}

        if len(to_insert) == lines_per_insert:
            inserted += len(to_insert)
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        r['numero'] = int(row_i) + 1
        r['orgao'] = row['Orgao']
        r['data_assinatura'] = parse_date(row['Data da Assinatura'])
        r['vigencia'] = int(row['Vigencia']) if not np.isnan(row['Vigencia']) else -1
        r['objeto'] = row['Objeto']
        r['modalidade'] = row['Modalidade']
        r['evento'] = row['Evento']
        r['processo_administrativo'] = row['Processo Administrativo']
        r['cnpj'] = row['CNPJ']
        r['nome_fornecedor'] = row['Nome']
        r['valor'] = parse_money(row['Valor'])
        r['licitacao'] = row['Licitacao\n']
        r['data_publicacao'] = parse_date(row['Data Publicacao'])

        to_insert.append(r)

    if len(to_insert) > 0:
        inserted += len(to_insert)
        insert_rows(db, to_insert)

    counter.end()

    print("Imported {} Contratos".format(inserted))
def insert_csv(db, csv, lines_per_insert):
    table = pd.read_csv(csv)
    pks = create_pks(table)
    # db = get_db()
    counter = ProgressCounter(len(table))

    to_insert = []
    for row_i, row in table.iterrows():
        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)
        to_insert.append(prepare_row(pks.iloc[row_i], row))

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
def insert_csv(db, csv, lines_per_insert):
    table = pd.read_csv(csv)
    pks = create_pks(table)
    # db = get_db()
    counter = ProgressCounter(len(table))

    to_insert = []
    for row_i, row in table.iterrows():
        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)
        to_insert.append(prepare_row(pks.iloc[row_i], row))

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
Exemple #11
0
def download_contratos_files(csv_file='../data/urls.csv',
                             directory='../data/contratos'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    def download_and_save(url, directory):
        if not isinstance(url, basestring):
            return

        import os.path
        import urllib2

        filename = url.split('/')[-1]
        path = os.path.join(directory, filename)
        if not os.path.isfile(path):
            file = urllib2.urlopen(url)
            content = file.read()
            with open(path, 'w') as f:
                f.write(content)

    print("Downloading Contratos files from: {}".format(csv_file))
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    downloaded = 0
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = dict(
            (executor.submit(download_and_save, d['file_txt'], directory),
             d['file_txt']) for di, d in data.iterrows())

        for future in futures.as_completed(future_to_url):
            url = future_to_url[future]
            counter.update(1)
            downloaded += 1
            if future.exception() is not None:
                print('%r generated an exception: %s' %
                      (url, future.exception()))

    counter.end()

    print("Downloaded {} Contratos".format(downloaded))
Exemple #12
0
def insert_csv(db, csv, lines_per_insert):
    print(csv)
    table = pd.read_csv(csv)
    counter = ProgressCounter(len(table))

    # ## Add code column ## #
    code_series = [
        col for name, col in table.iteritems() if name[:3].lower() == "cd_"
    ]
    # this column doesn't start with "cd_" but is a code
    code_series.append(table["projetoatividade"])
    # create table of codes
    code_table = pd.concat(code_series, axis=1)
    # create PK Series
    pks = pd.Series([
        '.'.join([str(value) for value in row[1]])
        for row in code_table.iterrows()
    ],
                    name="code")
    # check pk uniqueness
    if pks.duplicated().values.sum() > 0:
        print("Warning: There are duplicated pks!")
    # add the pk series to the table
    # table = pd.concat([table, pks], axis=1)
    # ## --------------- ## #

    to_insert = []

    for row_i, row in table.iterrows():

        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        to_insert.append({"code": pks.iloc[row_i], "data": dict(row.iterkv())})

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()
def download_contratos_files(csv_file='../data/urls.csv', directory='../data/contratos'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    def download_and_save(url, directory):
        if not isinstance(url, basestring):
            return

        import os.path
        import urllib2

        filename = url.split('/')[-1]
        path = os.path.join(directory, filename)
        if not os.path.isfile(path):
            file = urllib2.urlopen(url)
            content = file.read()
            with open(path,'w') as f:
                f.write(content)

    print("Downloading Contratos files from: {}".format(csv_file))
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    downloaded = 0
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = dict((executor.submit(download_and_save, d['file_txt'], directory), d['file_txt'])
                             for di, d in data.iterrows())

        for future in futures.as_completed(future_to_url):
            url = future_to_url[future]
            counter.update(1)
            downloaded += 1
            if future.exception() is not None:
                print('%r generated an exception: %s' % (url,
                                                         future.exception()))

    counter.end()

    print("Downloaded {} Contratos".format(downloaded))
def update_from_csv(db, csv):
    '''Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values).'''
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        if row_model:
            for key, new_value in new_row.iteritems():
                setattr(row_model, key, new_value)
                # old_value = getattr(row_model, key)
                # if old_value != new_value:
                #     print(key, old_value, new_value)
                #     setattr(row_model, key, new_value)
        else:
            db.session.add(Execucao(**new_row))
        counter.update()
    counter.end()
    db.session.commit()
def insert_all(db, csv_file='../data/urls.csv', lines_per_insert=100):
    print("Getting Contratos urls from: " + csv_file)
    data = pd.read_csv(csv_file)

    total = len(data)
    counter = ProgressCounter(total)

    to_update = 0
    updated = 0
    for di, d in data.iterrows():
        to_update += 1
        stmt = update(Contrato).values({'file_url':d['file_url'], 'txt_file_url':d['file_txt']}).where(Contrato.numero == str(d['numero']))
        db.session.execute(stmt)
        if to_update == lines_per_insert or (updated + to_update) == total:
            counter.update(to_update)
            updated += to_update
            to_update = 0
            db.session.commit()

    counter.end()

    print("Updated {} Contratos".format(updated))
def update_from_csv(db, csv):
    '''Update table using values from CSV. Slower than 'insert_csv' but raises
    no error if primary key already exists (just updates values).'''
    table = pd.read_csv(csv)
    pks = create_pks(table)
    counter = ProgressCounter(len(table))
    for row_i, row in table.iterrows():
        code = pks.iloc[row_i]
        row_model = db.session.query(Execucao).filter_by(code=code).first()
        new_row = prepare_row(code, row)
        if row_model:
            for key, new_value in new_row.iteritems():
                setattr(row_model, key, new_value)
                # old_value = getattr(row_model, key)
                # if old_value != new_value:
                #     print(key, old_value, new_value)
                #     setattr(row_model, key, new_value)
        else:
            db.session.add(Execucao(**new_row))
        counter.update()
    counter.end()
    db.session.commit()
Exemple #17
0
def insert_all(db, csv_file='../data/receitas_min.csv', lines_per_insert=100):
    print("Importing Revenues from: " + csv_file)
    data = pd.read_csv(csv_file, encoding='utf8')

    cache = {}
    to_insert = []
    counter = ProgressCounter(len(data))

    for row_i, row in data.iterrows():

        r = {}

        if len(to_insert) == lines_per_insert:
            insert_rows(db, to_insert)
            to_insert = []
            # Progress counter
            counter.update(lines_per_insert)

        r['original_code'] = row['codigo']
        r['description'] = row['descricao']
        r['date'] = parse_date(row['data'])
        r['monthly_outcome'] = parse_money(row['realizado_mensal'])
        r['monthly_predicted'] = parse_money(row['previsto_mensal'])
        code_parsed = parse_code(row['codigo'])
        r['economical_category'] = code_parsed[0]

        # Insert code reference
        code_parts = map(int, r['original_code'].split('.'))
        len_cp = len(code_parts)

        for i in range(len_cp):
            code = '.'.join(map(str, code_parts[:len_cp - i]))
            if code not in cache:
                code_result = db.session.query(
                    RevenueCode.id).filter(RevenueCode.code == code).all()
                if code_result:
                    cache[code] = code_result[0][0]
                    r['code_id'] = code_result[0][0]
                    break
            else:
                r['code_id'] = cache[code]
                break
        else:
            r['code_id'] = None

        if len(code_parsed) >= 2:
            r['economical_subcategory'] = code_parsed[1]
        else:
            r['economical_subcategory'] = None

        if len(code_parsed) >= 3:
            r['source'] = code_parsed[2]
        else:
            r['source'] = None

        if len(code_parsed) >= 4:
            r['rubric'] = code_parsed[3]
        else:
            r['rubric'] = None

        if len(code_parsed) >= 5:
            r['paragraph'] = code_parsed[4]
        else:
            r['paragraph'] = None

        if len(code_parsed) == 6:
            r['subparagraph'] = code_parsed[5]
        else:
            r['subparagraph'] = None

        to_insert.append(r)

    if len(to_insert) > 0:
        insert_rows(db, to_insert)

    counter.end()