def geocode_all(db, data_folder="geocoder/data", terms_folder="geocoder/terms", lines_per_insert=1000): print("Loading table...") # The query bellow seems not very efficient... # Maybe change it as the link says. # https://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator non_geocoded = Execucao.query.filter(Execucao.searched == False).all() with Geocoder(data_folder, terms_folder) as geocoder: counter = ProgressCounter(len(non_geocoded), print_abs=True) to_be_inserted = 0 for row in non_geocoded: cells = get_geolocable_cells(row) geoent = geocoder.geocode_list(cells) if geoent: lat, lon, reg = geoent.best_coords() if lat: row.point = "POINT(%s %s)" % (lon, lat) row.searched = True to_be_inserted += 1 if to_be_inserted == lines_per_insert: db.session.commit() to_be_inserted = 0 counter.update() if to_be_inserted: db.session.commit() counter.end()
def update_from_csv(db, csv): '''Update table using values from CSV. Slower than 'insert_csv' but raises no error if primary key already exists (just updates values).''' table = pd.read_csv(csv) pks = create_pks(table) counter = ProgressCounter(len(table)) modified_counter = 0 added_counter = 0 for row_i, row in table.iterrows(): code = pks.iloc[row_i] row_model = db.session.query(Execucao).filter_by(code=code).first() new_row = prepare_row(code, row) date = datetime.datetime.strptime(new_row['data']['datafinal'], '%Y-%m-%d') if row_model: modified = {} # Check if state was modified if row_model.state != new_row['state'].decode('utf8'): modified['state'] = (row_model.state, new_row['state']) row_model.state = new_row['state'] # Check if a field in data was modified for key, new_value in new_row['data'].items(): old_value = row_model.data.get(key) # Avoids confusion caused by new_value not been unicode if type(new_value) is str: new_value = new_value.decode('utf8') new_row['data'][key] = new_value # Avoids data that changed from 0 to None if (old_value or new_value) and (old_value != new_value): modified[key] = (old_value, new_value) # Avoids registering row as modified if only datafinal changend if len(modified) == 1 and 'datafinal' in modified: modified = {} if modified: db.session.add( History(event='modified', code=code, date=date, data=modified)) modified_counter += 1 # Updates DB data even if only 'datafinal' changed row_model.data = new_row['data'] else: db.session.add( History(event='created', code=code, date=date, data=new_row)) db.session.add(Execucao(**new_row)) added_counter += 1 counter.update() counter.end() db.session.commit() print('Added/Modified/Total: %s/%s/%s' % (added_counter, modified_counter, len(table)))
def geocode_all(db, data_folder="geocoder/data", terms_folder="geocoder/terms", lines_per_insert=1000): print("Loading table...") non_geocoded = get_non_geocode(lines_per_insert) if non_geocoded: while non_geocoded: with Geocoder(data_folder, terms_folder) as geocoder: counter = ProgressCounter(len(non_geocoded), print_abs=True) to_be_inserted = 0 for row in non_geocoded: cells = get_geolocable_cells(row) geoent = geocoder.geocode_list(cells) if geoent: lat, lon, reg = geoent.best_coords() if lat: row.point = "POINT(%s %s)" % (lon, lat) row.searched = True to_be_inserted += 1 if to_be_inserted == lines_per_insert: db.session.commit() to_be_inserted = 0 counter.update() if to_be_inserted: db.session.commit() counter.end() non_geocoded = get_non_geocode(lines_per_insert)
def insert_all(db, csv_file='../data/urls.csv', lines_per_insert=100): print("Getting Contratos urls from: " + csv_file) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) to_update = 0 updated = 0 for di, d in data.iterrows(): to_update += 1 stmt = update(Contrato).values({ 'file_url': d['file_url'], 'txt_file_url': d['file_txt'] }).where(Contrato.numero == str(d['numero'])) db.session.execute(stmt) if to_update == lines_per_insert or (updated + to_update) == total: counter.update(to_update) updated += to_update to_update = 0 db.session.commit() counter.end() print("Updated {} Contratos".format(updated))
def update_from_csv(db, csv): """Update table using values from CSV. Slower than 'insert_csv' but raises no error if primary key already exists (just updates values).""" table = pd.read_csv(csv) pks = create_pks(table) counter = ProgressCounter(len(table)) modified_counter = 0 added_counter = 0 for row_i, row in table.iterrows(): code = pks.iloc[row_i] row_model = db.session.query(Execucao).filter_by(code=code).first() new_row = prepare_row(code, row) date = datetime.datetime.strptime(new_row["data"]["datafinal"], "%Y-%m-%d") if row_model: modified = {} # Check if state was modified if row_model.state != new_row["state"].decode("utf8"): modified["state"] = (row_model.state, new_row["state"]) row_model.state = new_row["state"] # Check if a field in data was modified for key, new_value in new_row["data"].items(): old_value = row_model.data.get(key) # Avoids confusion caused by new_value not been unicode if type(new_value) is str: new_value = new_value.decode("utf8") new_row["data"][key] = new_value # Avoids data that changed from 0 to None if (old_value or new_value) and (old_value != new_value): modified[key] = (old_value, new_value) # Avoids registering row as modified if only datafinal changend if len(modified) == 1 and "datafinal" in modified: modified = {} if modified: db.session.add(History(event="modified", code=code, date=date, data=modified)) modified_counter += 1 # Updates DB data even if only 'datafinal' changed row_model.data = new_row["data"] else: db.session.add(History(event="created", code=code, date=date, data=new_row)) db.session.add(Execucao(**new_row)) added_counter += 1 counter.update() counter.end() db.session.commit() print("Added/Modified/Total: %s/%s/%s" % (added_counter, modified_counter, len(table)))
def insert_all(db, csv_file='../data/contratos-2014.xls', lines_per_insert=100): print("Importing Contratos from: {}".format(csv_file)) data = pd.read_excel(csv_file) data = data.fillna(-1) cache = {} to_insert = [] total = len(data) inserted = 0 counter = ProgressCounter(total) for row_i, row in data.iterrows(): r = {} if len(to_insert) == lines_per_insert: inserted += len(to_insert) insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) r['numero'] = int(row_i) + 1 r['orgao'] = row['Orgao'] r['data_assinatura'] = parse_date(row['Data da Assinatura']) r['vigencia'] = int( row['Vigencia']) if not np.isnan(row['Vigencia']) else -1 r['objeto'] = row['Objeto'] r['modalidade'] = row['Modalidade'] r['evento'] = row['Evento'] r['processo_administrativo'] = row['Processo Administrativo'] r['cnpj'] = row['CNPJ'] r['nome_fornecedor'] = row['Nome'] r['valor'] = parse_money(row['Valor']) r['licitacao'] = row['Licitacao\n'] r['data_publicacao'] = parse_date(row['Data Publicacao']) to_insert.append(r) if len(to_insert) > 0: inserted += len(to_insert) insert_rows(db, to_insert) counter.end() print("Imported {} Contratos".format(inserted))
def insert_all(db, csv_file='../data/contratos-2014.xls', lines_per_insert=100): print("Importing Contratos from: {}".format(csv_file)) data = pd.read_excel(csv_file) data = data.fillna(-1) cache = {} to_insert = [] total = len(data) inserted = 0 counter = ProgressCounter(total) for row_i, row in data.iterrows(): r = {} if len(to_insert) == lines_per_insert: inserted += len(to_insert) insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) r['numero'] = int(row_i) + 1 r['orgao'] = row['Orgao'] r['data_assinatura'] = parse_date(row['Data da Assinatura']) r['vigencia'] = int(row['Vigencia']) if not np.isnan(row['Vigencia']) else -1 r['objeto'] = row['Objeto'] r['modalidade'] = row['Modalidade'] r['evento'] = row['Evento'] r['processo_administrativo'] = row['Processo Administrativo'] r['cnpj'] = row['CNPJ'] r['nome_fornecedor'] = row['Nome'] r['valor'] = parse_money(row['Valor']) r['licitacao'] = row['Licitacao\n'] r['data_publicacao'] = parse_date(row['Data Publicacao']) to_insert.append(r) if len(to_insert) > 0: inserted += len(to_insert) insert_rows(db, to_insert) counter.end() print("Imported {} Contratos".format(inserted))
def insert_csv(db, csv, lines_per_insert): table = pd.read_csv(csv) pks = create_pks(table) # db = get_db() counter = ProgressCounter(len(table)) to_insert = [] for row_i, row in table.iterrows(): if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) to_insert.append(prepare_row(pks.iloc[row_i], row)) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()
def download_contratos_files(csv_file='../data/urls.csv', directory='../data/contratos'): if not os.path.exists(directory): os.makedirs(directory) def download_and_save(url, directory): if not isinstance(url, basestring): return import os.path import urllib2 filename = url.split('/')[-1] path = os.path.join(directory, filename) if not os.path.isfile(path): file = urllib2.urlopen(url) content = file.read() with open(path, 'w') as f: f.write(content) print("Downloading Contratos files from: {}".format(csv_file)) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) downloaded = 0 with futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_url = dict( (executor.submit(download_and_save, d['file_txt'], directory), d['file_txt']) for di, d in data.iterrows()) for future in futures.as_completed(future_to_url): url = future_to_url[future] counter.update(1) downloaded += 1 if future.exception() is not None: print('%r generated an exception: %s' % (url, future.exception())) counter.end() print("Downloaded {} Contratos".format(downloaded))
def insert_csv(db, csv, lines_per_insert): print(csv) table = pd.read_csv(csv) counter = ProgressCounter(len(table)) # ## Add code column ## # code_series = [ col for name, col in table.iteritems() if name[:3].lower() == "cd_" ] # this column doesn't start with "cd_" but is a code code_series.append(table["projetoatividade"]) # create table of codes code_table = pd.concat(code_series, axis=1) # create PK Series pks = pd.Series([ '.'.join([str(value) for value in row[1]]) for row in code_table.iterrows() ], name="code") # check pk uniqueness if pks.duplicated().values.sum() > 0: print("Warning: There are duplicated pks!") # add the pk series to the table # table = pd.concat([table, pks], axis=1) # ## --------------- ## # to_insert = [] for row_i, row in table.iterrows(): if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) to_insert.append({"code": pks.iloc[row_i], "data": dict(row.iterkv())}) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()
def download_contratos_files(csv_file='../data/urls.csv', directory='../data/contratos'): if not os.path.exists(directory): os.makedirs(directory) def download_and_save(url, directory): if not isinstance(url, basestring): return import os.path import urllib2 filename = url.split('/')[-1] path = os.path.join(directory, filename) if not os.path.isfile(path): file = urllib2.urlopen(url) content = file.read() with open(path,'w') as f: f.write(content) print("Downloading Contratos files from: {}".format(csv_file)) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) downloaded = 0 with futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_url = dict((executor.submit(download_and_save, d['file_txt'], directory), d['file_txt']) for di, d in data.iterrows()) for future in futures.as_completed(future_to_url): url = future_to_url[future] counter.update(1) downloaded += 1 if future.exception() is not None: print('%r generated an exception: %s' % (url, future.exception())) counter.end() print("Downloaded {} Contratos".format(downloaded))
def update_from_csv(db, csv): '''Update table using values from CSV. Slower than 'insert_csv' but raises no error if primary key already exists (just updates values).''' table = pd.read_csv(csv) pks = create_pks(table) counter = ProgressCounter(len(table)) for row_i, row in table.iterrows(): code = pks.iloc[row_i] row_model = db.session.query(Execucao).filter_by(code=code).first() new_row = prepare_row(code, row) if row_model: for key, new_value in new_row.iteritems(): setattr(row_model, key, new_value) # old_value = getattr(row_model, key) # if old_value != new_value: # print(key, old_value, new_value) # setattr(row_model, key, new_value) else: db.session.add(Execucao(**new_row)) counter.update() counter.end() db.session.commit()
def insert_all(db, csv_file='../data/urls.csv', lines_per_insert=100): print("Getting Contratos urls from: " + csv_file) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) to_update = 0 updated = 0 for di, d in data.iterrows(): to_update += 1 stmt = update(Contrato).values({'file_url':d['file_url'], 'txt_file_url':d['file_txt']}).where(Contrato.numero == str(d['numero'])) db.session.execute(stmt) if to_update == lines_per_insert or (updated + to_update) == total: counter.update(to_update) updated += to_update to_update = 0 db.session.commit() counter.end() print("Updated {} Contratos".format(updated))
def insert_all(db, csv_file='../data/receitas_min.csv', lines_per_insert=100): print("Importing Revenues from: " + csv_file) data = pd.read_csv(csv_file, encoding='utf8') cache = {} to_insert = [] counter = ProgressCounter(len(data)) for row_i, row in data.iterrows(): r = {} if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) r['original_code'] = row['codigo'] r['description'] = row['descricao'] r['date'] = parse_date(row['data']) r['monthly_outcome'] = parse_money(row['realizado_mensal']) r['monthly_predicted'] = parse_money(row['previsto_mensal']) code_parsed = parse_code(row['codigo']) r['economical_category'] = code_parsed[0] # Insert code reference code_parts = map(int, r['original_code'].split('.')) len_cp = len(code_parts) for i in range(len_cp): code = '.'.join(map(str, code_parts[:len_cp - i])) if code not in cache: code_result = db.session.query( RevenueCode.id).filter(RevenueCode.code == code).all() if code_result: cache[code] = code_result[0][0] r['code_id'] = code_result[0][0] break else: r['code_id'] = cache[code] break else: r['code_id'] = None if len(code_parsed) >= 2: r['economical_subcategory'] = code_parsed[1] else: r['economical_subcategory'] = None if len(code_parsed) >= 3: r['source'] = code_parsed[2] else: r['source'] = None if len(code_parsed) >= 4: r['rubric'] = code_parsed[3] else: r['rubric'] = None if len(code_parsed) >= 5: r['paragraph'] = code_parsed[4] else: r['paragraph'] = None if len(code_parsed) == 6: r['subparagraph'] = code_parsed[5] else: r['subparagraph'] = None to_insert.append(r) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()