def analyseRemote(self, parser, since=None, limit=None): if since: pass #todo: implement since resultCount = 0 # apiIterator = self.ApiIterator(self.service, self.endpoint_plural) apiIterator = self.getIterator(self.endpoint_plural) progressCounter = None for page in apiIterator: if progressCounter is None: total_items = apiIterator.total_items if limit: total_items = min(limit, total_items) progressCounter = ProgressCounter(total_items) progressCounter.maybePrintUpdate(resultCount) # if Registrar.DEBUG_API: # Registrar.registerMessage('processing page: %s' % str(page)) if self.endpoint_plural in page: for page_item in page.get(self.endpoint_plural): parser.analyseWpApiObj(page_item) resultCount += 1 if limit and resultCount > limit: if Registrar.DEBUG_API: Registrar.registerMessage('reached limit, exiting') return
def geocode_all(db, data_folder="geocoder/data", terms_folder="geocoder/terms", lines_per_insert=1000): print("Loading table...") # The query bellow seems not very efficient... # Maybe change it as the link says. # https://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator non_geocoded = Execucao.query.filter(Execucao.searched == False).all() with Geocoder(data_folder, terms_folder) as geocoder: counter = ProgressCounter(len(non_geocoded), print_abs=True) to_be_inserted = 0 for row in non_geocoded: cells = get_geolocable_cells(row) geoent = geocoder.geocode_list(cells) if geoent: lat, lon, reg = geoent.best_coords() if lat: row.point = "POINT(%s %s)" % (lon, lat) row.searched = True to_be_inserted += 1 if to_be_inserted == lines_per_insert: db.session.commit() to_be_inserted = 0 counter.update() if to_be_inserted: db.session.commit() counter.end()
def update_from_csv(db, csv): """Update table using values from CSV. Slower than 'insert_csv' but raises no error if primary key already exists (just updates values).""" table = pd.read_csv(csv) pks = create_pks(table) counter = ProgressCounter(len(table)) modified_counter = 0 added_counter = 0 for row_i, row in table.iterrows(): code = pks.iloc[row_i] row_model = db.session.query(Execucao).filter_by(code=code).first() new_row = prepare_row(code, row) date = datetime.datetime.strptime(new_row["data"]["datafinal"], "%Y-%m-%d") if row_model: modified = {} # Check if state was modified if row_model.state != new_row["state"].decode("utf8"): modified["state"] = (row_model.state, new_row["state"]) row_model.state = new_row["state"] # Check if a field in data was modified for key, new_value in new_row["data"].items(): old_value = row_model.data.get(key) # Avoids confusion caused by new_value not been unicode if type(new_value) is str: new_value = new_value.decode("utf8") new_row["data"][key] = new_value # Avoids data that changed from 0 to None if (old_value or new_value) and (old_value != new_value): modified[key] = (old_value, new_value) # Avoids registering row as modified if only datafinal changend if len(modified) == 1 and "datafinal" in modified: modified = {} if modified: db.session.add(History(event="modified", code=code, date=date, data=modified)) modified_counter += 1 # Updates DB data even if only 'datafinal' changed row_model.data = new_row["data"] else: db.session.add(History(event="created", code=code, date=date, data=new_row)) db.session.add(Execucao(**new_row)) added_counter += 1 counter.update() counter.end() db.session.commit() print("Added/Modified/Total: %s/%s/%s" % (added_counter, modified_counter, len(table)))
def insert_all(db, csv_file='../data/contratos-2014.xls', lines_per_insert=100): print("Importing Contratos from: {}".format(csv_file)) data = pd.read_excel(csv_file) data = data.fillna(-1) cache = {} to_insert = [] total = len(data) inserted = 0 counter = ProgressCounter(total) for row_i, row in data.iterrows(): r = {} if len(to_insert) == lines_per_insert: inserted += len(to_insert) insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) r['numero'] = int(row_i) + 1 r['orgao'] = row['Orgao'] r['data_assinatura'] = parse_date(row['Data da Assinatura']) r['vigencia'] = int(row['Vigencia']) if not np.isnan(row['Vigencia']) else -1 r['objeto'] = row['Objeto'] r['modalidade'] = row['Modalidade'] r['evento'] = row['Evento'] r['processo_administrativo'] = row['Processo Administrativo'] r['cnpj'] = row['CNPJ'] r['nome_fornecedor'] = row['Nome'] r['valor'] = parse_money(row['Valor']) r['licitacao'] = row['Licitacao\n'] r['data_publicacao'] = parse_date(row['Data Publicacao']) to_insert.append(r) if len(to_insert) > 0: inserted += len(to_insert) insert_rows(db, to_insert) counter.end() print("Imported {} Contratos".format(inserted))
def insert_csv(db, csv, lines_per_insert): table = pd.read_csv(csv) pks = create_pks(table) # db = get_db() counter = ProgressCounter(len(table)) to_insert = [] for row_i, row in table.iterrows(): if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) to_insert.append(prepare_row(pks.iloc[row_i], row)) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()
def download_contratos_files(csv_file='../data/urls.csv', directory='../data/contratos'): if not os.path.exists(directory): os.makedirs(directory) def download_and_save(url, directory): if not isinstance(url, basestring): return import os.path import urllib2 filename = url.split('/')[-1] path = os.path.join(directory, filename) if not os.path.isfile(path): file = urllib2.urlopen(url) content = file.read() with open(path,'w') as f: f.write(content) print("Downloading Contratos files from: {}".format(csv_file)) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) downloaded = 0 with futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_url = dict((executor.submit(download_and_save, d['file_txt'], directory), d['file_txt']) for di, d in data.iterrows()) for future in futures.as_completed(future_to_url): url = future_to_url[future] counter.update(1) downloaded += 1 if future.exception() is not None: print('%r generated an exception: %s' % (url, future.exception())) counter.end() print("Downloaded {} Contratos".format(downloaded))
def update_from_csv(db, csv): '''Update table using values from CSV. Slower than 'insert_csv' but raises no error if primary key already exists (just updates values).''' table = pd.read_csv(csv) pks = create_pks(table) counter = ProgressCounter(len(table)) for row_i, row in table.iterrows(): code = pks.iloc[row_i] row_model = db.session.query(Execucao).filter_by(code=code).first() new_row = prepare_row(code, row) if row_model: for key, new_value in new_row.iteritems(): setattr(row_model, key, new_value) # old_value = getattr(row_model, key) # if old_value != new_value: # print(key, old_value, new_value) # setattr(row_model, key, new_value) else: db.session.add(Execucao(**new_row)) counter.update() counter.end() db.session.commit()
def insert_all(db, csv_file='../data/urls.csv', lines_per_insert=100): print("Getting Contratos urls from: " + csv_file) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) to_update = 0 updated = 0 for di, d in data.iterrows(): to_update += 1 stmt = update(Contrato).values({'file_url':d['file_url'], 'txt_file_url':d['file_txt']}).where(Contrato.numero == str(d['numero'])) db.session.execute(stmt) if to_update == lines_per_insert or (updated + to_update) == total: counter.update(to_update) updated += to_update to_update = 0 db.session.commit() counter.end() print("Updated {} Contratos".format(updated))
def download_contratos_files(csv_file='../data/urls.csv', directory='../data/contratos'): if not os.path.exists(directory): os.makedirs(directory) def download_and_save(url, directory): if not isinstance(url, basestring): return import os.path import urllib2 filename = url.split('/')[-1] path = os.path.join(directory, filename) if not os.path.isfile(path): file = urllib2.urlopen(url) content = file.read() with open(path, 'w') as f: f.write(content) print("Downloading Contratos files from: {}".format(csv_file)) data = pd.read_csv(csv_file) total = len(data) counter = ProgressCounter(total) downloaded = 0 with futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_url = dict( (executor.submit(download_and_save, d['file_txt'], directory), d['file_txt']) for di, d in data.iterrows()) for future in futures.as_completed(future_to_url): url = future_to_url[future] counter.update(1) downloaded += 1 if future.exception() is not None: print('%r generated an exception: %s' % (url, future.exception())) counter.end() print("Downloaded {} Contratos".format(downloaded))
def insert_all(db, csv_file='../data/contratos-2014.xls', lines_per_insert=100): print("Importing Contratos from: " + csv_file) data = pd.read_excel(csv_file) data = data.fillna(-1) cache = {} to_insert = [] counter = ProgressCounter(len(data)) for row_i, row in data.iterrows(): r = {} if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) r['numero'] = int(row_i) + 1 r['orgao'] = row['Orgao'] r['data_assinatura'] = parse_date(row['Data da Assinatura']) r['vigencia'] = int( row['Vigencia']) if not np.isnan(row['Vigencia']) else -1 r['objeto'] = row['Objeto'] r['modalidade'] = row['Modalidade'] r['evento'] = row['Evento'] r['processo_administrativo'] = row['Processo Administrativo'] r['cnpj'] = row['CNPJ'] r['nome_fornecedor'] = row['Nome'] r['valor'] = parse_money(row['Valor']) r['licitacao'] = row['Licitacao\n'] r['data_publicacao'] = parse_date(row['Data Publicacao']) to_insert.append(r) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()
def insert_csv(db, csv, lines_per_insert): print(csv) table = pd.read_csv(csv) counter = ProgressCounter(len(table)) # ## Add code column ## # code_series = [ col for name, col in table.iteritems() if name[:3].lower() == "cd_" ] # this column doesn't start with "cd_" but is a code code_series.append(table["projetoatividade"]) # create table of codes code_table = pd.concat(code_series, axis=1) # create PK Series pks = pd.Series([ '.'.join([str(value) for value in row[1]]) for row in code_table.iterrows() ], name="code") # check pk uniqueness if pks.duplicated().values.sum() > 0: print("Warning: There are duplicated pks!") # add the pk series to the table # table = pd.concat([table, pks], axis=1) # ## --------------- ## # to_insert = [] for row_i, row in table.iterrows(): if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) to_insert.append({"code": pks.iloc[row_i], "data": dict(row.iterkv())}) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()
if(Registrar.DEBUG_MESSAGE): print "email matches (%d)" % len(emailMatcher.matches) # print emailMatcher.matches.tabulate(tablefmt="simple") # TODO: further sort emailMatcher print debugUtils.hashify("BEGINNING MERGE (%d)" % len(globalMatches)) print timediff() syncCols = ColData_User.getSyncCols() if Registrar.DEBUG_PROGRESS: syncProgressCounter = ProgressCounter(len(globalMatches)) for count, match in enumerate(globalMatches): if Registrar.DEBUG_PROGRESS: syncProgressCounter.maybePrintUpdate(count) mObject = match.mObjects[0] sObject = match.sObjects[0] syncUpdate = SyncUpdate_Usr(mObject, sObject) syncUpdate.update(syncCols) # SanitationUtils.safePrint( syncUpdate.tabulate(tablefmt = 'simple')) if syncUpdate.mUpdated and syncUpdate.mDeltas: insort(mDeltaUpdates, syncUpdate)
def insert_all(db, csv_file='../data/receitas_min.csv', lines_per_insert=100): print("Importing Revenues from: " + csv_file) data = pd.read_csv(csv_file, encoding='utf8') cache = {} to_insert = [] counter = ProgressCounter(len(data)) for row_i, row in data.iterrows(): r = {} if len(to_insert) == lines_per_insert: insert_rows(db, to_insert) to_insert = [] # Progress counter counter.update(lines_per_insert) r['original_code'] = row['codigo'] r['description'] = row['descricao'] r['date'] = parse_date(row['data']) r['monthly_outcome'] = parse_money(row['realizado_mensal']) r['monthly_predicted'] = parse_money(row['previsto_mensal']) code_parsed = parse_code(row['codigo']) r['economical_category'] = code_parsed[0] # Insert code reference code_parts = map(int, r['original_code'].split('.')) len_cp = len(code_parts) for i in range(len_cp): code = '.'.join(map(str, code_parts[:len_cp - i])) if code not in cache: code_result = db.session.query( RevenueCode.id).filter(RevenueCode.code == code).all() if code_result: cache[code] = code_result[0][0] r['code_id'] = code_result[0][0] break else: r['code_id'] = cache[code] break else: r['code_id'] = None if len(code_parsed) >= 2: r['economical_subcategory'] = code_parsed[1] else: r['economical_subcategory'] = None if len(code_parsed) >= 3: r['source'] = code_parsed[2] else: r['source'] = None if len(code_parsed) >= 4: r['rubric'] = code_parsed[3] else: r['rubric'] = None if len(code_parsed) >= 5: r['paragraph'] = code_parsed[4] else: r['paragraph'] = None if len(code_parsed) == 6: r['subparagraph'] = code_parsed[5] else: r['subparagraph'] = None to_insert.append(r) if len(to_insert) > 0: insert_rows(db, to_insert) counter.end()
def printFileProgress(self, completed, total): if not hasattr(self, 'progressCounter'): self.progressCounter = ProgressCounter(total) self.progressCounter.maybePrintUpdate(completed)
class UsrSyncClient_SSH_ACT(SyncClient_Abstract): def __init__(self, connectParams, dbParams, fsParams): self.dbParams = dbParams self.fsParams = fsParams super(UsrSyncClient_SSH_ACT, self).__init__(connectParams) def attemptConnect(self): self.service = paramiko.SSHClient() self.service.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self.service.connect(**self.connectParams) @property def connectionReady(self): return self.service and self.service._transport and self.service._transport.active def execSilentCommandAssert(self, command): self.assertConnect() stdin, stdout, stderr = self.service.exec_command(command) if stdin: pass #gets rid of annoying warnings possible_errors = stdout.readlines() + stderr.readlines() for error in possible_errors: if re.match("^Countries.*", error): print error continue assert not error, "command <%s> returned errors: %s" % ( SanitationUtils.coerceUnicode(command), SanitationUtils.coerceUnicode(error) ) def putFile(self, localPath, remotePath): self.assertConnect() # remoteDir, remoteFileName = os.path.split(remotePath) remoteDir = os.path.split(remotePath)[0] exception = Exception() sftpClient = self.service.open_sftp() if remoteDir: try: sftpClient.stat(remoteDir) except: sftpClient.mkdir(remoteDir) sftpClient.put(localPath, remotePath) fstat = sftpClient.stat(remotePath) sftpClient.close() if not fstat: exception = UserWarning("could not stat remote file") # try: # sftpClient = self.service.open_sftp() # if remoteDir: # try: # sftpClient.stat(remoteDir) # except: # sftpClient.mkdir(remoteDir) # sftpClient.put(localPath, remotePath) # fstat = sftpClient.stat(remotePath) # if not fstat: # exception = UserWarning("could not stat remote file") # except Exception, e: # exception = e # finally: # sftpClient.close() # if not isinstance(exception, Exception): # raise exception def assertRemoteFileExists(self, remotePath, assertion = ""): self.assertConnect() # stdin, stdout, stderr = self.service.exec_command('stat "%s"' % remotePath) stderr = self.service.exec_command('stat "%s"' % remotePath)[2] possible_errors = stderr.readlines() assert not possible_errors, " ".join([assertion, "stat returned possible errors", str(possible_errors)]) @classmethod def printFileProgress(self, completed, total): if not hasattr(self, 'progressCounter'): self.progressCounter = ProgressCounter(total) self.progressCounter.maybePrintUpdate(completed) def getDeleteFile(self, remotePath, localPath): self.assertRemoteFileExists(remotePath) sftpClient = self.service.open_sftp() sftpClient.get(remotePath, localPath, self.printFileProgress) sftpClient.remove(remotePath) sftpClient.close() # exception = None # try: # sftpClient = self.service.open_sftp() # sftpClient.get(remotePath, localPath, self.printFileProgress) # sftpClient.remove(remotePath) # except Exception, e: # exception = e # finally: # sftpClient.close() # if exception: # raise exception def removeRemoteFile(self, remotePath): self.assertRemoteFileExists(remotePath) self.service.exec_command('rm "%s"' % remotePath) def uploadChanges(self, user_pkey, updates=None): if not updates: return # print "UPDATES:", updates self.assertConnect() if 'MYOB Card ID' in updates: del updates['MYOB Card ID'] updates = OrderedDict( [('MYOB Card ID', user_pkey)] \ + updates.items() ) importName = self.fsParams['importName'] outFolder = self.fsParams['outFolder'] remote_export_folder = self.fsParams['remote_export_folder'] fileRoot = 'act_i_' + importName + '_' + user_pkey fileName = fileRoot + '.csv' localPath = os.path.join(outFolder, fileName) remotePath = os.path.join(remote_export_folder, fileName) importedFile = os.path.join(remote_export_folder, fileRoot + '.imported') with open(localPath, 'w+') as outFile: csvdialect = UnicodeCsvDialectUtils.act_out dictwriter = unicodecsv.DictWriter( outFile, dialect=csvdialect, fieldnames=updates.keys(), encoding='utf8', extrasaction='ignore', ) dictwriter.writeheader() dictwriter.writerow(updates) self.putFile( localPath, remotePath) tokens = [ 'cd ' + remote_export_folder + ';', '{db_i_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}"'.format( **self.dbParams ), ('"%s"' % fileName) if fileName else None ] command = " ".join( token for token in tokens if token) # command = " ".join(filter(None,)) # # command = " ".join(filter(None,[ # 'cd ' + remote_export_folder + ';', # '{db_i_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}"'.format( # **self.dbParams # ), # ('"%s"' % fileName) if fileName else None # ])) self.execSilentCommandAssert(command) try: self.removeRemoteFile(importedFile) except: raise Exception("import didn't produce a .imported file") def analyseRemote(self, parser, since=None, limit=None): if not since: since = '1970-01-01' if limit: # todo: implement limit # this gets rid of unused argument warnings pass importName = self.fsParams['importName'] remote_export_folder = self.fsParams['remote_export_folder'] fileRoot = 'act_x_' + importName fileName = fileRoot + '.csv' inFolder = self.fsParams['inFolder'] localPath = os.path.join(inFolder, fileName) remotePath = os.path.join(remote_export_folder, fileName) tokens = [ 'cd ' + remote_export_folder + ';', '{db_x_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}" -c"{fields}"'.format( **self.dbParams ), '-s"%s"' % since, '"%s"' % fileName ] command = " ".join([token for token in tokens if token]) # command = " ".join(filter(None,[ # 'cd ' + remote_export_folder + ';', # '{db_x_exe} "-d{db_name}" "-h{db_host}" "-u{db_user}" "-p{db_pass}" -c"{fields}"'.format( # **self.dbParams # ), # '-s"%s"' % since, # '"%s"' % fileName # ])) print "executing export command..." self.execSilentCommandAssert(command) print "donloading file..." self.getDeleteFile(remotePath, localPath) print "analysing file..." parser.analyseFile(localPath, dialect_suggestion='act_out')