def initial_load(self): db_connection = MariaDB_Connect(user=self.user, password=self.password, host=self.host, database=self.database) db_connection.connect_db() db_connection.init_db() db_connection.close_db() files = os.listdir(self.json_path) for file in tqdm(files): with open(self.json_path + "/{}".format(file), encoding="utf-8") as json_file: aux = json.load(json_file) values = { "id": aux[0]["resource"]["id"], "name": decode(aux[0]["resource"]["name"].replace(",", "")) if aux[0]["resource"]["name"] else "", "attribution": decode(aux[0]["resource"]["attribution"].replace(",", "")) if aux[0]["resource"]["attribution"] else "", "description": decode(aux[0]["resource"]["description"].replace(",", "")) if aux[0]["resource"]["description"] else "", "created_date": aux[0]["resource"]["createdAt"][0:10] if aux[0]["resource"]["createdAt"] else "", "updated_date": aux[0]["resource"]["updatedAt"][0:10] if aux[0]["resource"]["updatedAt"] else "", "data_updated": aux[0]["resource"]["data_updated_at"][0:10] if aux[0]["resource"]["data_updated_at"] else "", "metadata_updated": aux[0]["resource"]["metadata_updated_at"][0:10] if aux[0]["resource"]["metadata_updated_at"] else "", "updated": 0, "new": 0 } db_connection = MariaDB_Connect(user=self.user, password=self.password, host=self.host, database=self.database) db_connection.connect_db() db_connection.insert_dataset(values) db_connection.close_db()
def key(author): #convert accented unicode characters to closest ascii equivalent author = decode(author) #re-arrange author name to FIRST [MIDDLE] LAST [SUFFIX] author = remove_accents_and_hyphens(author) #author = reformat_author(author) author = remove_curlies(author) author = reformat_author(author) #get first 4 letters of last name return last_name(author)[:4]
def save_to_db(self): data_map = { "4n4q-k399": "sanciones", "rpmr-utcd": "integrado", "gnxj-bape": "contratos", "aimg-uskh": "procesos", "fzc7-w78v": "chip", "vfek-dafh": "adquisiciones" } datasets = [ "4n4q-k399", "rpmr-utcd", "gnxj-bape", "aimg-uskh", "fzc7-w78v", "vfek-dafh" ] for dataset in datasets: db_connection = MariaDB_Connect(user=self.user, password=self.password, host=self.host, database=self.database) db_connection.connect_db() updated_date = db_connection.updated_date(dataset) filetoload = updated_date["fecha_actualizado"] db_connection.close_db() print("Loading {} file".format(data_map[dataset])) with open("{}/{}/{}.json".format( self.json_path, data_map[dataset], filetoload[0].strftime("%Y-%m-%d"))) as json_file: file_ = json.load(json_file)[0]["data"] file_ = pd.DataFrame(file_) names = [ item.replace(" ", "_").replace("(", "_").replace(")", "_").lower() for item in file_.columns ] file_.columns = names for col in names: file_[col] = [ decode(row) if isinstance(row, str) else row for row in file_[col] ] if data_map[dataset] == "contratos": del file_["urlproceso"] file_ = file_.where((pd.notnull(file_)), None) drop = "DROP TABLE IF EXISTS {};".format(data_map[dataset]) sentence = "CREATE TABLE {} (id INTEGER AUTO_INCREMENT PRIMARY KEY, ".format( data_map[dataset]) n = len(names) for i in range(len(names)): if i == n - 1: sentence += names[i] + " TEXT);" else: sentence += names[i] + " TEXT, " db_connection = MariaDB_Connect(user=self.user, password=self.password, host=self.host, database=self.database) db_connection.connect_db() metadata = db.MetaData(bind=db_connection.connection) metadata.reflect() db_connection.connection.execute(drop) metadata.reflect() db_connection.connection.execute(sentence) metadata.reflect() query = db.insert(metadata.tables[data_map[dataset]]) file_ = file_.to_dict("records") db_connection.connection.execute(query, file_) db_connection.close_db()
def download_dataset(self, item, updated=0, new=0, download=0): values = None complete_data = [] if download == 1: data_final = None index = 0 offset = 0 records = sess.get( "https://www.datos.gov.co/resource/{}.json?$select=count(*)". format(item["dataset_link"])).json()[0] keys = records.keys() records = int([records[key] for key in keys][0]) iterations = np.ceil(records / self.limit) if iterations > 0: while index < iterations: if index == 0: url = "https://www.datos.gov.co/resource/{}.json?$$app_token={}&$limit={}&$offset={}".format( item["dataset_link"], self.token, self.limit, offset) resp = sess.get(url) if resp.status_code == 200: data_final = pd.DataFrame( json.loads(resp.content, encoding="UTF-8")) if not data_final.empty: offset = (self.limit * index) + 1 index += 1 else: index = iterations + 1 else: print("Bad Request!") index = iterations + 1 else: url = "https://www.datos.gov.co/resource/{}.json?$$app_token={}&$limit={}&$offset={}".format( item["dataset_link"], self.token, self.limit, offset) resp = sess.get(url) if resp.status_code == 200: aux = pd.DataFrame( json.loads(resp.content, encoding="UTF-8")) if not aux.empty: data_final = pd.concat([data_final, aux], axis=0, sort=True) offset = (self.limit * index) + 1 index += 1 else: index = iterations + 1 else: print("Bad Request!") index = iterations + 1 complete_data.append({ "metadata": item["metadata"], "resource": item["resource"], "dataset": item["dataset_name"], "data": data_final.to_dict() }) values = { "id": item["resource"]["id"], "nombre": decode(item["resource"]["name"].replace(",", "")) if item["resource"]["name"] else "", "categoria": decode(item["category"]) if item["category"] else "", "entidad": decode(item["resource"]["attribution"].replace(",", "")) if item["resource"]["attribution"] else "", "descripcion": decode(item["resource"]["description"].replace(",", "")) if item["resource"]["description"] else "", "fecha_ejecucion": datetime.today().strftime("%Y-%m-%d"), "fecha_creacion": item["resource"]["createdAt"][0:10] if item["resource"]["createdAt"] else "", "fecha_actualizacion": item["resource"]["updatedAt"][0:10] if item["resource"]["updatedAt"] else "", "fecha_datos_actualizados": item["resource"]["data_updated_at"][0:10] if item["resource"]["data_updated_at"] else "", "fecha_metadata_actualizada": item["resource"]["metadata_updated_at"][0:10] if item["resource"]["metadata_updated_at"] else "", "actualizado": updated, "nuevo": new } return complete_data, values
def fix_name(player, owgr_rankings, log=None): '''takes a string and a dict and returns a dict?''' from golf_app.models import Name #need to import here to avoid cicular import error if owgr_rankings.get(player.replace('.', '').replace('-', '')) != None: #print ('returning match', owgr_rankings.get(player.replace('.', '').replace('-', ''))) return (player, owgr_rankings.get(player.replace('.', '').replace('-', ''))) if owgr_rankings.get(decode(player)): if log: print('unidecoded name dict match: ', player, owgr_rankings.get(decode(player))) return (player, owgr_rankings.get(decode(player))) #lower = {k:v for k,v in owgr_rankings.items() if player.lower() == k.lower()} lower = [ v for k, v in owgr_rankings.items() if player.lower() == k.lower() ] if len(lower) > 0: return (player, lower[0]) replace_list = ['-', "'"] for char in replace_list: strip = [ v for k, v in owgr_rankings.items() if player.replace(char, '').lower() == k.replace(char, '').lower() ] if len(strip) > 0: return (player, strip[0]) if log: print(['player', player]) if Name.objects.filter(PGA_name=player).exists(): if log: print('player mathc') name = Name.objects.get(PGA_name=player) if owgr_rankings.get(name.OWGR_name): if log: print('returning based on name table lookup: ', player, owgr_rankings.get(name.OWGR_name)) return (player, owgr_rankings.get(name.OWGR_name)) last = player.split(' ') if last[len(last) - 1] in ['Jr', 'Jr.', '(a)' ] or last[len(last) - 1].isupper(): last_name = last[len(player.split(' ')) - 2] else: last_name = last[len(last) - 1] possible_matches = { k: v for k, v in owgr_rankings.items() if decode(last_name.strip(',')) in decode(k) } if log: print('player: ', player) #print ('possible name mathces: ', player, possible_matches) pga_name = player.replace(' (a)', '').replace(',', '').replace('.', '').replace( '-', '').split(' ') #for k, v in owgr_rankings.items(): for k, v in possible_matches.items(): owgr_name = k.replace(',', '').split(' ') if log: print('looping thru possible: ', pga_name, owgr_name) if owgr_name == pga_name: if log: print('names equal after strip spec chars', player, owgr_name) return player, v if len(owgr_name) == 3 and len(pga_name) == 3 and decode(owgr_name[0]) == decode(pga_name[0]) \ and decode(owgr_name[2].replace('.', '')) == decode(pga_name[2].replace('.', '')) and owgr_name[1][0] == pga_name[1][0]: if log: print('last name, first name match, middle first intial match', player, owgr_name) return k, v #elif len(owgr_name) - 1 == len(pga_name) or len(owgr_name) == len(pga_name) - 1 \ # and (owgr_name[0] == pga_name[0] \ # and decode(owgr_name[len(owgr_name) -1]) == decode(pga_name[len(pga_name) -1])): # print ('strip middle stuff, first and last match', pga_name, owgr_name) # return k, v elif decode(owgr_name[len(owgr_name)-2]) == decode(pga_name[len(pga_name)-1]) \ and k.split(' ')[0] == player.split(' ')[0]: #and k[0:1] == player[0:1]: initial logic checks for charaacter of first name so causing false positives if log: print('XXXXX fix this for dru love') print('last name, first initial match, cut owgr suffix', k, v, player, owgr_name) return k, v #elif len(owgr_name) == 3 and len(pga_name) == 3 and unidecode.unidecode(owgr_name[len(owgr_name)-2]) == unidecode.unidecode(pga_name[len(pga_name)-2]) \ # and unidecode.unidecode(owgr_name[0]) == unidecode.unidecode(pga_name[0]): # print ('last name, first name, cut both suffix', player) # return k, v elif decode(owgr_name[0].replace('-', '')) == decode(pga_name[len(pga_name)-1].replace('-', '')) \ and decode(owgr_name[len(owgr_name)-1].replace('-', '')) == decode(pga_name[0].replace('-', '')): if log: print('names reversed', player, owgr_name) return k, v elif decode(owgr_name[len(owgr_name)-1]) == decode(pga_name[len(pga_name)-1]) \ and k[0:2] == player[0:2]: if log: print('last name, first two letter match', player, owgr_name) return k, v # s_name = [v for k, v in owgr_rankings.items() if k.split('(')[0] == player.split('(')[0]] # if len(s_name) ==1: # print ('split from ( match: ', player, s_name[0]) # return (player, s_name[0]) if log or os.environ.get("DEBUG") != "True": print('fix names didnt find match', player) return None, [9999, 9999, 9999]