def initial_load(self):
     db_connection = MariaDB_Connect(user=self.user,
                                     password=self.password,
                                     host=self.host,
                                     database=self.database)
     db_connection.connect_db()
     db_connection.init_db()
     db_connection.close_db()
     files = os.listdir(self.json_path)
     for file in tqdm(files):
         with open(self.json_path + "/{}".format(file),
                   encoding="utf-8") as json_file:
             aux = json.load(json_file)
         values = {
             "id":
             aux[0]["resource"]["id"],
             "name":
             decode(aux[0]["resource"]["name"].replace(",", ""))
             if aux[0]["resource"]["name"] else "",
             "attribution":
             decode(aux[0]["resource"]["attribution"].replace(",", ""))
             if aux[0]["resource"]["attribution"] else "",
             "description":
             decode(aux[0]["resource"]["description"].replace(",", ""))
             if aux[0]["resource"]["description"] else "",
             "created_date":
             aux[0]["resource"]["createdAt"][0:10]
             if aux[0]["resource"]["createdAt"] else "",
             "updated_date":
             aux[0]["resource"]["updatedAt"][0:10]
             if aux[0]["resource"]["updatedAt"] else "",
             "data_updated":
             aux[0]["resource"]["data_updated_at"][0:10]
             if aux[0]["resource"]["data_updated_at"] else "",
             "metadata_updated":
             aux[0]["resource"]["metadata_updated_at"][0:10]
             if aux[0]["resource"]["metadata_updated_at"] else "",
             "updated":
             0,
             "new":
             0
         }
         db_connection = MariaDB_Connect(user=self.user,
                                         password=self.password,
                                         host=self.host,
                                         database=self.database)
         db_connection.connect_db()
         db_connection.insert_dataset(values)
         db_connection.close_db()
Exemple #2
0
    def key(author):
        #convert accented unicode characters to closest ascii equivalent
        author = decode(author)

        #re-arrange author name to FIRST [MIDDLE] LAST [SUFFIX]
        author = remove_accents_and_hyphens(author)
        #author = reformat_author(author)
        author = remove_curlies(author)
        author = reformat_author(author)

        #get first 4 letters of last name
        return last_name(author)[:4]
Exemple #3
0
    def save_to_db(self):
        data_map = {
            "4n4q-k399": "sanciones",
            "rpmr-utcd": "integrado",
            "gnxj-bape": "contratos",
            "aimg-uskh": "procesos",
            "fzc7-w78v": "chip",
            "vfek-dafh": "adquisiciones"
        }
        datasets = [
            "4n4q-k399", "rpmr-utcd", "gnxj-bape", "aimg-uskh", "fzc7-w78v",
            "vfek-dafh"
        ]

        for dataset in datasets:
            db_connection = MariaDB_Connect(user=self.user,
                                            password=self.password,
                                            host=self.host,
                                            database=self.database)
            db_connection.connect_db()
            updated_date = db_connection.updated_date(dataset)
            filetoload = updated_date["fecha_actualizado"]
            db_connection.close_db()
            print("Loading {} file".format(data_map[dataset]))
            with open("{}/{}/{}.json".format(
                    self.json_path, data_map[dataset],
                    filetoload[0].strftime("%Y-%m-%d"))) as json_file:
                file_ = json.load(json_file)[0]["data"]
                file_ = pd.DataFrame(file_)
            names = [
                item.replace(" ", "_").replace("(", "_").replace(")",
                                                                 "_").lower()
                for item in file_.columns
            ]
            file_.columns = names
            for col in names:
                file_[col] = [
                    decode(row) if isinstance(row, str) else row
                    for row in file_[col]
                ]
            if data_map[dataset] == "contratos":
                del file_["urlproceso"]
            file_ = file_.where((pd.notnull(file_)), None)
            drop = "DROP TABLE IF EXISTS {};".format(data_map[dataset])
            sentence = "CREATE TABLE {} (id INTEGER AUTO_INCREMENT PRIMARY KEY, ".format(
                data_map[dataset])
            n = len(names)
            for i in range(len(names)):
                if i == n - 1:
                    sentence += names[i] + " TEXT);"
                else:
                    sentence += names[i] + " TEXT, "
            db_connection = MariaDB_Connect(user=self.user,
                                            password=self.password,
                                            host=self.host,
                                            database=self.database)
            db_connection.connect_db()
            metadata = db.MetaData(bind=db_connection.connection)
            metadata.reflect()
            db_connection.connection.execute(drop)
            metadata.reflect()
            db_connection.connection.execute(sentence)
            metadata.reflect()
            query = db.insert(metadata.tables[data_map[dataset]])
            file_ = file_.to_dict("records")
            db_connection.connection.execute(query, file_)
            db_connection.close_db()
Exemple #4
0
    def download_dataset(self, item, updated=0, new=0, download=0):
        values = None
        complete_data = []
        if download == 1:
            data_final = None
            index = 0
            offset = 0
            records = sess.get(
                "https://www.datos.gov.co/resource/{}.json?$select=count(*)".
                format(item["dataset_link"])).json()[0]
            keys = records.keys()
            records = int([records[key] for key in keys][0])
            iterations = np.ceil(records / self.limit)
            if iterations > 0:
                while index < iterations:
                    if index == 0:
                        url = "https://www.datos.gov.co/resource/{}.json?$$app_token={}&$limit={}&$offset={}".format(
                            item["dataset_link"], self.token, self.limit,
                            offset)
                        resp = sess.get(url)
                        if resp.status_code == 200:
                            data_final = pd.DataFrame(
                                json.loads(resp.content, encoding="UTF-8"))
                            if not data_final.empty:
                                offset = (self.limit * index) + 1
                                index += 1
                            else:
                                index = iterations + 1
                        else:
                            print("Bad Request!")
                            index = iterations + 1
                    else:
                        url = "https://www.datos.gov.co/resource/{}.json?$$app_token={}&$limit={}&$offset={}".format(
                            item["dataset_link"], self.token, self.limit,
                            offset)
                        resp = sess.get(url)
                        if resp.status_code == 200:
                            aux = pd.DataFrame(
                                json.loads(resp.content, encoding="UTF-8"))
                            if not aux.empty:
                                data_final = pd.concat([data_final, aux],
                                                       axis=0,
                                                       sort=True)
                                offset = (self.limit * index) + 1
                                index += 1
                            else:
                                index = iterations + 1
                        else:
                            print("Bad Request!")
                            index = iterations + 1

                complete_data.append({
                    "metadata": item["metadata"],
                    "resource": item["resource"],
                    "dataset": item["dataset_name"],
                    "data": data_final.to_dict()
                })

        values = {
            "id":
            item["resource"]["id"],
            "nombre":
            decode(item["resource"]["name"].replace(",", ""))
            if item["resource"]["name"] else "",
            "categoria":
            decode(item["category"]) if item["category"] else "",
            "entidad":
            decode(item["resource"]["attribution"].replace(",", ""))
            if item["resource"]["attribution"] else "",
            "descripcion":
            decode(item["resource"]["description"].replace(",", ""))
            if item["resource"]["description"] else "",
            "fecha_ejecucion":
            datetime.today().strftime("%Y-%m-%d"),
            "fecha_creacion":
            item["resource"]["createdAt"][0:10]
            if item["resource"]["createdAt"] else "",
            "fecha_actualizacion":
            item["resource"]["updatedAt"][0:10]
            if item["resource"]["updatedAt"] else "",
            "fecha_datos_actualizados":
            item["resource"]["data_updated_at"][0:10]
            if item["resource"]["data_updated_at"] else "",
            "fecha_metadata_actualizada":
            item["resource"]["metadata_updated_at"][0:10]
            if item["resource"]["metadata_updated_at"] else "",
            "actualizado":
            updated,
            "nuevo":
            new
        }
        return complete_data, values
Exemple #5
0
def fix_name(player, owgr_rankings, log=None):
    '''takes a string and a dict and returns a dict?'''

    from golf_app.models import Name
    #need to import here to avoid cicular import error

    if owgr_rankings.get(player.replace('.', '').replace('-', '')) != None:
        #print ('returning match', owgr_rankings.get(player.replace('.', '').replace('-', '')))
        return (player,
                owgr_rankings.get(player.replace('.', '').replace('-', '')))

    if owgr_rankings.get(decode(player)):
        if log:
            print('unidecoded name dict match: ', player,
                  owgr_rankings.get(decode(player)))
        return (player, owgr_rankings.get(decode(player)))

    #lower = {k:v for k,v in owgr_rankings.items() if player.lower() == k.lower()}
    lower = [
        v for k, v in owgr_rankings.items() if player.lower() == k.lower()
    ]
    if len(lower) > 0:
        return (player, lower[0])

    replace_list = ['-', "'"]
    for char in replace_list:
        strip = [
            v for k, v in owgr_rankings.items()
            if player.replace(char, '').lower() == k.replace(char, '').lower()
        ]
        if len(strip) > 0:
            return (player, strip[0])

    if log:
        print(['player', player])
    if Name.objects.filter(PGA_name=player).exists():
        if log:
            print('player mathc')
        name = Name.objects.get(PGA_name=player)
        if owgr_rankings.get(name.OWGR_name):
            if log:
                print('returning based on name table lookup: ', player,
                      owgr_rankings.get(name.OWGR_name))
            return (player, owgr_rankings.get(name.OWGR_name))

    last = player.split(' ')

    if last[len(last) - 1] in ['Jr', 'Jr.', '(a)'
                               ] or last[len(last) - 1].isupper():
        last_name = last[len(player.split(' ')) - 2]
    else:
        last_name = last[len(last) - 1]

    possible_matches = {
        k: v
        for k, v in owgr_rankings.items()
        if decode(last_name.strip(',')) in decode(k)
    }
    if log:
        print('player: ', player)
    #print ('possible name mathces: ', player, possible_matches)

    pga_name = player.replace(' (a)', '').replace(',',
                                                  '').replace('.', '').replace(
                                                      '-', '').split(' ')

    #for k, v in owgr_rankings.items():
    for k, v in possible_matches.items():

        owgr_name = k.replace(',', '').split(' ')
        if log:
            print('looping thru possible: ', pga_name, owgr_name)
        if owgr_name == pga_name:
            if log:
                print('names equal after strip spec chars', player, owgr_name)
            return player, v

        if len(owgr_name) == 3 and len(pga_name) == 3 and decode(owgr_name[0]) == decode(pga_name[0]) \
            and decode(owgr_name[2].replace('.', '')) == decode(pga_name[2].replace('.', '')) and owgr_name[1][0] == pga_name[1][0]:
            if log:
                print('last name, first name match, middle first intial match',
                      player, owgr_name)
            return k, v
        #elif len(owgr_name) - 1 == len(pga_name) or len(owgr_name) == len(pga_name) - 1 \
        #    and (owgr_name[0] == pga_name[0] \
        #    and decode(owgr_name[len(owgr_name) -1]) == decode(pga_name[len(pga_name) -1])):
        #    print ('strip middle stuff, first and last match', pga_name, owgr_name)
        #    return k, v


        elif decode(owgr_name[len(owgr_name)-2]) == decode(pga_name[len(pga_name)-1]) \
            and k.split(' ')[0] == player.split(' ')[0]:
            #and k[0:1] == player[0:1]:  initial logic checks for charaacter of first name so causing false positives
            if log:
                print('XXXXX fix this for dru love')
                print('last name, first initial match, cut owgr suffix', k, v,
                      player, owgr_name)
            return k, v
        #elif len(owgr_name) == 3 and len(pga_name) == 3 and unidecode.unidecode(owgr_name[len(owgr_name)-2]) == unidecode.unidecode(pga_name[len(pga_name)-2]) \
        #    and unidecode.unidecode(owgr_name[0]) == unidecode.unidecode(pga_name[0]):
        #    print ('last name, first name, cut both suffix', player)
        #    return k, v
        elif decode(owgr_name[0].replace('-', '')) == decode(pga_name[len(pga_name)-1].replace('-', '')) \
            and decode(owgr_name[len(owgr_name)-1].replace('-', '')) == decode(pga_name[0].replace('-', '')):
            if log:
                print('names reversed', player, owgr_name)
            return k, v
        elif decode(owgr_name[len(owgr_name)-1]) == decode(pga_name[len(pga_name)-1]) \
           and k[0:2] == player[0:2]:
            if log:
                print('last name, first two letter match', player, owgr_name)
            return k, v

    # s_name = [v for k, v in owgr_rankings.items() if k.split('(')[0] == player.split('(')[0]]
    # if len(s_name) ==1:
    #     print ('split from ( match: ', player, s_name[0])
    #     return (player, s_name[0])

    if log or os.environ.get("DEBUG") != "True":
        print('fix names didnt find match', player)
    return None, [9999, 9999, 9999]