Example #1
0
def appendCSVtoSQLite(csvFile, database, table, ignoreLines=0, dbCursor=None, dbConn=None):
    if not dbCursor:
        dbConn, dbCursor = sm.dbConnect(database)
        dbCursor.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '{table}'""".format(table=table))
        tables = [item[0] for item in dbCursor.fetchall()]
        if tables:
            print("A table by that name already exists in the database. Please use appendCSVtoSQLite or choose a new name.")
            sys.exit(1)
    print("""Importing data, reading {csvFile} file""".format(csvFile=csvFile))
    with open(csvFile, "r") as f:
        csvReader = csv.reader(f, delimiter=",")
        if ignoreLines > 0:
            for i in range(0,ignoreLines): 
                next(csvReader)
        csvData = list(csvReader)
        chunkData = chunks(csvData) 
        numColumns = None
        for chunk in chunkData:
            if not numColumns:
                numColumns = len(chunk[0])
                values_str = "(" + ",".join(["?"]*numColumns) + ")"
            dbCursor.executemany("""INSERT INTO {table} VALUES {values}""".format(table=table, values=values_str), chunk)
            dbConn.commit()
    dbConn.close()
    return
Example #2
0
def csvToSQLite(csvFile, database, table, columnDescription, ignoreLines=0):
    dbConn, dbCursor = sm.dbConnect(database)
    dbCursor.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '{table}'""".format(table=table))
    tables = [item[0] for item in dbCursor.fetchall()]
    if tables:
        print("A table by that name already exists in the database. Please use appendCSVtoSQLite or choose a new name.")
        sys.exit(1)
    createSQL = """CREATE TABLE {table} {colDesc}""".format(table=table, colDesc=columnDescription)
    print(createSQL)
    dbCursor.execute(createSQL)
    appendCSVtoSQLite(csvFile, database, table, ignoreLines, dbCursor, dbConn)
    return
Example #3
0
def mySQLToCSV(database, table, csvFile, csvQuoting=csv.QUOTE_ALL):
    csvPath = os.path.dirname(os.path.abspath(csvFile))
    if not os.path.isdir(csvPath):
        print("Path {path} does not exist".format(path=csvPath))
        sys.exit(1)
    dbConn, dbCursor, dictCursor = mm.dbConnect(database)
    ssCursor = dbConn.cursor(SSCursor)
    ssCursor.execute("select * from {table}".format(table=table))
    header = [i[0] for i in ssCursor.description]
    with open(csvFile, "w") as csv_file:
        csv_writer = csv.writer(csv_file, quoting=csvQuoting)
        csv_writer.writerow(header)
        csv_writer.writerows(ssCursor)
    return
Example #4
0
def csvToMySQL(csvFile, database, table, columnDescription, ignoreLines=0):
    dbConn, dbCursor, dictCursor = mm.dbConnect(database)

    dbCursor.execute("""SHOW TABLES LIKE '{table}'""".format(table=table))
    tables = [item[0] for item in dbCursor.fetchall()]
    if tables:
        print("A table by that name already exists in the database. Please use appendCSVtoMySQL or choose a new name.")
        sys.exit(1)

    createSQL = """CREATE TABLE {table} {colDesc} CHARACTER SET utf8mb4""".format(table=table, colDesc=columnDescription)
    print(createSQL)
    dbCursor.execute(createSQL)
    appendCSVtoMySQL(csvFile, database, table, ignoreLines, dbCursor)
    return
Example #5
0
def appendCSVtoMySQL(csvFile, database, table, ignoreLines=0, dbCursor=None):
    if not dbCursor:
        dbConn, dbCursor, dictCursor = mm.dbConnect(database)
        dbCursor.execute("""SHOW TABLES LIKE '{table}'""".format(table=table))
        tables = [item[0] for item in dbCursor.fetchall()]
        if not tables:
            print("The table {table} does not exist in the database. Please use csvToMySQL or create the table.".format(table=table))
            sys.exit(1)
    with open(csvFile, 'U') as f:
        f.readline()
        line_termination = f.newlines
    disableSQL = """ALTER TABLE {table} DISABLE KEYS""".format(table=table)
    print(disableSQL)
    dbCursor.execute(disableSQL)
    print("""Importing data, reading {csvFile} file""".format(csvFile=csvFile))
    importSQL = """LOAD DATA LOCAL INFILE '{csvFile}' INTO TABLE {table} 
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' 
        LINES TERMINATED BY '{lineTermination}' IGNORE {ignoreLines} LINES""".format(csvFile=csvFile, table=table, ignoreLines=ignoreLines, lineTermination=line_termination)
    dbCursor.execute(importSQL)
    enableSQL = """ALTER TABLE {table} ENABLE KEYS""".format(table=table)
    print(enableSQL)
    dbCursor.execute(enableSQL)
    return
Example #6
0
def createCountry(from_database, to_database, table, country):
    dbConn, dbCursor, dictCursor = mm.dbConnect(from_database)
    dbToConn, dbToCursor, dictToCursor = mm.dbConnect(to_database)

    country_abbr = country_dict[country]
    df = pd.Series(cities).to_frame()
    df = df.rename(columns={0: "original"})
    df['text'] = df['original'].apply(lambda x: x
                                      if ',' + country_abbr in x else None)

    df = df[~df['text'].isnull()]
    df['city'] = df['text'].apply(lambda x: x.split(",")[2])
    df['state'] = df['text'].apply(lambda x: x.split(",")[3])
    df['county'] = df['text'].apply(lambda x: x.split(",")[4])
    df['country'] = df['text'].apply(lambda x: x.split(",")[5])
    df = df[['city', 'county', 'state', 'country']]

    df = df[df['country'] == country_abbr]
    df = df.drop_duplicates()
    df.reset_index(inplace=True, drop=True)

    state_list = df['state'].value_counts().index
    stateInformation = ''
    for i in state_list:
        temp = i.replace(' and ', '%')
        temp = temp.replace(' de ', '%')
        if not i:
            continue
        if not stateInformation:
            stateInformation += 'CASE WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                temp, i)
        elif i == state_list[-1]:
            stateInformation += 'WHEN a.location LIKE "%%%s%%" THEN "%s" END AS state' % (
                temp, i)
        else:
            stateInformation += 'WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                temp, i)

    state_id = ''
    for i in range(len(state_list)):
        temp = state_list[i].replace(' and ', '%')
        temp = temp.replace(' de ', '%')
        if not state_list[i]:
            continue
        if not state_id:
            state_id += 'CASE WHEN a.location LIKE "%%%s%%" THEN %i ' % (temp,
                                                                         i + 1)
        elif i == len(state_list) - 1:
            state_id += 'WHEN a.location LIKE "%%%s%%" THEN %i END AS state_id' % (
                temp, i + 1)
        else:
            state_id += 'WHEN a.location LIKE "%%%s%%" THEN %i ' % (temp,
                                                                    i + 1)

    df_county = df.groupby(['county', 'state']).count().reset_index()
    countyInformation = ''
    county_id = ''
    for i in range(len(df_county)):
        temp = df_county.iloc[i]['county'].replace(' County', '').replace(
            ' and ', '%').replace(' de ', '%')
        county = df_county.iloc[i]['county']
        state = df_county.iloc[i]['state'].replace(' and ',
                                                   '%').replace(' de ', '%')
        com = temp + '%' + state
        if not county or not state:
            continue
        if not countyInformation and not county_id:
            countyInformation += 'CASE WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                com, county)
            county_id += 'CASE WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                com, i + 1)
        elif i == len(df_county) - 1:
            countyInformation += 'WHEN a.location LIKE "%%%s%%" THEN "%s" END AS county,' % (
                com, county)
            county_id += 'WHEN a.location LIKE "%%%s%%" THEN "%s" END AS county_id,' % (
                com, i + 1)
        else:
            countyInformation += 'WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                com, county)
            county_id += 'WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (com,
                                                                       i + 1)

    df_city = df[['city', 'state']].drop_duplicates().reset_index(drop=True)
    cityInformation = ''
    city_id = ''
    for i in range(len(df_city)):
        temp = df_city.iloc[i]['city'].replace(' and ',
                                               '%').replace(' de ', '%')
        city = df_city.iloc[i]['city']
        state = df_city.iloc[i]['state'].replace(' and ',
                                                 '%').replace(' de ', '%')
        com = temp + '%' + state
        if not city or not state:
            continue
        if not cityInformation and not city_id:
            cityInformation += 'CASE WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                com, city)
            city_id += 'CASE WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                com, i + 1)
        elif i == len(df_city) - 1:
            cityInformation += 'WHEN a.location LIKE "%%%s%%" THEN "%s" END AS city' % (
                com, city)
            city_id += 'WHEN a.location LIKE "%%%s%%" THEN "%s" END AS city_id' % (
                com, i + 1)
        else:
            cityInformation += 'WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (
                com, city)
            city_id += 'WHEN a.location LIKE "%%%s%%" THEN "%s" ' % (com,
                                                                     i + 1)

    columnDescription = '(message_id bigint(20), user_id bigint(20), message text, created_at_utc datetime, state varchar(256), state_id int(6), county varchar(256), county_id int(6), city varchar(256), city_id int(6), postal_code int(5), lang varchar(4))'
    table_name = country_abbr + '_geotagged'
    dbToCursor.execute(
        """SHOW TABLES LIKE '{table}'""".format(table=table_name))
    tables = [item[0] for item in dbToCursor.fetchall()]
    if not tables:
        createSQL = """CREATE TABLE {table} {colDesc}""".format(
            table=table_name, colDesc=columnDescription)
        dbToCursor.execute(createSQL)

    createCountry1 = """
    INSERT INTO {to_db}.{to_table}
    SELECT * FROM
      (SELECT message_id, user_id, message, created_at_utc, {state}, {state_id}, {county}, {county_id}, {city}, {city_id}, postal_code, message_lang AS lang
      FROM (SELECT message_id, user_id, message, created_at_utc, user_location AS location, postal_code, message_lang 
            FROM {from_table} WHERE user_location IS NOT NULL) AS a) AS b
    WHERE b.state IS NOT NULL
    """.format(
        to_db=to_database,
        to_table=table_name,
        state=stateInformation,
        state_id=state_id,
        county=countyInformation
        if countyInformation else "(SELECT NULL) AS county",
        county_id=county_id if county_id else "(SELECT NULL) AS county_id",
        city=cityInformation,
        city_id=city_id,
        from_table=table)

    createCountry2 = """
    INSERT INTO {to_db}.{to_table}
    SELECT * FROM
      (SELECT message_id, user_id, message, created_at_utc, {state}, {state_id}, {county}, {county_id}, {city}, {city_id}, postal_code, message_lang AS lang
      FROM (SELECT message_id, user_id, message, created_at_utc, tweet_location AS location, postal_code,message_lang 
            FROM {from_table} WHERE user_location IS NULL AND tweet_location IS NOT NULL) AS a) AS b
    WHERE b.state IS NOT NULL
    """.format(
        to_db=to_database,
        to_table=table_name,
        state=stateInformation,
        state_id=state_id,
        county=countyInformation
        if countyInformation else "(SELECT NULL) AS county",
        county_id=county_id if county_id else "(SELECT NULL) AS county_id",
        city=cityInformation,
        city_id=city_id,
        from_table=table)

    dbCursor.execute(createCountry1)
    dbCursor.execute(createCountry2)

    return