def transform_data(): import pyspark import pandas as pd from pyspark.sql import SQLContext from pyspark import SparkContext dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() BASE_DIR = os.path.dirname(os.path.abspath("__file__")) sc = pyspark.SparkContext() sqlContext = SQLContext(sc) df = sc.textFile(os.path.join(BASE_DIR, "InputFile.txt")) df.collect() sparkDF = df.map(lambda x: str(x).translate({ord(c): None for c in '][""'}) ).map(lambda w: w.split(',')).toDF() pdDF = sparkDF.toPandas() sqlUpdate = 'INSERT OR REPLACE INTO Currency(USD, JPY, CAD, GBP, NZD, INR, Date_of_rate) VALUES (?, ?, ?, ?, ?, ?, ?)' data = pdDF.values dest_conn.executemany(sqlUpdate, data) dest_conn.commit() sc.stop()
def createdb(): try: dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() dest_conn.execute( '''CREATE TABLE if not exists Currency(USD text, JPY text, CAD text, GBP text, NZD text, INR text, Date_of_rate date)''' ) except: print("SQLite Connection Failed")
def get_num_active_dagruns(dag_id, conn_id='sqlite_default'): # if you've opted for a different backend for airflow, you will need to # refactor the two lines below. For a Postgres example, please refer to # https://github.com/Nextdoor/airflow_examples/blob/master/dags/util.py#L8 airflow_db = SqliteHook(sqlite_conn_id=conn_id) conn = airflow_db.get_conn() cursor = conn.cursor() sql = """ select count(*) from dag_run where dag_id = '{dag_id}' and state in ('running', 'queued', 'up_for_retry') """.format(dag_id=dag_id) cursor.execute(sql) num_active_dagruns = cursor.fetchone()[0] return num_active_dagruns
def predict(classifier, **context): """ Makes predictions for a model created by the given classifier and returns its stores the results in the mct_talks table. """ # Load model model = _load_model(classifier.__name__) # Load data db = SqliteHook() df = db.get_pandas_df('select * from mct_talks') # Make predictions df['Conference'] = model.predict(df['Title'].tolist()) # Save predictions with db.get_conn() as conn: df.to_sql('mct_talks', con=conn, if_exists='replace')
def split_data(**context): """ Splits the sample data (i.e. research_papers) into training and test sets and stores them in the Sqlite DB. """ # Load full dataset db = SqliteHook() df = db.get_pandas_df('select * from research_papers') # Create train/test split train, _ = train_test_split(df.index, test_size=0.33, stratify=df['Conference'], random_state=42) # Save training and test data in separate tables with db.get_conn() as conn: df.iloc[train].to_sql('training_data', con=conn, if_exists='replace') df.drop(train).to_sql('test_data', con=conn, if_exists='replace')
def createdb(): dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() dest_conn.execute( '''CREATE TABLE if not exists ORDERS(OrderId text, DocumentNo text, OrderDate text, CatalogId text)''' ) dest_conn.execute( '''CREATE TABLE if not exists CUSTOMERS(DocumentNo text, FullName text, Device text, Counttry real)''' ) dest_conn.execute( '''CREATE TABLE if not exists CATALOG(CatalogId text, ProductId text, CUID text)''' ) dest_conn.execute( '''CREATE TABLE if not exists PRODUCTS(ProductId text, ProductName text, CUID text)''' ) dest_conn.execute( '''CREATE TABLE if not exists LEADS(Id text, CustomerId text)''') dest_conn.execute( '''CREATE TABLE if not exists LOGS(ClientIp text, UserName text, Time text)''' )
def updatedb(): #read extract file import re import re, datetime BASE_DIR = os.path.dirname(os.path.abspath("__file__")) extract_file = pd.read_csv(os.path.join(BASE_DIR, "SourceData.txt")) lead_file_name = os.path.join(BASE_DIR, "Marketing_Lead.xlsx") LeadFile = pd.read_excel(lead_file_name) log_file_name = os.path.join(BASE_DIR, "logFile.log") LogFile = pd.DataFrame(columns=["ClientIP", "UserName", "Time"]) with open(log_file_name, "r") as logfile: Lines = logfile.readlines() Linelist = [] for line in Lines: line.strip() userregex = "|".join([ r'(\"[^\d\W]+[^\S][^\d\W]+\")', r'(\"[^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\")', r'(\"[^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\")', r'(\"[^\d\W]+\.[^\S][^\d\W]+[^\S][^\d\W]+\")', r'(\"[^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\.\")', r'(\"[^\d\W]+\.[^\S][^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\.\")', r'(\"[^\d\W]+\.[^\S][^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\")' ]) matchHost = re.search(r'([(\d\.)]+)', line).group(0) matchUser = re.search(userregex, line).group(0) matchTime = re.search(r'(?P<time>\[.*?\])', line).group(0) print(matchHost) print(matchUser) print(matchTime) listitem = (matchHost, matchUser, matchTime) Linelist.append(listitem) LogfileDF = pd.DataFrame(Linelist, columns=["ClientIP", "UserName", "Time"]) print(LogfileDF) LeadFileDF = LeadFile[["Id", "Company_Id"]] OrdersDF = extract_file[[ "OrderId", "DocumentNo", "OrderDate", "CatalogId" ]] CustomersDF = extract_file[["DocumentNo", "FullName", "Device", "Country"]] ProductsDF = extract_file[["ProductId", "ProductName", "CUID.1"]] CatalogDF = extract_file[["CatalogId", "ProductId", "CUID"]] #write update logic for customers - only those that are not already existing, catalog - only those that are new dest = SqliteHook(sqlite_conn_id='sqlite2') dest_conn = dest.get_conn() sqlOrders = 'INSERT OR REPLACE INTO ORDERS(OrderId, DocumentNo, OrderDate, CatalogId) VALUES (?, ?, ?, ?)' dataOrders = OrdersDF.values dest_conn.executemany(sqlOrders, dataOrders) dest_conn.commit() sqlCustomers = 'INSERT OR REPLACE INTO CUSTOMERS(DocumentNo, FullName, Device, Counttry) VALUES (?, ?, ?, ?)' dataCust = CustomersDF.values dest_conn.executemany(sqlCustomers, dataCust) dest_conn.commit() sqlProducts = 'INSERT OR REPLACE INTO PRODUCTS(ProductId, ProductName, CUID) VALUES (?, ?, ?)' dataProducts = ProductsDF.values dest_conn.executemany(sqlProducts, dataProducts) dest_conn.commit() sqlCatalog = 'INSERT OR REPLACE INTO CATALOG(CatalogId, ProductId, CUID) VALUES (?, ?, ?)' dataCatalog = CatalogDF.values dest_conn.executemany(sqlCatalog, dataCatalog) dest_conn.commit() sqlLogs = 'INSERT OR REPLACE INTO LOGS(ClientIP, UserName, Time) VALUES (?, ?, ?)' dataLog = LogfileDF.values dest_conn.executemany(sqlLogs, dataLog) dest_conn.commit() sqlLead = 'INSERT OR REPLACE INTO LEADS(Id, CustomerId) VALUES (?, ?)' dataLeadFile = LeadFileDF.values dest_conn.executemany(sqlLead, dataLeadFile) dest_conn.commit()