Ejemplo n.º 1
0
def transform_data():
    import pyspark
    import pandas as pd
    from pyspark.sql import SQLContext
    from pyspark import SparkContext

    dest = SqliteHook(sqlite_conn_id='sqlite2')
    dest_conn = dest.get_conn()
    BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

    sc = pyspark.SparkContext()
    sqlContext = SQLContext(sc)

    df = sc.textFile(os.path.join(BASE_DIR, "InputFile.txt"))
    df.collect()
    sparkDF = df.map(lambda x: str(x).translate({ord(c): None
                                                 for c in '][""'})
                     ).map(lambda w: w.split(',')).toDF()
    pdDF = sparkDF.toPandas()

    sqlUpdate = 'INSERT OR REPLACE INTO Currency(USD, JPY, CAD, GBP, NZD, INR, Date_of_rate) VALUES (?, ?, ?, ?, ?, ?, ?)'
    data = pdDF.values
    dest_conn.executemany(sqlUpdate, data)
    dest_conn.commit()
    sc.stop()
Ejemplo n.º 2
0
def createdb():

    try:
        dest = SqliteHook(sqlite_conn_id='sqlite2')
        dest_conn = dest.get_conn()
        dest_conn.execute(
            '''CREATE TABLE if not exists Currency(USD text, JPY text, CAD text, GBP text, NZD text, INR text, Date_of_rate date)'''
        )
    except:
        print("SQLite Connection Failed")
Ejemplo n.º 3
0
def get_num_active_dagruns(dag_id, conn_id='sqlite_default'):
    # if you've opted for a different backend for airflow, you will need to
    # refactor the two lines below. For a Postgres example, please refer to
    # https://github.com/Nextdoor/airflow_examples/blob/master/dags/util.py#L8
    airflow_db = SqliteHook(sqlite_conn_id=conn_id)
    conn = airflow_db.get_conn()
    cursor = conn.cursor()
    sql = """
         select count(*)
         from dag_run
         where dag_id = '{dag_id}'
         and state in ('running', 'queued', 'up_for_retry')
          """.format(dag_id=dag_id)
    cursor.execute(sql)
    num_active_dagruns = cursor.fetchone()[0]
    return num_active_dagruns
Ejemplo n.º 4
0
def predict(classifier, **context):
    """
    Makes predictions for a model created by the given classifier and returns its
    stores the results in the mct_talks table.
    """
    # Load model
    model = _load_model(classifier.__name__)

    # Load data
    db = SqliteHook()
    df = db.get_pandas_df('select * from mct_talks')

    # Make predictions
    df['Conference'] = model.predict(df['Title'].tolist())

    # Save predictions
    with db.get_conn() as conn:
        df.to_sql('mct_talks', con=conn, if_exists='replace')
Ejemplo n.º 5
0
def split_data(**context):
    """
    Splits the sample data (i.e. research_papers) into training and test sets
    and stores them in the Sqlite DB.
    """
    # Load full dataset
    db = SqliteHook()
    df = db.get_pandas_df('select * from research_papers')

    # Create train/test split
    train, _ = train_test_split(df.index,
                                test_size=0.33,
                                stratify=df['Conference'],
                                random_state=42)

    # Save training and test data in separate tables
    with db.get_conn() as conn:
        df.iloc[train].to_sql('training_data', con=conn, if_exists='replace')
        df.drop(train).to_sql('test_data', con=conn, if_exists='replace')
Ejemplo n.º 6
0
def createdb():
    dest = SqliteHook(sqlite_conn_id='sqlite2')
    dest_conn = dest.get_conn()
    dest_conn.execute(
        '''CREATE TABLE if not exists ORDERS(OrderId text, DocumentNo text, OrderDate text, CatalogId text)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists CUSTOMERS(DocumentNo text, FullName text, Device text, Counttry real)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists CATALOG(CatalogId text, ProductId text, CUID text)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists PRODUCTS(ProductId text, ProductName text, CUID text)'''
    )
    dest_conn.execute(
        '''CREATE TABLE if not exists LEADS(Id text, CustomerId text)''')
    dest_conn.execute(
        '''CREATE TABLE if not exists LOGS(ClientIp text, UserName text, Time text)'''
    )
Ejemplo n.º 7
0
def updatedb():
    #read extract file
    import re
    import re, datetime
    BASE_DIR = os.path.dirname(os.path.abspath("__file__"))
    extract_file = pd.read_csv(os.path.join(BASE_DIR, "SourceData.txt"))

    lead_file_name = os.path.join(BASE_DIR, "Marketing_Lead.xlsx")
    LeadFile = pd.read_excel(lead_file_name)

    log_file_name = os.path.join(BASE_DIR, "logFile.log")
    LogFile = pd.DataFrame(columns=["ClientIP", "UserName", "Time"])
    with open(log_file_name, "r") as logfile:
        Lines = logfile.readlines()
        Linelist = []
        for line in Lines:
            line.strip()
            userregex = "|".join([
                r'(\"[^\d\W]+[^\S][^\d\W]+\")',
                r'(\"[^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\")',
                r'(\"[^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\")',
                r'(\"[^\d\W]+\.[^\S][^\d\W]+[^\S][^\d\W]+\")',
                r'(\"[^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\.\")',
                r'(\"[^\d\W]+\.[^\S][^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\.\")',
                r'(\"[^\d\W]+\.[^\S][^\d\W]+[^\S][^\d\W]+[^\S][^\d\W]+\")'
            ])
            matchHost = re.search(r'([(\d\.)]+)', line).group(0)
            matchUser = re.search(userregex, line).group(0)
            matchTime = re.search(r'(?P<time>\[.*?\])', line).group(0)
            print(matchHost)
            print(matchUser)
            print(matchTime)
            listitem = (matchHost, matchUser, matchTime)
            Linelist.append(listitem)
        LogfileDF = pd.DataFrame(Linelist,
                                 columns=["ClientIP", "UserName", "Time"])
        print(LogfileDF)

    LeadFileDF = LeadFile[["Id", "Company_Id"]]
    OrdersDF = extract_file[[
        "OrderId", "DocumentNo", "OrderDate", "CatalogId"
    ]]
    CustomersDF = extract_file[["DocumentNo", "FullName", "Device", "Country"]]
    ProductsDF = extract_file[["ProductId", "ProductName", "CUID.1"]]
    CatalogDF = extract_file[["CatalogId", "ProductId", "CUID"]]

    #write update logic for customers - only those that are not already existing, catalog - only those that are new
    dest = SqliteHook(sqlite_conn_id='sqlite2')
    dest_conn = dest.get_conn()
    sqlOrders = 'INSERT OR REPLACE INTO ORDERS(OrderId, DocumentNo, OrderDate, CatalogId) VALUES (?, ?, ?, ?)'
    dataOrders = OrdersDF.values
    dest_conn.executemany(sqlOrders, dataOrders)
    dest_conn.commit()

    sqlCustomers = 'INSERT OR REPLACE INTO CUSTOMERS(DocumentNo, FullName, Device, Counttry) VALUES (?, ?, ?, ?)'
    dataCust = CustomersDF.values
    dest_conn.executemany(sqlCustomers, dataCust)
    dest_conn.commit()

    sqlProducts = 'INSERT OR REPLACE INTO PRODUCTS(ProductId, ProductName, CUID) VALUES (?, ?, ?)'
    dataProducts = ProductsDF.values
    dest_conn.executemany(sqlProducts, dataProducts)
    dest_conn.commit()

    sqlCatalog = 'INSERT OR REPLACE INTO CATALOG(CatalogId, ProductId, CUID) VALUES (?, ?, ?)'
    dataCatalog = CatalogDF.values
    dest_conn.executemany(sqlCatalog, dataCatalog)
    dest_conn.commit()

    sqlLogs = 'INSERT OR REPLACE INTO LOGS(ClientIP, UserName, Time) VALUES (?, ?, ?)'
    dataLog = LogfileDF.values
    dest_conn.executemany(sqlLogs, dataLog)
    dest_conn.commit()

    sqlLead = 'INSERT OR REPLACE INTO LEADS(Id, CustomerId) VALUES (?, ?)'
    dataLeadFile = LeadFileDF.values
    dest_conn.executemany(sqlLead, dataLeadFile)
    dest_conn.commit()