Ejemplo n.º 1
0
def process_files_district(dno_files):
    for file in dno_files:
        try:
            target_cols = [
                "RDate", "Schedule", "Route", "fRun", "DNO", "DNOLost", "DNC",
                "DNCLost", "Type", "Prob Location", "Prob Description", "Base",
                "Miles"
            ]
            parse_dates = ["RDate"]
            df = pd.read_excel(s3_bucket_prefix + file,
                               usecols=target_cols,
                               parse_dates=parse_dates)
            df['Source'] = "District"
        except Exception as e:
            failed_files.append({
                'file_name': file,
                'reason': eC.INVALID_FILE,
                'time': datetime.datetime.now()
            })
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.RUNNING,
                level=lC.ERROR,
                error=repr(e),
                remarks='Column Validation failed for dno district ' + file,
                reason=eC.INVALID_FILE,
                sns=True)
            logger.critical(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
            continue
        try:
            write_to_s3(df, file)
        except Exception as e:
            failed_files.append({
                'file_name': file,
                'reason': eC.S3_WRITE_FAILED,
                'time': datetime.datetime.now()
            })
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.FAILED,
                level=lC.ERROR,
                error=repr(e),
                remarks='s3 Write:Failed ' + file,
                reason=eC.S3_WRITE_FAILED,
                sns=True)
            logger.critical(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
Ejemplo n.º 2
0
def process_dno_cub(cub_files, month=None):
    res_list = []
    for file in cub_files:
        # print(file)
        try:
            data = pd.ExcelFile(s3_bucket_prefix + file)
            sheet_df_mapping = get_df_by_sheet_mapping(data, month)
            new_mapping = parse_sheets(sheet_df_mapping)
            # print("keys",new_mapping.keys())
            res_list.append(new_mapping)
            if len(new_mapping.keys()) < 1:
                # print("No sheets")
                pass
            else:
                write_dno_cub_to_s3(new_mapping, file)
        except Exception as e:
            failed_files.append({
                'file_name': file,
                'reason': eC.CSV_PROCESSING_FAILED,
                'time': datetime.datetime.now()
            })
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.RUNNING,
                level=lC.ERROR,
                error=repr(e),
                remarks=' CSV_PROCESSING_FAILED:Failed ' + file,
                reason=eC.CSV_PROCESSING_FAILED,
                sns=True)
            logger.critical(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
Ejemplo n.º 3
0
def write_dno_cub_to_s3(new_mapping, file):
    filename = (file.split("/")[-1]).split(".")[-2]
    print("heyy:", filename)
    for date in new_mapping:
        try:
            month = datetime.datetime.strftime(date, '%b')
            year = date.year
            target_path_csv = processing_csv_path + 'CUB' + '/' + str(
                year) + '/' + str(month) + '/' + 'CUB_' + str(
                    year) + '_' + str(month) + '_' + filename + '.csv'
            print("cub path", target_path_csv)
            # processing_path = processed_path.replace('processed','incoming/processing')
            # print(processing_path)
            df = new_mapping[date]
            # util.write_df_to_csv_on_s3(new_mapping[date],bucket,processed_path)
            df.to_csv(s3_bucket_prefix + target_path_csv, index=False)
            # util.write_df_to_csv_on_s3(new_mapping[date],bucket,processing_path)
            # df.to_csv(s3_bucket_prefix+processed_path,index=False)
        except Exception as e:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.RUNNING,
                level=lC.ERROR,
                error=repr(e),
                remarks='CUB CSV Write from sheet:Failed for ' + date,
                reason=eC.CSV_PROCESSING_FAILED,
                sns=True)
            logger.critical(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
Ejemplo n.º 4
0
def check_input_ready(data):
    inputs = {}
    try:
        inputs["source_sf_ud"] = data["paths"]["clipper_sftp_sf_ud"]
        inputs["source_sys_cd"] = data["paths"]["clipper_sftp_sys_cd"]
    except Exception as e:
        log_msg, sns_msg = util.get_log_template(dag=dag,
                                                 task=task,
                                                 status=sC.FAILED,
                                                 level=lC.CRITICAL,
                                                 error=repr(e),
                                                 remarks=None,
                                                 reason=eC.CONFIG_ERROR,
                                                 exit_code=1,
                                                 sns=True)
        logger.error(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        sys.exit(1)

    try:
        inputs["host"] = data['sftp']['host']
        inputs["user"] = data['sftp']['user']
        inputs["sshk"] = data['sftp']['sshk']
        return inputs
    except Exception as e:
        log_msg, sns_msg = util.get_log_template(dag=dag,
                                                 task=task,
                                                 status=sC.FAILED,
                                                 level=lC.CRITICAL,
                                                 error=repr(e),
                                                 remarks=None,
                                                 reason=eC.NO_SFTP_CREDENTIALS,
                                                 exit_code=1,
                                                 sns=True)
        logger.error(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        sys.exit(1)
Ejemplo n.º 5
0
def get_sftp_connection(host, user, SSHK):
    try:
        sshcon = paramiko.SSHClient()  # will create the object
        sshcon.set_missing_host_key_policy(
            paramiko.AutoAddPolicy())  # no known_hosts error
        sshcon.connect(host, username=user,
                       key_filename=SSHK)  # no passwd needed
        logger.info('SUCCEEDED:CONNECTION to SFTP')
        return sshcon
    except Exception as e:
        log_msg, sns_msg = util.get_log_template(dag=dag,
                                                 task=task,
                                                 status=sC.FAILED,
                                                 level=lC.CRITICAL,
                                                 error=repr(e),
                                                 remarks=None,
                                                 reason=eC.SFTP_CONN_ERROR,
                                                 exit_code=1,
                                                 sns=True)
        logger.error(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        sys.exit(1)
Ejemplo n.º 6
0
def main():
    inputs = check_input_ready(data)
    copy_cmd_ud = ('aws s3 cp %s %s --recursive --exclude "newfiles"') % (
        inputs["source_sf_ud"], destination_sf_ud)
    copy_cmd_cd = ('aws s3 cp %s %s --recursive --exclude "newfiles"') % (
        inputs["source_sys_cd"], destination_sys_cd)
    list_file_cmd_ud = ('ls {}').format(inputs["source_sf_ud"])
    list_file_cmd_cd = ('ls {}').format(inputs["source_sys_cd"])
    sshcon = get_sftp_connection(inputs["host"], inputs["user"],
                                 inputs["sshk"])
    try:
        stdin, stdout, stderr = sshcon.exec_command(list_file_cmd_ud)
        stdin, stdout, stderr = sshcon.exec_command(list_file_cmd_cd)
        output = stdout.read().decode('ascii')
        input_files = parse_bash_out(output)
        msg = util.get_log_template(dag=dag,
                                    task=task,
                                    status=sC.RUNNING,
                                    level=lC.INFO,
                                    remarks='Bash List cmd Executed')
        logger.info(msg)
    except Exception as e:
        log_msg, sns_msg = util.get_log_template(
            dag=dag,
            task=task,
            status=sC.FAILED,
            level=lC.CRITICAL,
            error=repr(e),
            remarks='SFTP file listing failed',
            reason=eC.BASH_ERROR,
            exit_code=1,
            sns=True)
        logger.error(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        sys.exit(1)

    if len(input_files) < 1:
        log_msg, sns_msg = util.get_log_template(dag=dag,
                                                 task=task,
                                                 status=sC.FINISHED,
                                                 level=lC.WARNING,
                                                 reason=eC.NO_FILES_FOUND,
                                                 remarks='No Files to Process',
                                                 exit_code=1,
                                                 sns=True)
        logger.warning(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        sys.exit(1)

    else:
        try:
            logger.info(msg)
            stdin, stdout, stderr = sshcon.exec_command(copy_cmd_ud)
            stdin, stdout, stderr = sshcon.exec_command(copy_cmd_cd)
            output = stdout.read()
            stderr_out = stderr.read()
            if len(stderr_out) == 0:
                log_msg, sns_msg = util.get_log_template(
                    dag=dag,
                    task=task,
                    status=sC.FINISHED,
                    level=lC.INFO,
                    remarks='SUCCESS-COPY SFTP to S3',
                    sns=True)
                sns.publish(TopicArn=topicarn,
                            Subject=subject,
                            Message=sns_msg)
            else:
                log_msg, sns_msg = util.get_log_template(
                    dag=dag,
                    task=task,
                    status=sC.FAILED,
                    level=lC.CRITICAL,
                    error=repr(stderr_out),
                    remarks='Copy Command sftp to s3 failed',
                    reason=eC.S3_COPY_FAILED,
                    exit_code=1,
                    sns=True)
                logger.error(log_msg)
                sns.publish(TopicArn=topicarn,
                            Subject=subject,
                            Message=sns_msg)
                sys.exit(1)
        except Exception as e:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.FAILED,
                level=lC.CRITICAL,
                error=repr(e),
                remarks='Copy Command sftp to s3 failed',
                reason=eC.S3_COPY_FAILED,
                exit_code=1,
                sns=True)
            logger.error(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
            sys.exit(1)
    log_msg = util.get_log_template(dag=dag,
                                    task=task,
                                    status=sC.FINISHED,
                                    level=lC.INFO)
    logger.info(log_msg)
Ejemplo n.º 7
0
def main():
    all_files = []
    try:
        # print(incoming_xls_prefix)
        all_files = util.get_s3_keys_by_prefix(bucket, incoming_xls_prefix)
        # print(all_files)
    except Exception as e:
        log_msg, sns_msg = util.get_log_template(
            dag=dag,
            task=task,
            status=sC.FAILED,
            level=lC.ERROR,
            error=repr(e),
            remarks='In S3 Key fetch -  all_files',
            reason=eC.S3_CONNECTION_ERROR,
            sns=True)
        logger.error(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        sys.exit(1)

    if len(all_files) > 1:
        try:
            dno_dist_files = get_dno_dist_files(all_files)
            dno_cub_files = get_dno_cub_files(all_files)
        except Exception as e:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.FAILED,
                level=lC.WARNING,
                remarks='ERROR IN getting cub/district file',
                exit_code=1,
                reason=eC.ERROR_IN_PARSING_FILENAME,
                sns=True)
            logger.error(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
            sys.exit(1)

        try:
            # print(dno_cub_files)
            useful_files = dno_cub_files + dno_dist_files
            # print("useful",useful_files)
            bad_files = set(all_files) - set(useful_files)
            util.delete_file_list(bucket, list(bad_files))
        except Exception as e:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.FAILED,
                level=lC.WARNING,
                remarks='ERROR in removing unwanted files',
                reason=eC.S3_DELETE_FAILED,
                sns=True)
            logger.error(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
            sys.exit(1)

        if len(dno_dist_files) < 1:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.RUNNING,
                level=lC.CRITICAL,
                error=repr(e),
                remarks=
                'DNO District Files to be processed missing in incoming',
                reason=eC.NO_FILES_FOUND,
                sns=True)
            logger.error(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)

        else:
            try:
                process_files_district(dno_dist_files)
                pass
            except Exception as e:
                log_msg, sns_msg = util.get_log_template(
                    dag=dag,
                    task=task,
                    status=sC.RUNNING,
                    level=lC.CRITICAL,
                    error=repr(e),
                    remarks='ERROR in process_files_district',
                    reason=eC.CSV_PROCESSING_FAILED,
                    sns=True)
                logger.error(log_msg)
                sns.publish(TopicArn=topicarn,
                            Subject=subject,
                            Message=sns_msg)
        if len(dno_cub_files) < 1:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.RUNNING,
                level=lC.CRITICAL,
                error=repr(e),
                remarks='DNO CUB Files to be processed missing in incoming',
                reason=eC.NO_FILES_FOUND,
                sns=True)
            logger.error(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
        else:
            try:
                if len(cub_date_selection) > 0:
                    year = cub_date_selection[0]['year']
                    month = cub_date_selection[0]['month']
                    for file in dno_cub_files:
                        if str(year) in file:
                            process_dno_cub([file], month)
                else:
                    print("processing dno_cub_files")
                    process_dno_cub(dno_cub_files)
            except Exception as e:
                log_msg, sns_msg = util.get_log_template(
                    dag=dag,
                    task=task,
                    status=sC.RUNNING,
                    level=lC.CRITICAL,
                    error=repr(e),
                    remarks='ERROR in process_dno_cub',
                    reason=eC.CSV_PROCESSING_FAILED,
                    sns=True)
                logger.error(log_msg)
                sns.publish(TopicArn=topicarn,
                            Subject=subject,
                            Message=sns_msg)

        if len(failed_files) > 0:
            print("moving_failed_files")
            move_failed_files(failed_files)
        else:
            pass

    else:
        log_msg, sns_msg = util.get_log_template(
            dag=dag,
            task=task,
            status=sC.FINISHED,
            level=lC.WARNING,
            remarks='Files to be processed missing in incoming',
            reason=eC.NO_FILES_FOUND,
            sns=True)
        logger.error(log_msg)
        sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
Ejemplo n.º 8
0
def parse_sheets(sheet_dict):
    cols_of_interest = [
        "Unnamed: 0", "Unnamed: 1", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4",
        "Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9"
    ]
    new_dic = {}
    dno_dnc_cols = [
        "date", "run", "schedule_number", "dno_miles", "dnc_miles", "dno",
        "dnc", "prob_location"
    ]
    for key in sheet_dict.keys():
        try:
            if sheet_dict[key].empty or key == "Sept 2017":
                log_msg, sns_msg = util.get_log_template(
                    dag=dag,
                    task=task,
                    status=sC.RUNNING,
                    level=lC.WARNING,
                    remarks='Empty Sheet for ' + key,
                    reason=eC.CSV_PROCESSING_FAILED,
                    sns=True)
                logger.warning(log_msg)
                sns.publish(TopicArn=topicarn,
                            Subject=subject,
                            Message=sns_msg)
            else:
                # print(key)
                # print(sheet_dict[key].columns)
                df = sheet_dict[key]
                col_map = get_col_map(df)
                new_df = df.rename(columns=col_map)
                # print("new cols",new_df.columns)
                # print("dno cols",dno_dnc_cols)
                if "prob_location" not in col_map.keys():
                    dno_dnc_col = [
                        "date", "run", "schedule_number", "dno_miles",
                        "dnc_miles", "dno", "dnc"
                    ]
                    new_df = new_df[dno_dnc_col]
                else:
                    new_df = new_df[dno_dnc_cols]
                final_df = new_df[new_df['date'].notna()]
                final_df = final_df[final_df['date'].apply(
                    lambda x: isinstance(
                        dateparser.parse(str(x),
                                         settings={'STRICT_PARSING': True}),
                        datetime.datetime))]
                final_df["source"] = "CUB"
                final_df['route'] = final_df.apply(
                    lambda row: str(row.schedule_number)[:3]
                    if len(str(row.schedule_number)) > 3 else " ",
                    axis=1)
                final_df['miles'] = final_df.apply(lambda row: row.dnc_miles
                                                   if row.dnc == 1 else
                                                   (row.dno_miles
                                                    if row.dno == 1 else " "),
                                                   axis=1)
                final_df['dno'] = final_df.apply(lambda row: True
                                                 if row.dno == 1 else False,
                                                 axis=1)
                final_df['dnc'] = final_df.apply(lambda row: True
                                                 if row.dnc == 1 else False,
                                                 axis=1)
                # print(final_df[["dno",'date']])
                date = final_df['date'].iloc[0]
                new_dic[date] = final_df
        except Exception as e:
            log_msg, sns_msg = util.get_log_template(
                dag=dag,
                task=task,
                status=sC.FAILED,
                level=lC.ERROR,
                error=repr(e),
                remarks='Sheet parsing failed for CUB: ' + key,
                reason=eC.INVALID_FILE,
                sns=True)
            logger.critical(log_msg)
            sns.publish(TopicArn=topicarn, Subject=subject, Message=sns_msg)
    return new_dic