def persistRelations(esIndexRel, s3confPath, spark): sqlContext = SQLContext(spark) relationsDF = esIndexRel.select("ticket_id", "related_ticket_id") total = None try: total = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_relations_parquet_path) except Exception as e: message = str(e) if message.find("Path does not exist"): raise e else: logging.info("catched Path does not exist (first job execution): " + str(e)) relationsDF \ .repartition(1) \ .write \ .mode("overwite") \ .parquet(S3FilesDsl.readConfigJson(s3confPath).rod_relations_parquet_path) total = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_relations_parquet_path) total.cache() logging.info("total.count---------------------------------------------" + str(total.count())) result = total \ .unionByName(relationsDF) \ .distinct() \ .repartition(1) \ .write \ .mode("overwrite") \ .parquet(S3FilesDsl.readConfigJson(s3confPath).rod_relations_parquet_path) return result
def removeClosedAgentSmc(esIndex, s3confPath, spark): sqlContext = SQLContext(spark) closed = esIndex.select("ticket_id", "assigned_agent", "smc_cluster", "reported_source_id").distinct() auxOuterJoin = closed.select("ticket_id") total = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) total.cache() logging.info("totalWithoutClosed.count---------------------------------------------" + str(total.count())) totalWithoutClosed = total \ .join(auxOuterJoin, ["ticket_id"], "left") \ .where(auxOuterJoin["ticket_id"].isNull()) result = totalWithoutClosed \ .repartition(1) \ .write \ .mode("overwrite") \ .parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) logging.info( "totalWithoutClosed.count---------------------------------------------" + str(totalWithoutClosed.count())) return result
def writeESCorruptRecordsIndex(index, name, conf): prefix = S3FilesDsl.readConfigJson(conf).elastic_env_index_prefix config = { "elastic_nodes": "127.0.0.1", "elastic_port": "9200", "elastic_user": "******", "elastic_pass": "******" } toElastic(config, index, addId, prefix + name + datetime.now().strftime("%Y")) '''index.write.format(
def fullPersistAgentSmc(esIndex, s3confPath): preload = esIndex.select("ticket_id", "assigned_agent", "smc_cluster", "reported_source_id") result = preload \ .repartition(1) \ .write \ .mode("overwrite") \ .parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) logging.info("total.count---------------------------------------------" + str(preload.count())) return result
def persistAgentSmc(esIndex, s3confPath, spark): sqlContext = SQLContext(spark) newOrUpdated = esIndex.select("ticket_id", "assigned_agent", "smc_cluster", "reported_source_id").distinct() auxOuterJoin = newOrUpdated.select("ticket_id") total = None try: total = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) except Exception as e: message = str(e) if message.find("Path does not exist"): raise e else: logging.info("catched Path does not exist (first job execution): " + str(e)) newOrUpdated \ .repartition(1) \ .write \ .mode("overwite") \ .parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) total = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) total.cache() logging.info("total.count---------------------------------------------" + str(total.count())) totalWithoutNewOrUpdated = total \ .join(auxOuterJoin, ["ticket_id"], "left") \ .where(auxOuterJoin["ticket_id"].isNull()) updatedToParquet = totalWithoutNewOrUpdated.unionByName(newOrUpdated) result = updatedToParquet \ .repartition(1) \ .write \ .mode("overwrite") \ .parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) return result
def getRelations(esIndex, s3confPath, spark): sqlContext = SQLContext(spark) parquet = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) agents = parquet \ .filter(parquet.reported_source_id != "Vendor") \ .withColumnRenamed("ticket_id", "agent_ticket_id") relations = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_relations_parquet_path) relatedAgents = relations \ .join(agents, relations.related_ticket_id == agents.agent_ticket_id, "inner") \ .groupBy("ticket_id") \ .agg(F.collect_set("assigned_agent").alias("assignee"), F.collect_set("smc_cluster").alias("smc")) result = esIndex \ .join(relatedAgents, ["ticket_id"], "left") \ .withColumn("assignee", Utils.addToArray("assigned_agent", "assignee")) \ .withColumn("smc", Utils.addToArray("smc_cluster", "smc")) \ .drop("assigned_agent", "smc_cluster") return result
def checkCount(indexName, fileName, dfCount, spark, conf): prefix = S3FilesDsl.readConfigJson(conf).elastic_env_index_prefix sqlContext = SQLContext(spark) logging.info('dfCount.. ' + str(dfCount)) path = fileName.replace(':', '\\:').replace("/", "\\/") qResultDF1 = sqlContext.read \ .option("es.resource",prefix + indexName) \ .option("es.query", "?q=file:\''" + path + " '\'") \ .format("org.elasticsearch.spark.sql") \ .load() # ¿Equivalente de qResultDF = spark.esDF("${indexName}", "?q=file:\"" + path + "\"").select("ticket_id") ? qResultDF = qResultDF1.select("ticket_id") qResultDF.cache() queryCount = qResultDF.count() qResultDF.unpersist() logging.info("queryCount.. " + str(queryCount)) if dfCount != queryCount: alertDataFrame = sqlContext.createDataFrame( [(fileName, dfCount, queryCount, datetime.now().strftime("%Y%m%d%H%M%S"))], ["file", "expected_count", "result_count", "date"]) ElasticDsl.writeESAlertsIndex(alertDataFrame, conf)
def buildESIndex(detailType, detail, s3confPath, s3filePath, spark): sqlContext = SQLContext(spark) # TODO.confinsteadof.json val confJson = S3FilesDsl.readConfigJson(s3confPath) rodTicketANTags = AdminNumberTags.antagsColumns( S3FilesDsl.readFile(confJson.tags_admin_path, spark), spark) parquetPath = confJson.fast_parquet_path rodPostgreAdminNumber = sqlContext.read.parquet(parquetPath) logging.info("FAST joins..") networkFast = sqlContext.read.parquet(confJson.fast_network_parquet_path) logging.info("common joins..") # TODO: añadir import de utils.constantes # TODO: comprobar parametros que se pasan a los metodos de Utils common3 = joinMasterEntities(detail, spark) common2 = common3.join(rodPostgreAdminNumber, ["admin_number"], "left") common1 = Utils.fillEmptyFastColumns(common2) common = common1.join(networkFast, ["admin_number"], "left"). \ withColumn("networkinfo", Utils.networkNestedObject("fast_customer", "fast_end_customer", "router_interface_vendor_type_set")). \ drop("router_interface_vendor_type_set"). \ join(rodTicketANTags, ["admin_number"], "left"). \ withColumn("open", F.when(common1.status_desc.isin(Constants.openStatus), Constants.OPEN_YES). otherwise(F.when(common1.status_desc.isin(Constants.notOpenStatus), Constants.OPEN_NO). otherwise(Constants.EMPTY_STRING))). \ withColumn("ticket_max_value_partition", Utils.getIndexPartition("ticket_id")). \ withColumn("admin_number_escaped", Utils.urlWhitespaces("admin_number")). \ withColumn("fast_max_resolution_time", Utils.validateNumeric("fast_max_resolution_time")). \ withColumn("file", F.lit(s3filePath)). \ fillna(Constants.EMPTY_STRING, ["assigned_agent"]) if detailType == "helpdesk": rodTicketReportedSource = getReportedSource(spark) operationalManager = getOperationalManager(confJson.operational_path, spark) opTags = OperatingTags.operatingTagsColumns(S3FilesDsl.readFile(confJson.tags_operating_path, spark)) customer = Customer.customerColumns(S3FilesDsl.readFile(confJson.customer_path, spark), spark) endCustomer = EndCustomer.endCustomerColumns(S3FilesDsl.readFile(confJson.end_customer_path, spark), spark) index1 = common \ .join(rodTicketReportedSource, ["reported_source_id"], "left") \ .drop("reported_source_id") \ .join(operationalManager, ["operating_company_name", "operating_le"], "left") \ .na.fill(Constants.EMPTY_STRING, ["operational_manager"]) \ .join(opTags, ["operating_company_name", "operating_le"], "left") \ .withColumn("tags", Utils.mergeArrays("tags", "operating_tags")) \ .drop("operating_tags") \ .join(customer, ["operating_company_name"], "left") \ .fillna(Constants.EMPTY_STRING, ["customer_correct"]) \ .join(endCustomer, ["operating_le"], "left") \ .fillna(Constants.EMPTY_STRING, ["end_customer_correct"]) \ .withColumn("end_customer_correct", Utils.emptyEndCustomerCorrect("customer_correct", "end_customer_correct")) \ .withColumn("ci_country", Utils.kibanaCountry("ci_country")) \ .withColumn("end_user_country", Utils.kibanaCountry("end_user_country")) \ .withColumn("smc_cluster", Utils.smcClusterFromGroup("assigned_support_group")) \ .withColumn("ci_name_escaped", Utils.urlWhitespaces("ci_name")) \ .withColumn("product_categorization_all_tiers", Utils.concat3Columns("product_categorization_tier_1", "product_categorization_tier_2", "product_categorization_tier_3")) \ .withColumn("closure_categorization_all_tiers", Utils.concat3Columns("closure_categorization_tier_1", "closure_categorization_tier_2", "closure_categorization_tier_3")) \ .withColumn("operational_categorization_all_tiers", Utils.concat3Columns("operational_categorization_tier_1", "operational_categorization_tier_2", "operational_categorization_tier_3")) \ .withColumnRenamed("reported_source_desc", "reported_source_id") index = FastDsl.fastCircuitFields(index1, confJson, spark) elif detailType == "problems": index1 = common \ .withColumn("ci_country", Utils.kibanaCountry("ci_country")) \ .withColumn("ci_name_escaped", Utils.urlWhitespaces("ci_name")) index = FastDsl.fastCircuitFields(index1, confJson, spark) elif detailType == "changes": rodTicketReportedSource = getReportedSource(spark) index = common \ .join(rodTicketReportedSource, ["reported_source_id"], "left") \ .drop("reported_source_id") \ .withColumn("ci_country", Utils.kibanaCountry("ci_country")) \ .withColumn("company_country", Utils.kibanaCountry("company_country")) \ .withColumnRenamed("reported_source_desc", "reported_source_id") # EL USUARIO SOLICITA QUE LAS DESCRIPCIONES DE LOS MAESTROS SE RENOMBREN COMO _id indexRenamed = index \ .withColumnRenamed("status_desc", "status_id") \ .withColumnRenamed("substatus_desc", "substatus_id") \ .withColumnRenamed("urgency_desc", "urgency_id") \ .withColumnRenamed("priority_desc", "priority_id") \ .withColumnRenamed("impact_desc", "impact_id") return indexRenamed
def getAgentSmcCluster(esIndex, s3confPath, spark): sqlContext = SQLContext(spark) parquet = sqlContext.read.parquet(S3FilesDsl.readConfigJson(s3confPath).rod_agent_smc_parquet_path) return esIndex.join(parquet.select("ticket_id", "smc_cluster", "assigned_agent"), ["ticket_id"], "left")
def writeMappedESIndex(index, name, mapId, config): prefix = S3FilesDsl.readConfigJson(config).elastic_env_index_prefix index.write.format('org.elasticsearch.spark.sql').option( 'es.mapping.id', mapId).option('es.resource', prefix + name).save()
def writeESAlertsIndex(index, config): prefix = S3FilesDsl.readConfigJson(config).elastic_env_index_prefix index.write.format('org.elasticsearch.spark.sql').mode( 'append').option('es.write.operation', 'index').option( 'es.resource', prefix + 'copt-rod-alerts').save()
def writeESLogIndex(index, name, config): prefix = S3FilesDsl.readConfigJson(config).elastic_env_index_prefix index.write.format('org.elasticsearch.spark.sql').mode( 'append').option('es.write.operation', 'index').option( 'es.resource', prefix + name + datetime.now().strftime("%Y")).save()