Exemple #1
0
def words_black_list_process_new(spark, df_primary):
    blackListFile = issueconstruction.g_word_segment_black_files

    if common.data_is_NULL(blackListFile):
        return df_primary

    blackList = common.read_file_lines_to_list(blackListFile)

    if common.data_is_NULL(blackList):
        return df_primary

    df_primary.registerTempTable("tb_black_content")
    spark.udf.register("black", black)
    df_black = spark.sql(
        "select OWNER,PMAnalysis,Rname,Vname,adminAdvice,approverComments,att_file_num1,"
        "att_file_num3,att_img_num1,att_img_num3,baseline,category,categoryStr,causeAnalysis,"
        "creationdate,currentNode,currentPerson,cut_words,defectModifier,defectNo,defect_ODCSeverity,"
        "developerComments,issueProcessor,lastProcessed,lastupdateTimestamp,lengthofstay,nodeCode,"
        "nodeName,operation_type,productLineName,productName,refresh_timestamp,solution,status,"
        "submitBy,submitDate,suspendReason,testReport,testTool,testToolStr,testerComments,"
        "name,describe,detail,black(describekey) as describekey,detailkey from  tb_black_content"
    )

    # 白名单筛选
    words_white_list_process_new(spark, df_black)
Exemple #2
0
def white(spark, df_black):
    retList = []
    whiteListFile = issueconstruction.g_word_segment_white_file
    if common.data_is_NULL(whiteListFile):
        return df_black
    whiteList = common.read_file_lines_to_list(whiteListFile)
    if common.data_is_NULL(whiteList):
        return []

    # 将白名单whiteList转换成一行一列的dataframe
    # schema=StructType([StructField("white",StringType(),True)])
    # df_white=spark.createDataFrame(DataFrame(whiteList), schema) # ["white"]
    # df_all=df_white.withColumn("name",f.lit("白名单"))
    # df_all.registerTempTable("tmp_all")
    # df_whiteList=spark.sql("select concat_ws(',',collect_set(white)) as detailkey from tmp_all group by name")
    #
    # # 将白名单独立出来
    # rule_white=df_whiteList.rdd.map(lambda row: row).reduce(sum)
    # rule_white=spark.sparkContext.broadcast(rule_white)

    rdd = df_black.rdd.map(lambda row: match_rule(whiteList, row))
    schema = StructType([
        StructField("name", StringType(), True),
        StructField("detailkey", StringType(), True)
    ])
    df_detailkey = spark.createDataFrame(rdd, schema)
    return df_detailkey
Exemple #3
0
def words_black_list_process(blackListFile, words):
    retList = []
    if common.data_is_NULL(blackListFile):
        return words
    blackList = common.read_file_lines_to_list(blackListFile)
    if common.data_is_NULL(blackList):
        return words
    # Logger.logger.info("blackList:%s"%(common.list_2_str(blackList)))
    for d in words:
        if d in blackListFile:
            continue
        if d in retList:
            continue
        retList.append(d)
    # Logger.logger.info("after blackList:%s"%(common.list_2_str(retList)))
    return retList
Exemple #4
0
def discovery_from_delhtmllabel(contentList):
    if common.data_is_NULL(contentList):
        logging.info("contentList is null!")
    dict_add = {}
    list_pm = []
    for param_dict in contentList:
        name = param_dict.get("name")
        dict_add["name"] = name
        for pms in ["detail"]:
            pm = param_dict.get(pms)
            destStr = ''
            if len(pm) > 0:
                src_soup = BeautifulSoup(pm, 'html5lib')
                if src_soup is not None:
                    # get_text得到html内容
                    src_soup_text = src_soup.get_text()
                    if src_soup_text:
                        destStr = src_soup_text.replace('\n', '')
                        destStr = destStr.replace('\t', '')
                        destStr = re.sub('\\s+', ' ', destStr)
                        dict_add[pms] = destStr
            else:
                dict_add[pms] = ' '
        list_pm.append(dict_add.copy())
    # print(list_pm)
    list_iter = generate_output_file_overwrite(contentList, list_pm)
    return list_iter
Exemple #5
0
def discovery_from_merge(list_thr):
    if common.data_is_NULL(list_thr):
        logging.info("数据合并失败")
    list_merge = []
    for param in list_thr:
        kdddict = {}
        destList = []
        destname = param["name"]
        for keys in param.keys():
            if keys == "describekey" or keys == "detailkey":
                if "\\[" in param[keys] or "\\]" in param[keys]:
                    destList = destList + common.csv_list_str_2_list(
                        param[keys])
                else:
                    destList.append(param[keys])

        # 已满足,将嵌套的list转换成一维的list
        # print(set(reduce(operator.add,destList)))

        # print(list(set(flat(destList))))

        kdddict.update({destname[0]: list(set(flat(destList)))})

        list_return = search_update(kdddict, list_thr)
        for dict_par in list_return:
            dict = {
                "name": dict_par["name"][0],
                "describe": dict_par["describe"],
                "detail": dict_par["detail"],
                "describekey": dict_par["describekey"],
                "detailkey": dict_par["detailkey"],
                "searchkey": dict_par["searchkey"]
            }
        list_merge.append(dict.copy())
    return list_merge
Exemple #6
0
def run(srcString, bUseCustomDict=False):
    # param是字典型,srcString字段的值
    global g_word_segment_local_file
    global g_drpoList
    if common.data_is_NULL(srcString):
        return []
    # load user custom word dict
    if bUseCustomDict:
        # 对简述分词
        jieba.load_userdict(g_word_segment_local_file)
        jieba.initialize()
    # Data preprocessing数据预处理
    user_string = common.C_trans2_E(srcString)
    # segment将数据生成一个list
    ret_list = jieba.lcut(user_string.lower())
    wordList = []
    for i in ret_list:
        if i in wordList:
            continue
        if i in g_drpoList:
            continue
        if len(i) < 2:
            continue
        wordList.append(i)
    return wordList
Exemple #7
0
 def __init__(self, \
              operator, \
              filterList):
     self.operator = operator
     self.filterList = []
     if not common.data_is_NULL(filterList):
         for filter in filterList:
             self.filterList.append(filter.__dict__)
Exemple #8
0
def data_clean(s):
    s = str(s)
    if common.data_is_NULL(s):
        return ""
    s = s.replace('"', "'")
    s = s.replace(',', "。")
    s = s.replace('\n', "")
    s = s.replace('\r', "")
    return s
Exemple #9
0
def get(label):
    global g_entity_dict

    if not label in g_entity_dict.keys():
        g_entity_dict.update({label: {}})

    if common.data_is_NULL(g_entity_dict[label]):
        req([label])

    return g_entity_dict[label]
Exemple #10
0
def words_black_list_process_new(spark, df_primary):
    blackListFile = issueconstruction.g_word_segment_black_files

    if common.data_is_NULL(blackListFile):
        return df_primary

    blackList = common.read_file_lines_to_list(blackListFile)

    if common.data_is_NULL(blackList):
        return df_primary

    df_primary.registerTempTable("tb_black_content")
    spark.udf.register("black", black)
    df_black = spark.sql(
        "select name,describe,detail,trim(black(describekey)) as describekey,trim(detailkey) as detailkey from  tb_black_content"
    )

    # 白名单筛选
    words_white_list_process_new(spark, df_black)
Exemple #11
0
def words_white_list_process(whiteListFile, words):
    retList = []
    if common.data_is_NULL(whiteListFile):
        return words
    whiteList = common.read_file_lines_to_list(whiteListFile)

    if common.data_is_NULL(whiteList):
        return []
    # Logger.logger.info("whiteList:%s"%(common.list_2_str(whiteList)))
    for d in words:
        if len(str(d)) > 0:
            # 如果索引的词在白名单里
            if d in whiteList:
                # 如果在白名单里不在retList里
                if d not in retList:
                    retList.append(d)
                else:
                    continue
    # Logger.logger.info("after whiteList:%s"%(common.list_2_str(retList)))
    return retList
Exemple #12
0
def word_primary_key_align(words, isExpend=True):
    global g_primary_key_align_dict

    if common.data_is_NULL(g_primary_key_align_dict):
        g_primary_key_align_dict = load_eneity_align_dict(g_align_label)

    if common.data_is_NULL(g_primary_key_align_dict):
        return words

    retWords = []
    for w in words:
        if w in g_primary_key_align_dict.keys():
            w1 = g_primary_key_align_dict[w]
            if w1 != w:
                if isExpend:
                    retWords.append(w1)
                else:
                    w = w1
        retWords.append(w)
    return retWords
Exemple #13
0
def req(keyList):

    global g_entity_dict

    cache_lock.acquire()

    for k in list(keyList):
        cmd = "g.V().hasLabel('{0}').valueMap('name','alias')".format(k)

        ret, result = query.get_common(cmd)

        if ret is not True:
            logging.info("get <%s> entity is NULL" % k)
            continue
        data = result.get('data')
        if common.data_is_NULL(data):
            logging.info("get <%s> entity is NULL" % k)
            continue

        for d in data:
            if "alias" in d:
                name = re.findall(r"name=\[(.+?)\],", d)
            else:
                name = re.findall(r"name=\[(.+?)\]\}", d)
            if common.data_is_NULL(name):
                continue
            name = common.list_str_2_list(name[0], ', ')

            alias = re.findall(r"alias=\[(.+?)\]\}", d)
            if not common.data_is_NULL(alias):

                alias = common.list_str_2_list(alias[0], ', ')
            g_entity_dict[k].update({name[0]: alias})

    cache_lock.release()

    return
Exemple #14
0
def load_eneity_align_dict(labelList):
    # label是给定的一个个label
    for label in labelList:
        # 得到label数据
        data = cache.get(label)

        for k in data.keys():
            name = k
            alias = data[k]

            if common.data_is_NULL(alias):
                continue
            if isinstance(alias, list):
                for a in alias:
                    g_primary_key_align_dict.update({a: name})
            else:
                g_primary_key_align_dict.update({alias: name})

        return g_primary_key_align_dict
Exemple #15
0
def discovery_from_wordseg(line):
    if common.data_is_NULL(line):
        logging.info("line is null")
        return True
    # param是字典里的简述跟详述
    dict = {}
    list = []
    for param in line:
        for keys in param.keys():
            # 分词
            destList = wordSegment.run(param[keys])
            # 索引
            destList = word_primary_key_align(destList)
            if keys == "name":
                dict.update({keys: destList})
            else:
                dict.update({keys + "key": destList})
        list.append(dict.copy())

    list_detail_des = white_black(list, line)
    return list_detail_des
Exemple #16
0
def discovery_from_delhtmllabel_new(contentList):
    if common.data_is_NULL(contentList):
        logging.info("contentList is null!")
    dict_add = {}
    list_pm = []
    name = eval(dict(contentList)).get("name")
    dict_add["name"] = name
    for pms in ["detail"]:
        pm = contentList.get(pms)
        destStr = ''
        if len(pm) > 0:
            src_soup = BeautifulSoup(pm, 'html5lib')
            if src_soup is not None:
                # get_text得到html内容
                src_soup_text = src_soup.get_text()
                if src_soup_text:
                    destStr = src_soup_text.replace('\n', '')
                    destStr = destStr.replace('\t', '')
                    destStr = re.sub('\\s+', ' ', destStr)
                    dict_add[pms] = destStr
        else:
            dict_add[pms] = ' '
        list_pm.append(dict_add.copy())
    print(list_pm)