Esempio n. 1
0
 def __init__(self):
     """
     初始化
     """
     self.logging = LogFactory()
     self.config_data = Config().config_data
     self.conn = self.get_odps_conn()
Esempio n. 2
0
def main():
    config = Config()
    s = ""
    for query in config.get_queries():
        s += Finder(query, config.get_texts()).__str__()
    print(s)
    save(s, config.get_output())
Esempio n. 3
0
 def __init__(self):
     self.cuslabelcluster = CusLabelCluster()
     self.logging = LogFactory()
     self.config = Config().config_data
     self.odps = ODPSdb()
     self.bdp_date = (datetime.date.today() +
                      datetime.timedelta(days=-1)).strftime("%Y%m%d")
Esempio n. 4
0
class CusLabelDistance:
    def __init__(self):
        self.config = Config().config_data
        self.logging = LogFactory()
        self.odps = ODPSdb()

    def preprocessing(self, data):
        """
        数据预处理
        :param data:
        :return:
        """
        index = data['str_code'].values
        data.drop(columns=['str_code', 'distrib_code'], axis=1, inplace=True)
        # 空值补零
        data.fillna(0, inplace=True)
        # 行中心化
        m_row = data.mean(axis=1)
        data = data - m_row[:, None]
        # 正则化Normalizer
        scaler = Normalizer()
        x_train = scaler.fit_transform(data)
        return x_train, index

    def distance(self, x_train):
        """
        计算距离矩阵
        :param x_train:
        :return: 返回一维的压缩距离矩阵
        对于一个m维矩阵,(i, j)对应的距离,在位置m * i + j - ((i + 2) * (i + 1)) // 2
        """
        d_condensed = pdist(x_train, metric='cosine')
        return d_condensed

    def main(self):
        """
        按分区
        :return:
        """
        bdp_date = (datetime.date.today() +
                    datetime.timedelta(days=-1)).strftime("%Y%m%d")
        sql_cmd = self.config.get('GET_CUSTOM_LABEL').format(bdp_date)
        data = self.odps.get_data(sql=sql_cmd)
        d_condensed, index, distrib = [], [], []
        for distrib_code in set(data.distrib_code):
            x_train, idx = self.preprocessing(
                data=data[data['distrib_code'] == distrib_code])
            d_condensed.append(self.distance(x_train))
            index.append(idx)
            distrib.append(distrib_code)
        return d_condensed, index, distrib
Esempio n. 5
0
class DistribStyDistance():

    def __init__(self):
        self.config = Config().config_data
        self.logging = LogFactory()
        self.odps = ODPSdb()

    def preprocessing(self, data):
        """
        数据预处理
        :param data:
        :return:
        """
        index = data['str_code'].values
        # 向量化
        tfidf2 = TfidfVectorizer()
        x_train = tfidf2.fit_transform(data['sty_code_list'])
        # 标准化
        scaler = Normalizer()
        x_train = scaler.fit_transform(x_train)
        return x_train, index


    def distance(self, x_train):
        """
        计算距离矩阵
        :param x_train:
        :return: 返回一维的压缩距离矩阵
        对于一个m维矩阵,(i, j)对应的距离,在位置m * i + j - ((i + 2) * (i + 1)) // 2
        """
        d_condensed = pdist(x_train.toarray(), metric='cosine')
        return d_condensed

    def main(self):
        sql_cmd = self.config.get('GET_STR_STY_LABEL')
        data = self.odps.get_data(sql=sql_cmd)
        d_condensed, index, distrib = [], [], []
        for distrib_code in set(data.distrib_code):
            x_train, idx = self.preprocessing(data=data[data['distrib_code']==distrib_code])
            d_condensed.append(self.distance(x_train))
            index.append(idx)
            distrib.append(distrib_code)
        return d_condensed, index, distrib
Esempio n. 6
0
 def __init__(self):
     self.logging = LogFactory()
     self.config_data = Config().config_data
     self.conn = self.get_conn()
Esempio n. 7
0
class PostGreSQL:
    def __init__(self):
        self.logging = LogFactory()
        self.config_data = Config().config_data
        self.conn = self.get_conn()

    def get_conn(self):
        pg_config = self.config_data.get('POSTGRESQL')
        conn = psycopg2.connect(host=pg_config['HOST'],
                                port=pg_config['PORT'],
                                database=pg_config['DBNAME'],
                                user=pg_config['USER'],
                                password=pg_config['PASSWD'])
        return conn

    def get_pg_records(self, query, param=None):
        """
        从sqlite数据库获取数据
        :param query: 查询语句
        :param param: 查询参数
        :return: 查询结果(list)
        """
        cursor = self.conn.cursor()
        try:
            if param is None:
                cursor.execute(query)
            else:
                cursor.execute(query, param)
            records = cursor.fetchall()
        except:
            records = []
            self.logging.error(traceback.format_exc())
        cursor.close()
        return records

    def update_pg(self, query, param=None):
        """
        执行增删改类查询sql语句
        :param query: 查询语句
        :param param: 查询参数
        :return: 执行结果(1-成功;0-失败)
        """
        cursor = self.conn.cursor()
        try:
            if param is None:
                cursor.execute(query)
            else:
                cursor.execute(query, param)
            cursor.close()
            self.conn.commit()
            return 1
        except:
            self.logging.error(traceback.format_exc())
            cursor.close()
            self.conn.rollback()
            return 0

    def pandas_readsql(self, sql, columns=None):
        """
        使用pandas读取
        :param sql: 查询语句
        :param columns: 查询列
        :return: dataframe
        """
        res = pd.read_sql(sql, con=self.conn, columns=columns)
        return res

    def close(self):
        self.conn.close()
Esempio n. 8
0
 def __init__(self):
     self.config = Config().config_data
     self.logging = LogFactory()
     self.odps = ODPSdb()
Esempio n. 9
0
class CusLabelCluster:
    def __init__(self):
        self.config = Config().config_data
        self.logging = LogFactory()
        self.odps = ODPSdb()

    def preprocessing(self, data):
        """
        数据预处理
        :param data:
        :return:
        """
        index = data['str_code']
        data.drop(columns=['str_code', 'distrib_code'], axis=1, inplace=True)
        # 空值补零
        data.fillna(0, inplace=True)
        # 行中心化
        m_row = data.mean(axis=1)
        data = data - m_row[:, None]
        # 正则化Normalizer
        scaler = Normalizer()
        x_train = scaler.fit_transform(data)
        return x_train, index

    def cluster(self, x_train):
        """
        聚类
        :param x_train:
        :return:
        根据门店数目确定聚类数目的大概范围。1、每组最少30家门店,最多200家门店;2、聚类数目最少为2组,最多为30组。
        """
        silhouette_best = 0
        cluster_label = None
        cluster_num = None
        n = max(len(x_train) // 200, 2)
        m = min(len(x_train) // 30, 30) + 1
        self.logging.info("聚类数目最小值:{0},最大值:{1}".format(n, m))
        step = 1 if m < 11 else 2
        for i in range(n, m, step):
            clusterid, error, nfound = kcluster(x_train,
                                                nclusters=i,
                                                dist='u',
                                                npass=1000)
            silhouette_score = metrics.silhouette_score(x_train,
                                                        clusterid,
                                                        metric='cosine')
            self.logging.info("聚类数目:{0},聚类得分:{1}".format(i, silhouette_score))
            self.logging.info("找到解的次数:{0}".format(nfound))
            if silhouette_best < silhouette_score:
                silhouette_best = silhouette_score
                cluster_label = clusterid
                cluster_num = i
        self.logging.info("最优聚类数目:{0},最优轮廓系数得分:{1}".format(
            cluster_num, silhouette_best))
        return cluster_label

    def main(self):
        bdp_date = (datetime.date.today() +
                    datetime.timedelta(days=-1)).strftime("%Y%m%d")
        sql_cmd = self.config.get('GET_CUSTOM_LABEL').format(bdp_date)
        data = self.odps.get_data(sql=sql_cmd)
        cluster_label, index = [], []
        for distrib_code in set(data.distrib_code):
            x_train, idx = self.preprocessing(
                data=data[data['distrib_code'] == distrib_code])
            cluster_label.extend(self.cluster(x_train=x_train))
            index.extend(idx.values)
        cluster_df = pd.DataFrame(cluster_label,
                                  index=index,
                                  columns=['label'],
                                  dtype=object)
        return cluster_df
Esempio n. 10
0
class ODPSdb:
    """
    配置数据库
    """

    def __init__(self):
        """
        初始化
        """
        self.logging = LogFactory()
        self.config_data = Config().config_data
        self.conn = self.get_odps_conn()

    def get_odps_conn(self):
        """
        连接ODPS
        :return:
        """
        odps_config = self.config_data.get('ODPS')
        try:
            conn = ODPS(access_id=odps_config['USER'],
                        secret_access_key=odps_config['PASSWD'],
                        project=odps_config['DBNAME'],
                        endpoint=odps_config['URL'])
        except:
            self.logging.error(traceback.format_exc())
            raise
        return conn

    def get_data(self, sql):
        """
        查询数据
        :param sql:
        :return:
        """
        self.logging.info("查询数据:" + sql)
        try:
            with self.conn.execute_sql(sql).open_reader() as reader:
                data = reader.to_pandas()
            if len(data) == 0:
                self.logging.error("数据为空!")
        except:
            self.logging.error(traceback.format_exc())
            return None
        self.logging.info("read_data success!")
        return data

    def write_to_db(self, data, tablename, if_partition=1):
        """
        写入数据库
        :param data:
        :param tablename:
        :param if_partition: 是否分区
        :return:(1-成功;0-失败)
        """
        if data is None or data.empty:
            self.logging.error("{0} 写入数据库失败!数据为空!".format(tablename))
            return 0
        else:
            try:
                data['dw_date'] = datetime.datetime.now()
                if if_partition:
                    bdp_date = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d")
                    DataFrame(data).persist(name=tablename, overwrite=True, partition="ds='{}'".format(bdp_date),
                                            create_partition=True, odps=self.conn, cast=True)
                else:
                    DataFrame(data).persist(name=tablename, overwrite=True, odps=self.conn, cast=True)
            except:
                self.logging.error(traceback.format_exc())
                return 0
        self.logging.info("{0} 成功写入数据库!".format(tablename))
        return 1
Esempio n. 11
0
class NewStoreCluster:
    def __init__(self):
        self.cuslabelcluster = CusLabelCluster()
        self.logging = LogFactory()
        self.config = Config().config_data
        self.odps = ODPSdb()
        self.bdp_date = (datetime.date.today() +
                         datetime.timedelta(days=-1)).strftime("%Y%m%d")

    def main(self):
        cluster_df = self.cuslabelcluster.main()
        sql_cmd = self.config.get('GET_GEOGRAPHY_LABEL').format(self.bdp_date)
        full_store = self.odps.get_data(sql=sql_cmd)

        full_store = pd.merge(full_store,
                              cluster_df,
                              how='left',
                              left_on=['str_code'],
                              right_index=True)

        # 不同组中出现最频繁的:商圈、城市、城市等级、省区
        # 商圈+城市
        feature_matrix1 = full_store.groupby(
            by=['distrib_code', 'label', 'city_name', 'cbd_type_name'],
            as_index=False).agg(feature1=('str_code', 'count'))
        # 城市
        feature_matrix2 = full_store.groupby(
            by=['distrib_code', 'label', 'city_name'],
            as_index=False).agg(feature2=('str_code', 'count'))
        # 省区+城市等级
        feature_matrix3 = full_store.groupby(
            by=['distrib_code', 'label', 'str_org4_name', 'city_level'],
            as_index=False).agg(feature3=('str_code', 'count'))
        # 省区
        feature_matrix4 = full_store.groupby(
            by=['distrib_code', 'label', 'str_org4_name'],
            as_index=False).agg(feature4=('str_code', 'count'))
        for i, item in full_store[full_store['label'].isnull()].iterrows():
            cbd_type_name = item['cbd_type_name']
            city_name = item['city_name']
            city_level = item['city_level']
            str_org4_name = item['str_org4_name']
            if cbd_type_name is not None:
                data = feature_matrix1[
                    (feature_matrix1['cbd_type_name'] == cbd_type_name)
                    & (feature_matrix1['city_name'] == city_name)].sort_values(
                        by='feature1')
                if len(data) > 0:
                    full_store.loc[i, 'label'] = data.iloc[-1]['label']
                    continue
            if city_name is not None:
                data = feature_matrix2[feature_matrix2['city_name'] ==
                                       city_name].sort_values(by='feature2')
                if len(data) > 0:
                    full_store.loc[i, 'label'] = data.iloc[-1]['label']
                    continue
            if city_level is not None:
                data = feature_matrix3[
                    (feature_matrix3['city_level'] == city_level)
                    & (feature_matrix3['str_org4_name'] == str_org4_name
                       )].sort_values(by='feature3')
                if len(data) > 0:
                    full_store.loc[i, 'label'] = data.iloc[-1]['label']
                    continue
            if str_org4_name is not None:
                data = feature_matrix4[feature_matrix4['str_org4_name'] ==
                                       str_org4_name].sort_values(
                                           by='feature4')
                if len(data) > 0:
                    full_store.loc[i, 'label'] = data.iloc[-1]['label']
                    continue
            self.logging.error('{0}未匹配到!'.format(item['str_code']))
        full_store['label'] = full_store['label'].astype('int')
        return full_store