def fetch_raw(self, sql): self.cursor = self.conn.cursor() self.cursor.execute(smart_encode(sql)) rows = [] for row in self.cursor: rows.append("\t".join([smart_decode(_, cast=True) if _ else "" for _ in row])) return rows
def execute(self, sql): with self.conn.cursor() as cursor: for line in [_.strip() for _ in sql.split(";") if _.strip()]: cursor.execute(smart_encode(line)) cursor.close() self.conn.close() return True
def query(self, sql): with self.conn.cursor() as cursor: cursor.execute(smart_encode(sql)) columns = smart_decode([_[0] for _ in cursor.description]) rows = smart_decode([dict(zip(columns, _)) for _ in cursor]) cursor.close() self.conn.close() return rows
def total(self, sql): self.cursor = self.conn.cursor() self.cursor.execute(smart_encode(sql)) columns = [smart_decode(row["columnName"]) for row in self.cursor.getSchema()] rows = [dict(zip(columns, [smart_decode(cell) for cell in row])) for row in self.cursor] if rows: return rows[0][columns[0]] else: return 0
def get_distance(dummy_head, word_vector, size, data): customer_dummy = numpy.array(data[3].split(",")) target_dummy = numpy.array(data[4].split(",")) customer_attr = data[5].split(",") target_attr = data[6].split(",") customer_vector = build_base_feature(dummy_head, word_vector, customer_dummy, customer_attr, size) target_vector = build_base_feature(dummy_head, word_vector, target_dummy, target_attr, size) l = list(range(len(customer_vector))) feature = [ numpy.nan_to_num(d(customer_vector[i], target_vector[i])) for i in l for d in gen_distance_method() ] return data[:3] + [",".join(smart_encode(feature, cast=True))] + data[7:]
def query(self, sql, meta=False, to_dict=True): """ :param sql: :param meta: True的时候同时返回表头信息 :param to_dict: True将返回字典类型, False返回列表类型 :return: """ self.cursor = self.conn.cursor() self.cursor.execute(smart_encode(sql)) columns = [smart_decode(row["columnName"]) for row in self.cursor.getSchema()] if to_dict: rows = [dict(zip(columns, [smart_decode(cell) for cell in row])) for row in self.cursor] else: rows = smart_decode([row for row in self.cursor]) if meta: return rows, columns else: return rows
def make_similarity_feature(tag_dict, row): attr1 = attributes_to_dict(row[0]) attr2 = attributes_to_dict(row[1]) feature = [] for attr_name, attr_value_list in iteritems(tag_dict): similarity = 0. if attr_name == "材质成分": md1 = material_string_to_dict(attr1.get(attr_name)) md2 = material_string_to_dict(attr2.get(attr_name)) mdi = set(list(md1)) & set(list(md2)) if len(mdi) > 0: material_similar_score = similarity for i in mdi: material_similar_score += min(md1.get(i), md2.get(i)) feature.append(round(material_similar_score / 100, 4)) else: feature.append(similarity) continue try: s = set(attr_value_list) a1 = set(attr1[attr_name]) a2 = set(attr2[attr_name]) except: feature.append(similarity) continue set1 = s & a1 set2 = s & a2 try: feature.append(round(len(set1 & set2) / len(set1 | set2), 4)) except ZeroDivisionError: feature.append(similarity) return row[2:4] + [",".join(smart_encode(feature, cast=True))] + row[4:]
("促销", "预售|反季清仓|促销活动|大促销|试用装|赶紧行动|选购|赠品|热销|特价|限时|批发|大型展卖|促销商品|折上折|特价场|还在等什么|活动品牌|全场品牌|需要的抓紧|优惠劵大放送|体验价|优惠多多|购物狂欢节|限时优惠|特价处理|清仓|断码|原价|降价|打折|限时|低价|特卖|满购|聚划算|购物满|促销|优惠价|男女同款|淘宝只卖|甩卖价|天猫商城价|现价只有|最终售价|感恩节折扣|即可获得优惠|新品预售|双十一狂欢价|甩卖|开抢|快来抢|更多优惠活动等着你|全清价|每件立减|0元换购|精选上市|大减价|大处理|清场甩|优惠等你来|喜迎双11|限时\w{0,1}折|大销价|购实惠|购满+送|送+价值|开学巨献|开业价|全场\w{0,3}折|满\w{0,3}减|件W{0,3}折|全场\w{0,2}减|买\w{0,2}减|折处理|折优惠|元优惠券|包顺风|送.*元红包|分享有礼送|一大波.*来袭|买\D{0,5}送|低至\w{0,5}折|买1送1|买一送一|风衣特价|买任意款|加100元换购|加1元送一|码特价|热销价|折扣价|疯狂价|优惠价|男女同款|淘宝只卖|甩卖价|天猫商城价|现价只有|最终售价|感恩节折扣|即可获得优惠|新品预售|双十一狂欢价|甩卖|开抢|快来抢|更多优惠活动等着你|全清价|每件立减|0元换购|精选上市|大减价|大处理|清场甩|优惠等你来|喜迎双11|特价折扣|抢购|商场特卖|国庆特惠|购物节|活动时间|(春|夏|秋|冬)装折扣" ), ("投票活动", "\【|\】|\〖|\〗|\《|\》|我是雷锋|帮忙投票|好礼等你来拿|活动推荐给大家|活动详见|有机会赢|有奖竞猜|详情(请)*点击|活动热线|现在就来参加|只要分享|就有机会获得|微博抽奖|获得好礼|即可赢得|有机会获得|获奖名单|就有机会中奖|赢取|即可获赠|礼品等着您|圆满落幕|有机会赢得|报名活动|幸运奖品|诚邀您参与|有奖活动|的大力支持|快来报名吧|火热报名中|狂欢季|活动地点|活动内容|快来领取|分享你喜欢的|一起来分享吧|免费领取|活动如下:|活动请关注|关注最新活动|免费试用|线上专供|报名网址|本次活动|活动开始|红包大派送|众多好礼|拿大奖|免费大派发|踊跃投票|欢迎\D{0,1}报名参加|大力支持|活动二|小编注意到|立即申请|即可参与抽奖|敬请期待|有机会赢|有奖竞猜|详情(请)*点击|活动热线|现在就来参加|只要分享|就有机会获得|微博抽奖|获得好礼|即可赢得|有机会获得|获奖名单|就有机会中奖|赢取|即可获赠|礼品等着您|圆满落幕|有机会赢得|报名活动|幸运奖品|活动最后一天|赢取大奖|活动将送|店庆|有机会获得|活动内容详见|等你来挑战|即日起," ), ("科普", "(①|②|③|1\.|2\.|1\、|2\、|图1|图2|1\)| 2\)|一\、|二\、|三\、)|情感好文|今天就介绍大家|今天就介绍给大家|最新发现|研究指出|数据证明|研究发现|注意事项|小贴士|本款|分享给大家|请关注@|小编给大家推荐" ), ("新闻", "据媒体报道|开幕首日|隆重举行|活动现场|启动仪式|商业活动中|娱乐|日发售"), ("灌水", "顶上去|66666|牛牛牛|顶一个"), ("分享", "转走|转载|转发|成功分享到|大家帮忙多转|豪礼|一张图片测试|性格测试题|很火的心理测试|您敢挑战吗|异性眼中的你|招聘测试题|变态测试题|请朋友们留意|原文地址|看客推荐|让你在人群中|别怪我不告诉你|转发此条微博|详情点击|分享一款|供大家参考|给大家分享我|小说|转发此微博|分享赢|最变态招聘测试题" ), ("其他", "此用户暂时被停用|宜忌") ] for line in sys.stdin: try: line = smart_decode(line).replace("\n", "").replace("\r", "").replace( "\\N", "").split("\t") if len(line) <= 1: continue line[0] = mother_baby_denoise(line[0], l) print(smart_encode("\t".join(list(reversed(line))))) print(smart_encode(line[0])) except Exception as e: print( smart_encode("\t".join([ traceback.format_exc().replace("\t", " ").replace("\n", " "), "ERROR" ])))
def gen_experiment_logs_head(): info = ",".join(gen_print_var()) with open(get_experiment_logs_path("experiment_head.csv"), mode="w") as f: f.write(smart_encode(info + "\n"))
def logging_process(locals_var): var_dict = get_print_var(locals_var, gen_print_var()) info = ",".join([str(v) for k, v in iteritems(var_dict)]) with open(get_experiment_logs_path("experiment_logs.csv"), mode="a") as f: f.write(smart_encode(info + "\n"))
("科普", "(①|②|③|1\.|2\.|1\、|2\、|图1|图2|1\)| 2\)|一\、|二\、|三\、)|情感好文|今天就介绍大家|今天就介绍给大家|最新发现|研究指出|数据证明|研究发现|注意事项|小贴士|本款|分享给大家|请关注@|小编给大家推荐" ), ("新闻", "据媒体报道|开幕首日|隆重举行|活动现场|启动仪式|商业活动中|娱乐|日发售"), ("灌水", "顶上去|66666|牛牛牛|顶一个"), ("分享", "转走|转载|转发|成功分享到|大家帮忙多转|豪礼|一张图片测试|性格测试题|很火的心理测试|您敢挑战吗|异性眼中的你|招聘测试题|变态测试题|请朋友们留意|原文地址|看客推荐|让你在人群中|别怪我不告诉你|转发此条微博|详情点击|分享一款|供大家参考|给大家分享我|小说|转发此微博|分享赢|最变态招聘测试题" ), ("其他", "此用户暂时被停用|宜忌") ] for line in sys.stdin: line = smart_decode(line).replace("\n", "").replace("\r", "").replace("\\N", "").split("\t") try: id, channel, subject, post_id, title, tags, reply_count, view_count, collection_count, detail_url, content, is_best_answer, like_count, user_id, user_name, user_type, is_host, replied_user_id, replied_user_name, created_at, device, updated_at, baby_agethen, baby_days, floorid, noise, platform_id = line noise = mother_baby_denoise(content, l) print( smart_encode("\t".join([ id, channel, subject, post_id, title, tags, reply_count, view_count, collection_count, detail_url, content, is_best_answer, like_count, user_id, user_name, user_type, is_host, replied_user_id, replied_user_name, created_at, device, updated_at, baby_agethen, baby_days, floorid, noise, platform_id ]))) except Exception as e: print( smart_encode(traceback.format_exc().replace("\t", " ").replace( "\n", " ")))
def execute(self, sql): self.cursor = self.conn.cursor() for s in sql.split(";"): s = s.strip() if s: self.cursor.execute(smart_encode(s))
def get(self, sql): rows = self.query(smart_encode(sql)) return rows[0] if len(rows) > 0 else None