def update_label_id(url, label_id): """ 根据url更新label_id这个集合,将url所属的标签更新到label_id集合中 :param url: 用户访问的url,根据该url获取标签 :param label_id: 用户的标签集合 :return: """ url = trans_str(url) host = utility.url_to_host(url) domain = utility.host_to_domain(host) try: if url in label_rules['url']: label_id.update(label_rules['url'][url]) except KeyError: pass try: if host in label_rules['host']: label_id.update(label_rules['host'][host]) except KeyError: pass try: if domain in label_rules['domain']: label_id.update(label_rules['domain'][domain]) except KeyError: pass
def insert_host_info( host_pv_file, urls_set, host="127.0.0.1", port=3306, user="******", password="", database="test" ): print("urls len: {0}".format(len(urls_set))) conn = connect(host=host, port=port, user=user, passwd=password, db=database, charset='utf8') cursor = conn.cursor() with open(host_pv_file, mode='r') as fd: for line in fd: line = line.strip() host, pv = line.split(" ") if host not in urls_set: # print("host {0} not in urls set".format(host)) continue host = utility.spider_url_to_dpi_url(host) host = utility.url_to_host(host) suffix = utility.get_suffix(host) insert_sql = "insert into dmp_site_info (domain, suffix, pv_yesterday) values (\"{0}\", \"{1}\", {2});".format( host, suffix, int(pv)) # print(insert_sql) try: cursor.execute(insert_sql) except IntegrityError as ie: traceback.print_exc(ie) conn.commit() conn.close()
def write_mongo(host, port, db_name, collection_name, file_name): client = MongoClient(host, port) print client.server_info() dbs = client.database_names() print '\t'.join(dbs) db = client.get_database(db_name) collections = db.collection_names(include_system_collections=False) print '\t'.join(collections) collection = db.get_collection(collection_name) with open(file_name, "r") as fd: for line in fd: line = line.strip() segs = line.split("\t") if len(segs) < 2: continue host = utility.url_to_host(segs[1]) domain = utility.host_to_domain(host) if ValidHostnameRegex.match( domain) is not None or ValidIpAddressRegex.match( domain) is not None: rule = host + "/*" if segs[0] == "色情": collection.insert_one({ "table": "domain_rule", "group_id": seqing_group_id, "domain": domain, "rule": rule }) print "色情", segs[1] elif segs[0] == "赌博": collection.insert_one({ "table": "domain_rule", "group_id": dubo_group_id, "domain": domain, "rule": rule }) print "赌博", segs[1] elif segs[0] == "小说": collection.insert_one({ "table": "domain_rule", "group_id": xiaoshuo_group_id, "domain": domain, "rule": rule }) print "小说", segs[1] elif segs[0] == "长尾": collection.insert_one({ "table": "domain_rule", "group_id": changwei_group_id, "domain": domain, "rule": rule }) print "长尾", segs[1] else: print "其他", segs[0], segs[1] cursor = collection.find_one()
def get_result_urls(results_file): urls_set = set() with open(results_file, mode='r') as fd: for line in fd: line = line.strip() segs = line.split(",") if len(segs) < 6: continue host = utility.spider_url_to_dpi_url(segs[0]) host = utility.url_to_host(host) # print("add host {0}".format(host)) urls_set.add(host) return urls_set
def insert_host_content( results_file, host="127.0.0.1", port=3306, user="******", password="", database="test" ): conn = connect(host=host, port=port, user=user, passwd=password, db=database, charset='utf8') cursor = conn.cursor() count = 0 failed_count = 0 ie_failed_count = 0 pe_failed_count = 0 de_failed_count = 0 oe_failed_count = 0 with open(results_file, mode="r") as fd: for line in fd: line = line.strip() segs = line.split(",") if len(segs) < 6: continue host = utility.spider_url_to_dpi_url(segs[0]) host = utility.url_to_host(host) title = segs[1] keywords = segs[2] description = segs[3] if title is None or not chinese.search(title.decode('utf8')): continue if keywords is None or not chinese.search(title.decode('utf8')): continue site_id = get_site_id(host, conn) if site_id == 0: print("get site id for host {0} failed".format(host)) failed_count += 1 continue insert_sql = "insert into dmp_site_content (site_id, domain, title, keywords, description) values ({0}, \"{1}\", \"{2}\", \"{3}\", \"{4}\");".format( site_id, host, title, keywords, description) try: cursor.execute(insert_sql) except IntegrityError as ie: traceback.print_exc(ie) ie_failed_count+=1 continue except ProgrammingError as pe: traceback.print_exc(pe) print("Error sql: {0}".format(insert_sql)) pe_failed_count += 1 continue except DataError as de: traceback.print_exc(de) print("Error sql: {0}".format(insert_sql)) de_failed_count += 1 continue except OperationalError as oe: traceback.print_exc(oe) print("Error sql: {0}".format(insert_sql)) oe_failed_count += 1 continue count += 1 if count > 100: conn.commit() count = 0 conn.commit() conn.close() print("get site id failed number: {0}".format(failed_count)) print("IntegrityError failed number: {0}".format(ie_failed_count)) print("ProgrammingError failed number: {0}".format(pe_failed_count)) print("DataError failed number: {0}".format(de_failed_count)) print("OperationalError failed number: {0}".format(oe_failed_count))
middle_tokens.append(token) return ''.join(middle_tokens), '.'.join(suffix_tokens[::-1]) if __name__ == "__main__": filename = sys.argv[1] full_filename = os.path.split(os.path.realpath(__file__))[0] + os.path.sep + filename #print analyse.default_tfidf.stop_words #print full_filename with open(full_filename, "r") as f: for line in f: line = line.strip() if line is None or len(line) == 0 or line.find('\t') == -1: continue try: url, body = line.split('\t', 1) host = utility.url_to_host(url) middle, suffix = split_host(host) if suffix_dict.has_key(suffix): print u'\t'.join([suffix_dict[suffix], url, suffix]) continue body = body.decode('utf-8', 'ignore') title = body.split('\1')[0] # title, keywords, description, p_list, a_list = body.split('\01', 4) body = body.replace('\01', ' ') if not chinese.search(body): continue #tags = analyse.extract_tags(body, topK=20, withWeight=True) tags = analyse.extract_tags(body, topK=20, withWeight=False) out_tag = json.dumps(tags, ensure_ascii=False)