continue freq = array[1] if freq.find('.') != -1: freq = int(freq[:freq.index('.')]) else: freq = int(freq) if freq < min_freq: break word = array[0] array_word = word.split(" ") for w in array_word: if len(w) > 3 or len(w) == 1: continue if not help.is_all_chinese(w): continue if w in merge_dict or w in brand_dict or w in new_word_dict: continue if w[:2] in merge_dict or w[:2] in new_word_dict: continue query_log_new_word_dict[w] = 1 print len(query_log_new_word_dict) output_new_word = codecs.open(config.ROOT_SPIDER_PATH+"new_word_from_log.alternative", "w", "utf-8") for word in query_log_new_word_dict: output_new_word.write(word+"\n") output_new_word.close()
mergeDict[array[0]] = 1 # 京东品牌 brandDict = dict() with codecs.open(input_brand_firsthand_file, "r", "utf-8") as f: for line in f: brandDict[line.strip()] = 1 import jieba newWordDic = dict() # 天猫品牌 with codecs.open(input_brand_tmall_file, "r", "utf-8") as f: for line in f: array = line.strip().split("/") if len(array) == 2 and help.is_all_chinese(array[1]): brandDict[array[1]] = 1 elif help.is_all_chinese(array[0]): brandDict[array[0]] = 1 print "total brand number is ", len(brandDict) def get_new_word(line_info): word_alter = line_info.strip() if u"爷的剑" in word_alter: print line if word_alter in brandDict: return if len(word_alter) == 1: