continue

        freq = array[1]
        if freq.find('.') != -1:
            freq = int(freq[:freq.index('.')])
        else:
            freq = int(freq)
        if freq < min_freq:
            break

        word = array[0]
        array_word = word.split(" ")
        for w in array_word:
            if len(w) > 3 or len(w) == 1:
                continue
            if not help.is_all_chinese(w):
                continue
            if w in merge_dict or w in brand_dict or w in new_word_dict:
                continue

            if w[:2] in merge_dict or w[:2] in new_word_dict:
                continue

            query_log_new_word_dict[w] = 1

print len(query_log_new_word_dict)

output_new_word = codecs.open(config.ROOT_SPIDER_PATH+"new_word_from_log.alternative", "w", "utf-8")
for word in query_log_new_word_dict:
    output_new_word.write(word+"\n")
output_new_word.close()
Esempio n. 2
0
        mergeDict[array[0]] = 1

# 京东品牌
brandDict = dict()
with codecs.open(input_brand_firsthand_file, "r", "utf-8") as f:
    for line in f:
        brandDict[line.strip()] = 1

import jieba
newWordDic = dict()

# 天猫品牌
with codecs.open(input_brand_tmall_file, "r", "utf-8") as f:
    for line in f:
        array = line.strip().split("/")
        if len(array) == 2 and help.is_all_chinese(array[1]):
            brandDict[array[1]] = 1
        elif help.is_all_chinese(array[0]):
            brandDict[array[0]] = 1

print "total brand number is ", len(brandDict)


def get_new_word(line_info):
    word_alter = line_info.strip()
    if u"爷的剑" in word_alter:
        print line
    if word_alter in brandDict:
        return

    if len(word_alter) == 1: