def zsite2keyword(z): r = [] t = defaultdict(int) id = z.id name = z.name if name: for word in seg_title_search(name): t[word] += 2 url = url_by_id(id) if url: t[url] += 2 if z.cid == CID_SITE: for word in seg_title_search(motto_get(id)): t[word] += 1 elif z.cid == CID_USER: mail = mail_by_user_id(id) if mail: t[mail] += 2 t[mail.split('@', 1)[0]] += 2 txt = txt_get(id) if txt: man_txt_len = len(txt) for word in seg_txt_search(txt): t[word] += 1 for seq, career in enumerate(career_list_all(id)): if seq: add = 1 else: add = 2 unit = career.unit title = career.title txt = career.txt if unit: for word in seg_title_search(unit): t[word] += add if title: for word in seg_title_search(title): t[word] += add if txt: for word in seg_txt_search(txt): t[word] += add return t
def split_words(text): """docstring for split_words""" words = [] for i in seg_txt_search(text): words.append(i) return words
def guess_keywords(querystring, known_prefix=None): stem = xapian.Stem('en') words = set() contains_chinese = 0 for piece in querystring.split(): prefix = None if known_prefix and ':' in piece: qprefix, other = piece.split(':', 1) if qprefix in known_prefix: prefix = known_prefix[qprefix] piece = other for word in seg_txt_search(piece): if ord(word[0]) <= 127: # english word = stem(word) else: contains_chinese = 1 if prefix: words.add(prefix + word) else: words.add(word) return contains_chinese, list(words)
def split_words(text): """docstring for split_words""" return [i for i in seg_txt_search(text)]