def get_cs_by_lucene_doc(self, docs, context): doc_size = len(docs) lucene_ids = [] categories = [] for id in range(doc_size): link = docs[id].get("link") lucene_ids.append(int(docs[id].get("id"))) entry = dao.get_by_link(link, Entry) self.entries.append(entry) # TODO boost title field summary = entry.summary[:200] #if entry.category != '其他': #categories.append(entry.category) stream = self.analyzer.tokenStream("summary", StringReader(summary)) for s in stream: context.tokens.append(s.term()) context.token_types.append(s.type()) stream = self.analyzer.tokenStream("title", StringReader(entry.title)) for s in stream: context.title_field.append(len(context.tokens)) context.tokens.append(s.term()) context.token_types.append(s.type()) context.term_doc_range.append(len(context.tokens)) #print 'tokens:',len(context.tokens) return self.pe.extract(context), lucene_ids, categories
def c(): from apps.wantown import dao from apps.wantown.models import Entry,Category entries = Entry.objects.all() from dot.matrixmapper import MatrixMapper STOP_WORDS = [u'a', u'an', u'and', u'are', u'as', u'at', u'be', u'but', u'by', u'for', u'if', u'in', u'into', u'is', u'it', u'no', u'not', u'of', u'on', u'or', u'such', u'that', u'the', u'their', u'then', u'there', u'these', u'they', u'this', u'to', u'was', u'will', u'with', u'you',u'your',u'we',u'he',u'him',u'how',u'where', # add by myself u'i',u'been',u'about',u'们',u'这',u'那',u'的',u'己',u'个',u'我',u'你',u'很',u'了',u'是',u'以',u'过',u'一',u'么',u'没',u'在'] mapper = MatrixMapper(STOP_WORDS) ireader = IndexReader.open(STORE_DIR) for i in range(len(entries)): try: doc = ireader.document(i) link = doc.get('link') entry = dao.get_by_link(link, Entry) category = mapper.build([doc]) weight = 0 if category: cat = category[0].text weight = category[0].label_weight else: cat = '其他' entry.category = dao.save_category(cat,weight,'s') entry.save() except Exception,e: print i,e
def sortValue(self, i): fielddoc = FieldDoc.cast_(i) link = self.reader.document(fielddoc.doc).get('link') entry = dao.get_by_link(link,Entry) count = dao.get_category_count_by_entry(self.query,entry) return Double(count)
def bits(self, reader): bits = BitSet(reader.maxDoc()) cat = dao.Category.objects.get(id=self.category_id) termDocs = reader.termDocs(Term('summary',cat.what)) doc_ids = cache.get(self.query.replace(' ','+')) if doc_ids: for id in doc_ids: doc = reader.document(id) entry = dao.get_by_link(doc["link"],Entry) qec = dao.get_qec_by_qe(self.query, entry.id) for i in qec: if i.category.id == self.category_id: bits.set(id) break """ while termDocs.next(): id = termDocs.doc() doc = reader.document(id) entry = dao.get_by_link(doc["link"],Entry) qec = dao.get_qec_by_qe(self.query, entry.id) i +=1 for i in qec: if i.category.id == self.category_id: bits.set(id) break """ print bits return bits
def fetch_entries(feed,entries): for entry in entries: entry_link = entry['link'] entry_model = dao.get_by_link(entry_link, Entry) if not entry_model: entry_model = Entry(feed=feed, link=entry_link) else: continue entry_model.title = entry['title'] if len(entry_model.title) >= 200: continue entry_model.author = entry.get('author', 'unknow') entry_model.summary = entry.get('summary', '') if not entry_model.summary: content = entry.get('content', '') try: entry_model.summary = (type(content) == unicode and content) or content[0].get('value', '') except: continue #clear html tags entry_model.summary = strip_tags(entry_model.summary) if len(entry_model.summary) <= 100: return entry_model.when = entry.get('updated_parsed','') or time.localtime(entry.get('updated')) if entry_model.when: entry_model.when = datetime.datetime(entry_model.when[0],entry_model.when[1],entry_model.when[2],entry_model.when[3],entry_model.when[4]) tags = None if entry.has_key('tags'): tags = entry.get('tags', '') tags = tags[0].get('term','') if not tags and entry.has_key('categories'): tags = entry.get('categories') tags = tags.values()[0] if not tags: print 'no tags.ignored...' #continue else: cat = dao.save_category(tags) entry_model.category = cat try: dao.save_model(entry_model) except Exception,e: print 'save error:',e
def a(): import os #loader = BSDDictLoader() #dic = loader.load() words_dict = {} from dot.searcher import Searcher, STORE_DIR from apps.wantown import dao from apps.wantown.models import Entry searcher = Searcher() hits = searcher.search("java") docs = [] for hit in hits: doc = Hit.cast_(hit).getDocument() docs.append(doc) entries = [] all = '' from dot.context import Context, Token context = Context() import re #all = re.sub('[0-9:;;/\(\)\t\[\]]()\**#&','',all) #all = re.sub('[ +=-]',' ',all) analyzer = StandardAnalyzer() # doc id id = 0 allToken = [] allText = [] pureText = '' c = 0 docRange = {} for doc in docs[0:100]: link = doc.get("link") entry = dao.get_by_link(link, Entry) entries.append(entry.summary) all = entry.summary[:200] + entry.title pureText += all tokenType = [] last_type = '' #all = """提起电吉他演奏,就必须提到布鲁斯音乐;提起最伟大的吉他演奏大师,人们首先会想到的是 Jimi Hendrix,但是说起依然在世的最伟大的吉他演奏家,名字只有一个——Eric Clapton爵士。自从上个世纪60年代布鲁斯摇滚乐以及布鲁斯吉他演奏成为了主流摇滚风格之后,在这种来源于黑人音乐的吉他演奏中,在所有除黑色外其他肤色的布鲁斯吉他演奏家之中,传奇人物Eric Clapton毫无疑问是其中最杰出的一位。在与Eric Clapton同时代的所有艺术家纷纷的离开人世,或者失去了原有的歌迷号召力之后,Eric Clapton是所有当年这些艺术家中为数不多的既然保持着自己高超的演奏技术以及强大的市场号召力的艺术家。 # #Eric Clapton为人谦逊,在与其他出色的吉他演奏者比如Jimi Hendrix,B.B. King,Duane Allman,甚至后辈Stevie Ray Vaughan相比较的时候他总是非常谦恭,在与B.B. King以及Bob Dylan等人同台的时候他总是举止非常礼让,他是最有绅士风度的流行音乐家之一。同时,作为世界上最著名的吉他大师,Eric Clapton还经常热心的帮助包括英国著名流行音乐家Sting,Bon Jovi乐队主音吉他手Richie Sambora在内的其他一些音乐家去录制专辑或者拍摄音乐录影带,并且经常为一些音乐家担任吉他手作伴奏。Eric Clapton曾经协助过Bob Dylan,Aretha Franklin,Joe Cocker,Ringo Starr,Freddie King,Roger Waters等等近百位艺术家的专辑录制。 #""" stream = analyzer.tokenStream("fieldname", StringReader(all)) for s in stream: #if (last_type == '<ALPHANUM>' or last_type == '<HOST>') and (s.type() == '<ALPHANUM>' or s.type() == '<HOST>'): #all.append(' ') #pass #last_type = s.type() token = Token() token.text = s.termText() token.offset = s.termLength() token.doc = id allToken.append(token) allText.append(s.term()) print dir(s) c += 1 docRange[len(allText)] = id #all = sorted(all,cmp=lambda x,y:cmp(x.termText(),y.termText())) id += 1 context.tokens = allText #context.tokens.sort() #for i in context.tokens: #print i #print s context.text = '' context.token_types = tokenType context.docs = entries context.term_doc_range = docRange print len(all) from dot.lingo import pextractor import time start = time.time() #pe = pextractor.PhraseExtractor() #results = pe.extract(context) count = 0 r = docRange.keys() r.sort() if 0: for i in results: if len(i.text) > 1 and i.freq > 2 and len(i.text) < 20: id = i.id - 1 lcp = context.lcp[id + 1] for f in range(i.freq): begin = context.suffix[id] end = context.suffix[id] + lcp for j in range(len(r)): if begin < r[j]: break doc = docRange[r[j]] #print context.tokens[begin:end],i.freq,begin,doc if end > r[j]: print 'not in the same doc' id += 1 #print i.text.strip(), i.freq,i.doc_freq #print (time.time() - start) from dot.matrixmapper import MatrixMapper mapper = MatrixMapper() mapper.build(docs[:100]) #print pureText import sys from dot.lingo import suffixsorter as ss #for i in range(len(context.suffix)): # s = pe.list2str(context.tokens) # sys.stdout.write('%d\t%d\t%s\n' % (context.suffix[i], context.lcp[i], context.tokens[context.suffix[i]:context.suffix[i] + 10])) #dm = getDictManager() #words_dict= featurex.tf_idf(entries, dm.seg_dict) #doc1 = featurex.Document(entries.encode('utf-8'),dm) #doc2 = featurex.Document(entries[0].encode('utf-8'), dm) #for i in words_dict.values(): #print i.word,i.frequency,i.feature_value,i.tfidf #print similitude_doc_cos(doc1, doc2) """ ibm jdk 3 {3: 3} 不同 3 {4: 2, 7: 1} 使用 3 {8: 2, 7: 1} 可以 10 {8: 3, 3: 2, 4: 2, 7: 3} 处理 3 {8: 3} 好的 3 {8: 1, 7: 2} 字体 5 {8: 2, 4: 3} 已经 4 {9: 1, 3: 1, 4: 1, 7: 1} 平滑 4 {8: 1, 4: 3} 应用 3 {8: 1, 4: 2} 手机上 3 {7: 3} 文本 3 {8: 3} 游戏 4 {7: 4} 环境 3 {1: 1, 3: 2} 的java 6 {1: 1, 2: 1, 5: 1, 7: 3} 的文 3 {8: 3} 设置 5 {4: 5} 软件 3 {1: 1, 7: 2} 运行 3 {1: 1, 7: 2} """
return ''.join(result) def fetch(url,is_write=True): print 'parsing: ',url if not url: return if Feed.objects.filter(rss_link=url): return 'pass' try: soup = feedparser.parse(url) except Exception,e: print 'parsing error',e return feed_link = soup.feed.get('link','') feed = dao.get_by_link(feed_link, Feed) if not feed: feed = Feed(link=feed_link) feed.title = soup.feed.get('title','') if Feed.objects.filter(title=feed.title): return 'pass' feed.description = soup.feed.get('description','') feed.rss_link = url dao.save_model(feed) if not soup['entries']: print 'this feed has not entries' return if is_write: fetch_entries(feed,soup['entries']) def fetch_entries(feed,entries):
def query(query, page,category_what,data_size=200,nobuildcategory=False): category_id = None if category_what: category_ = dao.Category.objects.filter(what=category_what)[0] category_id = category_.id hits = searcher.search(query,category_id) doc_ids = [] for i in range(len(hits)): doc_ids.append(hits.id(i)) #这里将空格替换为+号,否则会报错,对应地在catfilter中从cache中值时也要将query的空格替换为+号 cache.add(query.replace(' ','+'),doc_ids,3600) #相关类目,暂不使用 #cats = dao.get_keywords(query) results = [] scores = [] #last page number total = hits.length() pages_num = total / PAGE_SIZE + (total % PAGE_SIZE and 1) or 0 if ((page - 1) * PAGE_SIZE) > total : page = pages_num docs = [] for i in range(PAGE_SIZE): start = (page - 1) * PAGE_SIZE if start + i >= total: break doc = hits.doc(i + start) docs.append(doc) link = doc.get("link") entry = dao.get_by_link(link, Entry) if entry: entry.summary = entry.summary[0:data_size] + "..." results.append(entry) scores.append(hits.score(i + (page - 1) * PAGE_SIZE)) if 0: for hit in hits: doc = Hit.cast_(hit).getDocument() link = doc.get("link") entry = dao.get_by_link(link, Entry) if entry: entry.summary = entry.summary[0:200] + "..." results.append(entry) scores.append(Hit.cast_(hit).getScore()) dispCats = dao.QueryCategoryDisp.objects.filter(query__keyword=query) label = [] if dispCats: for cat in dispCats: qec=dao.QueryEntryCategory.objects.filter(query__keyword=query,category=cat.category) label.append([cat.weight,cat.category.what,len(qec)]) label.sort(reverse=True) phrases,label_doc = (dispCats and ({},[])) or discover_freq_phrases(docs,query) #for i in range(len(docs)): #raw_cat = results[i].category.what #if raw_cat == u'其他' and phrases[i].label_weight: # results[i].category.what = phrases[i].text return results, scores,total,phrases,dispCats and label[:10] or label_doc[:10]