def test1(): obj = HtmlContent.objects.get(pk=46) key, sum, cls = summarize(obj.content) key2 = key_rank(obj.content) sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sum2 = summarize4(sents) print 'key1', key, cls print 'key2', ', '.join(key2) print 'sum1', sum print 'sum2', sum2
def test1(): obj = HtmlContent.objects.get(pk=46) key,sum,cls = summarize(obj.content) key2 = key_rank(obj.content) sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sum2 = summarize4(sents) print 'key1', key, cls print 'key2',', '.join(key2) print 'sum1',sum print 'sum2',sum2
def main(): hashm = zerorpc.Client('tcp://yaha.v-find.com:5678') #load_hashes(hashm) sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) r = redis.Redis() corpus = [] while True: try: # process queue as FIFO, change `blpop` to `brpop` to process as LIFO source, data = r.blpop(["groud_crawler:items", "followall:items"], timeout=20) except KeyboardInterrupt: print 'Exit' break except: #print 'No blpop', len(corpus) if len(corpus) > 0: sim_server.index(corpus) corpus = [] continue try: #print source, type(data) item = json.loads(data) except: print 'Load json error' continue url = item['url'] try: html = HtmlContent.objects.get(url=url) #Ignore the exists item, TODO use bloomfilter to ignore if html.status != 2: continue except: html = HtmlContent(url=url) try: html.title = item['title'][0:200] html.content = item['content'] tokens = list(Tokenize(html.content)) html.hash = hash_token(tokens) #html.hash = long(simhashpy(tokens)) html.tags,html.summerize,html.classify = summarize(html.content) html.summerize = html.summerize[0:400] html.preview = item['preview'] if find_duplicate(hashm, html.hash) != 0: #Mark as duplicate html.status = 1 else: html.status = 0 html.save() hashm.insert(html.hash) if html.status == 0: doc = {} doc['id'] = 'html_%d' % html.id doc['tokens'] = tokens corpus.append(doc) #print 'Append corpus', len(corpus), corpus[-1]['id'] if len(corpus) >= CORPUS_LEN: sim_server.index(corpus) corpus = [] #print 'Saved url %s' % html.url except: tb = traceback.format_exc() print 'Load json error', html.url, tb
def update_keywords(): for html in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')): html.tags,html.summerize = summarize(html.content) html.summerize = html.summerize[0:388] html.save()