def classify_content(content): num_terms = len(dictionary) test_corpus = tfidf_model[dictionary.doc2bow(list(Tokenize(content)))] test_sparse = matutils.corpus2csc([test_corpus], num_terms).transpose(copy=False) result = sg_class.predict(test_sparse) return id2cls[result[0]]
def sim_search(html): model_pks = [] scores = [] results = None try: results = server.find_similar('html_%d' % html.id) #print 'get from id',html.id,results except: doc = {} doc['tokens'] = [s for s in Tokenize(html.content)] results = server.find_similar(doc) #print 'get from content' if results: for result in results: id = int(result[0].split('_')[1]) model_pks.append(id) scores.append(result[1]) objs = [] bulk_objs = HtmlContent.objects.in_bulk(model_pks) for k, v in enumerate(model_pks): objs.append((bulk_objs[v], scores[k])) return objs #return list(HtmlContent.objects.filter(pk__in=model_pks)) else: return None
def sim_content(content): model_pks = [] scores = [] doc = {} ''' if not isinstance(content,unicode): try: content = content.decode('utf-8') except: content = content.decode('gbk','ignore').encode('utf-8', 'replace').decode('utf-8') ''' doc['tokens'] = [s for s in Tokenize(content)] #print doc results = server.find_similar(doc) if results: for result in results: id = int(result[0].split('_')[1]) model_pks.append(id) scores.append(result[1]) objs = [] bulk_objs = HtmlContent.objects.in_bulk(model_pks) for k, v in enumerate(model_pks): objs.append((bulk_objs[v], scores[k])) return objs else: return None
def iter_documents(): """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.""" for obj in HtmlContent.objects.filter(status=0).filter(~Q(content='')): doc = {} doc['id'] = 'html_%d' % obj.id doc['tokens'] = list(Tokenize(obj.content)) if obj.id % 1000 == 0: print 'processing', obj.id yield doc
def summarize4(sents, docs=None): if not docs: docs = [list(Tokenize(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
def summarize(txt): sents = list(cppjiebapy.cut_sentence(txt)) docs = [list(Tokenize(sent)) for sent in sents] keys = [] for d in docs: keys += d (top_n_words, cls) = key_words(keys, N_2) top_n_sum = summarize4(sents, docs) return ', '.join(top_n_words[:18]), top_n_sum, cls
def test3(): obj = HtmlContent.objects.get(pk=34) sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] bm25 = BM25(docs) l = len(sents) test_dense = np.zeros((l, l)) for i in xrange(l): scores = bm25.simall(docs[i]) test_dense[i] = scores print 'bm25=', test_dense
def test_rank2(obj): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) print 'test_rank2', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '') + u'。'
def hash_all(): for obj in HtmlContent.objects.filter( status__lte=2).filter(~Q(content='')): h = simhash.hash_tokenpy(list(Tokenize(obj.content))) if find_duplicate(c, h) == 0: obj.status = 0 else: obj.status = 1 obj.hash = h obj.save() c.insert(h)
def proxy_task(id): try: html = HtmlContent.objects.get(pk=id) #print 'html content', html.content if html.status <= 1 and html.content != '': return html.status except: # Not find return HttpResponse('not find') the_data = '' try: req = urllib2.Request(html.url) req.add_header('User-Agent', USER_AGENT) encoding_support = ContentEncodingProcessor opener = urllib2.build_opener(encoding_support, urllib2.HTTPHandler) #print 'requesting' proxied_request = opener.open(req, timeout=12) content = proxied_request.read() #print 'requested' except urllib2.HTTPError as e: html.status = 3 print 'urllib2 error' else: try: ucontent = content.decode('utf-8') except UnicodeDecodeError: ucontent = content.decode('gbk', 'ignore') #print 'parsing' tx = TextExtract(ucontent) #print 'parsed' html.title = tx.title html.content = tx.content.strip() if tx.content == '': print 'Parse html error' html.status = 4 else: html.status = 0 html.hash = long(simhashpy(list(Tokenize(html.content)))) html.tags, html.summerize = summarize(html.content) if len(html_remove.sub('', tx.preview)) < 250: html.preview = TextToHtml(tx.content) else: html.preview = tx.preview #print html.id, html.title, html.tags, html.summerize html.save() if html.status == 0: print 'begin sim_index' #sim_index(html) return html.status
def test1(): obj = HtmlContent.objects.get(pk=46) key, sum, cls = summarize(obj.content) key2 = key_rank(obj.content) sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sum2 = summarize4(sents) print 'key1', key, cls print 'key2', ', '.join(key2) print 'sum1', sum print 'sum2', sum2
def search(content): doc = {} doc['tokens'] = [s for s in Tokenize(content)] model_pks = [] scores = [] for result in server.find_similar(doc): id = int(result[0].split('_')[1]) model_pks.append(id) scores.append(result[1]) objs = [] bulk_objs = HtmlContent.objects.in_bulk(model_pks) for k,v in enumerate(model_pks): objs.append((bulk_objs[v],scores[k])) return objs
def test2(): obj = HtmlContent.objects.get(pk=34) sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] num_terms = 400 test_corpus = lsi_model[tfidf_model[docs]] #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False) test_dense = matutils.corpus2dense(test_corpus, num_terms).T test_a = [a for a in test_dense] sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a, test_a)), dtype=np.float) l = len(sents) sim_res = np.reshape(sim_res, (l, l)) print 'lsi=', sim_res
def test_rank3(obj): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize) tfidf = vect.fit_transform(sents) lsa = TruncatedSVD(5) lsa_res = lsa.fit_transform(tfidf) lsa_res = Normalizer(copy=False).fit_transform(lsa_res) tfidf_graph = np.dot(lsa_res, lsa_res.T) tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph)) nx_graph = nx.from_numpy_matrix(tfidf_graph) scores = nx.pagerank(nx_graph) res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True) top_n_summary = [sents[i] for _, i in sorted(res[:3])] print 'test_rank3', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '') + u'。'
def test_rank1(obj): sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] num_terms = 400 test_corpus = lsi_model[tfidf_model[docs]] test_dense = matutils.corpus2dense(test_corpus, num_terms).T test_a = [a for a in test_dense] sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a, test_a)), dtype=np.float) l = len(sents) sim_res = np.reshape(sim_res, (l, l)) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in rank.top_index(5): top_n_summary.append(sents[index]) print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '')
def save_to_db(self): corpus_db = [] cnt = 0 for c, f in self.iter_files(): corpus = SogouCorpus() with open(f) as file: content = file.read().decode('gb2312', 'ignore').encode('utf-8').decode( 'utf-8', 'replace') corpus.content = content corpus.tokens = ','.join([s for s in Tokenize(content)]) corpus.classify = c corpus_db.append(corpus) cnt += 1 if cnt % 80 == 0: print '#', error = False try: SogouCorpus.objects.bulk_create(corpus_db) except: error = True if error: for cor in corpus_db: cor.save() corpus_db = [] if len(corpus_db) > 0: error = False try: SogouCorpus.objects.bulk_create(corpus_db) except: error = True if error: for cor in corpus_db: cor.save() corpus_db = [] print 'complete'
def sim_index(obj): doc = {} doc['id'] = 'html_%d' % obj.id doc['tokens'] = [s for s in Tokenize(obj.content)] server.index([doc])
#hash_all() def hash_test(): sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver')) dels = [] for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')): dels.append('html_%d' % obj.id) sim_server.delete(dels) #hash_test() obj1 = HtmlContent.objects.get(pk=6870) obj2 = HtmlContent.objects.get(pk=7024) token1 = list(Tokenize(obj1.content)) token2 = list(Tokenize(obj2.content)) h1 = simhashpy(token1, 64) h2 = simhashpy(token2, 64) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_token(token1) h2 = simhash.hash_token(token2) print h1, h2 print corpus.distance(h1, h2) h1 = simhash.hash_tokenpy(token1) h2 = simhash.hash_tokenpy(token2) print h1, h2 print corpus.distance(h1, h2) ''' str1 = 'test love you'
def tokenize2(text): for k in Tokenize(text): if len(k) > 1: yield k
def test5(): for obj in HtmlContent.objects.filter(~Q(content='')): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] obj.summerize = summarize4(sents, docs)[0:400] obj.save()
import sys,os from whoosh.index import create_in,open_dir from whoosh.fields import * from whoosh.qparser import QueryParser from cppjiebapy import ChineseAnalyzer, Tokenize #copy this file from jieba project, just for testing analyzer = ChineseAnalyzer() str = u"来这里看看别人正在搜索什么;吧我的好朋友是李明;我爱北京天安门;IBM和Microsoft/\\/1997;... I have a dream interesting" for t in analyzer(str): print t.text, print '\n' for t in Tokenize(str): print t, print '\n' schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists("tmp"): os.mkdir("tmp") ix = create_in("tmp", schema) # for create new index #ix = open_dir("tmp") # for read only writer = ix.writer() writer.add_document( title=u"document1", path=u"/a", content=u"This is the first document we’ve added!"
def iter_documents(): """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.""" for obj in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')): document = obj.content #yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you yield [s for s in Tokenize(document)]
tfidf = transformer.fit_transform(counts) print tfidf.toarray() print '\n' vectorizer = TfidfVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) print X print '\n' tags = [ "python, tools", "linux, tools, ubuntu", "distributed systems, linux, networking, tools", ] print list(Tokenize(tags[-1])) vec = CountVectorizer(tokenizer=Tokenize) data = vec.fit_transform(tags) print data vocab = vec.get_feature_names() print vocab print "#####HASHING TESTING#########" vec = HashingVectorizer(tokenizer=Tokenize) data = vec.fit_transform(tags) print data print "###END HASHING###" train_set = ["The sky is blue.", "The sun is bright."] #Documents