Ejemplo n.º 1
0
def classify_content(content):
    num_terms = len(dictionary)
    test_corpus = tfidf_model[dictionary.doc2bow(list(Tokenize(content)))]
    test_sparse = matutils.corpus2csc([test_corpus],
                                      num_terms).transpose(copy=False)
    result = sg_class.predict(test_sparse)
    return id2cls[result[0]]
Ejemplo n.º 2
0
def sim_search(html):
    model_pks = []
    scores = []
    results = None
    try:
        results = server.find_similar('html_%d' % html.id)
        #print 'get from id',html.id,results
    except:
        doc = {}
        doc['tokens'] = [s for s in Tokenize(html.content)]
        results = server.find_similar(doc)
        #print 'get from content'
    if results:
        for result in results:
            id = int(result[0].split('_')[1])
            model_pks.append(id)
            scores.append(result[1])
        objs = []
        bulk_objs = HtmlContent.objects.in_bulk(model_pks)
        for k, v in enumerate(model_pks):
            objs.append((bulk_objs[v], scores[k]))
        return objs
        #return list(HtmlContent.objects.filter(pk__in=model_pks))
    else:
        return None
Ejemplo n.º 3
0
def sim_content(content):
    model_pks = []
    scores = []
    doc = {}
    '''
    if not isinstance(content,unicode):
        try:
            content = content.decode('utf-8')
        except:
            content = content.decode('gbk','ignore').encode('utf-8', 'replace').decode('utf-8')
    '''
    doc['tokens'] = [s for s in Tokenize(content)]
    #print doc
    results = server.find_similar(doc)
    if results:
        for result in results:
            id = int(result[0].split('_')[1])
            model_pks.append(id)
            scores.append(result[1])
        objs = []
        bulk_objs = HtmlContent.objects.in_bulk(model_pks)
        for k, v in enumerate(model_pks):
            objs.append((bulk_objs[v], scores[k]))
        return objs
    else:
        return None
Ejemplo n.º 4
0
def iter_documents():
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for obj in HtmlContent.objects.filter(status=0).filter(~Q(content='')):
        doc = {}
        doc['id'] = 'html_%d' % obj.id
        doc['tokens'] = list(Tokenize(obj.content))
        if obj.id % 1000 == 0:
            print 'processing', obj.id
        yield doc
Ejemplo n.º 5
0
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
Ejemplo n.º 6
0
def summarize(txt):
    sents = list(cppjiebapy.cut_sentence(txt))
    docs = [list(Tokenize(sent)) for sent in sents]
    keys = []
    for d in docs:
        keys += d
    (top_n_words, cls) = key_words(keys, N_2)
    top_n_sum = summarize4(sents, docs)

    return ', '.join(top_n_words[:18]), top_n_sum, cls
Ejemplo n.º 7
0
def test3():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    bm25 = BM25(docs)
    l = len(sents)
    test_dense = np.zeros((l, l))
    for i in xrange(l):
        scores = bm25.simall(docs[i])
        test_dense[i] = scores
    print 'bm25=', test_dense
Ejemplo n.º 8
0
def test_rank2(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    print 'test_rank2', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
Ejemplo n.º 9
0
def hash_all():
    for obj in HtmlContent.objects.filter(
            status__lte=2).filter(~Q(content='')):
        h = simhash.hash_tokenpy(list(Tokenize(obj.content)))
        if find_duplicate(c, h) == 0:
            obj.status = 0
        else:
            obj.status = 1
        obj.hash = h
        obj.save()
        c.insert(h)
Ejemplo n.º 10
0
def proxy_task(id):
    try:
        html = HtmlContent.objects.get(pk=id)
        #print 'html content', html.content
        if html.status <= 1 and html.content != '':
            return html.status
    except:
        # Not find
        return HttpResponse('not find')

    the_data = ''
    try:
        req = urllib2.Request(html.url)
        req.add_header('User-Agent', USER_AGENT)
        encoding_support = ContentEncodingProcessor
        opener = urllib2.build_opener(encoding_support, urllib2.HTTPHandler)
        #print 'requesting'
        proxied_request = opener.open(req, timeout=12)
        content = proxied_request.read()
        #print 'requested'
    except urllib2.HTTPError as e:
        html.status = 3
        print 'urllib2 error'
    else:
        try:
            ucontent = content.decode('utf-8')
        except UnicodeDecodeError:
            ucontent = content.decode('gbk', 'ignore')
        #print 'parsing'
        tx = TextExtract(ucontent)
        #print 'parsed'
        html.title = tx.title
        html.content = tx.content.strip()
        if tx.content == '':
            print 'Parse html error'
            html.status = 4
        else:
            html.status = 0
            html.hash = long(simhashpy(list(Tokenize(html.content))))
            html.tags, html.summerize = summarize(html.content)
            if len(html_remove.sub('', tx.preview)) < 250:
                html.preview = TextToHtml(tx.content)
            else:
                html.preview = tx.preview

    #print html.id, html.title, html.tags, html.summerize
    html.save()
    if html.status == 0:
        print 'begin sim_index'
        #sim_index(html)

    return html.status
Ejemplo n.º 11
0
def test1():
    obj = HtmlContent.objects.get(pk=46)
    key, sum, cls = summarize(obj.content)
    key2 = key_rank(obj.content)

    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sum2 = summarize4(sents)

    print 'key1', key, cls
    print 'key2', ', '.join(key2)
    print 'sum1', sum
    print 'sum2', sum2
Ejemplo n.º 12
0
def search(content):
    doc = {}
    doc['tokens'] = [s for s in Tokenize(content)]
    model_pks = []
    scores = []
    for result in server.find_similar(doc):
        id = int(result[0].split('_')[1])
        model_pks.append(id)
        scores.append(result[1])
    objs = []
    bulk_objs = HtmlContent.objects.in_bulk(model_pks)
    for k,v in enumerate(model_pks):
        objs.append((bulk_objs[v],scores[k]))

    return objs
Ejemplo n.º 13
0
def test2():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance,
                                         itertools.product(test_a, test_a)),
                          dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res, (l, l))
    print 'lsi=', sim_res
Ejemplo n.º 14
0
def test_rank3(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    lsa = TruncatedSVD(5)
    lsa_res = lsa.fit_transform(tfidf)
    lsa_res = Normalizer(copy=False).fit_transform(lsa_res)
    tfidf_graph = np.dot(lsa_res, lsa_res.T)
    tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph))
    nx_graph = nx.from_numpy_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _, i in sorted(res[:3])]
    print 'test_rank3', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
Ejemplo n.º 15
0
def test_rank1(obj):
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance,
                                         itertools.product(test_a, test_a)),
                          dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res, (l, l))
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in rank.top_index(5):
        top_n_summary.append(sents[index])
    print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '')
Ejemplo n.º 16
0
 def save_to_db(self):
     corpus_db = []
     cnt = 0
     for c, f in self.iter_files():
         corpus = SogouCorpus()
         with open(f) as file:
             content = file.read().decode('gb2312',
                                          'ignore').encode('utf-8').decode(
                                              'utf-8', 'replace')
             corpus.content = content
             corpus.tokens = ','.join([s for s in Tokenize(content)])
             corpus.classify = c
             corpus_db.append(corpus)
             cnt += 1
             if cnt % 80 == 0:
                 print '#',
                 error = False
                 try:
                     SogouCorpus.objects.bulk_create(corpus_db)
                 except:
                     error = True
                 if error:
                     for cor in corpus_db:
                         cor.save()
                 corpus_db = []
     if len(corpus_db) > 0:
         error = False
         try:
             SogouCorpus.objects.bulk_create(corpus_db)
         except:
             error = True
         if error:
             for cor in corpus_db:
                 cor.save()
         corpus_db = []
     print 'complete'
Ejemplo n.º 17
0
def sim_index(obj):
    doc = {}
    doc['id'] = 'html_%d' % obj.id
    doc['tokens'] = [s for s in Tokenize(obj.content)]
    server.index([doc])
Ejemplo n.º 18
0
#hash_all()


def hash_test():
    sim_server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
    dels = []
    for obj in HtmlContent.objects.filter(status=1).filter(~Q(content='')):
        dels.append('html_%d' % obj.id)
    sim_server.delete(dels)


#hash_test()

obj1 = HtmlContent.objects.get(pk=6870)
obj2 = HtmlContent.objects.get(pk=7024)
token1 = list(Tokenize(obj1.content))
token2 = list(Tokenize(obj2.content))
h1 = simhashpy(token1, 64)
h2 = simhashpy(token2, 64)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_token(token1)
h2 = simhash.hash_token(token2)
print h1, h2
print corpus.distance(h1, h2)
h1 = simhash.hash_tokenpy(token1)
h2 = simhash.hash_tokenpy(token2)
print h1, h2
print corpus.distance(h1, h2)
'''
str1 = 'test love you'
Ejemplo n.º 19
0
def tokenize2(text):
    for k in Tokenize(text):
        if len(k) > 1:
            yield k
Ejemplo n.º 20
0
def test5():
    for obj in HtmlContent.objects.filter(~Q(content='')):
        sents = list(cut_sentence(obj.content))
        docs = [list(Tokenize(sent)) for sent in sents]
        obj.summerize = summarize4(sents, docs)[0:400]
        obj.save()
Ejemplo n.º 21
0
import sys,os
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser

from cppjiebapy import ChineseAnalyzer, Tokenize

#copy this file from jieba project, just for testing

analyzer = ChineseAnalyzer()
str = u"来这里看看别人正在搜索什么;吧我的好朋友是李明;我爱北京天安门;IBM和Microsoft/\\/1997;... I have a dream interesting"
for t in analyzer(str):
    print t.text,
print '\n'

for t in Tokenize(str):
    print t,
print '\n'

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
    os.mkdir("tmp")

ix = create_in("tmp", schema) # for create new index
#ix = open_dir("tmp") # for read only
writer = ix.writer()

writer.add_document(
    title=u"document1", 
    path=u"/a",
    content=u"This is the first document we’ve added!"
Ejemplo n.º 22
0
def iter_documents():
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for obj in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')):
        document = obj.content
        #yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you
        yield [s for s in Tokenize(document)]
Ejemplo n.º 23
0
tfidf = transformer.fit_transform(counts)
print tfidf.toarray()
print '\n'

vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
print X
print '\n'

tags = [
    "python, tools",
    "linux, tools, ubuntu",
    "distributed systems, linux, networking, tools",
]

print list(Tokenize(tags[-1]))

vec = CountVectorizer(tokenizer=Tokenize)
data = vec.fit_transform(tags)
print data

vocab = vec.get_feature_names()
print vocab

print "#####HASHING TESTING#########"
vec = HashingVectorizer(tokenizer=Tokenize)
data = vec.fit_transform(tags)
print data
print "###END HASHING###"

train_set = ["The sky is blue.", "The sun is bright."]  #Documents