Example #1
0
    def indexDocs(self, root, writer):
        global count

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                try:
                    data = load(filename)
                    raw_title = data['title']
                    title = ' '.join(jieba.cut_for_search(raw_title))
                    print title
                    print "adding", filename
                    doc = Document()
                    doc.add(Field("raw_title", raw_title, t1))
                    doc.add(Field("filename", filename, t1))
                    doc.add(Field("url", data['url'], t1))
                    doc.add(Field("title", title, t2))
                    doc.add(Field("ctime", str(data['ctime']), t1))
                    doc.add(Field("content", data['content'], t1))
                    for img in data['realimgs']:
                        doc.add(Field("imgs", img[1], t1))
                        doc.add(Field("imgurls", img[0], t1))
                    writer.addDocument(doc)
                    count += 1
                except Exception, e:
                    print "Failed in indexDocs:", e
Example #2
0
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
import scipy as sp
import scipy.sparse
import my_load
import my_metrics
import my_utils
from sklearn.svm import SVC


########################################################################################################################
###################################################### LOAD DATA #######################################################
########################################################################################################################

#load
sparse_trainX, trainY, sparse_testX, testY = my_load.load(make_X_sparse = True)

sparse_trainX = sparse_trainX[0:50000]
trainY = trainY[0:50000]
#train
clf = SVC(probability = True, kernel='linear')
clf.fit(sparse_trainX, trainY)


#test
score_predicted_Y_array_2 = clf.predict_proba(sparse_testX).flatten()


sparse_trainX, trainY, sparse_testX, testY = my_load.load(make_X_sparse = True)

sparse_trainX = sparse_trainX[0:50000]
Example #3
0
from __future__ import print_function
import numpy as np
import scipy as sp
import scipy.sparse
import math
import my_load
import my_metrics
import my_utils
from sklearn.naive_bayes import MultinomialNB



trainX, trainY, testX, testY = my_load.load(make_X_sparse = True)


########################################################################################################################
################################################### NAIVE BAYES ########################################################
########################################################################################################################


#count_class_features = np.zeros(shape=(12000, 40))
#P_y = np.zeros(40)


#for i in range(len(trainX)):
#   s = trainX[i]
#    label = trainY[i]-1
#    P_y[label] = P_y[label] + 1
#    for t in s.split(",")[0:-1]:
#        feature = int(t.split(":")[0])
#        count_class_features[feature, label] = count_class_features[feature, label] + 1
Example #4
0
import os
from my_load import load
from my_dump import dump
for _,__,files in os.walk('data'):
    for f in files:
        data=load(f)
        realimgs=[]
        for i in range(len(data['realimgs'])):
            print data['realimgs'][i][0]
            print data['realimgs'][i][1]
            realimgs.append((data['realimgs'][i][0],f[:-4]+'_'+str(i)+'.jpg'))
        #    print 'pic/'+data['realimgs'][i][1]
        #    print 'pic/'+f[:-4]+'_'+str(i)+'.jpg'
            os.rename('pic/'+data['realimgs'][i][1],'pic/'+f[:-4]+'_'+str(i)+'.jpg')
        print realimgs
        data['realimgs']=realimgs
        dump(f,data)