def indexDocs(self, root, writer): global count t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: try: data = load(filename) raw_title = data['title'] title = ' '.join(jieba.cut_for_search(raw_title)) print title print "adding", filename doc = Document() doc.add(Field("raw_title", raw_title, t1)) doc.add(Field("filename", filename, t1)) doc.add(Field("url", data['url'], t1)) doc.add(Field("title", title, t2)) doc.add(Field("ctime", str(data['ctime']), t1)) doc.add(Field("content", data['content'], t1)) for img in data['realimgs']: doc.add(Field("imgs", img[1], t1)) doc.add(Field("imgurls", img[0], t1)) writer.addDocument(doc) count += 1 except Exception, e: print "Failed in indexDocs:", e
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import numpy as np import scipy as sp import scipy.sparse import my_load import my_metrics import my_utils from sklearn.svm import SVC ######################################################################################################################## ###################################################### LOAD DATA ####################################################### ######################################################################################################################## #load sparse_trainX, trainY, sparse_testX, testY = my_load.load(make_X_sparse = True) sparse_trainX = sparse_trainX[0:50000] trainY = trainY[0:50000] #train clf = SVC(probability = True, kernel='linear') clf.fit(sparse_trainX, trainY) #test score_predicted_Y_array_2 = clf.predict_proba(sparse_testX).flatten() sparse_trainX, trainY, sparse_testX, testY = my_load.load(make_X_sparse = True) sparse_trainX = sparse_trainX[0:50000]
from __future__ import print_function import numpy as np import scipy as sp import scipy.sparse import math import my_load import my_metrics import my_utils from sklearn.naive_bayes import MultinomialNB trainX, trainY, testX, testY = my_load.load(make_X_sparse = True) ######################################################################################################################## ################################################### NAIVE BAYES ######################################################## ######################################################################################################################## #count_class_features = np.zeros(shape=(12000, 40)) #P_y = np.zeros(40) #for i in range(len(trainX)): # s = trainX[i] # label = trainY[i]-1 # P_y[label] = P_y[label] + 1 # for t in s.split(",")[0:-1]: # feature = int(t.split(":")[0]) # count_class_features[feature, label] = count_class_features[feature, label] + 1
import os from my_load import load from my_dump import dump for _,__,files in os.walk('data'): for f in files: data=load(f) realimgs=[] for i in range(len(data['realimgs'])): print data['realimgs'][i][0] print data['realimgs'][i][1] realimgs.append((data['realimgs'][i][0],f[:-4]+'_'+str(i)+'.jpg')) # print 'pic/'+data['realimgs'][i][1] # print 'pic/'+f[:-4]+'_'+str(i)+'.jpg' os.rename('pic/'+data['realimgs'][i][1],'pic/'+f[:-4]+'_'+str(i)+'.jpg') print realimgs data['realimgs']=realimgs dump(f,data)