Esempio n. 1
0
size = len(idf.files)
tf = dict()
wordLoc = dict()

# Index files 
t = 0
for f in idf.files:
    try:
	    temp = time.time()
	    doc = Document(f)
	    index.buildIndex(doc)
	    tf[doc.docNo] = doc.TF
	    wordLoc[doc.docNo] = doc.wordLoc
	    idf.buildDF(doc)
	    t=t + (time.time()-temp)
	    doc.write2DB(cur)
	    i +=1
	    if i%10==0:
	    	if i%300==0:
	    		con.commit()
	    	percent = i*100/size
	    	t = time.time() - startTime
	    	sys.stdout.write('\r indexing...\033[92m%2d\033[0m%%   \033[92m%2.0f\033[0ms'%(percent,t))
	    	sys.stdout.flush()
    except:
        print " some file format is not correct!"
        continue
con.commit()
idf.buildIDF()
#index.seldomWords()
#index.wordsVector()