def jiebaCounter(max_features=5000,prefix="extraction-",begin=1, end=1,dictionary=""): # get stopwords sf = open('chi_,.txt','r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] if dictionary=="": vectorizer=cv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer) else: vocabulary=open(dictionary,'r').read().split("\n") vectorizer=cv(vocabulary=vocabulary,max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer) d={} st=time.time() d,txt=getText(prefix=prefix,begin=begin,end=end) getdatatime=time.time() print getdatatime-st corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) vect=vectorizer.fit_transform(corpus.values()).toarray() print vect.shape voc=vectorizer.get_feature_names() wordssum = vect.sum(axis=0) index=range(len(voc)) index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x not in stopwords] print time.time() - st voc_sorted = [voc[i] for i in index] print time.time()-getdatatime return vect,voc,txt
def tfidf(max_features=5000,prefix="extraction-",begin=1, end=26): # get stopwords sf = open('chi_stopwords.txt','r') stopwords = [x.strip() for x in sf.read().split(',')] vectorizer=tv(max_features=max_features)#tokenizer=tokenizer) d={} st=time.time() d,txt=getText(prefix=prefix,begin=begin,end=end) getdatatime=time.time() print getdatatime-st corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) tfidf=vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc=vectorizer.get_feature_names() wordssum = tfidf.sum(axis=0) index=range(len(voc)) index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] print time.time() - st voc_sorted = [voc[i] for i in index] tfidfret = [] print time.time()-getdatatime return tfidf,voc,txt
def jiebaCount(max_features=5000,prefix="extraction-",begin=1, end=1): sf=open('chi_,.txt','r') d,txt=getText(prefix=prefix,begin=begin,end=end) print "Data loaded." res=[] count=[] stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] sw=stopwords print len(txt) st=time.time() for i in range(len(txt)): r=" ".join(jieba.cut(txt[i])).split(" ") r=[x.strip() for x in r] r=filter(None,r) r=[x for x in r if not x in stopwords] res.append(r) count.append(Counter(r)) #print i print "Counting cost "+str(time.time()-st)+" seconds." #sw = [x.strip() for x in sf.read().split(',')] #sw=[] #[sw.append(x) for x in stopwords if x not in sw] #print len(sw) #stopwords=sw #of=open('stopwords1.txt','w') #of.write(','.join(stopwords)) #of.close() #for line in f.readlines(): # line=re.sub(r'\s','',line)#eine=line.strip() # res+=[x.strip() for x in line.split("/")] for x in count[0].most_common(20): print x[0],x[1] print len(count),len(count[0]) return count
def tfidf(max_features=5000, prefix="extraction-", begin=1, end=26): # get stopwords sf = open('chi_stopwords.txt', 'r') stopwords = [x.strip() for x in sf.read().split(',')] vectorizer = tv(max_features=max_features) #tokenizer=tokenizer) d = {} st = time.time() d, txt = getText(prefix=prefix, begin=begin, end=end) getdatatime = time.time() print getdatatime - st corpus = {} for i in range(len(txt)): #d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False))) tfidf = vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc = vectorizer.get_feature_names() wordssum = tfidf.sum(axis=0) index = range(len(voc)) index = [ index for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True) if x.encode('utf-8') not in stopwords ] print time.time() - st voc_sorted = [voc[i] for i in index] tfidfret = [] print time.time() - getdatatime return tfidf, voc, txt
def jiebaCount(max_features=5000, prefix="extraction-", begin=1, end=1): sf = open('chi_,.txt', 'r') d, txt = getText(prefix=prefix, begin=begin, end=end) print "Data loaded." res = [] count = [] stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] sw = stopwords print len(txt) st = time.time() for i in range(len(txt)): r = " ".join(jieba.cut(txt[i])).split(" ") r = [x.strip() for x in r] r = filter(None, r) r = [x for x in r if not x in stopwords] res.append(r) count.append(Counter(r)) #print i print "Counting cost " + str(time.time() - st) + " seconds." #sw = [x.strip() for x in sf.read().split(',')] #sw=[] #[sw.append(x) for x in stopwords if x not in sw] #print len(sw) #stopwords=sw #of=open('stopwords1.txt','w') #of.write(','.join(stopwords)) #of.close() #for line in f.readlines(): # line=re.sub(r'\s','',line)#eine=line.strip() # res+=[x.strip() for x in line.split("/")] for x in count[0].most_common(20): print x[0], x[1] print len(count), len(count[0]) return count
def tfidf(max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1, end=26): ### get stopwords sf = open('chi_n.txt','r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split('\n')] sf.close() ### load data d={} st=time.time() d,txt=getText(prefix=prefix,begin=begin,end=end) getdatatime=time.time() print "Loading data cost "+str(getdatatime-st)+" seconds." ### cut text corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) jsonfile = "tfidf_cut_"+prefix+str(begin)+"_"+str(end)+".json" f = open(jsonfile,'w') json.dump(corpus,f) #f = open(jsonfile,'r') #corpus = json.load(f) f.close() ### tfidf vectorizer vectorizer=tv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer) tfidf=vectorizer.fit_transform(corpus.values())#.toarray() print "Tfidf vectorizing cost "+str(time.time()-getdatatime)+" seconds." #print tfidf.shape voc=vectorizer.get_feature_names() ### sorting vocabulary #wordssum = tfidf.sum(axis=0) #index=range(len(voc)) #index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] #voc_sorted = [voc[i] for i in index] ### save to json file #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json" #data={} #data['vocabulary']=voc #data['tfidf']=tfidf.tolist() #with open(jsonfile,'w') as f: # json.dump(data,f) #f.close() ### save to pickle file pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat" f = open(pklfile,'wb') cPickle.dump(tfidf,f,-1) f.close() vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc" f = open(vocfile,'w') voca=voc f.write("\n".join(voca).encode('utf-8')) f.close() return tfidf,voc,txt
def jiebaCounter(max_features=5000, prefix="extraction-", begin=1, end=1, dictionary=""): # get stopwords sf = open('chi_,.txt', 'r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] if dictionary == "": vectorizer = cv(max_features=max_features, stop_words=stopwords) #tokenizer=tokenizer) else: vocabulary = open(dictionary, 'r').read().split("\n") vectorizer = cv(vocabulary=vocabulary, max_features=max_features, stop_words=stopwords) #tokenizer=tokenizer) d = {} st = time.time() d, txt = getText(prefix=prefix, begin=begin, end=end) getdatatime = time.time() print getdatatime - st corpus = {} for i in range(len(txt)): #d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False))) vect = vectorizer.fit_transform(corpus.values()).toarray() print vect.shape voc = vectorizer.get_feature_names() wordssum = vect.sum(axis=0) index = range(len(voc)) index = [ index for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True) if x not in stopwords ] print time.time() - st voc_sorted = [voc[i] for i in index] print time.time() - getdatatime return vect, voc, txt
parser = argparse.ArgumentParser() parser.add_argument("max_features", type=int, help="number of max features") parser.add_argument("prefix", type=str, help="prefix of json files") parser.add_argument("begin", type=int, help="begin code of json files") parser.add_argument("end", type=int, help="end code of json files") parser.add_argument("outputfile", type=str, help="output vocabulary file path") args = parser.parse_args() # get stopwords sf = open('chi_,.txt', 'r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] # load data d = {} st = time.time() d, txt = getText(prefix=args.prefix, begin=args.begin, end=args.end) getdatatime = time.time() print "Loading data cost " + str(getdatatime - st) + " seconds." # cut words corpus = {} for i in range(len(txt)): #d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False))) # tfidf vectorizer = tv(max_features=args.max_features, stop_words=stopwords) #tokenizer=tokenizer) tfidf = vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc = vectorizer.get_feature_names()
parser = argparse.ArgumentParser() parser.add_argument("max_features",type=int,help="number of max features") parser.add_argument("prefix",type=str,help="prefix of json files") parser.add_argument("begin",type=int,help="begin code of json files") parser.add_argument("end",type=int,help="end code of json files") parser.add_argument("outputfile",type=str,help="output vocabulary file path") args=parser.parse_args() # get stopwords sf = open('chi_,.txt','r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] # load data d={} st=time.time() d,txt=getText(prefix=args.prefix,begin=args.begin,end=args.end) getdatatime=time.time() print "Loading data cost "+ str(getdatatime-st)+" seconds." # cut words corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) # tfidf vectorizer=tv(max_features=args.max_features,stop_words=stopwords)#tokenizer=tokenizer) tfidf=vectorizer.fit_transform(corpus.values()).toarray() print tfidf.shape voc=vectorizer.get_feature_names() print "Tfidf calculating cost "+str(time.time() - getdatatime)+" seconds."