Ejemplo n.º 1
0
def tfidf(max_features=5000,prefix="extraction-",begin=1, end=26):
    # get stopwords
    sf = open('chi_stopwords.txt','r')
    stopwords = [x.strip() for x in sf.read().split(',')]
    vectorizer=tv(max_features=max_features)#tokenizer=tokenizer)
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print getdatatime-st
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    tfidf=vectorizer.fit_transform(corpus.values()).toarray()
    print tfidf.shape
    voc=vectorizer.get_feature_names()
    wordssum = tfidf.sum(axis=0)
    index=range(len(voc))
    index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] 
    print time.time() - st
    voc_sorted = [voc[i] for i in index] 
    tfidfret = []
    print time.time()-getdatatime
    return tfidf,voc,txt
Ejemplo n.º 2
0
def tfidf(max_features=5000, prefix="extraction-", begin=1, end=26):
    # get stopwords
    sf = open('chi_stopwords.txt', 'r')
    stopwords = [x.strip() for x in sf.read().split(',')]
    vectorizer = tv(max_features=max_features)  #tokenizer=tokenizer)
    d = {}
    st = time.time()
    d, txt = getText(prefix=prefix, begin=begin, end=end)
    getdatatime = time.time()
    print getdatatime - st
    corpus = {}
    for i in range(len(txt)):  #d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False)))
    tfidf = vectorizer.fit_transform(corpus.values()).toarray()
    print tfidf.shape
    voc = vectorizer.get_feature_names()
    wordssum = tfidf.sum(axis=0)
    index = range(len(voc))
    index = [
        index
        for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True)
        if x.encode('utf-8') not in stopwords
    ]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    tfidfret = []
    print time.time() - getdatatime
    return tfidf, voc, txt
Ejemplo n.º 3
0
 def cluster(self, processed_text):
     vector = tv(use_idf=True)
     self.matrix = vector.fit_transform(processed_text)
     self.model = km(n_clusters=clusters, n_init=1000)
     self.model.fit(self.matrix)
     self.model_pred = km(n_clusters=clusters, n_init=1000)
     self.model_pred.fit_predict(self.matrix)
     return vector
Ejemplo n.º 4
0
def tfidf(max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1, end=26):
    ### get stopwords
    sf = open('chi_n.txt','r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split('\n')]
    sf.close()
    ### load data
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print "Loading data cost "+str(getdatatime-st)+" seconds."
    ### cut text
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    jsonfile = "tfidf_cut_"+prefix+str(begin)+"_"+str(end)+".json"
    f = open(jsonfile,'w')
    json.dump(corpus,f)
    #f = open(jsonfile,'r')
    #corpus = json.load(f)
    f.close()
    ### tfidf vectorizer
    vectorizer=tv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    tfidf=vectorizer.fit_transform(corpus.values())#.toarray()
    print "Tfidf vectorizing cost "+str(time.time()-getdatatime)+" seconds."
    #print tfidf.shape
    voc=vectorizer.get_feature_names()
    ### sorting vocabulary
    #wordssum = tfidf.sum(axis=0)
    #index=range(len(voc))
    #index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] 
    #voc_sorted = [voc[i] for i in index] 
    ### save to json file
    #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json"
    #data={}
    #data['vocabulary']=voc
    #data['tfidf']=tfidf.tolist()
    #with open(jsonfile,'w') as f:
    #    json.dump(data,f)
    #f.close()
    ### save to pickle file
    pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat"
    f = open(pklfile,'wb')
    cPickle.dump(tfidf,f,-1)
    f.close()
    vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc"
    f = open(vocfile,'w')
    voca=voc
    f.write("\n".join(voca).encode('utf-8'))
    f.close()
    return tfidf,voc,txt
Ejemplo n.º 5
0
 def __init__(self, revisoes, ngram_range=(1, 1)):
     self.vetorizador = tv(ngram_range=ngram_range)
     self.revisoes = revisoes
     self.vetorizar()
     print("\nTFIDFVectorizer concluiu a vetorização de %s." %
           str(ngram_range))
Ejemplo n.º 6
0
# load data
d = {}
st = time.time()
d, txt = getText(prefix=args.prefix, begin=args.begin, end=args.end)
getdatatime = time.time()
print "Loading data cost " + str(getdatatime - st) + " seconds."

# cut words
corpus = {}
for i in range(len(txt)):  #d.items():
    #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
    corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False)))

# tfidf
vectorizer = tv(max_features=args.max_features,
                stop_words=stopwords)  #tokenizer=tokenizer)
tfidf = vectorizer.fit_transform(corpus.values()).toarray()
print tfidf.shape
voc = vectorizer.get_feature_names()
print "Tfidf calculating cost " + str(time.time() - getdatatime) + " seconds."

# sorting according to tfidf
wordssum = tfidf.sum(axis=0)
index = range(len(voc))
index = [
    index for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True)
]  #if x not in stopwords]
voc_sorted = [voc[i] for i in index]
f = open(args.outputfile, 'w')
for x in voc_sorted:
    f.write(x.encode("utf-8") + "\n")
Ejemplo n.º 7
0
# load data
d={}
st=time.time()
d,txt=getText(prefix=args.prefix,begin=args.begin,end=args.end)
getdatatime=time.time()
print "Loading data cost "+ str(getdatatime-st)+" seconds."

# cut words
corpus={}
for i in range(len(txt)):#d.items():
    #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
    corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))

# tfidf
vectorizer=tv(max_features=args.max_features,stop_words=stopwords)#tokenizer=tokenizer)
tfidf=vectorizer.fit_transform(corpus.values()).toarray()
print tfidf.shape
voc=vectorizer.get_feature_names()
print "Tfidf calculating cost "+str(time.time() - getdatatime)+" seconds."

# sorting according to tfidf
wordssum = tfidf.sum(axis=0)
index = range(len(voc))
index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True)] #if x not in stopwords] 
voc_sorted = [voc[i] for i in index] 
f=open(args.outputfile,'w')
for x in voc_sorted:
    f.write(x.encode("utf-8")+"\n")
f.close()
#return tfidf,voc,txt
Ejemplo n.º 8
0
    print("Accuracy: %0.2f (+/- %0.2f)" %
          (train_scores.mean(), train_scores.std() * 2))
    test_scores = classifier.score(X_test, y_test)
    print('Test Scores')
    print(test_scores)
    print("Accuracy: %0.2f (+/- %0.2f)" %
          (test_scores.mean(), test_scores.std() * 2))
    print('')
    '''
		Train a SGD classifier using unigram representation
		with tf-idf, predict sentiments on imdb_te.csv, and write
		output to unigram.output.txt
	'''

    vectorizer = tv(encoding='utf-8',
                    strip_accents='unicode',
                    ngram_range=(1, 1),
                    decode_error='replace')
    vector_data = vectorizer.fit_transform(train_text)

    model_selector = model_selection
    X_train, X_test, y_train, y_test = model_selector.train_test_split(
        vector_data, labels, stratify=labels, test_size=0.2)

    classifier = sgd(loss='hinge', penalty='l1')
    classifier.fit(X_train, y_train)

    train_scores = classifier.score(X_train, y_train)
    print('TF-IDF Unigram Results')
    print('Train Scores')
    print(train_scores)
    print("Accuracy: %0.2f (+/- %0.2f)" %
Ejemplo n.º 9
0
def build_tfidf(corpus):
	vec = tv()
	tfidfs = vec.fit_transform(corpus)
	print (tfidfs.shape)
	return (tfidfs,vec)
Ejemplo n.º 10
0
# Import the necessary packages for preforming similarity between texts.
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.feature_extraction.text import TfidfVectorizer as tv

# Load the texts - The original & The generated
gensis = open('../Genesis.txt','r').read().split('\r')
model_gensis = open('../Model_Genesis.txt','r').read().split('\n')

# Initialize
TfV = tv()
TfV.fit(gensis)
Y = TfV.transform(gensis)

# Check for every sentence the similarity
similaritySum = 0
for sentence in model_gensis:
    X = TfV.transform([sentence])
    print(sentence)
    print(gensis[cs(X, Y).argmax()])
    print(' ')
    similaritySum = cs(X, Y).max()

# Calculate the similarity
similarity = similaritySum/7
print('The similarity between the original text - Genesis -  and the model is: ' , similarity)
Ejemplo n.º 11
0
def build_tfidf(corpus):
	tfidfs = tv(corpus)
	return tfidfs