Ejemplo n.º 1
0
def build_guten():
    sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist()
    
    senna = numpy.array(senna)[gsubset].tolist()
    hashtab = dict( zip( senna, range( len( gsubset))))


    vectorizer = tfidf(vocabulary=hashtab,stop_words='english')

    wsentences = []
    avglen = 0
    for s in sentences:
        print '*',
        news = ''
        for w in s:
            print '.',
            news +=' ' + senna[w]
        avglen += len(s)
        wsentences.append(news)
        print ''

    print 'Average sentence size:',avglen/float(len(sentences))
    tfidfmat = vectorizer.fit_transform(wsentences)
    numpy.save('/scratch/rifaisal/data/guten/guten_tfidf.npy',tfidfmat)
    cPickle.dump(vectorizer,open('gutentokenizer.pkl','w'))
    print 'Done!'
Ejemplo n.º 2
0
def main():

    traindata = (p.read_table('train.tsv'))
    tr_title, tr_body, tr_url = convert_text(traindata)

    testdata = list(np.array(p.read_table('test.tsv'))[:, 2])
    y = np.array(p.read_table('train.tsv'))[:, -1]

    wordCount = cv(stop_words='english', encoding='latin-1')
    wordTFIDF = tfidf(stop_words='english', encoding='latin-1')

    corpus = tr_body

    bag = wordCount.fit_transform(corpus)
    tfdif = wordTFIDF.fit_transform(corpus)

    tfdif = tfdif.toarray()

    kmeans_soln.getDender(bag, tr_title)

    titles = np.array(tr_title)

    vocab = wordCount.get_feature_names()
    vocabTF = wordTFIDF.get_feature_names()

    topWords(centers, vocab)
Ejemplo n.º 3
0
def build_guten():
    sentences = cPickle.load(
        open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(
        open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten(
        ).tolist()

    senna = numpy.array(senna)[gsubset].tolist()
    hashtab = dict(zip(senna, range(len(gsubset))))

    vectorizer = tfidf(vocabulary=hashtab, stop_words='english')

    wsentences = []
    avglen = 0
    for s in sentences:
        print '*',
        news = ''
        for w in s:
            print '.',
            news += ' ' + senna[w]
        avglen += len(s)
        wsentences.append(news)
        print ''

    print 'Average sentence size:', avglen / float(len(sentences))
    tfidfmat = vectorizer.fit_transform(wsentences)
    numpy.save('/scratch/rifaisal/data/guten/guten_tfidf.npy', tfidfmat)
    cPickle.dump(vectorizer, open('gutentokenizer.pkl', 'w'))
    print 'Done!'
Ejemplo n.º 4
0
def tfidfVectorize(texts,
                   genKey,
                   vocabulary=None,
                   stop_words=None,
                   min_df=1,
                   max_df=1.0,
                   ngram_range=(1, 1),
                   max_features=None):
    '''This will likely require fixing so that I can pass some of the parameters into this function and keep the remaing functions
    unused - i.e. to have Vectorize(...vocabulary=someVocab,ngram_range=(1,3)) and Vectorize(...stopwords=someStops,max_features=1000)
    see options here: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    
    NOTE: Check by using an empty vocabulary first.  This should break.  Others are default values and should be okay.
    '''
    print 'vectorizing ', str(len(texts)), ' texts'
    vectorizer = tfidf(texts.values(),
                       stop_words=stop_words,
                       vocabulary=vocabulary,
                       min_df=min_df,
                       ngram_range=ngram_range,
                       max_features=max_features)
    vec = vectorizer.fit_transform(texts.values())
    labels = []
    for k in texts.keys():
        labels.append(genKey[k])
    labels = np.asarray(labels)
    return vec, labels, vectorizer
Ejemplo n.º 5
0
def search(term,desc):
    term_l=term.lower()
    desc_l=desc.lower()
    word_tok=tfidf().build_tokenizer()
    desc=word_tok(desc_l)
    terms=word_tok(term_l)
    return all([x in desc for x in terms])
Ejemplo n.º 6
0
def main( questionsFile, articleFile ):
  
  
  questions, oQuestions = read_file( questionsFile, True )
  
  if not questions:
    return
  sentences, oSentences = read_file( articleFile , False  )
  transformation = tfidf( analyzer = 'word', ngram_range = (1,1) , token_pattern =  r'[^ ]+', min_df = 1 ,
                  norm = "l1", use_idf = True, smooth_idf = True, sublinear_tf = False , stop_words = "english")

  XSentences = transformation.fit_transform( sentences )
  XQuestions = transformation.transform( questions )
  y = np.arange( XSentences.shape[0] )
  
  model = KNN( n_neighbors=1, metric = "euclidean")
  model.fit( XSentences, y )

  yQuestions = model.predict( XQuestions )

  #print yQuestions
  for iQuestion, iAnswer in enumerate( yQuestions ) :
    #print iQuestion, iAnswer
    #print oQuestions[iQuestion]
    print oSentences[iAnswer]
    #print
    pass
    #print sentences[ i ]


  pass
Ejemplo n.º 7
0
def get_XTrain_XTest ( text, y, cv = 5):
  #This function returns a matrix split in Train and Test data.

  #This line initializes the class that will change the list of texts to matrices. It
  #applies logarithm to the counts and considers only the features that have a 
  #minimum count of 2
  modelT = tfidf( analyzer = 'word', ngram_range = (1,1) , token_pattern = r'[^ ]+', min_df = 2 ,
                  norm = None, use_idf = False, smooth_idf = False, sublinear_tf = True )


  indices_Train, indices_Test = list(), list()
  XTrain, XTest = list(), list()
  cvI = StratifiedKFold(y, cv, indices= True)

  for train, test in cvI:
    indices_Train.append( train )
    indices_Test.append ( test  )
    
    textTrain = [ text[i] for i in train ]
    textTest  = [ text[i] for i in test  ]
    
    modelC = clone( modelT )
    modelC.fit( textTrain )

    XTrain.append( modelC.transform( textTrain ) )
    XTest.append ( modelC.transform( textTest  ) )

  return XTrain, XTest, indices_Train, indices_Test
Ejemplo n.º 8
0
def search(term, desc):
    term_l = term.lower()
    desc_l = desc.lower()
    word_tok = tfidf().build_tokenizer()
    desc = word_tok(desc_l)
    terms = word_tok(term_l)
    return all([x in desc for x in terms])
Ejemplo n.º 9
0
def main():

	traindata = (p.read_table('train.tsv'))
	tr_title, tr_body, tr_url = convert_text(traindata)

	testdata = list(np.array(p.read_table('test.tsv'))[:,2])
	y = np.array(p.read_table('train.tsv'))[:,-1]

	wordCount = cv(stop_words = 'english', encoding='latin-1')
	wordTFIDF = tfidf(stop_words = 'english', encoding='latin-1')

	corpus = tr_body

	bag = wordCount.fit_transform(corpus)
	tfdif = wordTFIDF.fit_transform(corpus)

	tfdif = tfdif.toarray()

	kmeans_soln.getDender(bag, tr_title)

	titles = np.array(tr_title)

	vocab = wordCount.get_feature_names()
	vocabTF = wordTFIDF.get_feature_names()

	topWords(centers, vocab)
Ejemplo n.º 10
0
def bag_of_words( clean_train_reviews ):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = tfidf(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 5000) 

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    
    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()

    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)

    # For each, print the vocabulary word and the number of times it 
    # appears in the training set
#    for tag, count in zip(vocab, dist):
#        print count, tag
        
    return train_data_features
Ejemplo n.º 11
0
def search(query):
	key_terms=searcher(query)
	termlist=[]
	# print(key_terms)
	if not key_terms:
		# print("here")
		key_terms=tfidf().build_tokenizer()(query)
	# print(key_terms)
	ranked={}
	for term in key_terms:
		if term not in termlist:
			ranks,authrs=topic_z_scorer(term)
			if not ranked:
				ranked=authrs
			else:
				ranked=combine_z(ranked,authrs)
			termlist.append(term)
		# print(ranked,authrs)
	for term in key_terms:
		sub_terms=tfidf().build_tokenizer()(query)
		for sub_term in sub_terms:
			if sub_term not in termlist:
				ranks,authrs=topic_z_scorer(sub_term)
				authrs={auth:authrs[auth]*0.75 for auth in authrs}
				if not ranked:
					ranked=authrs
				else:
					ranked=combine_z(ranked,authrs)
				termlist.append(sub_term)

	for term in tfidf().build_tokenizer()(query):
		for word in tfidf().build_tokenizer()(term):
			if word not in termlist:
				ranks,authrs=topic_z_scorer(word)
				authrs={auth:authrs[auth]*1.25 for auth in authrs}
				if not ranked:
					ranked=authrs
				else:
					ranked=combine_z(ranked,authrs)
				termlist.append(word)

	ranks_final=sorted([(ranked[key],key) for key in ranked.keys()])
	ranks_final.reverse()
	# print(ranks_final)
	return [name[1] for name in ranks_final]
Ejemplo n.º 12
0
def search(query):
    key_terms = searcher(query)
    termlist = []
    # print(key_terms)
    if not key_terms:
        # print("here")
        key_terms = tfidf().build_tokenizer()(query)
    # print(key_terms)
    ranked = {}
    for term in key_terms:
        if term not in termlist:
            ranks, authrs = topic_z_scorer(term)
            if not ranked:
                ranked = authrs
            else:
                ranked = combine_z(ranked, authrs)
            termlist.append(term)
        # print(ranked,authrs)
    for term in key_terms:
        sub_terms = tfidf().build_tokenizer()(query)
        for sub_term in sub_terms:
            if sub_term not in termlist:
                ranks, authrs = topic_z_scorer(sub_term)
                authrs = {auth: authrs[auth] * 0.75 for auth in authrs}
                if not ranked:
                    ranked = authrs
                else:
                    ranked = combine_z(ranked, authrs)
                termlist.append(sub_term)

    for term in tfidf().build_tokenizer()(query):
        for word in tfidf().build_tokenizer()(term):
            if word not in termlist:
                ranks, authrs = topic_z_scorer(word)
                authrs = {auth: authrs[auth] * 1.25 for auth in authrs}
                if not ranked:
                    ranked = authrs
                else:
                    ranked = combine_z(ranked, authrs)
                termlist.append(word)

    ranks_final = sorted([(ranked[key], key) for key in ranked.keys()])
    ranks_final.reverse()
    # print(ranks_final)
    return [name[1] for name in ranks_final]
Ejemplo n.º 13
0
def tf(sent):
    words=tfidf().build_tokenizer()(sent.lower())
    tf={}
    for word in words:
        if word in tf:
            tf[word]+=1
        else:
            tf[word]=1
    for key in tf.keys():
        tf[key]=tf[key]/len(words)
    return tf
Ejemplo n.º 14
0
def tf(sent):
    words = tfidf().build_tokenizer()(sent.lower())
    tf = {}
    for word in words:
        if word in tf:
            tf[word] += 1
        else:
            tf[word] = 1
    for key in tf.keys():
        tf[key] = tf[key] / len(words)
    return tf
Ejemplo n.º 15
0
def activate_tf_idf(clean_text,
                    tags,
                    top_n,
                    model="All",
                    n_estimator=2000,
                    mindf=0.001,
                    ng=1):
    print(model)
    dft = pd.DataFrame({"Data": [txt], "Tag": ['M']})
    clean_text_test, tags_test = cleaner(dft)
    if model == 'None':
        print("\nActivating tfidf analysis for unsupervised learning:")
        tf_idf = tfidf(sublinear_tf=True, min_df=mindf, ngram_range=(0, ng))
        X = tf_idf.fit_transform(clean_text)
        dtm = pd.DataFrame(X.toarray())
        features = tf_idf.get_feature_names()
        return dtm, features
    else:
        print("\nActivating tfidf analysis:")
        tf_idf = tfidf(sublinear_tf=True, min_df=mindf, ngram_range=(0, ng))
        X = tf_idf.fit_transform(clean_text)
        dtm = pd.DataFrame(X.toarray())
        features = tf_idf.get_feature_names()
        print("Total number of features are:", len(features))
        # phrase_extraction_tfidf(dtm, features, clean_text, tags, top_n, tf_idf=tf_idf)
        if top_n:
            indices = np.argsort(tf_idf.idf_)[::-1]
            features = tf_idf.get_feature_names()
            top = top_n
            top_features = [features[i] for i in indices[:top]]
            print("\nHere are the top {} important features:".format(top))
            print(top_features)
        X_train, X_test, y_train, y_test = train_test_split(dtm,
                                                            tags,
                                                            test_size=0.05,
                                                            random_state=40)
        X_test = dtm.iloc[0:31, :]
        y_test = tags[0:31]
        new_test = tf_idf.transform(clean_text_test)
        model_runner(model, n_estimator, X_train, X_test, y_train, y_test,
                     new_test)
Ejemplo n.º 16
0

# In[45]:

from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.metrics.pairwise import linear_kernel

df = pd.read_pickle('articles.pkl')
df.head()


# In[104]:

wordVector = cv(stop_words = 'english', encoding='latin-1')
wordWeights = tfidf(stop_words = 'english', encoding='latin-1')

corpus = df[df['section_name'] == 'Sports']['content']
corpus = corpus.append(df[df['section_name'] == 'Arts']['content'])
corpus = corpus.append(df[df['section_name'] == 'Business Day']['content'])

bag = wordVector.fit_transform(corpus)
weightybags = wordWeights.fit_transform(corpus)


# In[105]:

weightybags = weightybags.toarray()


# In[106]:
Ejemplo n.º 17
0
terms_desc={}

for term in terms:
    med_term=term.string
    if len(med_term)>1:
        print(med_term+","+term.next_sibling.next_sibling.string)
        filer=tf("r+")
        filer.write(str(term.next_sibling.next_sibling.string))
        terms_desc[med_term]=str(term.next_sibling.next_sibling.string)
        filer.seek(0)
        files.append(filer)

for filer in files:
    print(filer.read())
    filer.seek(0)

from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

learnt=tfidf("file")

transd=learnt.fit_transform(files)

print(transd)

import pickle
with open("learnt-tf.pck","w+b") as ltf:
    pickle.dump(learnt,ltf)
with open("terms-desc.pck","w+b") as ted_def:
    pickle.dump(terms_desc,ted_def)

Ejemplo n.º 18
0
def get_tfidf(df):
    df.label = df.label.apply(lambda x: 1 if x == 'lib' \
        else 0 if x == 'con' else -1)
    v = tfidf()
    return pd.DataFrame(v.fit_transform(df['sent']).toarray())
Ejemplo n.º 19
0
    for term in terms_desc.keys():
        if search(tfs_list_sorted[0][1], terms_desc[term]) or search(
                tfs_list_sorted[1][1], terms_desc[term]):
            possible_terms[term] = terms_desc[term]
    if len(possible_terms.keys()) <= term_threshold:
        return possible_terms.keys()
    else:
        possiblity = query_terms(tfs_list_sorted[1:], possible_terms)
        if not possiblity:
            print(possiblity)
            return list(possible_terms.keys())
        else:
            return possiblity


import pickle
import json


def searcher(query):
    with open("learnt-tf.pck", "r+b") as ltf, open("terms-desc.json",
                                                   "r") as ted_def:
        learnt = pickle.load(ltf)
        terms_desc = json.load(ted_def)
        return query_terms(tf_tdf_sent(query, learnt)[1], terms_desc)


if __name__ == '__main__':
    print(searcher("My baby is heart"))
    print(tfidf().build_tokenizer()("My baby is weighing less."))
Ejemplo n.º 20
0
files = []
terms_desc = {}

for term in terms:
    med_term = term.string
    if len(med_term) > 1:
        print(med_term + "," + term.next_sibling.next_sibling.string)
        filer = tf("r+")
        filer.write(str(term.next_sibling.next_sibling.string))
        terms_desc[med_term] = str(term.next_sibling.next_sibling.string)
        filer.seek(0)
        files.append(filer)

for filer in files:
    print(filer.read())
    filer.seek(0)

from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

learnt = tfidf("file")

transd = learnt.fit_transform(files)

print(transd)

import pickle
with open("learnt-tf.pck", "w+b") as ltf:
    pickle.dump(learnt, ltf)
with open("terms-desc.pck", "w+b") as ted_def:
    pickle.dump(terms_desc, ted_def)
Ejemplo n.º 21
0
    if len(tfs_list_sorted)<2:
        #print("Exhausted")
        if len(terms_desc)<10:
            return list(terms_desc.keys())
        else:
            return []
    for term in terms_desc.keys():
        if search(tfs_list_sorted[0][1],terms_desc[term]) or  search(tfs_list_sorted[1][1],terms_desc[term]):
            possible_terms[term]=terms_desc[term]
    if len(possible_terms.keys())<=term_threshold:
        return possible_terms.keys()
    else:
        possiblity=query_terms(tfs_list_sorted[1:],possible_terms)
        if not possiblity:
            print(possiblity)
            return list(possible_terms.keys())
        else:
            return possiblity
    
import pickle
import json
def searcher(query):
    with open("learnt-tf.pck","r+b") as ltf,open("terms-desc.json","r") as ted_def:
        learnt=pickle.load(ltf)
        terms_desc=json.load(ted_def)
        return query_terms(tf_tdf_sent(query,learnt)[1],terms_desc)

if __name__ == '__main__':
    print(searcher("My baby is heart"))
    print(tfidf().build_tokenizer()("My baby is weighing less."))
Ejemplo n.º 22
0
    print(" new pred")
    print(clfnb.predict(new_test.toarray()))
    print ("Model Accuracy for cross validation - ", np.mean(cross_val_score(clfnb, X_test, y_test, cv=10)))
    print ("Accuracy of the model on testing data: {}% ".format(accuracy_score(y_test, prediction) * 100))
    print ("F1 score: {}".format(f1_score(y_test, prediction, average='micro')))
    print("Confusion Matrix: ")
    print (confusion_matrix(y_test, prediction))
    return True


if __name__ == "__main__":
    df = pd.read_csv("C:\\Users\Sheel\PycharmProjects\\NLP\Data_Set.txt", sep="\t", encoding='latin-1', header=None)
    dsi = {"URL": df[0].tolist(), "Tag": df[1].tolist(), "Data": df[2].tolist()}
    dsi_df = pd.DataFrame(dsi)
    clean_text, tags= cleaner(dsi_df)
    cv = tfidf(sublinear_tf=True, min_df=0.05)
    X = cv.fit_transform(clean_text)
    dtm = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
    X_train=dtm
    y_train=tags
    X_test= dtm.iloc[0:31,:]
    y_test= tags[0:31]
    txt = """Michel Salama HerszageComcast adquire mais de 75% das ações da Sky Operação movimentou US$ 40 bilhões  A Comcast informou que garantiu mais de 75 por cento das ações da Sky, aproximando-se de finalizar a aquisição do grupo britânico de TV paga por 40 bilhões de dólares.  A empresa norte-americana de televisão a cabo Comcast disse anteriormente que espera que a aquisição seja concluída até o final de outubro.  No mês passado, a Comcast emergiu triunfante na longa batalha pela Sky depois de uma disputa com a Twenty-
8:51 PMMichel Salama HerszageFirst Century Fox, de Rupert Murdoch, em um leilão.  A Comcast informou em comunicado nesta quinta-feira que, até 9 de outubro, quando concluir a compra da participação de 39 por cento da Twenty-First Century Fox, já manterá ou terá recebido aceitações em mais de 75 por cento do capital social da Sky.  A empresa disse que um novo anúncio será feito no devido tempo. """
    dft = pd.DataFrame({"Data": [txt], "Tag": ['M']})
    clean_text_test, tags_test = cleaner(dft)
    new_test = cv.transform(clean_text_test)
    # print(new_test)
    run_random_forest_classifier()
    run_nb_classifier()
    print (dtm.shape)
Ejemplo n.º 23
0
def tfidf_maker(df_parole):
    maker = tfidf()
    matrix = np.matrix(maker.fit_transform(counter_maker(df_parole)).toarray())
    df_tfidf = pd.DataFrame(matrix, index=df_parole['groupe'])
    return df_tfidf
Ejemplo n.º 24
0
            # use str.replace() to remove any instances of the words
            for w in [
                    "sara", "shackleton", "chris", "germani", "sshacklensf",
                    "cgermannsf"
            ]:
                text = text.replace(w, '')

            # append the text to word_data
            word_data.append(text)
            # append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == 'sara':
                from_data.append(0)
            else:
                from_data.append(1)
            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

print word_data[152]

# in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

vec = tfidf(stop_words='english')
matrix = vec.fit_transform(word_data)
Ejemplo n.º 25
0
 def __init__(self, n_grams=[1, 2, 3], config_identifier='n_gram_123'):
     super(TfIdfVectorizer, self).__init__(config_identifier=config_identifier)
     self._vec = tfidf(ngram_range=(min(n_grams), max(n_grams)))
     self.n_grams = n_grams
Ejemplo n.º 26
0
if __name__ == '__main__':
    print("读入训练数据...")
    TrainIndex, TrainTitle, TrainDescri, TrainWhole, TrainCount = loadDateSet(
        '../ag_news_csv/train.csv')
    print("读入测试数据...")
    TestIndexTarget, TestTitle, TestDescri, TestWhole, TestCount = loadDateSet(
        '../ag_news_csv/test.csv')

    print("生成TF-IDF词向量空间...")
    Title = TrainTitle + TestTitle  # --------------------------------------------------------------title合集
    Descri = TrainDescri + TestDescri  # -----------------------------------------------------------descri合集
    Whole = TrainWhole + TestWhole  # --------------------------------------------------------------Whole合集
    cvector = CountVectorizer(
        stop_words='english', min_df=2,
        max_features=100)  # -----------------------------------特征提取器,避开英文停用词
    transformer = tfidf(
    )  # -----------------------------------------------------------------------计算tfidf特征
    temp = cvector.fit_transform(
        Title
    )  # ------------------------------------------------------每一行是一篇文章的词向量
    # temp = transformer.fit_transform(cvector.fit_transform(Title))#------------------------------每一行是一篇文章的词向量
    TrainTitle_bag = temp[0:TrainCount]
    TestTitle_bag = temp[TrainCount:temp.shape[0]]
    temp = cvector.fit_transform(
        Descri
    )  # ------------------------------------------------------每一行是一篇文章的词向量
    # temp = transformer.fit_transform(cvector.fit_transform(Descri))#-----------------------------每一行是一篇文章的词向量
    TrainDescri_bag = temp[0:TrainCount]
    TestDescri_bag = temp[TrainCount:temp.shape[0]]
    temp = cvector.fit_transform(
        Whole
    )  # ------------------------------------------------------每一行是一篇文章的词向量
Ejemplo n.º 27
0
#boxplot of absolute frequencies
plt.boxplot(presence)
plt.show()

#plot of absolute frequencies
plt.plot(xrange(len(presence)), sorted(presence))
plt.show()


from sklearn.metrics.pairwise import pairwise_distances
distances = pairwise_distances(data, metric='euclidean')
plt.imshow(distances); plt.title('Euclidean Similarity of initial data')
plt.colorbar()


distances = pairwise_distances(data, metric='cosine')
plt.figure(4)
plt.imshow(distances); plt.title('Cosine Similarity of initial data')
plt.colorbar()
plt.show()

""" Processing data """
from sklearn.feature_extraction.text import TfidfTransformer as tfidf
transformer = tfidf()
data = transformer.fit_transform(data)

distances = pairwise_distances(data, metric='cosine')
plt.figure(6)
plt.imshow(distances); plt.title('Cosine Similarity of tfidf data')
plt.colorbar()
plt.show()
Ejemplo n.º 28
0

# In[3]:


np.argwhere(np.isnan(embed))


# In[4]:



document = pd.concat((train['comment_text'],test['comment_text']))
document.fillna('')

tfidf_1gram = tfidf(stop_words="english", ngram_range=(1,4), max_features=50000, sublinear_tf=True, strip_accents="unicode", min_df=3, max_df=0.9)
#tfidf_2gram = tfidf(stop_words="english", ngram_range=(2,4), max_features=20000, sublinear_tf=True, strip_accents="unicode", min_df=3)
#tfidf_chargram = tfidf(encoding='unicode', analyzer='char', ngram_range=(2,6), sublinear_tf=True, max_features=40000)

tfidf_1gram = tfidf_1gram.fit(document)
#tfidf_2gram = tfidf_2gram.fit(document)
#tfidf_chargram = tfidf_chargram.fit(document)
train_f= pd.read_csv("train_f.csv")
test_f = pd.read_csv("test_f.csv")

train_tfidf = tfidf_1gram.transform(train['comment_text'])
test_tfidf = tfidf_1gram.transform(test['comment_text'])


# In[10]:
Ejemplo n.º 29
0
findElbow(x_iris)

# In[45]:

from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.metrics.pairwise import linear_kernel

df = pd.read_pickle('articles.pkl')
df.head()

# In[104]:

wordVector = cv(stop_words='english', encoding='latin-1')
wordWeights = tfidf(stop_words='english', encoding='latin-1')

corpus = df[df['section_name'] == 'Sports']['content']
corpus = corpus.append(df[df['section_name'] == 'Arts']['content'])
corpus = corpus.append(df[df['section_name'] == 'Business Day']['content'])

bag = wordVector.fit_transform(corpus)
weightybags = wordWeights.fit_transform(corpus)

# In[105]:

weightybags = weightybags.toarray()

# In[106]:

bag = bag.toarray()
Ejemplo n.º 30
0
porter = PorterStemmer()
wordnet = WordNetLemmatizer()
d = []
data_stem_lem = []
for doc in data_st:
    for word in doc:
        """word = porter.stem(word) """
        word = wordnet.lemmatize(word)
        d.append(word)
    data_stem_lem.append(d)
    d = []

# reverse of tokenisation for each document
X = [(" ").join(doc) for doc in data_stem_lem]
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
T = tfidf()  # define tfidf object
T.fit(X)  # fit for your data for td * idf value
data_TFIDF = T.transform(
    X)  # transform to tf*idf array this will be a sparse matrix

#featuresNames =  T.get_feature_names() get all the feature names

# Sparsity reduction
f = []
for i in range(data_TFIDF.shape[1]):
    # removing 99% sparsity column (here removing 99% zero value columns i.e. 1% non zero)
    if (data_TFIDF[:, i].count_nonzero() / data_TFIDF.shape[0]) > 0.01:
        f.append(i)

#Final data
X = data_TFIDF[:, f]
Ejemplo n.º 31
0
"""

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.feature_extraction.text import CountVectorizer as vectorizer

ngram = Pipeline([('features',
                   FeatureUnion([('wrd',
                                  tfidf(binary=False,
                                        max_df=1.0,
                                        min_df=2,
                                        norm='l2',
                                        sublinear_tf=True,
                                        use_idf=True,
                                        lowercase=True)),
                                 ('char',
                                  tfidf(analyzer='char',
                                        ngram_range=(3, 6),
                                        binary=False,
                                        max_df=1.0,
                                        min_df=2,
                                        norm='l2',
                                        sublinear_tf=True,
                                        use_idf=True,
                                        lowercase=True))])),
                  ('clf', LinearSVC())])
words = Pipeline([('features',
Ejemplo n.º 32
0
import networkx as nx


headerFilename="header.csv"

file=open(headerFilename)
headers=file.readlines()[0].split('\r')

filename="centroids.csv"

data=np.genfromtxt(filename,delimiter=" ")
print(data.shape)



a=tfidf()
tData=a.fit_transform(data)
tData=tData.toarray()
plt.figure()
plt.subplot(2,1,1)
plt.imshow(data,vmin=0,vmax=1)
plt.title("original centroids")
plt.colorbar()
plt.subplot(2,1,2)
plt.imshow(tData,vmin=0,vmax=1)
plt.colorbar()
plt.title("tf-idf")


# tData=tData-np.mean(tData,0)
# plt.figure()
Ejemplo n.º 33
0
    test.append(t.tokenize(rm[i], wakati=True))
for i in range(len(train)):
    sumt.append(train[i])
for i in range(len(test)):
    sumt.append(test[i])

#分かち書き
x_doc = []
for i in range(len(sumt)):
    doc = ""
    for j in range(len(sumt[i])):
        doc = doc + " " + sumt[i][j]
    x_doc.append(doc)

#TF-IDF
tivec = tfidf()
x = tivec.fit_transform(x_doc)
x = x.toarray()

#認識された文書と脚本を分割
x_train = x[0:int(len(x) / 2)]
x_test = x[int(len(x) / 2):len(x)]

out = []
#脚本と入れ替え
for i in range(len(x_train)):
    num = 0
    sim = np.dot(x_train[i], x_test[0]) / (np.linalg.norm(x_train[i]) *
                                           np.linalg.norm(x_test[0]))
    for j in range(len(x_test)):
        sim2 = np.dot(x_train[i], x_test[j]) / (np.linalg.norm(x_train[i]) *
            "cgermannsf"
        ]
        for word in stopwords:
            text = text.replace(word, "")
        ### append the text to word_data
        word_data.append(text)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name == "sara":
            from_data.append(0)
        else:
            from_data.append(1)
        email.close()

print("emails processed")
print("word data:", word_data[152])

from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "wb"))
pickle.dump(from_data, open("your_email_authors.pkl", "wb"))

### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

vectorizer = tfidf(stop_words='english')
transformed = vectorizer.fit_transform(word_data)
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names()[34597])
Ejemplo n.º 35
0
		charged protons from one another. Under certain circumstances, the repelling electromagnetic 
		force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 
		behind different elements. This is a form of nuclear decay.'''

#dependencies
import re  #regular expression
from nltk.tokenize import sent_tokenize as st, word_tokenize as wt  #for tokenization
from nltk.corpus import stopwords  #stop words
from nltk.stem import WordNetLemmatizer as wl  #for lemmatization

wordnet = wl()  #object creation for lemmatization
corpus = []  #empty list
sentences = st(para)  #tokenizing the paragraph to sentences

for i in range(len(sentences)):
    rev = re.sub(
        '[^a-zA-Z]', ' ',
        sentences[i])  #replace all the letters by space except the alphabets
    rev = rev.lower()  #lower the senteces
    rev = rev.split()  #each word gets converted to an element of a list
    rev = [
        wordnet.lemmatize(word) for word in rev
        if word not in stopwords.words('english')
    ]
    rev = ' '.join(rev)
    corpus.append(rev)

#creating TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
cv = tfidf()  #object creation
x = cv.fit_transform(corpus).toarray()  #transforming