def __init__(self, **kwargs):
     self.vectorizer = text.TfidfVectorizer(**kwargs)
# In[30]:

skip_gram = SGHS('/DATA/1_DataCache/FinCorpus/skip_gram.model')

# In[33]:

skip_gram.key_words(['保险'])

# In[34]:

from sklearn.feature_extraction import text

# In[37]:

l = text.TfidfVectorizer()

# In[42]:

skip_gram.model.wv.most_similar('保险')

# In[27]:

skip_gram = SGHS(skip_gram)

# In[ ]:

skip_gram.key_words(ji)

# In[6]:
Esempio n. 3
0
def detect_topic(instances, labels, sentence, 
                 ndim=5,
                 n_gram_range=(1,1),
                 n_max_features=None):
    
    highlight_word = ""
    
    svd_model = TruncatedSVD(n_components=ndim,
                         algorithm='randomized',
                         n_iter=10, random_state=42)
    
    
    preprocessor = TokenHandler.TrTokenHandler(stopword=True, more_stopwords=None, 
                                                   stemming=True, 
                                                   remove_numbers=True,
                                                   deasciify=False, remove_punkt=True)
    
    tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=preprocessor, 
                                          ngram_range=n_gram_range,
                                          max_features=n_max_features)
    svd_transformer = skpipeline.Pipeline([('vectorizer', tfidf_vectorizer),
                                           #('normalizer', skprep.Normalizer()),
                                           ('scaler', skprep.StandardScaler(with_mean=False)),
                                           ('svd', svd_model)])

    docmatrix = svd_transformer.fit_transform(instances)
    
    
    

    input_ = preprocessor(sentence)
    if(len(input_) < 1 or len("".join(input_)) < 1):
        highlight_word = ""
        return highlight_word
    
    
    inputmatrix = svd_transformer.transform(input_)    
     
    termmatrix = svd_model.components_.T
    print(termmatrix.shape)

    print(inputmatrix.shape)
    print(docmatrix.shape)


    # closest docs
    # @TODO different similarity metrics
    docsim, docindices = list_utils.matrix_similarity(inputmatrix, docmatrix, top_N=10)
    for i,w in enumerate(input_):
        print(w)
        sim_docs = [labels[j] for j in docindices[i]]
        print("most similar docs: ", ", ".join(sim_docs))
        sim_vals = docsim[i]
        print(sim_vals)
        print()
    
    # closest terms -> the input word which has the largest similarity value
    termsim, termindices = list_utils.matrix_similarity(inputmatrix, termmatrix, top_N=10)
    allterms = tfidf_vectorizer.get_feature_names()
    for i,w in enumerate(input_):
        print(w)
        sim_terms = [allterms[j] for j in termindices[i]]
        print("most similar terms: ", ", ".join(sim_terms))
        sim_vals = termsim[i]
        print(sim_vals)
        print(sum(sim_vals))

    # the heaviest term
    similarity_threshold = 0.0  # @TODO should be inferred from the data_matrix
    
    total_termsim_per_instance = np.sum(termsim, axis=1)
    max_sim = total_termsim_per_instance.max()
    max_index = total_termsim_per_instance.argmax()
    #print("max -> ", input_[max_index], " : ",max_sim)
    
    if max_sim <= similarity_threshold:
        highlight_word = ""
        return highlight_word
    
    highlight_word = input_[max_index]
    return highlight_word    
                                                       test_size=0.25,
                                                       random_state=53)
print([(len(data), type(data)) for data in [X_train, X_test, y_train, y_test]])

#==============================================================================
# LA: Linear SVC (l2 regularized, l2-loss, dual optimization)
# Preprocessing knobs: min_df(0.02,0.033,0.04, 0.05), max_df(0.95)
#                      ngram_range: (1, 1), (1, 2)
#                      use_idf: True(tf-idf), False(tf)
#                      binary: False, True(use_idf=False, norm=None)
# LA knobs: C: {0.0001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 700.0, 1000.0}
#           loss: l1, l2
# CV: stratified 10-fold
#     stratified shuffle split, n_iter = 10, test_size = 0.25
#==============================================================================
tfidf = text.TfidfVectorizer()
tfidf.set_params(analyzer="word",
                 max_df=0.95,
                 ngram_range=(1, 1),
                 use_idf=True)
svc = svm.LinearSVC()
svc.set_params(verbose=1, loss="l2", random_state=53)
pip_svm = pipes.Pipeline([("tfidf", tfidf), ("lalg", svc)])
parameter_grid = [{
    "tfidf__min_df": [7, 0.02, 0.033, 0.04, 0.05],
    #"tfidf__ngram_range":[(1, 1), (1, 2)],
    #"tfidf__use_idf":[True, False],
    #"tfidf__binary":[False, True],
    "lalg__C": [0.0001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 700.0, 1000.0],
    #"lalg__loss": ["l2", "l1"]
}]
Esempio n. 5
0
    if args.ngrams > 0:
        ngram_range = (1, args.ngrams)
    else:
        ngram_range = None

    # Verify that the hyperparameter values are valid.
    assert n_estimators > 0
    assert min_child_samples > 1
    assert type(ngram_range) is tuple and len(ngram_range) == 2
    assert ngram_range[0] > 0 and ngram_range[0] <= ngram_range[1]

    # Define the pipeline that featurizes the text columns.
    featurization = [
        (column,
         make_pipeline(ItemSelector(column),
                       text.TfidfVectorizer(ngram_range=ngram_range)))
        for column in feature_columns
    ]
    features = FeatureUnion(featurization)

    # Define the estimator that learns how to classify duplicate-original question pairs.
    estimator = lgb.LGBMClassifier(n_estimators=n_estimators,
                                   min_child_samples=min_child_samples,
                                   verbose=args.verbose)

    # Define the model pipeline as feeding the features into the estimator.
    model = Pipeline([('features', features), ('model', estimator)])

    # Fit the model.
    print('Training...')
    model.fit(train_X, train_y, model__sample_weight=sample_weight)
Esempio n. 6
0
        job_keyword = data["api_data"]["job_keywords"][0]
        job_keyword = job_keyword.replace(' ', '_')
        tags.append(job_keyword)

#        break

import sklearn.preprocessing as preprocessing
import sklearn.feature_extraction.text as text

encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(tags)

# all the tags that will be given labels
encoder.classes_

tfidf = text.TfidfVectorizer(corpus, stop_words="english", max_features=5000)

tfidf_matrix = tfidf.fit_transform(corpus)

tfidf_matrix.shape

import sklearn.model_selection as model_selection

x_train, x_test, y_train, y_test = model_selection.train_test_split(
    tfidf_matrix, y, test_size=0.20, random_state=42)

from keras.layers import Conv2D, MaxPool2D, Flatten

from keras.layers import Dense, Activation

from keras.models import Sequential
Esempio n. 7
0
# Get Kanyes's Tweets
kanyeTweets = getTweets('kanyewest')

# Get Elon's Tweets
elonTweets = getTweets('elonmusk')

# ******************* Begin Setup for Logistic Regression Model *******************
kanyes = ' '.join(
    kanyeTweets
)  # For the Tfidf - Long string of every Kanye Tweet connected by spaces
elons = ' '.join(
    elonTweets
)  # For the Tfidf - Lost string of every Elon Tweet connected by spaces
total = [kanyes, elons]
vector = sk.TfidfVectorizer(
)  # Create a Tfidf Vectorizer object - Used to compare importance of each word tweeted by Elon and Kanye
vector.fit(total)

kanyeTweets = zip(
    kanyeTweets, [0] * (len(kanyeTweets) - 1)
)  # For the Regression model, makes a tuple connecting a 0 to each Kanye Tweet
kanyeTweets = [tweet for tweet in kanyeTweets]
elonTweets = zip(
    elonTweets, [1] * (len(elonTweets) - 1)
)  # For the Regression model, makes a tuple connecting a 1 to each Kanye Tweet
elonTweets = [tweet for tweet in elonTweets]
df = pandas.DataFrame(kanyeTweets + elonTweets, columns=[
    'Text', 'Target'
])  # DataFrame containing each tweet and their respective 1 or 0

# Make a dataframe specific to Kanye and Elon, will be used to send specifc tweet from each public figure to model
Esempio n. 8
0
def main(config):
    """
        Function to run the training process
        :param config: SimpleNamespace config object
    """
    train_df = pd.read_csv(config.train_filepath,
                           usecols=config.data["columns"])
    train_df.rename(columns=config.data["names"], inplace=True)

    test_df = pd.read_csv(config.test_filepath, usecols=config.data["columns"])
    test_df.rename(columns=config.data["names"], inplace=True)

    print(
        f"Train labels distribution:\n{train_df.loc[:,'labels'].value_counts()}"
    )
    print(
        f"Test labels distribution:\n{test_df.loc[:,'labels'].value_counts()}\n"
    )

    # Create validation folds
    train_df["fold"] = -1
    skf = model_selection.StratifiedKFold(n_splits=config.num_folds)
    for fold, (train_index, val_index) in enumerate(
            skf.split(train_df, train_df.loc[:, "labels"])):
        train_df.loc[val_index, "fold"] = fold

    # Iterate folds and run train_mdoels
    model_list = []
    folds = np.unique(train_df.loc[:, "fold"].values)

    print("\n~~~~Running training and cross validation~~~~\n")
    for fold in folds:
        start_time = time.perf_counter()

        train_fold = train_df.loc[train_df.loc[:,
                                               "fold"] != fold, :].reset_index(
                                                   drop=True)
        val_fold = train_df.loc[train_df.loc[:,
                                             "fold"] == fold, :].reset_index(
                                                 drop=True)

        text_encoder = text.TfidfVectorizer(tokenizer=word_tokenize)
        clf = clf = naive_bayes.MultinomialNB()

        model = pipeline.Pipeline([('text_enc', text_encoder), ('clf', clf)])
        model.fit(train_fold.loc[:, "feature"].values,
                  train_fold.loc[:, "labels"].values)

        # Evaluate score
        val_probs = model.predict_proba(val_fold.loc[:, "feature"].values)
        roc_auc_score = metrics.roc_auc_score(val_fold.loc[:, "labels"].values,
                                              val_probs[:, 1])
        # Calculate accuracy
        val_tags = model.predict(val_fold.loc[:, "feature"].values)
        accuracy_score = metrics.accuracy_score(
            val_fold.loc[:, "labels"].values, val_tags)

        # Append in the list to evaluate on test
        model_list.append(model)

        print(
            f"fold: {fold}, roc auc socre: {roc_auc_score:.2f}, accuracy_score: {accuracy_score:.2f}, fold time: {(time.perf_counter()-start_time):.2f} seconds"
        )

    # Run evaluation on tests
    print("\n~~~~Running evaluation on held out test data~~~~\n")
    running_score, running_accuracy = 0, 0
    for fold, model in enumerate(model_list):
        test_text = test_df.loc[:, "feature"].values
        test_class = test_df.loc[:, "labels"].values

        label_probs = model.predict_proba(test_text)[:, 1]
        label_tags = model.predict(test_text)

        # Calculate accuracy
        accuracy_score = metrics.accuracy_score(test_class, label_tags)
        running_accuracy += accuracy_score

        roc_auc_score = metrics.roc_auc_score(test_class, label_probs)
        running_score += roc_auc_score

        print(
            f"FOLD: {fold}, auc_roc_score on held out test: {roc_auc_score}, accuracy: {accuracy_score}"
        )

    print(
        f"Average auc_roc_score acorss all folds on test data is: {running_score/len(model_list)}"
    )
    print(
        f"Average accuracy acorss all folds on test data is: {running_accuracy/len(model_list)}"
    )
    lambda x: re.sub("[a-zA-Z0-9():\-_ \'\.\/]", "", x))
print(orig_train2.shape)
print(orig_test2.shape)

# TFIDF on those specific chars
tfidf = text.TfidfVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    analyzer=
    'char',  #stop_words=[chr(x) for x in range(97,123)]+[chr(x) for x in range(65,91)]+['_','.',':'], 
    token_pattern='(?u)\\b\\w\\w+\\b',
    ngram_range=(1, 1),
    max_df=1.0,
    min_df=1,
    max_features=None,
    vocabulary=None,
    binary=True,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False)
orig_train2 = tfidf.fit_transform(orig_train2)

# Simple naive bayes on the text features
model = naive_bayes.BernoulliNB()
model.fit(orig_train2, orig_target)
Esempio n. 10
0
if process_all:
    load_grammar = True
else:
    load_grammar = False
if load_grammar:
    grammar_processed_data = joblib.load('grammar-processed.pkl')
else:
    grammar_processed_data = grammar_processor.fit_transform(
        loaded_data, train.target)
processed_test_data = grammar_processed_data

if process_all:
    joblib.dump(grammar_processed_data, 'grammar-processed.pkl')

if False:
    text_processor = text.TfidfVectorizer(stop_words='english')
    processed_train_data = text_processor.fit_transform(
        train.data, train.target)
    # Cheating, but what we did with eval_auto_classifier.py
    processed_test_data = processed_train_data
    # Correct
    #processed_test_data = text_processor.fit_transform(train.test)

test = train


def eval_model(name, model, data):
    print '=' * 20
    print name, 'training'
    model.fit(data, train.target, sample_weight=sample_weights)
    print name, 'trained'
Esempio n. 11
0
business = pd.read_json('business.json', lines=True) 
business.drop(['address','city','state','postal_code','latitude','longitude', 'review_count', 'is_open', 'attributes', 'categories', 'hours'], axis=1, inplace=True)


df_review_agg = df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id':df_review_agg.index, 'all_reviews':df_review_agg.values})
df3 = pd.merge(df_ready_for_sklearn, business, on="business_id")
df3.drop(['business_id','name'],axis=1,inplace=True)
# df3 just contains reviews and the stars associated with each business



# model stuff
vectorizer = sk_text.CountVectorizer(min_df=1)
corpus = df3['all_reviews']
vectorizer = sk_text.TfidfVectorizer(max_features = 5000,min_df=5)
matrix = vectorizer.fit_transform(corpus)
# x = tfidf_data
x = matrix.toarray()
y = df3.iloc[:,1].values

#
#
#    
#
#
#

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)
# first hidden last has to have input matching the dimension of a row in x
Esempio n. 12
0
 def _get_vectorizers(self, cols_params):
     return [(col, text.TfidfVectorizer(**params))
             for col, params in cols_params]
Esempio n. 13
0
def generate_tfidf(text_corpora):
    vectorizer = tf.TfidfVectorizer(lowercase=False)
    vectorizer.fit(text_corpora)
    vector = vectorizer.transform(text_corpora)
    return vector
processedText = processedText.str.lower()

###Stop word removal##

stop_words = nltk.corpus.stopwords.words('english')
processedText = processedText.apply(lambda x: ' '.join(
    term for term in x.split() if term not in set(stop_words)))

### STEMMING ###

porter = nltk.PorterStemmer()
processedText = processedText.apply(
    lambda x: ' '.join(porter.stem(term) for term in x.split()))

## Feature Extraction : TF-IDF Vectorizer
vectorizer = text.TfidfVectorizer(ngram_range=(1, 1))
X_ngrams = vectorizer.fit_transform(processedText)

X_train, X_test, y_train, y_test = train_test_split(X_ngrams,
                                                    encodedLabels,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=encodedLabels)

### CLASSIFICATION ###
#1. Support Vector Machine#
clf = svm.LinearSVC(loss='hinge')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score_svm = metrics.f1_score(y_test, y_pred)
Esempio n. 15
0
import pandas as pd
import re

base_dir = "E:\\Kaggle\\ted-talks"
os.chdir(base_dir)
transcripts = pd.read_csv("transcripts.csv")
transcripts['title'] = transcripts['url'].map(lambda x: x.split("/")[-1])

## Extract key words using tfidf
pattern = re.compile(r'\d+')
transcripts['transcript'] = transcripts['transcript'].map(
    lambda x: re.sub(pattern, "", x))
from sklearn.feature_extraction import text

Text = transcripts['transcript'].tolist()
tfidf = text.TfidfVectorizer(input=Text, stop_words="english")
matrix = tfidf.fit_transform(Text)
matrix.shape


def get_imp_terms(x):
    x = x.todense()
    x = x.tolist()[0]
    x = pd.Series(x, index=tfidf.get_feature_names())
    x = x.sort_values(ascending=False)
    return x.head(4).index.tolist()


transcripts['imp_terms'] = [get_imp_terms(x) for x in matrix]
transcripts['imp_terms_tfidf'] = transcripts['imp_terms'].map(
    lambda x: ",".join(x))
Esempio n. 16
0
print("Reduced vocabulary to %i words" % (len(vocabulary.vocab_reduced)),
      flush=True)

vocabulary.write_voc_reduced(out_vocab_text)

with open(out_vocab_pickle, 'wb') as f:
    print("Pickling reduced vocabulary to", out_vocab_pickle, flush=True)
    pickle.dump(vocabulary.vocab_reduced, f)

vocnow = vocabulary.get_vocab_reduced_dict()

vectorizer = fe.TfidfVectorizer(input='content',
                                lowercase=False,
                                preprocessor=preprocessor,
                                token_pattern="",
                                tokenizer=tokenizer,
                                stop_words=None,
                                vocabulary=vocabulary.get_vocab_reduced_dict(),
                                norm='l1')

print("Created tf-idf vectorizer\nFitting to the training data", flush=True)

tfidf_train = vectorizer.fit_transform(
    [texts[idnow] for idnow in df_train[idcolname].values])

print("Created tf-idf training vectors\nCreating test vectors", flush=True)

tfidf_test = vectorizer.transform(
    [texts[idnow] for idnow in df_test[idcolname].values], copy=True)

print("Created tf-idf test vectors", flush=True)
Esempio n. 17
0
    def Get_Data(self,batch_idx,mscoco,split,caption_path):
        '''  Show an example of how to read the dataset   '''
        V_input  = np.zeros((self.batch_size,3,64,64))
        V_target = np.zeros((self.batch_size,3,32,32))
        V_target_64by64 = np.zeros((self.batch_size,3,64,64))
        V_Y_Carre = np.zeros((24,self.batch_size,3,32,32))
        V_X_Carre = np.zeros((24,self.batch_size,3,32,32))
        V_caption_dict = []
        x = T.tensor3()
        f = theano.function([x],outputs=x.dimshuffle(2,0,1))
        #m = T.ftensor3()
        #m_new = T.stack(T.concatenate([V_input[0],m]))
        #f = theano.function([V_input,m], outputs=[m_new])
        data_path = os.path.join(mscoco,split)
        caption_path = os.path.join(mscoco,caption_path)
        with open(caption_path,'rb') as fd:
            caption_dict = pickle.load(fd)
        
            #print (data_path + "/*.jpg")
        imgs = glob.glob(data_path + "/*.jpg")
        batch_imgs = imgs[batch_idx*self.batch_size:(batch_idx+1)*self.batch_size]
        j = 0
        for i, img_path in enumerate(batch_imgs):
            #print(i)
            img = Image.open(img_path)
            img_array = np.array(img)
            Input     = np.array(img)
            center = (int(np.floor(img_array.shape[0] / 2.)), int(np.floor(img_array.shape[1] / 2.)))  
            cap_id = os.path.basename(img_path)[:-4]
            #k=0 
            ### Get input/target from the images true immage 64*64, image with mask 64*64  true center 32*32
            
            if len(img_array.shape) == 3:
                V_target_64by64[j,:,:,:] = f(img_array)
                Input[center[0]-16:center[0]+16, center[1]-16:center[1]+16, :] = 0
                V_input[j,:,:,:] = f(Input) 
               
               #####
                
                    
                ##############################################################################
                target0 = img_array[center[0]-32:center[0], center[1] - 32:center[1], :]
                V_Y_Carre[0,j,:,:,:] = f(target0)
                target0 =     Input[center[0]-32:center[0], center[1] - 32:center[1], :]
                V_X_Carre[0,j,:,:,:] = f(target0)
                ###
                target0 = img_array[center[0]-32:center[0], center[1]:center[1]+32, :]
                V_Y_Carre[1,j,:,:,:] = f(target0)
                target0 =     Input[center[0]-32:center[0], center[1]:center[1]+32, :]
                V_X_Carre[1,j,:,:,:] = f(target0)
                ###
                target0 = img_array[center[0]:center[0]+32, center[1] - 32:center[1], :]
                V_Y_Carre[2,j,:,:,:] = f(target0)
                target0 =     Input[center[0]:center[0]+32, center[1] - 32:center[1], :]
                V_X_Carre[2,j,:,:,:] = f(target0)
                ###
                target0 = img_array[center[0]:center[0]+32, center[1]:center[1]+32, :]
                V_Y_Carre[3,j,:,:,:] = f(target0)
                target0 =     Input[center[0]:center[0]+32, center[1]:center[1]+32, :]
                V_X_Carre[3,j,:,:,:] = f(target0)
                ### ### ### ###
                target0 = img_array[center[0]-32+8:center[0]+8, center[1] - 32+8:center[1]+8, :]
                V_Y_Carre[4,j,:,:,:] = f(target0)
                target0 =     Input[center[0]-32+8:center[0]+8, center[1] - 32+8:center[1]+8, :]
                V_X_Carre[4,j,:,:,:] = f(target0)
                ###
                target0 = img_array[center[0]-32+8:center[0]+8, center[1]-8:center[1]+32-8, :]
                V_Y_Carre[5,j,:,:,:] = f(target0)
                target0 =     Input[center[0]-32+8:center[0]+8, center[1]-8:center[1]+32-8, :]
                V_X_Carre[5,j,:,:,:] = f(target0)
                ###
                target0 = img_array[center[0]-8:center[0]+32-8, center[1] - 32+8:center[1]+8, :]
                V_Y_Carre[6,j,:,:,:] = f(target0)
                target0 =     Input[center[0]-8:center[0]+32-8, center[1] - 32+8:center[1]+8, :]
                V_X_Carre[6,j,:,:,:] = f(target0)
                ###
                target0 = img_array[center[0]-8:center[0]+32-8, center[1]-8:center[1]+32-8, :]
                V_Y_Carre[7,j,:,:,:] = f(target0)
                target0 =     Input[center[0]-8:center[0]+32-8, center[1]-8:center[1]+32-8, :]
                V_X_Carre[7,j,:,:,:] = f(target0)
                               
                ##############################################################################
                ###vtrue immage 64*64, image with mask 64*64  true center 32*32
                 #input[center[0]-16:center[0]+16, center[1]-16:center[1]+16, :] = 0
                target = img_array[center[0]-16:center[0]+16, center[1] - 16:center[1]+16, :]
               
                V_target[j,:,:,:] = f(target)
                V_caption_dict.append(caption_dict[cap_id])
                j = j+1  
                #print (i,cap_id,caption_dict[cap_id])  
                #print(input)
                #plt.imshow(input)  
                #eeee
            
            
            
            
        V_Temp_I = V_input        
        if (V_Temp_I.shape[0]!=j):
            
            V_input = V_input[:j,:,:,:]
            V_target = V_target[:j,:,:,:]  
            V_target_64by64 = V_target_64by64[:j,:,:,:]  
            
            V_Y_Carre = V_Y_Carre[:,:j,:,:,:] 
            V_X_Carre = V_X_Carre[:,:j,:,:,:]
            V_Temp_I = V_Temp_I[:j,:,:,:]
        Index_Image  = j  
        #print("fin first Party",V_input[0],Index_Image)     
            #else:
            #    try:
            #        input = np.copy(img_array)
            #        input[center[0]-16:center[0]+16, center[1]-16:center[1]+16, :] = 0
            #        target = img_array[center[0]-16:center[0]+16, center[1] - 16:center[1]+16]
            #    except:
            #        pass
           
            
            #Image.fromarray(img_array).show()
            #Image.fromarray(input).show()
            #Image.fromarray(target).show()
       ###################################################################  Identification des description identiques
       # on a 100 Feature pour chacun des 4 blocs de 3*3232 et 100 pour le blocs masque 3*64*64
        Feature_Kernel = np.zeros((Index_Image, 5*self.size_Feature))
        # suppression nan 
        # Ajust_Data  = Input.dropna()
        # fusion toute les descriptions et combine all to have a corpus
        Corpus_Train = np.array([''.join(Doc) for Doc in V_caption_dict])
        # count and normilize the corpus
        #Vectorizer = text.TfidfVectorizer(min_df=1,ngram_range=(1,1), stop_words = 'english', strip_accents='unicode',norm='12')
        #Vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words = stop_words )
        Vectorizer = text.TfidfVectorizer(min_df=1,ngram_range=(2,1),stop_words= stop_words)
        #Vectorizer = text.HashingVectorizer(stop_words = stop_words,non_negative=True, n_features = 2 ** 8)
        Count_Train_Vector = Vectorizer.fit_transform(Corpus_Train) 
        # projection dim 4 ou alors calcul imilitude direct 2000 comme random simulation
        LS_Model = TruncatedSVD(n_components = 5,random_state=2000)
        LS_Model.fit(Count_Train_Vector)
        # Nouveau vecteur avec dimension reduit
        Vect_Proj = LS_Model.fit_transform(Count_Train_Vector,y=None)
        Temp1 = pd.DataFrame(Vect_Proj, columns = ['dim1','dim2','dim3','dim4','dim5'])
        Temp2 = pd.DataFrame(Vect_Proj[:,0],columns = ['Cos_Sim'])
        Temp1.index = range(0,Index_Image)
        Temp2.index = range(0,Index_Image)
        #############################################  Creation Feature
        for ind in range(0,Index_Image):
            Temp = Temp1.iloc[ind:ind+1]
            temp = pd.DataFrame(cosine_similarity(Temp.values,Temp1.values))
            Temp2['Cos_Sim'] = temp.transpose()
            Temp2 = Temp2.sort_values(by=['Cos_Sim'])
            #print(Temp2.index[-2],Temp2.index[-3])
            #print(Temp2.iloc[-2])
            # fusionner avec le prmier, deux premiers, 5 premiers et exploiter extraction feature ds autoencodeur
            # vecteur de taille 5 ou on ajoute des elts a V_X_Carre_train[:,:Index_Image+1,:,:,:]
            #  8*Index_Image*3*32*32, Index_Image : nombre image echantilllon d'entrainement 0 a 3 0 a 32 : repartition en 4 de image initiale ; 4 a 7, 16 a 48
            V_X_Carre[8:15,ind,:,:,:] = V_X_Carre[0:7,Temp2.index[-2],:,:,:]
            V_Y_Carre[8:15,ind,:,:,:] = V_Y_Carre[0:7,Temp2.index[-2],:,:,:]
            V_X_Carre[16:23,ind,:,:,:] = V_X_Carre[0:7,Temp2.index[-3],:,:,:]
            V_Y_Carre[16:23,ind,:,:,:] = V_Y_Carre[0:7,Temp2.index[-3],:,:,:]
            #print(J,Temp2.index[-2],Temp2.index[-3])
            
            
            # Quelques feature pour l'image 64 by 64 avec 0 au Centre
            Nbre = 0
            params = {'bandwidth': np.logspace(-1, 1,20)}
            grid = GridSearchCV(KernelDensity(), params)
            temp0 = np.reshape(V_Temp_I[ind,:,:,:],(64,192)) # 3*64*64
            temp0 = temp0/(255/2)
            temp0 = temp0-1
            #temp /= (255/2)
            #temp -= 1
                  
            n_components = 5
            pca = PCA(n_components = n_components, svd_solver='full')
            data = pca.fit_transform(temp0)
            grid.fit(data)
            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_
            new_data = kde.sample(self.size_Feature//n_components,random_state=2000)
            temp00 = new_data.min()
            temp01 = new_data.max()
            new_data = new_data - temp00
            new_data = new_data/(temp00-temp01)
            #new_data += 1
            new_data = new_data*255
            new_data = np.uint8(new_data)
            new_data = new_data.astype('float32') 
            Feature_Kernel[ind,Nbre:Nbre + self.size_Feature] = new_data.reshape(1,-1)
            Nbre =Nbre+self.size_Feature 
           
             
#           project the 3*32*32 -dimensional data to a lower dimension
#           use grid search cross-validation to optimize the bandwidth
            
            for K in(0,4):
                params = {'bandwidth': np.logspace(-1, 1,20)}
                grid = GridSearchCV(KernelDensity(), params)
                temp0 = np.reshape(V_X_Carre[K,Temp2.index[-1],:,:,:],(32,96)) # un quart du tableau 3*32*32 separation initiale 0*32-0*32; 32*64-0*32; 32*64-0*32; 32*64-32*64
               
                temp0 = temp0/(255/2)
                temp0 = temp0-1
                
                
                pca = PCA(n_components = 5,svd_solver='full')
                data = pca.fit_transform(temp0)
                grid.fit(data)
                # use the best estimator to compute the kernel density estimate
                kde = grid.best_estimator_
                new_data = kde.sample(self.size_Feature//5,random_state=2000)
                #new_data += 1
                temp00 = new_data.min()
                temp01 = new_data.max()
                new_data = new_data - temp00
                new_data = new_data/(temp00-temp01)
                new_data = new_data*255
                new_data = np.uint8(new_data)
                #print(np.sum(pca.explained_variance_ratio_)) 
                #print(new_data)
                # sample 44 new points from the data 2000 comme simule dristribution
                #new_data = pca.inverse_transform(new_data)
                Feature_Kernel[ind,Nbre:Nbre + self.size_Feature] = new_data.reshape(1,-1)
                Nbre = Nbre + self.size_Feature
            
            
            
            
        return V_input,V_target,V_target_64by64,V_caption_dict,Index_Image,V_X_Carre,V_Y_Carre,Feature_Kernel
Esempio n. 18
0
def makePredictions(train, test_melt, Windows, look_back=49):
    r = 1.61803398875
    # Windows = np.round(r**np.arange(1,9) * 7)
    # Windows = [11, 18, 30, 48, 78, 126, 203, 329]
    # Windows = [7, 13, 20, 33, 53, 86, 139, 225]

    n = train.shape[1] - 1  #  550
    Visits = np.zeros(train.shape[0])
    for i, row in train.iterrows():
        M = []
        start = row[1:].nonzero()[0]
        if len(start) == 0:
            continue
        if n - start[0] < Windows[0]:
            Visits[i] = row.iloc[start[0] + 1:].median()
            continue
        for W in Windows:
            if W > n - start[0]:
                break
            M.append(row.iloc[-W:].median())
        Visits[i] = np.median(M)

    Visits[np.where(Visits < 1)] = 0.

    train['Predicted'] = Visits
    #print(train.head())
    #test1 = pd.read_csv("../input/key_2.csv")
    #test1['Page'] = test1.Page.apply(lambda x: x[:-11])
    test1 = test_melt.merge(train[['Page', 'Predicted']],
                            on='Page',
                            how='left')
    #print('MODEL 1 SMAPE: ', smape(test1['Visits'], test1['Predicted']))

    # add model 2

    #determine idiom with URL
    train['origine'] = train['Page'].apply(
        lambda x: re.split(".wikipedia.org", x)[0][-2:])
    '''
    This is what you get with a value counts on train.origine
    en    24108
    ja    20431
    de    18547
    fr    17802
    zh    17229
    ru    15022
    es    14069
    ts    13556
    er     4299
    '''
    #we have english, japanese, deutch, french, chinese (taiwanese ?), russian, spanish
    #ts and er are undetermined; in the next lines, I try to replace them by learning from special chars
    #Note : this step wasn't tuned, and can't be perfect because other idioms are available in those Pages (such as portuguese for example)

    #let's make a train, target, and test to predict language on ts and er pages
    orig_train = train.loc[~train.origine.isin(['ts', 'er']), 'Page']
    orig_target = train.loc[~train.origine.isin(['ts', 'er']), 'origine']
    orig_test = train.loc[train.origine.isin(['ts', 'er']), 'Page']
    #keep only interesting chars
    orig_train2 = orig_train.apply(lambda x: x.split(".wikipedia")[
        0][:-3]).apply(lambda x: re.sub("[a-zA-Z0-9():\-_ \'\.\/]", "", x))
    orig_test2 = orig_test.apply(lambda x: x.split(".wikipedia")[
        0][:-3]).apply(lambda x: re.sub("[a-zA-Z0-9():\-_ \'\.\/]", "", x))
    #run TFIDF on those specific chars
    tser_model = True
    try:
        tfidf = text.TfidfVectorizer(
            input='content',
            encoding='utf-8',
            decode_error='strict',
            strip_accents=None,
            lowercase=True,
            preprocessor=None,
            tokenizer=None,
            analyzer=
            'char',  #stop_words=[chr(x) for x in range(97,123)]+[chr(x) for x in range(65,91)]+['_','.',':'], 
            token_pattern='(?u)\\b\\w\\w+\\b',
            ngram_range=(1, 1),
            max_df=1.0,
            min_df=1,
            max_features=None,
            vocabulary=None,
            binary=True,
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=False)
        orig_train2 = tfidf.fit_transform(orig_train2)
        #apply a simple naive bayes on the text features
        model = naive_bayes.BernoulliNB()
        model.fit(orig_train2, orig_target)
        result = model.predict(tfidf.transform(orig_test2))
        result = pd.DataFrame(result, index=orig_test)
        result.columns = ['origine']
    except:
        tser_model = False
    #result will be used later to replace 'ts' and 'er' values
    #we need to remove train.origine so that the train can be flattened with melt
    del train['origine']
    del train['Predicted']

    #let's flatten the train as did clustifier and initialize a "ferie" columns instead of a weekend column
    #look_back=49
    #look_back=50
    #look_back=51
    #look_back=54
    #look_back=60

    train = pd.melt(train[list(train.columns[-look_back:]) + ['Page']],
                    id_vars='Page',
                    var_name='date',
                    value_name='Visits')
    train['date'] = train['date'].astype('datetime64[ns]')
    train['ferie'] = ((train.date.dt.dayofweek) >= 5).astype(float)
    train['origine'] = train['Page'].apply(
        lambda x: re.split(".wikipedia.org", x)[0][-2:])

    #let's join with result to replace 'ts' and 'er'
    if tser_model:
        join = train.loc[train.origine.isin(["ts", "er"]), ['Page']]
        join['origine'] = 0  #init
        join.index = join["Page"]
        join.origine = result
        train.loc[train.origine.isin(["ts", "er"]),
                  ['origine']] = join.origine.values  #replace

    #official non working days by country (manual search with google)
    #I made a lot of shortcuts considering that only Us and Uk used english idiom,
    #only Spain for spanich, only France for french, etc
    train_us=['2015-07-04','2015-11-26','2015-12-25']+\
    ['2016-07-04','2016-11-24','2016-12-26']
    test_us = []
    train_uk=['2015-12-25','2015-12-28'] +\
    ['2016-01-01','2016-03-28','2016-05-02','2016-05-30','2016-12-26','2016-12-27']
    test_uk = ['2017-01-01']
    train_de=['2015-10-03', '2015-12-25', '2015-12-26']+\
    ['2016-01-01', '2016-03-25', '2016-03-26', '2016-03-27', '2016-01-01', '2016-05-05', '2016-05-15', '2016-05-16', '2016-10-03', '2016-12-25', '2016-12-26']
    test_de = ['2017-01-01']
    train_fr=['2015-07-14', '2015-08-15', '2015-11-01', '2015-11-11', '2015-12-25']+\
    ['2016-01-01','2016-03-28', '2016-05-01', '2016-05-05', '2016-05-08', '2016-05-16', '2016-07-14', '2016-08-15', '2016-11-01','2016-11-11', '2016-12-25']
    test_fr = ['2017-01-01']
    train_ru=['2015-11-04']+\
    ['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06', '2016-01-07', '2016-02-23', '2016-03-08', '2016-05-01', '2016-05-09', '2016-06-12', '2016-11-04']
    test_ru = [
        '2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05',
        '2017-01-06', '2017-01-07', '2017-02-23'
    ]
    train_es=['2015-08-15', '2015-10-12', '2015-11-01', '2015-12-06', '2015-12-08', '2015-12-25']+\
    ['2016-01-01', '2016-01-06', '2016-03-25', '2016-05-01', '2016-08-15', '2016-10-12', '2016-11-01', '2016-12-06', '2016-12-08', '2016-12-25']
    test_es = ['2017-01-01', '2017-01-06']
    train_ja=['2015-07-20','2015-09-21', '2015-10-12', '2015-11-03', '2015-11-23', '2015-12-23']+\
    ['2016-01-01', '2016-01-11', '2016-02-11', '2016-03-20', '2016-04-29', '2016-05-03', '2016-05-04', '2016-05-05', '2016-07-18', '2016-08-11', '2016-09-22', '2016-10-10', '2016-11-03', '2016-11-23', '2016-12-23']
    test_ja = ['2017-01-01', '2017-01-09', '2017-02-11']
    train_zh=['2015-09-27', '2015-10-01', '2015-10-02','2015-10-03','2015-10-04','2015-10-05','2015-10-06','2015-10-07']+\
    ['2016-01-01', '2016-01-02', '2016-01-03', '2016-02-08', '2016-02-09', '2016-02-10', '2016-02-11', '2016-02-12', '2016-04-04', '2016-05-01', '2016-05-02', '2016-06-09', '2016-06-10', '2016-09-15', '2016-09-16', '2016-10-03', '2016-10-04','2016-10-05','2016-10-06','2016-10-07']
    test_zh = ['2017-01-02', '2017-02-27', '2017-02-28', '2017-03-01']
    #in China some saturday and sundays are worked
    train_o_zh = [
        '2015-10-10', '2016-02-06', '2016-02-14', '2016-06-12', '2016-09-18',
        '2016-10-08', '2016-10-09'
    ]
    test_o_zh = ['2017-01-22', '2017-02-04']

    #let's replace values in 'ferie' columns
    train.loc[(train.origine == 'en') & (train.date.isin(train_us + train_uk)),
              'ferie'] = 1
    train.loc[(train.origine == 'de') & (train.date.isin(train_de)),
              'ferie'] = 1
    train.loc[(train.origine == 'fr') & (train.date.isin(train_fr)),
              'ferie'] = 1
    train.loc[(train.origine == 'ru') & (train.date.isin(train_ru)),
              'ferie'] = 1
    train.loc[(train.origine == 'es') & (train.date.isin(train_es)),
              'ferie'] = 1
    train.loc[(train.origine == 'ja') & (train.date.isin(train_ja)),
              'ferie'] = 1
    train.loc[(train.origine == 'zh') & (train.date.isin(train_zh)),
              'ferie'] = 1
    train.loc[(train.origine == 'zh') & (train.date.isin(train_o_zh)),
              'ferie'] = 0

    #same with test
    #test = pd.read_csv("../input/key_2.csv")
    test = test_melt
    del test['Visits']
    #test['date'] = test.Page.apply(lambda a: a[-10:])
    #test['Page'] = test.Page.apply(lambda a: a[:-11])
    test['date'] = test['date'].astype('datetime64[ns]')
    test['ferie'] = ((test.date.dt.dayofweek) >= 5).astype(float)
    test['origine'] = test['Page'].apply(
        lambda x: re.split(".wikipedia.org", x)[0][-2:])

    #joint with result
    if tser_model:
        join = test.loc[test.origine.isin(["ts", "er"]), ['Page']]
        join['origine'] = 0
        join.index = join["Page"]
        join.origine = result
        test.loc[test.origine.isin(["ts", "er"]),
                 ['origine']] = join.origine.values

    test.loc[(test.origine == 'en') & (test.date.isin(test_us + test_uk)),
             'ferie'] = 1
    test.loc[(test.origine == 'de') & (test.date.isin(test_de)), 'ferie'] = 1
    test.loc[(test.origine == 'fr') & (test.date.isin(test_fr)), 'ferie'] = 1
    test.loc[(test.origine == 'ru') & (test.date.isin(test_ru)), 'ferie'] = 1
    test.loc[(test.origine == 'es') & (test.date.isin(test_es)), 'ferie'] = 1
    test.loc[(test.origine == 'ja') & (test.date.isin(test_ja)), 'ferie'] = 1
    test.loc[(test.origine == 'zh') & (test.date.isin(test_zh)), 'ferie'] = 1
    test.loc[(test.origine == 'zh') & (test.date.isin(test_o_zh)), 'ferie'] = 0

    train_page_per_dow = train.groupby(['Page',
                                        'ferie']).median().reset_index()
    test = test.merge(train_page_per_dow, on=['Page', 'ferie'], how='left')

    test['Pred2'] = test['Visits']
    test.loc[test.Pred2.isnull(), 'Pred2'] = 0
    test['PredC'] = (
        (test['Pred2'] * 10).astype('int') / 10 + test1['Predicted']) / 2

    test['Visits'] = test1['Visits']
    test['Pred1'] = test1['Predicted']

    #print("MODEL 2 SMAPE: ", smape(test['Visits'], test['Pred2']))

    combinedSmape = smape(test['Visits'], test['PredC'])
    print("Combined SMAPE: ", combinedSmape)
    print("look_back:", look_back)
    print("------------------------------------")
    #test[['Id','Visits']].to_csv('sub.csv', index=False)
    return (combinedSmape)
Esempio n. 19
0
    feats += [(strict_words[i], strict_words[i+1]) for i in xrange(len(strict_words)-1)]
    ## synonyms of strict words
    #for sw in strict_words:
    #    if sw in POS_KEY_WORDS:
    #        feats += synonyms(w)
    for w in words:
        try:
            pos = corpus.wordnet.synsets(w)[0].pos
            if pos in ('n', 'a', 'v'):
                feats += [(w, pos)]
        except: pass
    return feats

## building features on training data
tfidf_vectorizer = text.TfidfVectorizer(charset = 'latin-1', lowercase=False, 
                                            sublinear_tf=True, tokenizer = my_tokenizer,#vocabulary = CHAT_WORDS,
                                            max_df=1.0)#, norm = 'l1')
print 'extracting tfidf from training set...'
t0 = time()
train_X = tfidf_vectorizer.fit_transform(train_X)
print 'done in %0.2fs' % (time() - t0)
print 'shape of training data', train_X.shape

# <codecell>

## add extra features to tfidf
'@HasPositive' in tfidf_vectorizer.get_feature_names()
#print filter(lambda f: f[0]=='you', tfidf_vectorizer.get_feature_names())

# <codecell>
def select_vectorizer(vectorizer_type, req_ngram_range=[1, 2]):
    """
	Select the desired vectorizer for either text or tweet
	@ text_tfidf_std
	@ text_tfidf_custom
	@ text_count_std

	@ tweet_tfidf_std
	@ tweet_tfidf_custom
	"""

    # SPECIFY VECTORIZER ALGORITHM
    #---------------------------------#

    ngram_lengths = req_ngram_range

    if vectorizer_type == "text_tfidf_std":
        # Standard TFIDF Vectorizer (Text)
        vectorizer = text.TfidfVectorizer(input='filename',
                                          analyzer='word',
                                          ngram_range=(ngram_lengths),
                                          stop_words='english',
                                          min_df=2)
        return vectorizer
    elif vectorizer_type == "text_tfidf_custom":
        # TFIDF Vectorizer with NLTK Tokenizer (Text)
        vectorizer = text.TfidfVectorizer(input='filename',
                                          analyzer='word',
                                          ngram_range=(ngram_lengths),
                                          stop_words='english',
                                          min_df=2,
                                          tokenizer=tokenize_nltk)
        print("User specified custom stopwords: {} ...".format(
            str(custom_stopwords)[1:-1]))
        return vectorizer
    elif vectorizer_type == "text_count_std":
        vectorizer = text.CountVectorizer(input='filename',
                                          analyzer='word',
                                          ngram_range=(ngram_lengths),
                                          stop_words='english',
                                          min_df=2)
        return vectorizer
    elif vectorizer_type == "tweet_tfidf_std":
        # Standard TFIDF Vectorizer (Content)
        vectorizer = text.TfidfVectorizer(input='content',
                                          analyzer='word',
                                          ngram_range=(ngram_lengths),
                                          stop_words='english',
                                          min_df=2)
        return vectorizer
    elif vectorizer_type == "tweet_tfidf_custom":
        # Standard TFIDF Vectorizer (Content)
        vectorizer = text.TfidfVectorizer(input='content',
                                          analyzer='word',
                                          ngram_range=(ngram_lengths),
                                          stop_words='english',
                                          min_df=2,
                                          tokenizer=tokenize_nltk)
        print("User specified custom stopwords: {} ...".format(
            str(custom_stopwords)[1:-1]))
        return vectorizer
    else:
        print("error in vectorizer specification...")
        pass
Esempio n. 21
0
import numpy as np
import pandas as pd
import shutil
import os
import csv
import sklearn.feature_extraction.text as sk_text
from sklearn.model_selection import train_test_split
from sklearn import metrics

path = './data/'
filename_write = os.path.join(path, "class_vectorized_review.csv")

data = pd.read_csv("./data/class_example.csv", encoding="utf-8")

vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                                     max_features=1000,
                                     min_df=1)

matrix = vectorizer.fit_transform(
    data['all_reviews'])  #turns all reviews into a tf-idf vector
tfidf_data = matrix.toarray().tolist(
)  #vector converterd into a list for data frame
data.drop('all_reviews', axis=1, inplace=True)
#data.drop('Unnamed: 0', axis = 1, inplace= True)
#data.drop('Unnamed: 0.1', axis = 1, inplace= True)
data.insert(2, 'all_reviews', tfidf_data)  #inserts the tfidf vector into data
text_data = pd.DataFrame(data['all_reviews'].values.tolist(),
                         columns=vectorizer.get_feature_names())
data.drop('all_reviews', axis=1, inplace=True)
data = pd.concat([data, text_data], axis=1)
data.to_csv(filename_write, index=False)
Esempio n. 22
0
def result():
    # get select list
    sid = request.form['sid']
    sid = sid.split()
    sid = [int(i) for i in sid]
    a = session.get('abstract', None)
    t = session.get('title', None)

    # get selected abstract
    sa = []
    for index, item in enumerate(sid):
        sa.append(a[item - 1])

    # get abstract excluded from selected item
    sid.sort(reverse=True)
    for index, item in enumerate(sid):
        a.pop(item - 1)
        t.pop(item - 1)
    #
    # make non-selected articles a data frame for calculation
    ta_df = pd.DataFrame({'Title': t, 'Abstract': a})

    # combine 2 lists to one
    s = " "
    sa = s.join(sa)  # make all selected abstracts into one
    ca = a
    ca.append(sa)  # append the selected abstracts
    #
    # Stopwords
    my_additional_stop_words = ["author", "and", "of", "the", "research", "\n"]
    stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
    #
    # STEM
    ca = [[stem(word) for word in sentence.split(" ")] for sentence in ca]
    y = len(ca)
    for i in range(0, y):
        ca[i] = " ".join(ca[i])

    # build tf-idf matrix
    tfidf = text.TfidfVectorizer(input=ca,
                                 stop_words=stop_words,
                                 analyzer='word',
                                 lowercase=True)
    matrix = tfidf.fit_transform(ca)

    # calculate similarity score
    sim_unigram = cosine_similarity(matrix)

    # build function to get similar score for all articles by locating them
    def get_similar_papers(x):
        return ta_df.loc[np.argsort(-x)]

    recomPaper = get_similar_papers(sim_unigram[-1])

    sim_unigram[-1][::-1].sort()  # descending
    # # sim_unigram[-1].sort() #ascending

    recomPaper["Similar_score"] = sim_unigram[-1]

    # get the 50 papers with highest similarity
    recomPaper50 = recomPaper[:51]
    recomPaper50 = recomPaper50.dropna().reset_index(drop=True)
    tl = recomPaper50['Title'].tolist()
    al = recomPaper50['Abstract'].tolist()
    sl = recomPaper50['Similar_score'].tolist()
    listc = [[x, y, z] for x, y, z in zip(tl, al, sl)]

    # output file
    # recomPaper50.to_csv("recom50papers.csv")

    return render_template('result.html',
                           sid=sid,
                           a=a,
                           t=t,
                           result=recomPaper50,
                           lc=listc)
Esempio n. 23
0
talks = df.text.tolist()

# We are not going to need the identifiers for this run, so I'm leaving them commented out.
# =-=-=-=-=-=-=-=-=-=-=
# Create citations to identify individual texts
# =-=-=-=-=-=-=-=-=-=-=

# authors = df.author.tolist()
# dates = df.date.tolist()
# years = [re.sub('[A-Za-z ]', '', item) for item in dates]
# authordate = [author+" "+year for author, year in zip(authors, years)]

import sklearn.feature_extraction.text as sktext
from sklearn.decomposition import NMF
import numpy as np

# Import stoplist
stopwords = re.split('\s+',
                     open('../data/stopwords_2.txt', 'r').read().lower())

# TFIDF parameters
max_percent = 0.85
min_percent = 0.01  # One percent = 20 talks (so not enought to warrant a topic?)

# Create TFIDF matrix
vectorizer = sktext.TfidfVectorizer(lowercase=True,
                                    stop_words=stopwords,
                                    max_df=max_percent,
                                    min_df=min_percent)
td_matrix = vectorizer.fit_transform(talks)
print(td_matrix.shape)
Esempio n. 24
0
    with open(fil, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)

        for line in csv_reader:

            temp_str = ''.join(line)
            temp_doc.append(temp_str)

    doc_string = ''.join(temp_doc)
    corpus.append(doc_string)
    temp_doc.clear()

print(len(corpus))
"""tf-idf αναπαράσταση των επιχειρήσεων"""

vectorizer = sk_text.TfidfVectorizer(stop_words='english', min_df=10)
X = vectorizer.fit_transform(corpus)
print(X.toarray())
print(vectorizer.get_feature_names())
"""# 1.

k-means
"""

kmeans = sk_cluster.KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
kmeans_labels = kmeans.labels_
error = kmeans.inertia_
print("The total error of the clustering is: ", error)
print('\nCluster labels')
Esempio n. 25
0
import sklearn.feature_extraction.text as text

cv = text.CountVectorizer(doc)

cv.fit(doc)

cv.get_feature_names()

cv.transform(doc)

cv.transform(doc).toarray()

pd.DataFrame(data=cv.transform(doc).toarray(), columns=cv.get_feature_names())

tfid = text.TfidfVectorizer(doc)

tfid.fit(doc)

tfid.transform(doc).toarray()

pd.DataFrame(data=tfid.transform(doc).toarray(),
             columns=tfid.get_feature_names())

data.columns

y = data['Rank']
x = data['Raw_joke']

tfid = text.TfidfVectorizer(x.tolist())
tfid.fit(x.tolist())
dataset = datasets.fetch_20newsgroups(shuffle=True,
                                      random_state=1,
                                      remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

preprocessor = tokenhandler._ENSimpleTokenHandler(stem=apply_stemming,
                                                  stopword=remove_stopwords)

tf_vectorizer = txtfeatext.CountVectorizer(
    tokenizer=preprocessor, ngram_range=ngram_range, max_features=n_features
)  # @TODO encoding?? (default utf8 but may depend on the user per application needs)
tf_matrix = tf_vectorizer.fit_transform(data_samples)

tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=preprocessor,
                                              ngram_range=ngram_range,
                                              max_features=n_features)
tfidf_matrix = tfidf_vectorizer.fit_transform(data_samples)

# apply NMF
print(
    "Applying NMF on tf*idf weighted terms, n_samples=%d and n_features=%d..."
    % (n_samples, n_features))
nmf = decomposer.NMF(n_components=n_topics,
                     random_state=1,
                     alpha=.1,
                     l1_ratio=.5).fit(tfidf_matrix)

print("Applying LDA on tf weighted terms, n_samples=%d and n_features=%d..." %
      (n_samples, n_features))
lda = decomposer.LatentDirichletAllocation(n_topics=n_topics,
Esempio n. 27
0

@static_var('test_num', 1)
def print_ans(*args, sep=' '):
    with open('{}.txt'.format(print_ans.test_num), 'w') as fout:
        fout.write(sep.join(list(map(str, args))))
    print_ans.test_num += 1


newsgroups = datasets.fetch_20newsgroups(
    subset='all', 
    categories=['alt.atheism', 'sci.space']
)

pipe = pipeline.Pipeline([
    ('tfidf', sktext.TfidfVectorizer()),
    ('svс', svm.SVC(kernel='linear', random_state=241)),
])

parameters = {'svс__C': [math.pow(10, i) for i in range(-5, 6)]}
kfold = cross_validation.KFold(len(newsgroups.data), n_folds=5, shuffle=True, random_state=241)

clf = grid_search.GridSearchCV(pipe, parameters, scoring='roc_auc', cv=kfold, n_jobs=8)
clf.fit(newsgroups.data, newsgroups.target)

print(clf.best_params_)

estimator = clf.best_estimator_
estimator.fit(newsgroups.data, newsgroups.target)

words = estimator.named_steps['tfidf'].get_feature_names()
Esempio n. 28
0
            ycolor.append(1)
            # for p in paras:
            #     bad_results.append(q2v.filter_sentence(q['document'], is_stop_words=True))
            #     questions.append(p)

        elif q['success_outcome'] == 0:
            # paras = q['document'].split('\n')
            bad_results.append(
                q2v.filter_sentence(q['document'], is_stop_words=True))
            ycolor.append(0)
            # for p in paras:
            #     bad_results.append(q2v.filter_sentence(p, is_stop_words=True))
            #     questions.append(p)

    # print(filtered_questions[:20])
    cv = ft.TfidfVectorizer()
    # tf_good_mat = cv.fit_transform(good_results).toarray()
    tf_bad_mat = cv.fit_transform(bad_results).toarray()
    words = cv.get_feature_names()
    print("len(words):", len(words))

    # for t in tfmat[:30]:
    #     print(list(t))
    """ dimension reduction """
    pca = PCA(n_components=2)
    pca.fit(tf_bad_mat)
    X = pca.transform(tf_bad_mat)
    """ Kmeans """
    y_pred = KMeans(n_clusters=10, random_state=9).fit_predict(X)
    """ show points with colors and labels """
    my = My_show(X, ycolor, ycolor, "Paragraph tf-idf features")
Esempio n. 29
0
from sklearn.feature_extraction import text
import os

# %%
dir = os.path.dirname(__file__)
file = os.path.join(dir, "jones_t_mails.csv")
file

# %%
df = pd.read_csv(file)
df

# %%
notblank = df["content"].apply(lambda x: len(str(x)) > 3)
df = df[notblank]

# %%
X = df["content"]

# %%
TfIdfVectorizer = text.TfidfVectorizer()
TfIdfVectorizer.fit(X)

# %%
idfdf = pd.DataFrame({
    "names": TfIdfVectorizer.get_feature_names(),
    "idf": TfIdfVectorizer.idf_
})
idfdf.sort_values(by=["idf"], ascending=False)
# %%
            parsed_emails.replace("shackleton ", "")
            parsed_emails.replace("chris ", "")
            parsed_emails.replace("germani ", "")

            ### append the text to word_data

            word_data.append(parsed_emails)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            if name == "chris":
                from_data.append(1)
            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

#print (word_data[152])

### in Part 4, do TfIdf vectorization here

from sklearn.feature_extraction import text
vec = text.TfidfVectorizer(word_data[0])

print(vec)