Ejemplo n.º 1
0
 def test_persistence(self):
     model = FT_gensim(sentences, min_count=1)
     model.save(testfile())
     self.models_equal(model, FT_gensim.load(testfile()))
     #  test persistence of the KeyedVectors of a model
     wv = model.wv
     wv.save(testfile())
     loaded_wv = FastTextKeyedVectors.load(testfile())
     self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
     self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
     self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
Ejemplo n.º 2
0
 def test_persistence(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model = FT_gensim(sentences, min_count=1)
     model.save(tmpf)
     self.models_equal(model, FT_gensim.load(tmpf))
     #  test persistence of the KeyedVectors of a model
     wv = model.wv
     wv.save(tmpf)
     loaded_wv = FastTextKeyedVectors.load(tmpf)
     self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
     self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
Ejemplo n.º 3
0
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
Ejemplo n.º 5
0
    def test_norm_vectors_not_saved(self):
        tmpf = get_tmpfile('gensim_fasttext.tst')
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(tmpf)
        loaded_model = FT_gensim.load(tmpf)
        self.assertTrue(loaded_model.wv.vectors_norm is None)
        self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None)

        wv = model.wv
        wv.save(tmpf)
        loaded_kv = FastTextKeyedVectors.load(tmpf)
        self.assertTrue(loaded_kv.vectors_norm is None)
        self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
Ejemplo n.º 6
0
    def test_persistence_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model = FT_gensim(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, FT_gensim.load(tmpf))
            #  test persistence of the KeyedVectors of a model
            wv = model.wv
            wv.save(tmpf)
            loaded_wv = FastTextKeyedVectors.load(tmpf)
            self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
            self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
Ejemplo n.º 7
0
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle as pkl
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
import datetime

tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)

# morph analyzer for text lemmatization
morph = pymorphy2.MorphAnalyzer()
fasttext = FastTextKeyedVectors.load('187/model.model')
pos_log_reg = pkl.load(open('pickles/pos_log_reg.pkl', 'rb'))
neg_log_reg = pkl.load(open('pickles/neg_log_reg.pkl', 'rb'))
pos_log_reg_dost = pkl.load(open('pickles/pos_log_reg_dost.pkl', 'rb'))
neg_log_reg_dost = pkl.load(open('pickles/neg_log_reg_dost.pkl', 'rb'))

old_data = pd.read_pickle('data/new_data.pkl')
old_data['index'] = old_data.index
training_data = pd.read_csv('data/training_data_with_razmetka_final.csv')

data_new = training_data.merge(old_data, on=['index', 'message'])
cut_date = lambda x: datetime.date(x.year, x.month, x.day)
data_new['local_datetime'] = pd.to_datetime(
    data_new.local_datetime).apply(cut_date)