Esempi in Python per w2v, esempi in Python per gensim.models.w2v

Esempio n. 1

0

Mostra file

    def fit(self, train, save=True, load=True):
        '''
        Trains the predictor.

        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions.
            It has one column for session IDs, one for item IDs and one for the
            timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must
            correspond to the ones you set during the initialization of the
            network (session_key, item_key, time_key properties).

        save : bool
            Flag to save the model in file after training. (default: True)
        load : bool
            Flag to load trained model if exists. (default: True)
        '''

        if load and os.path.isfile(self.path_trained + self.file_prefix +
                                   self.file_suffix()):
            print("Model already trained! Loading...", end="")
            self.load_w2v_model()
            print("done.")
        else:
            print("Creating session vocabulary...", end="")
            train[self.item_key] = train[self.item_key].astype(str)
            sequences = train.groupby(self.session_key)[self.item_key] \
                             .apply(list)
            print("done.")
            print("Training model...", end="")
            if self.seed > 0:
                self.model = w2v(sequences,
                                 size=self.factors,
                                 sg=self.sg,
                                 window=self.window,
                                 workers=1,
                                 hs=self.hs,
                                 iter=self.epochs,
                                 min_count=1,
                                 seed=self.seed)
            else:
                self.model = w2v(sequences,
                                 size=self.factors,
                                 sg=self.sg,
                                 window=self.window,
                                 workers=self.workers,
                                 hs=self.hs,
                                 iter=self.epochs,
                                 min_count=1)

            print("done.")
            if (save):
                self.save_w2v_model()
                print("Model saved!")

            train[self.item_key] = train[self.item_key].astype(int)

Esempio n. 2

0

Mostra file

	def make_model(self):
		""" Model and train the word2vec model on words from tweets"""

		# Define parameters for the w2v model
		num_features = 300
		min_word_count = 3
		num_workers = multiprocessing.cpu_count()
		context_size = 7
		downsampling = 1e-3
		seed = 1

		# Build the model
		self.tweet2vec = w2v(
		    sg = 1,
		    seed = seed,
		    workers = num_workers,
		    size = num_features,
		    min_count = min_word_count,
		    window = context_size,
		    sample = downsampling
		)

		# Build the vocabulary
		self.tweet2vec.build_vocab(self.sentences)
		# Train the model
		self.tweet2vec.train(self.sentences, epochs = 10, total_examples = len(self.sentences))

Esempio n. 3

0

Mostra file

    def fit_word_vectors(self, train, save_model=True):
        # Load word vectors if model already exists
        self.fit_session_vocabulary(train)
        if os.path.isfile(self.path_trained + self.file_prefix +
                          self.file_suffix()):
            print("Model already trained! Loading...", end="")
            self.load_w2v_model()
            print("done.")
        else:
            # Generate word vectors
            print("Generating word vectors...", end="")
            self.model = w2v(self.all_session_items.values,
                             size=self.factors,
                             window=self.window,
                             sg=self.sg,
                             workers=4,
                             hs=self.hs,
                             iter=self.epochs,
                             min_count=1)
            print("done.")

            if save_model:
                self.save_w2v_model()
                print("Model saved!")
        self.wv = self.model.wv
        del (self.model)  # Discards wv model

Esempio n. 4

0

Mostra file

File: utils.py Progetto: labdac/charlacompling

def train_model(text, filename, phrases=True, workers=8, window=3, overwrite=False):
    if os.path.exists(filename) and not overwrite:
        return w2v.load(filename)
    pool = Pool(workers)
    tokens = pool.map(tokenize, text)
    cs = flatten(tokens)
    if phrases:
        sentences_phrases = Phrases(cs)
        sentences = sentences_phrases[cs]
    else:
        sentences = cs
    model_titulo = w2v(sentences,workers=workers,window=window,min_count=1,size=300)
    model_titulo.save(filename)
    return model_titulo

Esempio n. 5

0

Mostra file

File: utilities.py Progetto: SouravDutta91/map-low-resource-embeddings

def create_w2v(data,
               emb_dim=300,
               window=5,
               min_count=5,
               negative=5,
               iterations=10):

    from gensim.models import Word2Vec as w2v
    workers = multiprocessing.cpu_count()

    w2v = w2v(data,
              size=emb_dim,
              window=window,
              min_count=min_count,
              negative=negative,
              iter=iterations,
              workers=workers)

    print('Word2Vec model created.')
    return w2v

Esempio n. 6

0

Mostra file

#-------------------------------------------------------------------------------
with open('s1test_x', 'rb') as test_x:
    s1test_x = pickle.load(test_x)
with open('s1train_x', 'rb') as train_x:
    s1train_x = pickle.load(train_x)

with open('s2test_x', 'rb') as test_x:
    s2test_x = pickle.load(test_x)
with open('s2train_x', 'rb') as train_x:
    s2train_x = pickle.load(train_x)
#-------------------------------------------------------------------------------
cores = multiprocessing.cpu_count()
w2v_model = w2v(min_count=10,
                window=10,
                size=300,
                sample=1e-5,
                alpha=0.03,
                min_alpha=0.0007,
                negative=5,
                workers=cores - 1)
sentences = s1train_x + s2train_x
print(sentences)
#-------------------------------------------------------------------------------
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
t = time()
w2v_model.train(sentences,
                total_examples=w2v_model.corpus_count,
                epochs=30,
                report_delay=1.0)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Esempio n. 7

0

Mostra file

File: TrainWord2Vec.py Progetto: zhu-y11/phrase2vec

from gensim.models import Word2Vec as w2v

class MySentences(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open( self.filename ):
            yield line.split()

if __name__ == '__main__':
    srcfile = MySentences('/global-mt/lpeng/academic/neural-moses/corpus/bilingual/1227K-lowercase/3-clean/1227K-lowercase.chi-eng.tok.norm.clean.chi')
    cnmodel = w2v(srcfile, workers=4, size=4)
    cnmodel.save('/data/disk1/private/zy/phrase_str2vec/src/input/cnmodel')
    trgfile = MySentences('/global-mt/lpeng/academic/neural-moses/corpus/bilingual/1227K-lowercase/3-clean/1227K-lowercase.chi-eng.tok.norm.clean.eng')
    enmodel = w2v(trgfile, workers=4, size=4)
    enmodel.save('/data/disk1/private/zy/phrase_str2vec/src/input/enmodel')

Esempio n. 8

0

Mostra file

def get_w2v_model(sentences):
    model = w2v(sentences, min_count=1, workers=4)
    print("Word2Vec Model Loaded Successfully.")
    features = model[model.wv.vocab]
    return features

Esempio n. 9

0

Mostra file

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

base_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_tokenised_lemmatised/"
directory = os.fsencode(base_dir)

#initialise an empty model
# min_count: ignore words with lower frequency than the count
# window: maximum distance between the current and predicted word within a sentence
# size: dimensionality of the feature vectors
# alpha: learning rate
model = w2v(min_count=10,
            window=2,
            sample=6e-5,
            negative=20,
            alpha=0.03,
            min_alpha=0.0007,
            size=300)

#initialise empty list of dictionaries to create pandas dataframe
items = []

for file in os.listdir(directory):
    dict = {}
    filename = os.fsdecode(file)
    id = filename.split(".txt", 1)[0]
    dict["ID"] = id

    #text is already tokenised using lexnlp
    r = open(base_dir + filename, "r", encoding='latin1').read()

Esempio n. 10

0

Mostra file

 def __init__(self, train_data, size=200, window=5, min_count=2, workers=8, sg=1, hs=1):
     self.model = w2v(train_data, size=size, window=window, min_count=min_count, workers=workers, sg=sg, hs=hs)

Esempio n. 11

0

Mostra file

sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

base_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_test/"
directory = os.fsencode(base_dir)

#initialise an empty model
# min_count: ignore words with lower frequency than the count
# window: maximum distance between the current and predicted word within a sentence
# size: dimensionality of the feature vectors
# alpha: learning rate
model = w2v(min_count=10, window=2, size=300)

#initialise empty list of dictionaries to create pandas dataframe
items = []

for file in os.listdir(directory):
    dict = {}
    filename = os.fsdecode(file)
    id = filename.split(".txt", 1)[0]
    dict["ID"] = id

    #text is already tokenised using lexnlp
    r = open(base_dir + filename, "r", encoding='latin1').read()
    s = r.split("Class: ", 1)[1]
    classes = s.split("\nText: `` ", 1)[0]
    dict["Class"] = classes

Esempio n. 12

0

Mostra file

File: TrainVm_word2vec.py Progetto: yang-sec/SocialDisaster

tokenizer = nltk.tokenize.RegexpTokenizer(
    r'\w+')  # Keep only alphanumeric characters as tokens
for idx_e in range(len(data)):
    for idx_n in range(len(data[idx_e]["tweets"])):
        text = data[idx_e]["tweets"][idx_n]["text"]
        text = re.sub(r"http\S+", "", text)  # remove urls
        text = text.lower()  # convert to lowercase
        tokens = tokenizer.tokenize(text)  # tokenize
        tokenSet.append(tokens)
        magLabels.append(
            data[idx_e]
            ["magnitude"])  # every tokenized tweet has a magnitude label

# Remove stopwords, numbers, singleton characters, and lemmatize
stopwords_nltk = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
tokenSet = [[
    lemmatizer.lemmatize(token) for token in doc
    if not (token in stopwords_nltk or token.isnumeric() or len(token) <= 1)
] for doc in tokenSet]  # remove stopwords

print('Preprocessing Completed. Total earthquakes: ', len(data),
      '. Total tweets: ', len(tokenSet))

print(tokenSet[0:10])

### Deploy w2v
#models = w2v(tokenSet[0:20], min_count=1, size=10)
model = w2v(tokenSet, min_count=1, size=10)
print('Vector for \'earthquake\': ', model['earthquake'])

Esempio n. 13

0

Mostra file

def train_corpus(corpus):
    b = w2v(brown.sents())
    b.batch_words

Esempio n. 14

0

Mostra file

 def create_model(self, min_count=1, size=100, window=5, sg=0):
     if not filename == None:
         token = file_tokenize(filename)
     else:
         token = text_tokenize(text)
     return w2v(token, min_count=min_count, size=100, window=5)

Esempio n. 15

0

Mostra file

 def train(self, sentences):
     corpus = [[str(token) for token in sent] for sent in sentences]
     model = w2v(corpus, size=self.size, min_count=self.min_count)
     for word in model.wv.vocab.keys():
         self.vocab.embedding[word] = model.wv[word]

Esempio n. 16

0

Mostra file

# TEST = sys.argv[3]

## parameters of word2vec
WINDOW = 10
VEC_DIM = 100

# load (label, setence)
y_train, x_train = utils.load_data(TRAIN, file_type='train')
x_train_nolab = utils.load_data(TRAIN_NO_LAB, file_type='train_nolabel')
_, x_test = utils.load_data(TEST, file_type='test')

setence = x_train + x_train_nolab + x_test

# setence to word list e.g. 'fxxk you' to ['fxxk', 'you']
setenece_split = []
for line in setence:
    words = text.text_to_word_sequence(
        line, filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    setenece_split.append(words)

# print(setenece_split[0:10]) # just for test
dic = corpora.Dictionary(setenece_split)
dic.save('./dictionary')
print(dic)

# word to vector
model = w2v(setenece_split, window=WINDOW, size=VEC_DIM)
model.save('./word_vec')

# print(model.most_similar('get'))
# print(model.similarity('get', 'getting'))