Esempio n. 1
0
    def fit(self, train, save=True, load=True):
        '''
        Trains the predictor.

        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions.
            It has one column for session IDs, one for item IDs and one for the
            timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must
            correspond to the ones you set during the initialization of the
            network (session_key, item_key, time_key properties).

        save : bool
            Flag to save the model in file after training. (default: True)
        load : bool
            Flag to load trained model if exists. (default: True)
        '''

        if load and os.path.isfile(self.path_trained + self.file_prefix +
                                   self.file_suffix()):
            print("Model already trained! Loading...", end="")
            self.load_w2v_model()
            print("done.")
        else:
            print("Creating session vocabulary...", end="")
            train[self.item_key] = train[self.item_key].astype(str)
            sequences = train.groupby(self.session_key)[self.item_key] \
                             .apply(list)
            print("done.")
            print("Training model...", end="")
            if self.seed > 0:
                self.model = w2v(sequences,
                                 size=self.factors,
                                 sg=self.sg,
                                 window=self.window,
                                 workers=1,
                                 hs=self.hs,
                                 iter=self.epochs,
                                 min_count=1,
                                 seed=self.seed)
            else:
                self.model = w2v(sequences,
                                 size=self.factors,
                                 sg=self.sg,
                                 window=self.window,
                                 workers=self.workers,
                                 hs=self.hs,
                                 iter=self.epochs,
                                 min_count=1)

            print("done.")
            if (save):
                self.save_w2v_model()
                print("Model saved!")

            train[self.item_key] = train[self.item_key].astype(int)
Esempio n. 2
0
	def make_model(self):
		""" Model and train the word2vec model on words from tweets"""

		# Define parameters for the w2v model
		num_features = 300
		min_word_count = 3
		num_workers = multiprocessing.cpu_count()
		context_size = 7
		downsampling = 1e-3
		seed = 1

		# Build the model
		self.tweet2vec = w2v(
		    sg = 1,
		    seed = seed,
		    workers = num_workers,
		    size = num_features,
		    min_count = min_word_count,
		    window = context_size,
		    sample = downsampling
		)

		# Build the vocabulary
		self.tweet2vec.build_vocab(self.sentences)
		# Train the model
		self.tweet2vec.train(self.sentences, epochs = 10, total_examples = len(self.sentences))
Esempio n. 3
0
    def fit_word_vectors(self, train, save_model=True):
        # Load word vectors if model already exists
        self.fit_session_vocabulary(train)
        if os.path.isfile(self.path_trained + self.file_prefix +
                          self.file_suffix()):
            print("Model already trained! Loading...", end="")
            self.load_w2v_model()
            print("done.")
        else:
            # Generate word vectors
            print("Generating word vectors...", end="")
            self.model = w2v(self.all_session_items.values,
                             size=self.factors,
                             window=self.window,
                             sg=self.sg,
                             workers=4,
                             hs=self.hs,
                             iter=self.epochs,
                             min_count=1)
            print("done.")

            if save_model:
                self.save_w2v_model()
                print("Model saved!")
        self.wv = self.model.wv
        del (self.model)  # Discards wv model
Esempio n. 4
0
def train_model(text, filename, phrases=True, workers=8, window=3, overwrite=False):
    if os.path.exists(filename) and not overwrite:
        return w2v.load(filename)
    pool = Pool(workers)
    tokens = pool.map(tokenize, text)
    cs = flatten(tokens)
    if phrases:
        sentences_phrases = Phrases(cs)
        sentences = sentences_phrases[cs]
    else:
        sentences = cs
    model_titulo = w2v(sentences,workers=workers,window=window,min_count=1,size=300)
    model_titulo.save(filename)
    return model_titulo
def create_w2v(data,
               emb_dim=300,
               window=5,
               min_count=5,
               negative=5,
               iterations=10):

    from gensim.models import Word2Vec as w2v
    workers = multiprocessing.cpu_count()

    w2v = w2v(data,
              size=emb_dim,
              window=window,
              min_count=min_count,
              negative=negative,
              iter=iterations,
              workers=workers)

    print('Word2Vec model created.')
    return w2v
Esempio n. 6
0
#-------------------------------------------------------------------------------
with open('s1test_x', 'rb') as test_x:
    s1test_x = pickle.load(test_x)
with open('s1train_x', 'rb') as train_x:
    s1train_x = pickle.load(train_x)

with open('s2test_x', 'rb') as test_x:
    s2test_x = pickle.load(test_x)
with open('s2train_x', 'rb') as train_x:
    s2train_x = pickle.load(train_x)
#-------------------------------------------------------------------------------
cores = multiprocessing.cpu_count()
w2v_model = w2v(min_count=10,
                window=10,
                size=300,
                sample=1e-5,
                alpha=0.03,
                min_alpha=0.0007,
                negative=5,
                workers=cores - 1)
sentences = s1train_x + s2train_x
print(sentences)
#-------------------------------------------------------------------------------
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
t = time()
w2v_model.train(sentences,
                total_examples=w2v_model.corpus_count,
                epochs=30,
                report_delay=1.0)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
Esempio n. 7
0
from gensim.models import Word2Vec as w2v

class MySentences(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open( self.filename ):
            yield line.split()

if __name__ == '__main__':
    srcfile = MySentences('/global-mt/lpeng/academic/neural-moses/corpus/bilingual/1227K-lowercase/3-clean/1227K-lowercase.chi-eng.tok.norm.clean.chi')
    cnmodel = w2v(srcfile, workers=4, size=4)
    cnmodel.save('/data/disk1/private/zy/phrase_str2vec/src/input/cnmodel')
    trgfile = MySentences('/global-mt/lpeng/academic/neural-moses/corpus/bilingual/1227K-lowercase/3-clean/1227K-lowercase.chi-eng.tok.norm.clean.eng')
    enmodel = w2v(trgfile, workers=4, size=4)
    enmodel.save('/data/disk1/private/zy/phrase_str2vec/src/input/enmodel')
    

Esempio n. 8
0
def get_w2v_model(sentences):
    model = w2v(sentences, min_count=1, workers=4)
    print("Word2Vec Model Loaded Successfully.")
    features = model[model.wv.vocab]
    return features
Esempio n. 9
0
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

base_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_tokenised_lemmatised/"
directory = os.fsencode(base_dir)

#initialise an empty model
# min_count: ignore words with lower frequency than the count
# window: maximum distance between the current and predicted word within a sentence
# size: dimensionality of the feature vectors
# alpha: learning rate
model = w2v(min_count=10,
            window=2,
            sample=6e-5,
            negative=20,
            alpha=0.03,
            min_alpha=0.0007,
            size=300)

#initialise empty list of dictionaries to create pandas dataframe
items = []

for file in os.listdir(directory):
    dict = {}
    filename = os.fsdecode(file)
    id = filename.split(".txt", 1)[0]
    dict["ID"] = id

    #text is already tokenised using lexnlp
    r = open(base_dir + filename, "r", encoding='latin1').read()
Esempio n. 10
0
 def __init__(self, train_data, size=200, window=5, min_count=2, workers=8, sg=1, hs=1):
     self.model = w2v(train_data, size=size, window=window, min_count=min_count, workers=workers, sg=sg, hs=hs)
Esempio n. 11
0
sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

base_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_test/"
directory = os.fsencode(base_dir)

#initialise an empty model
# min_count: ignore words with lower frequency than the count
# window: maximum distance between the current and predicted word within a sentence
# size: dimensionality of the feature vectors
# alpha: learning rate
model = w2v(min_count=10, window=2, size=300)

#initialise empty list of dictionaries to create pandas dataframe
items = []

for file in os.listdir(directory):
    dict = {}
    filename = os.fsdecode(file)
    id = filename.split(".txt", 1)[0]
    dict["ID"] = id

    #text is already tokenised using lexnlp
    r = open(base_dir + filename, "r", encoding='latin1').read()
    s = r.split("Class: ", 1)[1]
    classes = s.split("\nText: `` ", 1)[0]
    dict["Class"] = classes
Esempio n. 12
0
tokenizer = nltk.tokenize.RegexpTokenizer(
    r'\w+')  # Keep only alphanumeric characters as tokens
for idx_e in range(len(data)):
    for idx_n in range(len(data[idx_e]["tweets"])):
        text = data[idx_e]["tweets"][idx_n]["text"]
        text = re.sub(r"http\S+", "", text)  # remove urls
        text = text.lower()  # convert to lowercase
        tokens = tokenizer.tokenize(text)  # tokenize
        tokenSet.append(tokens)
        magLabels.append(
            data[idx_e]
            ["magnitude"])  # every tokenized tweet has a magnitude label

# Remove stopwords, numbers, singleton characters, and lemmatize
stopwords_nltk = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
tokenSet = [[
    lemmatizer.lemmatize(token) for token in doc
    if not (token in stopwords_nltk or token.isnumeric() or len(token) <= 1)
] for doc in tokenSet]  # remove stopwords

print('Preprocessing Completed. Total earthquakes: ', len(data),
      '. Total tweets: ', len(tokenSet))

print(tokenSet[0:10])

### Deploy w2v
#models = w2v(tokenSet[0:20], min_count=1, size=10)
model = w2v(tokenSet, min_count=1, size=10)
print('Vector for \'earthquake\': ', model['earthquake'])
Esempio n. 13
0
def train_corpus(corpus):
    b = w2v(brown.sents())
    b.batch_words
Esempio n. 14
0
 def create_model(self, min_count=1, size=100, window=5, sg=0):
     if not filename == None:
         token = file_tokenize(filename)
     else:
         token = text_tokenize(text)
     return w2v(token, min_count=min_count, size=100, window=5)
Esempio n. 15
0
 def train(self, sentences):
     corpus = [[str(token) for token in sent] for sent in sentences]
     model = w2v(corpus, size=self.size, min_count=self.min_count)
     for word in model.wv.vocab.keys():
         self.vocab.embedding[word] = model.wv[word]
Esempio n. 16
0
# TEST = sys.argv[3]

## parameters of word2vec
WINDOW = 10
VEC_DIM = 100

# load (label, setence)
y_train, x_train = utils.load_data(TRAIN, file_type='train')
x_train_nolab = utils.load_data(TRAIN_NO_LAB, file_type='train_nolabel')
_, x_test = utils.load_data(TEST, file_type='test')

setence = x_train + x_train_nolab + x_test

# setence to word list e.g. 'fxxk you' to ['fxxk', 'you']
setenece_split = []
for line in setence:
    words = text.text_to_word_sequence(
        line, filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    setenece_split.append(words)

# print(setenece_split[0:10]) # just for test
dic = corpora.Dictionary(setenece_split)
dic.save('./dictionary')
print(dic)

# word to vector
model = w2v(setenece_split, window=WINDOW, size=VEC_DIM)
model.save('./word_vec')

# print(model.most_similar('get'))
# print(model.similarity('get', 'getting'))