class sentence2vector: def __init__(self, sentences, method='TF-IDF', vector_size=100, min_count=2): import spacy try: self.spacy_nlp = spacy.load("fr") except: #! spacy download fr self.spacy_nlp = spacy.load("fr_core_news_sm") self.unknown_token = '<ukn>' self.sentences = sentences self.method = method self.vector_size = vector_size self.min_count = min_count print('Size of documents:', len(self.sentences)) print('Method of vectorization:', self.method) self.preprocessing() self.count_word() self.vectorize() def preprocessing(self): print('Preprocessing sentences...') try: with tqdm(self.sentences) as t: for i, _ in enumerate(t): self.sentences[i] = self.raw_to_tokens(self.sentences[i]) except KeyboardInterrupt: t.close() raise t.close() def normalize_accent(self, string): string = string.replace('á', 'a') string = string.replace('â', 'a') string = string.replace('é', 'e') string = string.replace('è', 'e') string = string.replace('ê', 'e') string = string.replace('ë', 'e') string = string.replace('î', 'i') string = string.replace('ï', 'i') string = string.replace('ö', 'o') string = string.replace('ô', 'o') string = string.replace('ò', 'o') string = string.replace('ó', 'o') string = string.replace('ù', 'u') string = string.replace('û', 'u') string = string.replace('ü', 'u') string = string.replace('ç', 'c') return string def raw_to_tokens(self, raw_string): # Write code for lower-casing string = raw_string.lower() # Write code to normalize the accents string = self.normalize_accent(string) # Write code to tokenize string = self.spacy_nlp(string) # Write code to remove punctuation tokens, stop words , digits and create string tokens string = [ token.orth_ for token in string if not token.is_punct if not token.is_stop if token.orth_.isalpha() ] # Write code to join the tokens back into a single string #clean_string = " ".join(string_tokens) return string def vectorize(self): if self.method == 'TF-IDF': self.tfidf() if self.method == 'doc2vec': self.doc2vec() def tfidf(self): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA print('Transform TF-IDF vectors...') #create a TfidfVectorizer object self.vectorizer = TfidfVectorizer(min_df=self.min_count) # vectorize the text x = [" ".join(sentence) for sentence in self.sentences] sparse_result = self.vectorizer.fit_transform(x) self.vocabulary = self.vectorizer.vocabulary_ print('Vocabulary size:', len(self.vocabulary)) self.X = sparse_result.toarray() #reduce feature dimension of X pca = PCA(n_components=self.vector_size, copy=False) self.X = pca.fit_transform(self.X) def doc2vec(self): from gensim.models.doc2vec import Doc2Vec, TaggedDocument documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate(self.sentences) ] print('Training Doc2vec model...') self.vectorizer = Doc2Vec(vector_size=self.vector_size, window=5, min_count=self.min_count, hs=0, negative=5, workers=-1, alpha=0.025, min_alpha=1e-5) self.vectorizer.build_vocab(documents) self.vocabulary = self.vectorizer.wv.vocab print('Vocabulary size:', len(self.vocabulary)) self.vectorizer.train(documents, total_examples=self.vectorizer.corpus_count, epochs=self.vectorizer.epochs) self.X = np.array( [self.vectorizer[i] for i in range(len(self.sentences))]) def count_word(self): print('Building word2count dict...') self.word2count = {} try: with tqdm(self.sentences) as t: for sentence in t: for word in sentence: if word in self.word2count: self.word2count[word] += 1 else: self.word2count[word] = 1 except KeyboardInterrupt: t.close() raise t.close() def __getitem__(self, key): if self.method == 'TF-IDF': vec = self.vectorizer[key].toarray().squeeze() else: vec = self.vectorizer[key] return vec def __len__(self): return len(self.sentences)
class Vectorizer: vector_size = 300 bowargs = { "max_features": vector_size, "stop_words" : 'english', "max_df" : 0.5, "min_df" : 0.01 } tfidfargs = { "max_df" : 1.0, "min_df" : 1, "max_features" : vector_size, "stop_words" : 'english' } w2vargs = { "size" : vector_size, "window" : 5, "min_count" : 2, "sg" : 1, "hs" : 0, "negative" : 10, "workers" : 2, "seed" : 34 } supported_methods = { 'word2vec', 'bagofwords', 'tfidf' } def __init__(self, method='word2vec'): self.method = re.sub(r'''_|-|\ ''', '', method) if self.method == 'word2vec': self.underlying = Word2Vec(**self.w2vargs) elif self.method == 'bagofwords': self.underlying = CountVectorizer(**self.bowargs) elif self.method == 'tfidf': self.underlying = TfidfVectorizer(**self.tfidfargs) else: raise ValueError("'" + self.method + "' is not supported") def vectorize(self, preprocessor, dictionary, save=True): if isinstance(preprocessor, list): path = platform.filename(preprocessor, ['preprocessed', self.method] + (['augmented'] if dictionary else [])) + '.pkl' if not os.path.isfile(path): raise ValueError("'" + path + "' is not a file") with open(path, 'rb') as file: labels, vectors = pickle.load(file) print('<LOG>: Loaded', len(vectors), 'vectors from', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr) return dict(zip(vectors.keys(), labels)), vectors path = '_'.join([preprocessor.path, self.method] + (['augmented'] if dictionary else [])) + '.pkl' if not isinstance(preprocessor, Preprocessor): raise ValueError("'preprocessor' is not an instance of 'Preprocessor'") return self.process(preprocessor, dictionary, path if save else None) def process(self, preprocessor, dictionary, path): tweets = list(preprocessor.tweets.values()) if self.method == 'word2vec': self.underlying.build_vocab(tweets) self.underlying.train(sentences=tweets, total_examples=len(tweets), epochs=20) vectors = [None] * len(tweets) for i, tweet in enumerate(tweets): vector = [None] * len(tweet) for j, token in enumerate(tweet): if token in self.underlying.wv: vector[j] = self.underlying.wv[token] else: vector[j] = 2.0 * np.random.randn(self.vector_size) - 1.0 vectors[i] = np.mean(vector, axis=0) else: concatenated = [' '.join(tweet) for tweet in tweets] vectors = self.underlying.fit_transform(concatenated).toarray() if dictionary: flattened = list(np.asarray(vectors).flatten()) vmin, vmax = min(flattened), max(flattened) augmented = [None] * len(vectors) for i, valences in enumerate(dictionary.per_tweet(tweets, (vmin, vmax))): augmented[i] = np.concatenate((vectors[i], valences)) vectors = augmented print('<LOG>: The', ('augmented ' if augmented else '') + 'vectors\' values are in the range', '[' + '{0:.4f}'.format(vmin), ',', '{0:.4f}'.format(vmax) + ']', file=sys.stderr) vectors = dict(zip(preprocessor.tweets.keys(), vectors)) if path: with open(path, 'wb') as file: pickle.dump((list(preprocessor.labels.values()), vectors), file) print('<LOG>: Saved', len(vectors), 'vectors to', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr) return preprocessor.labels, vectors