def transform(self, X, y=None): """ Transforms the list of documents and returns tokens with their features. Each document should represent a sentence. """ log.info("Generating features for {} documents...".format(len(X))) features = [] for doc in X: doc_features = [] for token in document_to_tokens(doc): if token in self.model.wv: doc_features.append((token, self.model.wv[token])) features.append(doc_features) return features
def fit(self, X, y=None, size=100, min_count=5, workers=1, window=5, sample=1e-3, skipgram=False, min_n=3, max_n=6): """ Trains a Word2vec model on given documents. Each document should represent a sentence. Args: X: list(Document | AnnotatedDocument | list(str)) y: optional labels size: Size of embeddings to be learnt (Default 100), i.e. word vector dimensionality min_count: Minimum word count. Ignore words with number of occurrences below this (Default 5). workers: Number of threads to run in parallel window: Context window size sample: Threshold for downsampling higher-frequency words (Default 0.001) skipgram: Use skip-gram if True and CBOW otherwise min_n: min length of char ngrams (Default 3) max_n: max length of char ngrams (Default 6) """ log.info("Checking parameters...") self.config.set_parameters({ "size": size, "min_count": min_count, "workers": workers, "window": window, "sample": sample, "min_n": min_n, "max_n": max_n }) self.config.validate() # Get sentences as lists of tokens log.info("Tokenizing {} documents...".format(len(X))) sentences = [] for idx, doc in enumerate(X): sentences.append(document_to_tokens(doc)) log_progress(log, idx, len(X)) # Initialize and train the model (this will take some time) log.info("Training FastText on {} sentences...".format(len(X))) self.model = FastText( sentences, workers=self.config.get_parameter("workers"), size=self.config.get_parameter("size"), min_count=self.config.get_parameter("min_count"), window=self.config.get_parameter("window"), sample=self.config.get_parameter("sample"), sg=1 if skipgram else 0, min_n=self.config.get_parameter("min_n"), max_n=self.config.get_parameter("max_n")) # If you don't plan to train the model any further, calling # init_sims() will make the model much more memory-efficient. self.model.init_sims(replace=True) return self
def fit(self, X, y=None, size=100, min_count=5, workers=1, window=5, sample=1e-3, skipgram=False): """ Trains a Word2vec model on given documents. Each document should represent a sentence. Args: X: list(Document | AnnotatedDocument | list(str)) y: optional labels size: Word vector dimensionality min_count: Minimum word count workers: Number of threads to run in parallel window: Context window size sample: Downsample setting for frequent words skipgram: Use skip-gram if True and CBOW otherwise """ log.info("Checking parameters...") self.config.set_parameters({ "size": size, "min_count": min_count, "workers": workers, "window": window, "sample": sample }) # Get sentences as lists of tokens log.info("Tokenizing {} documents...".format(len(X))) sentences = [] for idx, doc in enumerate(X): sentences.append(document_to_tokens(doc)) log_progress(log, idx, len(X)) # Initialize and train the model (this will take some time) log.info("Training Word2vec on {} sentences...".format(len(X))) self.model = Word2Vec(sentences, workers=self.config.get_parameter("workers"), size=self.config.get_parameter("size"), min_count=self.config.get_parameter("min_count"), window=self.config.get_parameter("window"), sample=self.config.get_parameter("sample"), sg=1 if skipgram else 0) # If you don't plan to train the model any further, calling # init_sims() will make the model much more memory-efficient. self.model.init_sims(replace=True) return self
def fit(self, X, y=None, size=100, min_count=5, workers=1, window=5, sample=1e-3, skipgram=False, min_n=3, max_n=6): """ Trains word, character, and part-of-speech embeddings (see Char2VecFeatureExtractor for the description of arguments). """ # Get sentences as lists of tokens log.info("Tokenizing {} documents...".format(len(X))) sentences = [] for idx, doc in enumerate(X): sentences.append(document_to_tokens(doc)) log_progress(log, idx, len(X)) self.word_vectorizer.fit(sentences, y, size=size, min_count=min_count, workers=workers, window=window, sample=sample, skipgram=skipgram) self.pos_vectorizer.fit(sentences, y, size=size, min_count=min_count, workers=workers, window=window, sample=sample, skipgram=skipgram) self.char_vectorizer.fit(sentences, y, size=size, min_count=min_count, workers=workers, window=window, sample=sample, skipgram=skipgram, min_n=min_n, max_n=max_n) return self