def test_closer_to(start, close, far): start = docvector(word2vec, simple_tokenize(start)) close = docvector(word2vec, simple_tokenize(close)) far = docvector(word2vec, simple_tokenize(far)) close_dist = distance.cosine(start, close) far_dist = distance.cosine(start, far) assert close_dist < far_dist
def preprocess(self, text): seen = set() sents = [] for sent in text: processed = tuple(simple_tokenize(sent)) if processed not in seen: sents.append(processed) seen.add(processed) return sents
def most_common_words(self): """ Find the most common words in the procurement tender descriptions. Then return top 12 for the top most common words :return: List of String """ # check is it is already computed if self.cache_most_common_words is None: word_counts = {} for p in data_holder.procurements: for token in simple_tokenize(p.tender_description.lower()): if token in gensim.parsing.preprocessing.STOPWORDS: continue word_counts.setdefault(token, 0) word_counts[token] += 1 sorted_keys = sorted(word_counts.keys(), key=word_counts.get, reverse=True) self.cache_most_common_words = sorted_keys[:12] # return computed result return self.cache_most_common_words
def tokenizer(string: str): return [s for s in simple_tokenize(string)]
def review2idx(review): return [word2idx(word) for word in simple_tokenize(review)]
def imdb_preprocess(): imdb_dir = './data/aclImdb' subdirs = [ 'train/neg', 'train/pos', 'test/neg', 'test/pos' ] # Load reviews into memory reviews = dict() for subdir in subdirs: reviews[subdir] = [] working_dir = os.path.join(imdb_dir, subdir) for filepath in glob.glob(working_dir+"/*"): with open(filepath, 'r') as f: reviews[subdir].append(f.read()) # Create vocabulary vocab_counts = Counter() for review_set in [reviews['train/neg'], reviews['train/pos']]: for review in review_set: vocab_counts.update(simple_tokenize(review)) word_counts = vocab_counts.most_common() word_counts = [pair for pair in word_counts if pair[1] > 2] word_index = { a[0]: i+2 for i, a in enumerate(word_counts) } word_index["<EMPTY>"] = 0 word_index["<UNKNOWN>"] = 1 with open('./data/word-index.json', 'w') as f: json.dump(word_index, f) # Encode reviews using word_index def word2idx(word): if word in word_index.keys(): return word_index[word] else: return 1 def review2idx(review): return [word2idx(word) for word in simple_tokenize(review)] transformed_reviews = dict() for subdir in subdirs: transformed_reviews[subdir] = [] for review in reviews[subdir]: transformed_reviews[subdir].append(review2idx(review)) # Create train, validation and test datasets train = [(r, 1) for r in transformed_reviews['train/pos']] train += [(r, 0) for r in transformed_reviews['train/neg']] shuffle(train) test = [(r, 1) for r in transformed_reviews['test/pos']] test += [(r, 0) for r in transformed_reviews['test/neg']] shuffle(test) with open('./data/imdb-reviews.json', 'w') as f: json.dump( { 'train': train, 'test': test }, f )