def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. Parameters ---------- question : str The question asked tag_name : str The tag for the question Returns ------- int The id of the most similar thread of the question """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) question_vec = question_to_vec(question=question, embeddings=self.word_embeddings, dim=thread_embeddings.shape[1]) best_thread = pairwise_distances_argmin(question_vec[np.newaxis, ...], thread_embeddings, metric='cosine') return thread_ids[best_thread][0]
def get_best_thread(self, question, tag_name): """Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim) best_thread = pairwise_distances_argmin(thread_embeddings, question_vec.reshape(1, -1)) return thread_ids[best_thread[0]]
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ print(tag_name) thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. question_vec = question_to_vec( question, self.word_embeddings, self.embeddings_dim) #### YOUR CODE HERE #### best_thread = pairwise_distances_argmin( [question_vec], thread_embeddings, metric='cosine') #### YOUR CODE HERE #### return thread_ids[best_thread[0]]
def main(): starspace_embeddings, embeddings_dim = load_embeddings( 'word_embeddings.tsv') posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t') counts_by_tag = posts_df.groupby("tag")["post_id"].count() os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True) for tag, count in counts_by_tag.items(): tag_posts = posts_df[posts_df['tag'] == tag] tag_post_ids = tag_posts["post_id"].tolist() tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32) for i, title in enumerate(tag_posts['title']): tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim) # Dump post ids and vectors to a file. filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag)) pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))
embeddings_dim = 300 posts_df = pd.read_csv('models\\tagged_posts.tsv', sep='\t') gPosts = posts_df.groupby("tag").count() counts_by_tag = dict(zip(gPosts.index,gPosts.post_id)) for tag, count in counts_by_tag.items(): tag_posts = posts_df[posts_df['tag'] == tag] tag_post_ids = tag_posts.post_id.values titles = tag_posts.title.values tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32) for i, title in enumerate(titles): tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim) # Dump post ids and vectors to a file. filename = os.path.join("thread_embeddings_by_tags", os.path.normpath('%s.pkl' % tag)) pickle.dump((tag_post_ids, titles, tag_vectors), open(filename, 'wb')) ''' vectorizer = CountVectorizer(lowercase=True, stop_words="english", min_df=2) dialogue_df = pd.read_csv('models\\dialogues.tsv', sep='\t') posts_df = pd.read_csv('models\\tagged_posts.tsv', sep='\t') corpus = list(dialogue_df['text'].values) + list(posts_df.title.values) vectorizer.fit(corpus) common_words = vectorizer.get_feature_names()
pickle.dump(intent_recognizer, open(utils.RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb')) print('3. Programming language classification ...') X = stackoverflow_df['title'].values y = stackoverflow_df['tag'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) print('Train size = {}, test size = {}'.format(len(X_train), len(X_test))) vectorizer = pickle.load(open(utils.RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb')) X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test) tag_classifier = OneVsRestClassifier(LogisticRegression(penalty="l2", C=5, random_state=0)) tag_classifier.fit(X_train_tfidf, y_train) y_test_pred = tag_classifier.predict(X_test_tfidf) test_accuracy = accuracy_score(y_test, y_test_pred) print('Test accuracy = {}'.format(test_accuracy)) pickle.dump(tag_classifier, open(utils.RESOURCE_PATH['TAG_CLASSIFIER'], 'wb')) print('4. Ranking questions with embeddings ...') _, _, embeddings, _, embeddings_dim = utils.load_embeddings() posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\t') counts_by_tag = posts_df.groupby("tag").count().max(axis=1) os.makedirs(utils.RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True) for tag, count in counts_by_tag.items(): tag_posts = posts_df[posts_df['tag'] == tag] tag_post_ids = posts_df[posts_df['tag'] == tag].post_id tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32) for i, title in enumerate(tag_posts['title']): tag_vectors[i, :] = utils.question_to_vec(title, embeddings, embeddings_dim) # Dump post ids and vectors to a file. filename = os.path.join(utils.RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag)) pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))