def load_vectors(): model = Top2Vec.load("./models/top2vec.model") topic_vectors = model.topic_vectors tweet_vectors = model.model.docvecs.vectors_docs pca_tweet_vec = pca.fit_transform(tweet_vectors) pca_topic_vec = pca.fit_transform(topic_vectors) return pca_tweet_vec, pca_topic_vec, model
def _test_topics(): def _get_noun_phrases(text): pass model_path = os.path.join(Config.top2vec_models_dir(), 'top2vec_model_with_doc_ids') model = Top2Vec.load(model_path) print('Creating topics.') for topic_size, topic_num in zip(*model.get_topic_sizes()): if topic_num < 200: continue _, doc_ids = model.search_documents_by_topic(topic_num, num_docs=topic_size, return_documents=False) topic_words = model.topic_words[topic_num] word_scores = model.topic_word_scores[topic_num] topic = Topic(topic_words, word_scores, topic_num, doc_ids) neg_words = topic.get_negative_terms() a = 1
data = data.dropna(subset=["pubdate"]) #data = data.loc[(data.pubdate >= 2010) & (data.pubdate <=2020)] data = data.sort_values("pubdate") data = data.reset_index() print("TOTAL RECORDS:" + str(len(data))) documents = data["combined"].tolist() import time start = time.perf_counter() model = Top2Vec(documents, speed="deep-learn", workers=8) stop = time.perf_counter() print(f"Runtime {start - stop:0.4f} seconds") model.save("models/top2vec_d2v") model = Top2Vec.load("models/top2vec_d2v") print("Number of Topics Identified:" + str(model.get_num_topics())) model.model.init_sims() data = model.model.docvecs.vectors_docs umap_args = {'n_neighbors': 15, 'n_components': 5, 'metric': 'cosine'} umap_model = umap.UMAP(**umap_args).fit(model.model.docvecs.vectors_docs) # find dense areas of document vectors hdbscan_args = { 'min_cluster_size': 15, 'metric': 'euclidean', 'cluster_selection_method': 'eom'
for file in transcriptsLocation.rglob("*.txt"): all_txt_files.append(file.name) all_txt_files.sort() all_docs = [] # adds each transcript to a list in string format for processing for txt_file in all_txt_files: with open(transcriptsLocation / txt_file, encoding="utf-8") as f: txt_file_as_string = f.read() all_docs.append(txt_file_as_string) model = Top2Vec(all_docs, speed="fast-learn") model.save("modelTwo") model = Top2Vec.load("modelTwo") def printMetaData(): # get total number of model topics print("\nThe total number of model topics are: " + str(model.get_num_topics())) # This will return the number of documents most similar to each topic. topic_sizes, topic_nums = model.get_topic_sizes() print("\nThe number of documents most similar to each topic are: " + str(topic_sizes)) print("The unique indexes of each topic are: " + str(topic_nums)) def printModelTopics():
# { # "paperTitle1": "paperContent1", # "paperTitle2": "paperContent2", # ... # } paper_content = {} # a two dimensional dictionary # paper_similarity["paperTitle1"]["paperTitle2"] gives the similarity # between two papers paper_similarity = {} # model.documents --> content # model.document_ids --> title model = Top2Vec.load('model.thelibrarian') model_lock = Lock() for title in model.document_ids: paper_similarity[title] = {} doc_scores, doc_ids = model.search_documents_by_documents( doc_ids=[title], num_docs=len(model.documents) - 1, return_documents=False) for score, doc_id in zip(doc_scores, doc_ids): paper_similarity[title][doc_id] = score app = Flask(__name__) CORS(app) paper_positions = None
from fastapi import FastAPI, Request from fastapi.responses import JSONResponse from pydantic import BaseModel from typing import List from top2vec import Top2Vec app = FastAPI(title="Top2Vec API", description="Speak REST to a Top2Vec trained model.", version="1.0.0", ) top2vec = Top2Vec.load("top2vec_model/top2vec_20newsgroups") @app.exception_handler(ValueError) async def value_error_handler(request: Request, exc: ValueError): return JSONResponse( status_code=404, content={"message": str(exc)}, ) class NumTopics(BaseModel): num_topics: int class TopicSizes(BaseModel): topic_nums: List[int] topic_sizes: List[int] class Topic(BaseModel):
from fastapi import FastAPI, Request from fastapi.responses import JSONResponse from pydantic import BaseModel, BaseSettings from typing import List from top2vec import Top2Vec import numpy as np class Settings(BaseSettings): model_name: str = "Top2Vec API" model_path: str settings = Settings() top2vec = Top2Vec.load(settings.model_path) app = FastAPI( title=settings.model_name, description="RESTful Top2Vec API", version="1.0.0", ) @app.exception_handler(ValueError) async def value_error_handler(request: Request, exc: ValueError): return JSONResponse( status_code=404, content={"message": str(exc)}, )
import sys, multiprocessing from top2vec import Top2Vec import umap, hdbscan import logging from joblib import dump, load logger = logging.getLogger('gensim') logger.setLevel(logging.INFO) sh = logging.StreamHandler(sys.stderr) sh.setFormatter( logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(sh) model_file = 'top2vec.model' model = Top2Vec.load(model_file) wvs = model.model.wv.vectors docvecs = model._get_document_vectors() dims = [5, 3, 2] metrics = ['cosine', 'euclidean'] for dim in dims: for metric in metrics: logger.info('Creating ' + str(dim) + ' dimension embedding of documents with ' + metric) umap_model = umap.UMAP(n_neighbors=15, n_components=dim, metric=metric).fit(docvecs) #logger.info('Finding dense areas of documents')
import os from top2vec import Top2Vec from config import Config from risk_detection.preprocessing.report_parser import ( report_info_from_risk_path) from risk_detection.utils import (get_company_industry_mapping, get_sik_industry_name_mapping, get_risk_filenames) def _get_noun_phrases(txt): pass model_path = os.path.join(Config.top2vec_models_dir(), 'top2vec_model_with_doc_ids') model = Top2Vec.load(model_path) cik_sic_df = get_company_industry_mapping() sic_name_df = get_sik_industry_name_mapping() risk_files = get_risk_filenames() # TODO: Add diversification detection code for risk_file in risk_files: report_info = report_info_from_risk_path(risk_file) doc_id = report_info.get_document_id() topic = model.get_documents_topics(doc_ids=(doc_id, ))
def __post_init__(self): self.model = Top2Vec.load(self.model_path) self.vectors = None self.topic_labels = None
def load_model(): return Top2Vec.load("data/doc2vec_production")
def test_similar_words_index(top2vec_model): temp = tempfile.NamedTemporaryFile(mode='w+b') top2vec_model.save(temp.name) Top2Vec.load(temp.name) temp.close()