def train( load: Path = typer.Argument( ..., help="Path to a file containing documents for training.", exists=True), save: Path = typer.Argument(..., help="Output path for the model.", callback=check_parent_exists), embedding_model: Model = typer.Option( Model.DOC2VEC, help= "Which model is used to generate the document and word embeddings.", ), training_speed: Speed = typer.Option( Speed.LEARN, help="How fast the model takes to train"), workers: int = typer.Option( 128, help="The amount of worker threads to be used in training the model"), min_count: int = typer.Option( 50, help="Ignores all words with total frequency lower than this"), noun_phrases: bool = typer.Option(True, help="Use noun-phrases for training."), ): """Train Top2Vec algorithm.""" typer.echo("Loading data...") docs = load_json(load) if load.suffix == ".json" else load_csv( load, noun_phrases) typer.echo(f"Loaded {len(docs)} documents") speed = training_speed.value model = embedding_model.value tokenizer = tokenize if noun_phrases else None if model == "doc2vec": typer.echo( f"Training the model with following parameters: {model=}, {speed=}, {workers=}, {min_count=}, {noun_phrases=}" ) t2v = Top2Vec( documents=docs, embedding_model=model, speed=speed, workers=workers, tokenizer=tokenizer, min_count=min_count, ) else: typer.echo( f"Training the model with following parameters{model=}, {noun_phrases=}" ) t2v = Top2Vec( documents=docs, embedding_model=model, tokenizer=tokenizer, min_count=min_count, ) typer.echo(f"Saving the model to {save}") t2v.save(save)
def load_vectors(): model = Top2Vec.load("./models/top2vec.model") topic_vectors = model.topic_vectors tweet_vectors = model.model.docvecs.vectors_docs pca_tweet_vec = pca.fit_transform(tweet_vectors) pca_topic_vec = pca.fit_transform(topic_vectors) return pca_tweet_vec, pca_topic_vec, model
def top2vec_(df): import re import string def custom_preprocessing(x): x = x.lower() x = re.sub( r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""", "", x, ) for p in list(string.punctuation) + list(string.digits): x = x.replace(p, "") x = x.replace(r'\xa0', '') x = [ a.encode('ascii', 'ignore').decode('ascii') for a in x.split() if a ] return x docs = df["tweet"].apply(custom_preprocessing).values model = Top2Vec(docs, speed="deep-learn", workers=4) model.save("./top2vec.model") topic_sizes, topic_nums = model.get_topic_sizes() print(f'vocab learned: {len(model.model.wv.vocab.keys())}') print(topic_sizes) print(topic_nums)
def run_yearly(): """ Creates yearly topic models and saves it to disk. """ print(f'Reading files from {Config.risk_dir()}') corpus = get_corpus() yearly_doc_ids = defaultdict(list) for k in corpus.keys(): yearly_doc_ids[ReportInfo.from_doc_id(k).start_date.year].append(k) print(f'Read {len(corpus)} files.') base_dir = os.path.join(Config.top2vec_models_dir(), 'yearly_models') create_dir_if_not_exists(base_dir) print(f'Storing yearly models to {base_dir}.') for year, doc_ids in tqdm(yearly_doc_ids.items(), total=len(yearly_doc_ids)): yearly_corpus = [corpus[d] for d in doc_ids] try: model = Top2Vec(documents=yearly_corpus, document_ids=doc_ids, tokenizer=RiskSectionCleaner(), keep_documents=False, speed='learn', workers=24) model.save(os.path.join(base_dir, f'{year}_topics')) except: print(f'Could not create topic model for year: {year}') continue
def create_topic(self): corpus = self.get_corpus() doc_ids, docs = list(zip(*corpus.items())) return Top2Vec(docs, document_ids=doc_ids, speed='learn', tokenizer=RiskSectionCleaner(), workers=16)
def main(): st.set_option('deprecation.showfileUploaderEncoding', False) @st.cache(suppress_st_warning=True) def load_data(uploaded_file): df = pd.read_csv(uploaded_file) return df uploaded_file = st.file_uploader('Upload CSV file to begin', type='csv') #if upload then show left bar if uploaded_file is not None: df = load_data(uploaded_file) st.sidebar.subheader("Text column to analyse") st_ms = st.sidebar.selectbox("Select Text Columns To Analyse", (df.columns.tolist())) import nltk import top2vec from top2vec import Top2Vec #INITIALIZE AN EMPTY DATAFRAME, CONVERT THE TEXT INTO STRING AND APPEND INTO THE NEW COLUMN d1 = pd.DataFrame() d1['text'] = "" d1['text'] = df[st_ms] d1['text'] = d1['text'].astype(str) for x in range(len(d1)): d1.text.iloc[x] = d1.text.iloc[x].lower() #to lower case d1.text.iloc[x] = re.sub(r"@\S+", "", d1.text.iloc[x]) #remove mentions d1.text.iloc[x] = re.sub(r"http\S+", "", d1.text.iloc[x]) #remove hyperlinks d1.text.iloc[x] = ''.join([ word for word in d1.text.iloc[x] if not word.isdigit() ]) #remove numbers d1.text.iloc[x] = nltk.word_tokenize(d1.text.iloc[x]) #tokenising d1.text.iloc[x] = [ i for i in d1.text.iloc[x] if not i in english_stop_words ] #remove stop words d1.text.iloc[x] = [ i for i in d1.text.iloc[x] if not i in malay_stop_words ] d1.text.iloc[x] = [i for i in d1.text.iloc[x] if len(i) > 2] #too short potong print('Completed line : ', x) #INITIALIZE THE TOP2VEC MODEL AND FIT THE TEXT #model.build_vocab(df_list, update=False) model = Top2Vec(documents=d1['text'], speed="learn", workers=10) topic_sizes, topic_nums = model.get_topic_sizes() for topic in topic_nums: st.pyplot(model.generate_topic_wordcloud(topic))
def top2vec_model(): newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) top2vec = Top2Vec(newsgroups_train.data[0:1000], speed="fast-learn", workers=8) return top2vec
def main(): if not sys.argv[1:]: print("need to give dataset path!") exit(1) data_file = sys.argv[1:][0] print(data_file) save_file = "top2vec.model" docs, doc_ids = parser(data_file) model = Top2Vec(documents=docs, document_ids=doc_ids, min_count=20, keep_documents=False, use_corpus_file=True, workers=multiprocessing.cpu_count(), verbose=True) model.save(save_file)
def _test_topics(): def _get_noun_phrases(text): pass model_path = os.path.join(Config.top2vec_models_dir(), 'top2vec_model_with_doc_ids') model = Top2Vec.load(model_path) print('Creating topics.') for topic_size, topic_num in zip(*model.get_topic_sizes()): if topic_num < 200: continue _, doc_ids = model.search_documents_by_topic(topic_num, num_docs=topic_size, return_documents=False) topic_words = model.topic_words[topic_num] word_scores = model.topic_word_scores[topic_num] topic = Topic(topic_words, word_scores, topic_num, doc_ids) neg_words = topic.get_negative_terms() a = 1
def train_top2vec(**kwargs): print('Building corpus') t2v_corpus = build_top2vec_corpus( build_spans(sentences, keywords, **kwargs)) print(f'corpus length: {len(t2v_corpus)}') print('Training...') t2v = Top2Vec( documents=list(t2v_corpus.values()), document_ids=list(t2v_corpus.keys()), min_count=20, speed='learn', workers=multiprocessing.cpu_count(), embedding_model_path='../data/models/word2vec_270422/keyedvectors.txt') num_topics = t2v.get_num_topics() print(f'Found {num_topics} topics') return t2v
def run_all(): """ Creates a topic model for the entire corpus and saves it to disk. """ print(f'Reading files from {Config.risk_dir()}') corpus = get_corpus() print(f'Read {len(corpus)} files.') doc_ids, docs = list(zip(*corpus.items())) model = Top2Vec(docs, document_ids=doc_ids, tokenizer=RiskSectionCleaner(), keep_documents=False, speed='learn', workers=24) model_path = os.path.join(Config.top2vec_models_dir(), 'top2vec_model_phrases') model.save(model_path) print(f'Saved model to {model_path}')
import pytest from top2vec import Top2Vec from sklearn.datasets import fetch_20newsgroups from sklearn.metrics.pairwise import cosine_similarity import numpy as np # get 20 newsgroups data newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) newsgroups_documents = newsgroups_train.data[0:2000] # train top2vec model without doc_ids provided top2vec = Top2Vec(documents=newsgroups_documents, speed="fast-learn", workers=8) # train top2vec model with doc_ids provided doc_ids = [str(num) for num in range(0, len(newsgroups_documents))] top2vec_docids = Top2Vec(documents=newsgroups_documents, document_ids=doc_ids, speed="fast-learn", workers=8) # train top2vec model without saving documents top2vec_no_docs = Top2Vec(documents=newsgroups_documents, keep_documents=False, speed="fast-learn", workers=8) # train top2vec model with corpus_file top2vec_corpus_file = Top2Vec(documents=newsgroups_documents,
from fastapi import FastAPI, Request from fastapi.responses import JSONResponse from pydantic import BaseModel from typing import List from top2vec import Top2Vec app = FastAPI(title="Top2Vec API", description="Speak REST to a Top2Vec trained model.", version="1.0.0", ) top2vec = Top2Vec.load("top2vec_model/top2vec_20newsgroups") @app.exception_handler(ValueError) async def value_error_handler(request: Request, exc: ValueError): return JSONResponse( status_code=404, content={"message": str(exc)}, ) class NumTopics(BaseModel): num_topics: int class TopicSizes(BaseModel): topic_nums: List[int] topic_sizes: List[int] class Topic(BaseModel):
count_by_customer = customers_reviews.groupby(['externalHandle'])['channel'].count().reset_index(name = 'count') plt.figure(figsize=(8,4)) x1 = count_by_customer['count'] plt.hist(x1, color='#7F0442', label='Customers', bins = 30) plt.title("Distribution Number of Reviews per Customer", fontweight='bold', fontsize = 14, family = 'monospace') plt.xlabel("Number of Reviews") plt.legend() plt.savefig('images/distribution_per_customer.png') plt.show() customers_reviews['comment'][0] ## Train the model customers_reviews_list = list(customers_reviews["comment"]) model = Top2Vec(customers_reviews_list,speed="deep-learn", workers=8 ) """ documents: Input corpus, should be a list of strings. speed: This parameter will determine how fast the model takes to train. The 'fast-learn' option is the fastest and will generate the lowest quality vectors. The 'learn' option will learn better quality vectors but take a longer time to train. The 'deep-learn' option will learn the best quality vectors but will take significant time to train. workers: The amount of worker threads to be used in training the model. Larger amount will lead to faster training. """ ## Get the number of topics model.get_num_topics() ## Get the topics topic_words, word_scores, topic_nums = model.get_topics(117) ## View the topics topic_words[1]
from fastapi import FastAPI, Request from fastapi.responses import JSONResponse from pydantic import BaseModel, BaseSettings from typing import List from top2vec import Top2Vec import numpy as np class Settings(BaseSettings): model_name: str = "Top2Vec API" model_path: str settings = Settings() top2vec = Top2Vec.load(settings.model_path) app = FastAPI( title=settings.model_name, description="RESTful Top2Vec API", version="1.0.0", ) @app.exception_handler(ValueError) async def value_error_handler(request: Request, exc: ValueError): return JSONResponse( status_code=404, content={"message": str(exc)}, )
import pytest from top2vec import Top2Vec from sklearn.datasets import fetch_20newsgroups import numpy as np # get 20 newsgroups data newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) newsgroups_documents = newsgroups_train.data[0:2000] # train top2vec model without doc_ids provided top2vec = Top2Vec(documents=newsgroups_documents, speed="fast-learn", workers=8) # train top2vec model with doc_ids provided doc_ids = [str(num) for num in range(0, len(newsgroups_documents))] top2vec_docids = Top2Vec(documents=newsgroups_documents, document_ids=doc_ids, speed="fast-learn", workers=8) # train top2vec model without saving documents top2vec_no_docs = Top2Vec(documents=newsgroups_documents, keep_documents=False, speed="fast-learn", workers=8) # train top2vec model with corpus_file top2vec_corpus_file = Top2Vec(documents=newsgroups_documents, use_corpus_file=True,
def load_model(): return Top2Vec.load("data/doc2vec_production")
with open('sentences.bin', 'rb') as f: sentences = pickle.load(f) # Commented out IPython magic to ensure Python compatibility. # import tensorflow as tf import tensorflow_datasets as tfds from tensorboard.plugins import projector #load tensorboards with magics # %tensorflow_version 2.x # %load_ext tensorboard documents = random.choices(sentences,k = 100000) documents = [d.lower() for d in documents] from top2vec import Top2Vec model = Top2Vec(documents, embedding_model='universal-sentence-encoder', workers=4) model.hierarchical_topic_reduction(100) # model.get_topic_hierarchy() # topic_sizes, topic_nums = model.get_topic_sizes() print("Original Number of topics: ", model.get_num_topics()) print("Reduced number of topics: ", model.get_num_topics(reduced=True)) #word similarity topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["swimming"], num_topics=3, reduced=False) #swimming, cleanliness #keywords_neg=stop, for topic in topic_nums: model.generate_topic_wordcloud(topic) #reduce the number of topics and do hierarchical clustering # topic_words, word_scores, topic_nums = model.get_topics(reduced=True) # topic_nums = random.choices(topic_nums, k=20)
def main(): st.set_option('deprecation.showfileUploaderEncoding', False) st.title("HATI.AI") image = Image.open('macroview.jpg') #st.image(image, use_column_width=False) st.sidebar.image(image) st.sidebar.title("Hati.Ai Web App") menu = ["Login","SignUp"] choice = st.sidebar.selectbox("Menu",menu) if choice == "Login": st.subheader("Login Section") username = st.sidebar.text_input("User Name") password = st.sidebar.text_input("Password",type='password') if st.sidebar.checkbox("Login"): # if password == '12345': create_usertable() hashed_pswd = make_hashes(password) result = login_user(username,check_hashes(password,hashed_pswd)) if result: st.success("Logged In as {}".format(username)) def process_text(text): processed_data = [] # Make all the strings lowercase and remove non alphabetic characters #text = re.sub('[^A-Za-z]', ' ', text.lower()) # Tokenize the text; this is, separate every sentence into a list of words # Since the text is already split into sentences you don't have to call sent_tokenize tokenized_text = word_tokenize(text) #append the result into a new list called processed_data processed_data.append(tokenized_text) # Remember, this final output is a list of words return processed_data @st.cache(suppress_st_warning=True) def load_data(uploaded_file): df = pd.read_csv(uploaded_file) return df st.sidebar.subheader("Choose What Do You Want To Do") classifier = st.sidebar.selectbox(" ", ("Find new topics automatically", "POWER BI Dashboard", "Interact with our chatbot")) if classifier == 'POWER BI Dashboard': import streamlit.components.v1 as components from urllib.request import urlopen html = urlopen("https://app.powerbi.com/view?r=eyJrIjoiZTA4NWU4MjYtOTk3Yi00N2ZhLTgwZWQtZWFhMzNkNDk1Zjk3IiwidCI6Ijk5NmQwYTI3LWUwOGQtNDU1Ny05OWJlLTY3ZmQ2Yjk3OTA0NCIsImMiOjEwfQ%3D%3D&pageName=ReportSection06db5928b6af61b2868f").read() #components.html(html, width=None, height=600, scrolling=True) st.markdown(""" <iframe width="900" height="606" src="https://app.powerbi.com/view?r=eyJrIjoiZTA4NWU4MjYtOTk3Yi00N2ZhLTgwZWQtZWFhMzNkNDk1Zjk3IiwidCI6Ijk5NmQwYTI3LWUwOGQtNDU1Ny05OWJlLTY3ZmQ2Yjk3OTA0NCIsImMiOjEwfQ%3D%3D&pageName=ReportSection06db5928b6af61b2868f" frameborder="0" style="border:0" allowfullscreen></iframe> """, unsafe_allow_html=True) if classifier == 'Interact with our chatbot': import pickle with open('tnb_topic_classifier_svm', 'rb') as training_model: topic_model = pickle.load(training_model) import malaya model = malaya.sentiment.transformer(model = 'albert', size = 'base') #from src import model #malay_bert = model.BertModel() # eng_flair = model.Flair() # eng_vader = model.Vader() test = pd.DataFrame() test['Positive'] = '' test['Neutral'] = '' test['Negative'] = '' st.title("Sentiment Analyzer") message = st.text_area("Enter Text","Type Here ..") if st.button("Analyze"): with st.spinner("Analyzing the text …"): result = model.predict_proba([message]) #result = malay_bert.predict(message) message = [message] topic = topic_model.predict(message) #output = "Result is: Positive:" + str(result[0]) + "Neutral:" + str(result[1]) + "Negative:" + str(result[2]) + "topic is: " + str(topic) output = "result is:" + str(result) + "topic is: " + str(topic) st.write(output) else: st.warning("Not sure! Try to add some more words") from stop_words import get_stop_words if classifier == 'Find new topics automatically': uploaded_file = st.file_uploader('Upload CSV file to begin', type='csv') #if upload then show left bar if uploaded_file is not None: df = load_data(uploaded_file) if st.sidebar.checkbox("Show raw data", False): st.subheader("Uploaded Data Set") st.write(df) st.sidebar.subheader("Text column to analyse") st_ms = st.sidebar.selectbox("Select Text Columns To Analyse", (df.columns.tolist())) df_list = list(df) import top2vec from top2vec import Top2Vec #INITIALIZE AN EMPTY DATAFRAME, CONVERT THE TEXT INTO STRING AND APPEND INTO THE NEW COLUMN d1 = pd.DataFrame() d1['text'] = "" d1['text'] = df[st_ms] d1['text'] = d1['text'].astype(str) #INITIALIZE THE TOP2VEC MODEL AND FIT THE TEXT #model.build_vocab(df_list, update=False) model = Top2Vec(documents=d1['text'], speed="learn", workers=10) topic_sizes, topic_nums = model.get_topic_sizes() for topic in topic_nums: st.pyplot(model.generate_topic_wordcloud(topic)) # Display the generated image: else: st.warning("Incorrect Username/Password") elif choice == "SignUp": st.subheader("Create New Account") new_user = st.text_input("Username") new_password = st.text_input("Password",type='password') if st.button("Signup"): create_usertable() add_userdata(new_user,make_hashes(new_password)) st.success("You have successfully created a valid Account") st.info("Go to Login Menu to login")
all_txt_files = [] # adds each transcript to a list for file in transcriptsLocation.rglob("*.txt"): all_txt_files.append(file.name) all_txt_files.sort() all_docs = [] # adds each transcript to a list in string format for processing for txt_file in all_txt_files: with open(transcriptsLocation / txt_file, encoding="utf-8") as f: txt_file_as_string = f.read() all_docs.append(txt_file_as_string) model = Top2Vec(all_docs, speed="fast-learn") model.save("modelTwo") model = Top2Vec.load("modelTwo") def printMetaData(): # get total number of model topics print("\nThe total number of model topics are: " + str(model.get_num_topics())) # This will return the number of documents most similar to each topic. topic_sizes, topic_nums = model.get_topic_sizes() print("\nThe number of documents most similar to each topic are: " + str(topic_sizes)) print("The unique indexes of each topic are: " + str(topic_nums))
import warnings warnings.filterwarnings('ignore') import pandas as pd from top2vec import Top2Vec content = pd.read_csv('../data/content-clean.csv') documents = content['text'].values.tolist() model = Top2Vec(documents, speed='learn', workers=8) model.save("content-model.bin")
def __init__(self, *args, **kwargs): """Initialize Top2Vec with the given args""" self.__model__ = Top2Vec(*args, **kwargs)
from sklearn.datasets import fetch_20newsgroups from top2vec import Top2Vec import numpy as np news_groups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) # news_groups_documents = news_groups_train[] model = Top2Vec(documents=news_groups_train.data[0:2000], use_corpus_file=True, speed='fast-learn', workers=8) print("Number of Topics:- ", model.get_num_topics()) # Get Topic Sizes topic_sizes, topic_nums = model.get_topic_sizes() for i in range(len(topic_nums)): print(str(topic_nums[i]) + ":- " + str(topic_sizes[i])) # Get Topics topic_words, word_scores, topic_nums = model.get_topics() for i in range(len(topic_nums)): print(str(topic_nums[i]) + ": " + str(topic_sizes[i]) + ": " + str(topic_words[i])) # topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["medicine"], num_topics=5) # for i in range(topic_nums): # print print("Done!!!!")
import pandas as pd from top2vec import Top2Vec June= pd.read_excel('./dataset/NewsResult_20200601-20200630.xlsx') July= pd.read_excel('./dataset/NewsResult_20200701-20200731.xlsx') August= pd.read_excel('./dataset/NewsResult_20200801-20200831.xlsx') September= pd.read_excel('./dataset/NewsResult_20200901-20200930.xlsx') October= pd.read_excel('./dataset/NewsResult_20201001-20201029.xlsx') daylist_June= June[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드'])) daylist_July= July[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드'])) daylist_August= August[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드'])) daylist_September= September[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드'])) daylist_October= October[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드'])) model = Top2Vec(documents=daylist_June[20200601].split(','), speed="learn", workers=5) topic_sizes, topic_nums = model.get_topic_sizes() print(topic_sizes) print(topic_nums) model.get_topics()[0] model.generate_topic_wordcloud(0)
import os from top2vec import Top2Vec from config import Config from risk_detection.preprocessing.report_parser import ( report_info_from_risk_path) from risk_detection.utils import (get_company_industry_mapping, get_sik_industry_name_mapping, get_risk_filenames) def _get_noun_phrases(txt): pass model_path = os.path.join(Config.top2vec_models_dir(), 'top2vec_model_with_doc_ids') model = Top2Vec.load(model_path) cik_sic_df = get_company_industry_mapping() sic_name_df = get_sik_industry_name_mapping() risk_files = get_risk_filenames() # TODO: Add diversification detection code for risk_file in risk_files: report_info = report_info_from_risk_path(risk_file) doc_id = report_info.get_document_id() topic = model.get_documents_topics(doc_ids=(doc_id, ))
def __post_init__(self): self.model = Top2Vec.load(self.model_path) self.vectors = None self.topic_labels = None
# { # "paperTitle1": "paperContent1", # "paperTitle2": "paperContent2", # ... # } paper_content = {} # a two dimensional dictionary # paper_similarity["paperTitle1"]["paperTitle2"] gives the similarity # between two papers paper_similarity = {} # model.documents --> content # model.document_ids --> title model = Top2Vec.load('model.thelibrarian') model_lock = Lock() for title in model.document_ids: paper_similarity[title] = {} doc_scores, doc_ids = model.search_documents_by_documents( doc_ids=[title], num_docs=len(model.documents) - 1, return_documents=False) for score, doc_id in zip(doc_scores, doc_ids): paper_similarity[title][doc_id] = score app = Flask(__name__) CORS(app) paper_positions = None
import sys, multiprocessing from top2vec import Top2Vec import umap, hdbscan import logging from joblib import dump, load logger = logging.getLogger('gensim') logger.setLevel(logging.INFO) sh = logging.StreamHandler(sys.stderr) sh.setFormatter( logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(sh) model_file = 'top2vec.model' model = Top2Vec.load(model_file) wvs = model.model.wv.vectors docvecs = model._get_document_vectors() dims = [5, 3, 2] metrics = ['cosine', 'euclidean'] for dim in dims: for metric in metrics: logger.info('Creating ' + str(dim) + ' dimension embedding of documents with ' + metric) umap_model = umap.UMAP(n_neighbors=15, n_components=dim, metric=metric).fit(docvecs) #logger.info('Finding dense areas of documents')
data["pubdate"] = data["pubdate"].astype(str).str[0:4] data = data[pd.to_numeric(data["pubdate"], errors="coerce").notnull()] data = data.dropna(subset=["pubdate"]) data["pubdate"] = data["pubdate"].astype(float) data = data.dropna(subset=["pubdate"]) #data = data.loc[(data.pubdate >= 2010) & (data.pubdate <=2020)] data = data.sort_values("pubdate") data = data.reset_index() print("TOTAL RECORDS:" + str(len(data))) documents = data["combined"].tolist() import time start = time.perf_counter() model = Top2Vec(documents, speed="deep-learn", workers=8) stop = time.perf_counter() print(f"Runtime {start - stop:0.4f} seconds") model.save("models/top2vec_d2v") model = Top2Vec.load("models/top2vec_d2v") print("Number of Topics Identified:" + str(model.get_num_topics())) model.model.init_sims() data = model.model.docvecs.vectors_docs umap_args = {'n_neighbors': 15, 'n_components': 5, 'metric': 'cosine'} umap_model = umap.UMAP(**umap_args).fit(model.model.docvecs.vectors_docs) # find dense areas of document vectors
def test_similar_words_index(top2vec_model): temp = tempfile.NamedTemporaryFile(mode='w+b') top2vec_model.save(temp.name) Top2Vec.load(temp.name) temp.close()