Esempio n. 1
0
def load_vectors():
    model = Top2Vec.load("./models/top2vec.model")
    topic_vectors = model.topic_vectors
    tweet_vectors = model.model.docvecs.vectors_docs
    pca_tweet_vec = pca.fit_transform(tweet_vectors)
    pca_topic_vec = pca.fit_transform(topic_vectors)
    return pca_tweet_vec, pca_topic_vec, model
def _test_topics():
    def _get_noun_phrases(text):
        pass

    model_path = os.path.join(Config.top2vec_models_dir(),
                              'top2vec_model_with_doc_ids')
    model = Top2Vec.load(model_path)
    print('Creating topics.')
    for topic_size, topic_num in zip(*model.get_topic_sizes()):
        if topic_num < 200:
            continue
        _, doc_ids = model.search_documents_by_topic(topic_num,
                                                     num_docs=topic_size,
                                                     return_documents=False)
        topic_words = model.topic_words[topic_num]
        word_scores = model.topic_word_scores[topic_num]
        topic = Topic(topic_words, word_scores, topic_num, doc_ids)
        neg_words = topic.get_negative_terms()
        a = 1
Esempio n. 3
0
data = data.dropna(subset=["pubdate"])
#data = data.loc[(data.pubdate >= 2010) & (data.pubdate <=2020)]
data = data.sort_values("pubdate")
data = data.reset_index()
print("TOTAL RECORDS:" + str(len(data)))

documents = data["combined"].tolist()
import time

start = time.perf_counter()

model = Top2Vec(documents, speed="deep-learn", workers=8)
stop = time.perf_counter()
print(f"Runtime {start - stop:0.4f} seconds")
model.save("models/top2vec_d2v")
model = Top2Vec.load("models/top2vec_d2v")

print("Number of Topics Identified:" + str(model.get_num_topics()))
model.model.init_sims()
data = model.model.docvecs.vectors_docs

umap_args = {'n_neighbors': 15, 'n_components': 5, 'metric': 'cosine'}

umap_model = umap.UMAP(**umap_args).fit(model.model.docvecs.vectors_docs)

# find dense areas of document vectors

hdbscan_args = {
    'min_cluster_size': 15,
    'metric': 'euclidean',
    'cluster_selection_method': 'eom'
Esempio n. 4
0
for file in transcriptsLocation.rglob("*.txt"):
    all_txt_files.append(file.name)

all_txt_files.sort()

all_docs = []
# adds each transcript to a list in string format for processing
for txt_file in all_txt_files:
    with open(transcriptsLocation / txt_file, encoding="utf-8") as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

model = Top2Vec(all_docs, speed="fast-learn")
model.save("modelTwo")

model = Top2Vec.load("modelTwo")


def printMetaData():
    # get total number of model topics
    print("\nThe total number of model topics are: " +
          str(model.get_num_topics()))

    # This will return the number of documents most similar to each topic.
    topic_sizes, topic_nums = model.get_topic_sizes()
    print("\nThe number of documents most similar to each topic are: " +
          str(topic_sizes))
    print("The unique indexes of each topic are: " + str(topic_nums))


def printModelTopics():
Esempio n. 5
0
# {
#   "paperTitle1": "paperContent1",
#   "paperTitle2": "paperContent2",
#   ...
# }
paper_content = {}

# a two dimensional dictionary
# paper_similarity["paperTitle1"]["paperTitle2"] gives the similarity
# between two papers
paper_similarity = {}

# model.documents --> content
# model.document_ids --> title

model = Top2Vec.load('model.thelibrarian')
model_lock = Lock()

for title in model.document_ids:
    paper_similarity[title] = {}
    doc_scores, doc_ids = model.search_documents_by_documents(
        doc_ids=[title],
        num_docs=len(model.documents) - 1,
        return_documents=False)
    for score, doc_id in zip(doc_scores, doc_ids):
        paper_similarity[title][doc_id] = score

app = Flask(__name__)
CORS(app)

paper_positions = None
Esempio n. 6
0
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import List
from top2vec import Top2Vec

app = FastAPI(title="Top2Vec API",
              description="Speak REST to a Top2Vec trained model.",
              version="1.0.0", )

top2vec = Top2Vec.load("top2vec_model/top2vec_20newsgroups")


@app.exception_handler(ValueError)
async def value_error_handler(request: Request, exc: ValueError):
    return JSONResponse(
        status_code=404,
        content={"message": str(exc)},
    )


class NumTopics(BaseModel):
    num_topics: int


class TopicSizes(BaseModel):
    topic_nums: List[int]
    topic_sizes: List[int]


class Topic(BaseModel):
Esempio n. 7
0
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel, BaseSettings
from typing import List
from top2vec import Top2Vec
import numpy as np


class Settings(BaseSettings):
    model_name: str = "Top2Vec API"
    model_path: str


settings = Settings()

top2vec = Top2Vec.load(settings.model_path)

app = FastAPI(
    title=settings.model_name,
    description="RESTful Top2Vec API",
    version="1.0.0",
)


@app.exception_handler(ValueError)
async def value_error_handler(request: Request, exc: ValueError):
    return JSONResponse(
        status_code=404,
        content={"message": str(exc)},
    )
Esempio n. 8
0
import sys, multiprocessing
from top2vec import Top2Vec
import umap, hdbscan
import logging

from joblib import dump, load

logger = logging.getLogger('gensim')
logger.setLevel(logging.INFO)
sh = logging.StreamHandler(sys.stderr)
sh.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(sh)

model_file = 'top2vec.model'
model = Top2Vec.load(model_file)

wvs = model.model.wv.vectors
docvecs = model._get_document_vectors()

dims = [5, 3, 2]
metrics = ['cosine', 'euclidean']

for dim in dims:
    for metric in metrics:
        logger.info('Creating ' + str(dim) +
                    ' dimension embedding of documents with ' + metric)
        umap_model = umap.UMAP(n_neighbors=15, n_components=dim,
                               metric=metric).fit(docvecs)

        #logger.info('Finding dense areas of documents')
import os

from top2vec import Top2Vec

from config import Config
from risk_detection.preprocessing.report_parser import (
    report_info_from_risk_path)
from risk_detection.utils import (get_company_industry_mapping,
                                  get_sik_industry_name_mapping,
                                  get_risk_filenames)


def _get_noun_phrases(txt):
    pass


model_path = os.path.join(Config.top2vec_models_dir(),
                          'top2vec_model_with_doc_ids')
model = Top2Vec.load(model_path)

cik_sic_df = get_company_industry_mapping()
sic_name_df = get_sik_industry_name_mapping()
risk_files = get_risk_filenames()
# TODO: Add diversification detection code

for risk_file in risk_files:
    report_info = report_info_from_risk_path(risk_file)
    doc_id = report_info.get_document_id()
    topic = model.get_documents_topics(doc_ids=(doc_id, ))
 def __post_init__(self):
     self.model = Top2Vec.load(self.model_path)
     self.vectors = None
     self.topic_labels = None
Esempio n. 11
0
def load_model():
    return Top2Vec.load("data/doc2vec_production")
Esempio n. 12
0
def test_similar_words_index(top2vec_model):
    temp = tempfile.NamedTemporaryFile(mode='w+b')
    top2vec_model.save(temp.name)
    Top2Vec.load(temp.name)
    temp.close()