def train(
    load: Path = typer.Argument(
        ...,
        help="Path to a file containing documents for training.",
        exists=True),
    save: Path = typer.Argument(...,
                                help="Output path for the model.",
                                callback=check_parent_exists),
    embedding_model: Model = typer.Option(
        Model.DOC2VEC,
        help=
        "Which model is used to generate the document and word embeddings.",
    ),
    training_speed: Speed = typer.Option(
        Speed.LEARN, help="How fast the model takes to train"),
    workers: int = typer.Option(
        128,
        help="The amount of worker threads to be used in training the model"),
    min_count: int = typer.Option(
        50, help="Ignores all words with total frequency lower than this"),
    noun_phrases: bool = typer.Option(True,
                                      help="Use noun-phrases for training."),
):
    """Train Top2Vec algorithm."""
    typer.echo("Loading data...")
    docs = load_json(load) if load.suffix == ".json" else load_csv(
        load, noun_phrases)
    typer.echo(f"Loaded {len(docs)} documents")
    speed = training_speed.value
    model = embedding_model.value
    tokenizer = tokenize if noun_phrases else None
    if model == "doc2vec":
        typer.echo(
            f"Training the model with following parameters: {model=}, {speed=}, {workers=}, {min_count=}, {noun_phrases=}"
        )
        t2v = Top2Vec(
            documents=docs,
            embedding_model=model,
            speed=speed,
            workers=workers,
            tokenizer=tokenizer,
            min_count=min_count,
        )
    else:
        typer.echo(
            f"Training the model with following parameters{model=}, {noun_phrases=}"
        )
        t2v = Top2Vec(
            documents=docs,
            embedding_model=model,
            tokenizer=tokenizer,
            min_count=min_count,
        )
    typer.echo(f"Saving the model to {save}")
    t2v.save(save)
Ejemplo n.º 2
0
def load_vectors():
    model = Top2Vec.load("./models/top2vec.model")
    topic_vectors = model.topic_vectors
    tweet_vectors = model.model.docvecs.vectors_docs
    pca_tweet_vec = pca.fit_transform(tweet_vectors)
    pca_topic_vec = pca.fit_transform(topic_vectors)
    return pca_tweet_vec, pca_topic_vec, model
def top2vec_(df):
    import re
    import string

    def custom_preprocessing(x):
        x = x.lower()
        x = re.sub(
            r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""",
            "",
            x,
        )
        for p in list(string.punctuation) + list(string.digits):
            x = x.replace(p, "")
            x = x.replace(r'\xa0', '')
        x = [
            a.encode('ascii', 'ignore').decode('ascii') for a in x.split() if a
        ]
        return x

    docs = df["tweet"].apply(custom_preprocessing).values
    model = Top2Vec(docs, speed="deep-learn", workers=4)
    model.save("./top2vec.model")
    topic_sizes, topic_nums = model.get_topic_sizes()
    print(f'vocab learned: {len(model.model.wv.vocab.keys())}')
    print(topic_sizes)
    print(topic_nums)
def run_yearly():
    """
    Creates yearly topic models and saves it to disk.
    """
    print(f'Reading files from {Config.risk_dir()}')
    corpus = get_corpus()
    yearly_doc_ids = defaultdict(list)
    for k in corpus.keys():
        yearly_doc_ids[ReportInfo.from_doc_id(k).start_date.year].append(k)
    print(f'Read {len(corpus)} files.')

    base_dir = os.path.join(Config.top2vec_models_dir(), 'yearly_models')
    create_dir_if_not_exists(base_dir)
    print(f'Storing yearly models to {base_dir}.')

    for year, doc_ids in tqdm(yearly_doc_ids.items(),
                              total=len(yearly_doc_ids)):
        yearly_corpus = [corpus[d] for d in doc_ids]
        try:
            model = Top2Vec(documents=yearly_corpus,
                            document_ids=doc_ids,
                            tokenizer=RiskSectionCleaner(),
                            keep_documents=False,
                            speed='learn',
                            workers=24)
            model.save(os.path.join(base_dir, f'{year}_topics'))
        except:
            print(f'Could not create topic model for year: {year}')
            continue
Ejemplo n.º 5
0
 def create_topic(self):
     corpus = self.get_corpus()
     doc_ids, docs = list(zip(*corpus.items()))
     return Top2Vec(docs,
                    document_ids=doc_ids,
                    speed='learn',
                    tokenizer=RiskSectionCleaner(),
                    workers=16)
Ejemplo n.º 6
0
def main():
    st.set_option('deprecation.showfileUploaderEncoding', False)

    @st.cache(suppress_st_warning=True)
    def load_data(uploaded_file):

        df = pd.read_csv(uploaded_file)

        return df

    uploaded_file = st.file_uploader('Upload CSV file to begin', type='csv')

    #if upload then show left bar
    if uploaded_file is not None:
        df = load_data(uploaded_file)

        st.sidebar.subheader("Text column to analyse")
        st_ms = st.sidebar.selectbox("Select Text Columns To Analyse",
                                     (df.columns.tolist()))
        import nltk

        import top2vec
        from top2vec import Top2Vec

        #INITIALIZE AN EMPTY DATAFRAME, CONVERT THE TEXT INTO STRING AND APPEND INTO THE NEW COLUMN
        d1 = pd.DataFrame()
        d1['text'] = ""
        d1['text'] = df[st_ms]
        d1['text'] = d1['text'].astype(str)

        for x in range(len(d1)):
            d1.text.iloc[x] = d1.text.iloc[x].lower()  #to lower case
            d1.text.iloc[x] = re.sub(r"@\S+", "",
                                     d1.text.iloc[x])  #remove mentions
            d1.text.iloc[x] = re.sub(r"http\S+", "",
                                     d1.text.iloc[x])  #remove hyperlinks
            d1.text.iloc[x] = ''.join([
                word for word in d1.text.iloc[x] if not word.isdigit()
            ])  #remove numbers
            d1.text.iloc[x] = nltk.word_tokenize(d1.text.iloc[x])  #tokenising
            d1.text.iloc[x] = [
                i for i in d1.text.iloc[x] if not i in english_stop_words
            ]  #remove stop words
            d1.text.iloc[x] = [
                i for i in d1.text.iloc[x] if not i in malay_stop_words
            ]
            d1.text.iloc[x] = [i for i in d1.text.iloc[x]
                               if len(i) > 2]  #too short potong
            print('Completed line : ', x)

        #INITIALIZE THE TOP2VEC MODEL AND FIT THE TEXT
        #model.build_vocab(df_list, update=False)
        model = Top2Vec(documents=d1['text'], speed="learn", workers=10)

        topic_sizes, topic_nums = model.get_topic_sizes()
        for topic in topic_nums:
            st.pyplot(model.generate_topic_wordcloud(topic))
Ejemplo n.º 7
0
def top2vec_model():
    newsgroups_train = fetch_20newsgroups(subset='all',
                                          remove=('headers', 'footers',
                                                  'quotes'))
    top2vec = Top2Vec(newsgroups_train.data[0:1000],
                      speed="fast-learn",
                      workers=8)

    return top2vec
Ejemplo n.º 8
0
def main():
    if not sys.argv[1:]:
        print("need to give dataset path!")
        exit(1)

    data_file = sys.argv[1:][0]

    print(data_file)
    save_file = "top2vec.model"
    docs, doc_ids = parser(data_file)

    model = Top2Vec(documents=docs, document_ids=doc_ids, min_count=20, keep_documents=False,
                    use_corpus_file=True, workers=multiprocessing.cpu_count(),
                    verbose=True)

    model.save(save_file)
def _test_topics():
    def _get_noun_phrases(text):
        pass

    model_path = os.path.join(Config.top2vec_models_dir(),
                              'top2vec_model_with_doc_ids')
    model = Top2Vec.load(model_path)
    print('Creating topics.')
    for topic_size, topic_num in zip(*model.get_topic_sizes()):
        if topic_num < 200:
            continue
        _, doc_ids = model.search_documents_by_topic(topic_num,
                                                     num_docs=topic_size,
                                                     return_documents=False)
        topic_words = model.topic_words[topic_num]
        word_scores = model.topic_word_scores[topic_num]
        topic = Topic(topic_words, word_scores, topic_num, doc_ids)
        neg_words = topic.get_negative_terms()
        a = 1
Ejemplo n.º 10
0
def train_top2vec(**kwargs):

    print('Building corpus')
    t2v_corpus = build_top2vec_corpus(
        build_spans(sentences, keywords, **kwargs))
    print(f'corpus length: {len(t2v_corpus)}')

    print('Training...')
    t2v = Top2Vec(
        documents=list(t2v_corpus.values()),
        document_ids=list(t2v_corpus.keys()),
        min_count=20,
        speed='learn',
        workers=multiprocessing.cpu_count(),
        embedding_model_path='../data/models/word2vec_270422/keyedvectors.txt')

    num_topics = t2v.get_num_topics()
    print(f'Found {num_topics} topics')

    return t2v
def run_all():
    """
    Creates a topic model for the entire corpus and saves it to disk.
    """
    print(f'Reading files from {Config.risk_dir()}')
    corpus = get_corpus()
    print(f'Read {len(corpus)} files.')

    doc_ids, docs = list(zip(*corpus.items()))
    model = Top2Vec(docs,
                    document_ids=doc_ids,
                    tokenizer=RiskSectionCleaner(),
                    keep_documents=False,
                    speed='learn',
                    workers=24)

    model_path = os.path.join(Config.top2vec_models_dir(),
                              'top2vec_model_phrases')
    model.save(model_path)
    print(f'Saved model to {model_path}')
Ejemplo n.º 12
0
import pytest
from top2vec import Top2Vec
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# get 20 newsgroups data
newsgroups_train = fetch_20newsgroups(subset='all',
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_documents = newsgroups_train.data[0:2000]

# train top2vec model without doc_ids provided
top2vec = Top2Vec(documents=newsgroups_documents,
                  speed="fast-learn",
                  workers=8)

# train top2vec model with doc_ids provided
doc_ids = [str(num) for num in range(0, len(newsgroups_documents))]
top2vec_docids = Top2Vec(documents=newsgroups_documents,
                         document_ids=doc_ids,
                         speed="fast-learn",
                         workers=8)

# train top2vec model without saving documents
top2vec_no_docs = Top2Vec(documents=newsgroups_documents,
                          keep_documents=False,
                          speed="fast-learn",
                          workers=8)

# train top2vec model with corpus_file
top2vec_corpus_file = Top2Vec(documents=newsgroups_documents,
Ejemplo n.º 13
0
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import List
from top2vec import Top2Vec

app = FastAPI(title="Top2Vec API",
              description="Speak REST to a Top2Vec trained model.",
              version="1.0.0", )

top2vec = Top2Vec.load("top2vec_model/top2vec_20newsgroups")


@app.exception_handler(ValueError)
async def value_error_handler(request: Request, exc: ValueError):
    return JSONResponse(
        status_code=404,
        content={"message": str(exc)},
    )


class NumTopics(BaseModel):
    num_topics: int


class TopicSizes(BaseModel):
    topic_nums: List[int]
    topic_sizes: List[int]


class Topic(BaseModel):
Ejemplo n.º 14
0
count_by_customer = customers_reviews.groupby(['externalHandle'])['channel'].count().reset_index(name = 'count')
plt.figure(figsize=(8,4))
x1 = count_by_customer['count']
plt.hist(x1, color='#7F0442', label='Customers', bins = 30)
plt.title("Distribution Number of Reviews per Customer", fontweight='bold', fontsize = 14, family = 'monospace')
plt.xlabel("Number of Reviews")
plt.legend()
plt.savefig('images/distribution_per_customer.png')
plt.show()

customers_reviews['comment'][0]

## Train the model
customers_reviews_list = list(customers_reviews["comment"])

model = Top2Vec(customers_reviews_list,speed="deep-learn", workers=8 )

"""
documents: Input corpus, should be a list of strings.
speed: This parameter will determine how fast the model takes to train. The 'fast-learn' option is the fastest and will generate the lowest quality vectors. The 'learn' option will learn better quality vectors but take a longer time to train. The 'deep-learn' option will learn the best quality vectors but will take significant time to train.
workers: The amount of worker threads to be used in training the model. Larger amount will lead to faster training.
"""

## Get the number of topics
model.get_num_topics()

## Get the topics
topic_words, word_scores, topic_nums = model.get_topics(117)

## View the topics
topic_words[1]
Ejemplo n.º 15
0
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel, BaseSettings
from typing import List
from top2vec import Top2Vec
import numpy as np


class Settings(BaseSettings):
    model_name: str = "Top2Vec API"
    model_path: str


settings = Settings()

top2vec = Top2Vec.load(settings.model_path)

app = FastAPI(
    title=settings.model_name,
    description="RESTful Top2Vec API",
    version="1.0.0",
)


@app.exception_handler(ValueError)
async def value_error_handler(request: Request, exc: ValueError):
    return JSONResponse(
        status_code=404,
        content={"message": str(exc)},
    )
Ejemplo n.º 16
0
import pytest
from top2vec import Top2Vec
from sklearn.datasets import fetch_20newsgroups
import numpy as np

# get 20 newsgroups data
newsgroups_train = fetch_20newsgroups(subset='all',
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_documents = newsgroups_train.data[0:2000]

# train top2vec model without doc_ids provided
top2vec = Top2Vec(documents=newsgroups_documents,
                  speed="fast-learn",
                  workers=8)

# train top2vec model with doc_ids provided
doc_ids = [str(num) for num in range(0, len(newsgroups_documents))]
top2vec_docids = Top2Vec(documents=newsgroups_documents,
                         document_ids=doc_ids,
                         speed="fast-learn",
                         workers=8)

# train top2vec model without saving documents
top2vec_no_docs = Top2Vec(documents=newsgroups_documents,
                          keep_documents=False,
                          speed="fast-learn",
                          workers=8)

# train top2vec model with corpus_file
top2vec_corpus_file = Top2Vec(documents=newsgroups_documents,
                              use_corpus_file=True,
Ejemplo n.º 17
0
def load_model():
    return Top2Vec.load("data/doc2vec_production")
with open('sentences.bin', 'rb') as f:
  sentences = pickle.load(f)

# Commented out IPython magic to ensure Python compatibility.
# import tensorflow as tf
import tensorflow_datasets as tfds
from tensorboard.plugins import projector
#load tensorboards with magics
# %tensorflow_version 2.x
# %load_ext tensorboard

documents = random.choices(sentences,k = 100000)
documents = [d.lower() for d in documents]

from top2vec import Top2Vec
model = Top2Vec(documents, embedding_model='universal-sentence-encoder', workers=4)

model.hierarchical_topic_reduction(100)
# model.get_topic_hierarchy()
# topic_sizes, topic_nums = model.get_topic_sizes()
print("Original Number of topics: ", model.get_num_topics())
print("Reduced number of topics: ", model.get_num_topics(reduced=True))

#word similarity
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["swimming"], num_topics=3, reduced=False) #swimming, cleanliness #keywords_neg=stop,
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

#reduce the number of topics and do hierarchical clustering
# topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
# topic_nums = random.choices(topic_nums, k=20)
Ejemplo n.º 19
0
def main():
    st.set_option('deprecation.showfileUploaderEncoding', False)

    st.title("HATI.AI")
    image = Image.open('macroview.jpg')
    #st.image(image, use_column_width=False)
    st.sidebar.image(image)
    st.sidebar.title("Hati.Ai Web App")
    
    menu = ["Login","SignUp"]
    choice = st.sidebar.selectbox("Menu",menu)


    if choice == "Login":
        st.subheader("Login Section")

        username = st.sidebar.text_input("User Name")
        password = st.sidebar.text_input("Password",type='password')
        if st.sidebar.checkbox("Login"):
			# if password == '12345':
            create_usertable()
            hashed_pswd = make_hashes(password)

            result = login_user(username,check_hashes(password,hashed_pswd))
            if result:

                st.success("Logged In as {}".format(username))
                def process_text(text):
                    processed_data = []
                    # Make all the strings lowercase and remove non alphabetic characters
                    #text = re.sub('[^A-Za-z]', ' ', text.lower())
                
                    # Tokenize the text; this is, separate every sentence into a list of words
                    # Since the text is already split into sentences you don't have to call sent_tokenize
                    tokenized_text = word_tokenize(text)
                
                    #append the result into a new list called processed_data
                    processed_data.append(tokenized_text)
                
                
                    # Remember, this final output is a list of words
                    return processed_data
            
                @st.cache(suppress_st_warning=True)
                def load_data(uploaded_file):
                    
            
                    df = pd.read_csv(uploaded_file)
                            
             
                    return df
                
                st.sidebar.subheader("Choose What Do You Want To Do")
                classifier = st.sidebar.selectbox(" ", ("Find new topics automatically", "POWER BI Dashboard", "Interact with our chatbot"))
                if classifier == 'POWER BI Dashboard':
                    import streamlit.components.v1 as components
                    from urllib.request import urlopen
                    html = urlopen("https://app.powerbi.com/view?r=eyJrIjoiZTA4NWU4MjYtOTk3Yi00N2ZhLTgwZWQtZWFhMzNkNDk1Zjk3IiwidCI6Ijk5NmQwYTI3LWUwOGQtNDU1Ny05OWJlLTY3ZmQ2Yjk3OTA0NCIsImMiOjEwfQ%3D%3D&pageName=ReportSection06db5928b6af61b2868f").read()
                    #components.html(html, width=None, height=600, scrolling=True)
                    st.markdown("""
                        <iframe width="900" height="606" src="https://app.powerbi.com/view?r=eyJrIjoiZTA4NWU4MjYtOTk3Yi00N2ZhLTgwZWQtZWFhMzNkNDk1Zjk3IiwidCI6Ijk5NmQwYTI3LWUwOGQtNDU1Ny05OWJlLTY3ZmQ2Yjk3OTA0NCIsImMiOjEwfQ%3D%3D&pageName=ReportSection06db5928b6af61b2868f" frameborder="0" style="border:0" allowfullscreen></iframe>
                        """, unsafe_allow_html=True)

              
                if classifier == 'Interact with our chatbot':    
                    import pickle
                    with open('tnb_topic_classifier_svm', 'rb') as training_model:
                        topic_model = pickle.load(training_model)
                    import malaya
                    model = malaya.sentiment.transformer(model = 'albert', size = 'base')
                    #from src import model          
                    #malay_bert = model.BertModel()
                    # eng_flair = model.Flair()
                    # eng_vader = model.Vader()
                    test = pd.DataFrame()
                    test['Positive'] = ''
                    test['Neutral'] = ''
                    test['Negative'] = ''
                    
                    st.title("Sentiment Analyzer")
                    message = st.text_area("Enter Text","Type Here ..")
                    if st.button("Analyze"):
                     with st.spinner("Analyzing the text …"):
                         result = model.predict_proba([message])
                         #result = malay_bert.predict(message)
                         message = [message]
                         topic = topic_model.predict(message)
                         #output = "Result is: Positive:" + str(result[0]) + "Neutral:" + str(result[1]) + "Negative:" + str(result[2]) + "topic is: " + str(topic)
                         output = "result is:" + str(result) + "topic is: " + str(topic)
                         st.write(output)
            
                    else:
                     st.warning("Not sure! Try to add some more words")
    
                from stop_words import get_stop_words
                if classifier == 'Find new topics automatically':
            
                    
                    uploaded_file = st.file_uploader('Upload CSV file to begin', type='csv')
                
                    #if upload then show left bar
                    if uploaded_file is not None:
                        df = load_data(uploaded_file)
                
                
                
                        if st.sidebar.checkbox("Show raw data", False):
                            st.subheader("Uploaded Data Set")
                            st.write(df)
                
                
            
                        st.sidebar.subheader("Text column to analyse")
                        st_ms = st.sidebar.selectbox("Select Text Columns To Analyse", (df.columns.tolist()))
                        

                        df_list = list(df)
     
        
                        import top2vec
                        from top2vec import Top2Vec
                        
                        #INITIALIZE AN EMPTY DATAFRAME, CONVERT THE TEXT INTO STRING AND APPEND INTO THE NEW COLUMN
                        d1 = pd.DataFrame()
                        d1['text'] = ""
                        d1['text'] = df[st_ms]
                        d1['text'] = d1['text'].astype(str)
                        
                
                        #INITIALIZE THE TOP2VEC MODEL AND FIT THE TEXT
                        #model.build_vocab(df_list, update=False)
                        model = Top2Vec(documents=d1['text'], speed="learn", workers=10)
                        
                        topic_sizes, topic_nums = model.get_topic_sizes()
                        for topic in topic_nums:
                            st.pyplot(model.generate_topic_wordcloud(topic))
                            # Display the generated image:

        


            else:
                st.warning("Incorrect Username/Password")


    elif choice == "SignUp":
        st.subheader("Create New Account")
        new_user = st.text_input("Username")
        new_password = st.text_input("Password",type='password')

        if st.button("Signup"):
            create_usertable()
            add_userdata(new_user,make_hashes(new_password))
            st.success("You have successfully created a valid Account")
            st.info("Go to Login Menu to login")
Ejemplo n.º 20
0
all_txt_files = []

# adds each transcript to a list
for file in transcriptsLocation.rglob("*.txt"):
    all_txt_files.append(file.name)

all_txt_files.sort()

all_docs = []
# adds each transcript to a list in string format for processing
for txt_file in all_txt_files:
    with open(transcriptsLocation / txt_file, encoding="utf-8") as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

model = Top2Vec(all_docs, speed="fast-learn")
model.save("modelTwo")

model = Top2Vec.load("modelTwo")


def printMetaData():
    # get total number of model topics
    print("\nThe total number of model topics are: " +
          str(model.get_num_topics()))

    # This will return the number of documents most similar to each topic.
    topic_sizes, topic_nums = model.get_topic_sizes()
    print("\nThe number of documents most similar to each topic are: " +
          str(topic_sizes))
    print("The unique indexes of each topic are: " + str(topic_nums))
Ejemplo n.º 21
0
import warnings

warnings.filterwarnings('ignore')

import pandas as pd
from top2vec import Top2Vec

content = pd.read_csv('../data/content-clean.csv')
documents = content['text'].values.tolist()
model = Top2Vec(documents, speed='learn', workers=8)
model.save("content-model.bin")
Ejemplo n.º 22
0
 def __init__(self, *args, **kwargs):
     """Initialize Top2Vec with the given args"""
     self.__model__ = Top2Vec(*args, **kwargs)
Ejemplo n.º 23
0
from sklearn.datasets import fetch_20newsgroups
from top2vec import Top2Vec
import numpy as np

news_groups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# news_groups_documents = news_groups_train[]


model = Top2Vec(documents=news_groups_train.data[0:2000], use_corpus_file=True, speed='fast-learn', workers=8)
print("Number of Topics:- ", model.get_num_topics())
# Get Topic Sizes
topic_sizes, topic_nums = model.get_topic_sizes()
for i in range(len(topic_nums)):
    print(str(topic_nums[i]) + ":- " + str(topic_sizes[i]))

# Get Topics
topic_words, word_scores, topic_nums = model.get_topics()
for i in range(len(topic_nums)):
    print(str(topic_nums[i]) + ": " + str(topic_sizes[i]) + ": " + str(topic_words[i]))

# topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["medicine"], num_topics=5)
# for i in range(topic_nums):
#     print

print("Done!!!!")




Ejemplo n.º 24
0
import pandas as pd
from top2vec import Top2Vec

June= pd.read_excel('./dataset/NewsResult_20200601-20200630.xlsx')
July= pd.read_excel('./dataset/NewsResult_20200701-20200731.xlsx')
August= pd.read_excel('./dataset/NewsResult_20200801-20200831.xlsx')
September= pd.read_excel('./dataset/NewsResult_20200901-20200930.xlsx')
October= pd.read_excel('./dataset/NewsResult_20201001-20201029.xlsx')

daylist_June= June[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드']))
daylist_July= July[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드']))
daylist_August= August[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드']))
daylist_September= September[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드']))
daylist_October= October[['일자','키워드']].groupby('일자').apply(lambda d: ",".join(d['키워드']))

model = Top2Vec(documents=daylist_June[20200601].split(','), speed="learn", workers=5)
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)
print(topic_nums)

model.get_topics()[0]

model.generate_topic_wordcloud(0)
import os

from top2vec import Top2Vec

from config import Config
from risk_detection.preprocessing.report_parser import (
    report_info_from_risk_path)
from risk_detection.utils import (get_company_industry_mapping,
                                  get_sik_industry_name_mapping,
                                  get_risk_filenames)


def _get_noun_phrases(txt):
    pass


model_path = os.path.join(Config.top2vec_models_dir(),
                          'top2vec_model_with_doc_ids')
model = Top2Vec.load(model_path)

cik_sic_df = get_company_industry_mapping()
sic_name_df = get_sik_industry_name_mapping()
risk_files = get_risk_filenames()
# TODO: Add diversification detection code

for risk_file in risk_files:
    report_info = report_info_from_risk_path(risk_file)
    doc_id = report_info.get_document_id()
    topic = model.get_documents_topics(doc_ids=(doc_id, ))
 def __post_init__(self):
     self.model = Top2Vec.load(self.model_path)
     self.vectors = None
     self.topic_labels = None
Ejemplo n.º 27
0
# {
#   "paperTitle1": "paperContent1",
#   "paperTitle2": "paperContent2",
#   ...
# }
paper_content = {}

# a two dimensional dictionary
# paper_similarity["paperTitle1"]["paperTitle2"] gives the similarity
# between two papers
paper_similarity = {}

# model.documents --> content
# model.document_ids --> title

model = Top2Vec.load('model.thelibrarian')
model_lock = Lock()

for title in model.document_ids:
    paper_similarity[title] = {}
    doc_scores, doc_ids = model.search_documents_by_documents(
        doc_ids=[title],
        num_docs=len(model.documents) - 1,
        return_documents=False)
    for score, doc_id in zip(doc_scores, doc_ids):
        paper_similarity[title][doc_id] = score

app = Flask(__name__)
CORS(app)

paper_positions = None
Ejemplo n.º 28
0
import sys, multiprocessing
from top2vec import Top2Vec
import umap, hdbscan
import logging

from joblib import dump, load

logger = logging.getLogger('gensim')
logger.setLevel(logging.INFO)
sh = logging.StreamHandler(sys.stderr)
sh.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(sh)

model_file = 'top2vec.model'
model = Top2Vec.load(model_file)

wvs = model.model.wv.vectors
docvecs = model._get_document_vectors()

dims = [5, 3, 2]
metrics = ['cosine', 'euclidean']

for dim in dims:
    for metric in metrics:
        logger.info('Creating ' + str(dim) +
                    ' dimension embedding of documents with ' + metric)
        umap_model = umap.UMAP(n_neighbors=15, n_components=dim,
                               metric=metric).fit(docvecs)

        #logger.info('Finding dense areas of documents')
Ejemplo n.º 29
0
data["pubdate"] = data["pubdate"].astype(str).str[0:4]
data = data[pd.to_numeric(data["pubdate"], errors="coerce").notnull()]
data = data.dropna(subset=["pubdate"])
data["pubdate"] = data["pubdate"].astype(float)
data = data.dropna(subset=["pubdate"])
#data = data.loc[(data.pubdate >= 2010) & (data.pubdate <=2020)]
data = data.sort_values("pubdate")
data = data.reset_index()
print("TOTAL RECORDS:" + str(len(data)))

documents = data["combined"].tolist()
import time

start = time.perf_counter()

model = Top2Vec(documents, speed="deep-learn", workers=8)
stop = time.perf_counter()
print(f"Runtime {start - stop:0.4f} seconds")
model.save("models/top2vec_d2v")
model = Top2Vec.load("models/top2vec_d2v")

print("Number of Topics Identified:" + str(model.get_num_topics()))
model.model.init_sims()
data = model.model.docvecs.vectors_docs

umap_args = {'n_neighbors': 15, 'n_components': 5, 'metric': 'cosine'}

umap_model = umap.UMAP(**umap_args).fit(model.model.docvecs.vectors_docs)

# find dense areas of document vectors
Ejemplo n.º 30
0
def test_similar_words_index(top2vec_model):
    temp = tempfile.NamedTemporaryFile(mode='w+b')
    top2vec_model.save(temp.name)
    Top2Vec.load(temp.name)
    temp.close()