def build_lda_by_keywords(keywords, num_article_for_search, num_topics=0):
    if num_topics == 0:
        num_topics = len(keywords)

    dir_name = '../data'
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    filename = dir_name + '/' + str("_".join(keywords) + "_" + str(num_article_for_search))

    with open(filename, 'w', encoding="utf-8") as file:
        articles = []
        for keyword in keywords:
            articles_keyword = fetch_articles(keyword, number=num_article_for_search, days=-1)
            articles.extend(articles_keyword)
            log(file, "%s : %d" % (keyword, len(articles_keyword)))

        texts = []
        for article in articles:
            tokens = cut(
                article.title + article.content, using_stopwords=True, simplified_convert=True)
            texts.append(tokens)

        start = time.time()
        model = lda.build_lda_model(texts, num_topics)
        for topic_key, tokens in lda.get_topic(model, num_topics=num_topics, num_words=15).items():
            log(file, tokens)

        end = time.time()
        log(file, "model train time : " + str(end - start))

        print("\n\n\n\n", file=file)
        for article in articles:
            print(article.title, end="\n", file=file)
    def compare_clustering_using_real_data(self,
                                           start_month='2016/06',
                                           days=30):
        file_name = 'compare_using_real_data {} days={}'.format(
            ''.join(start_month.split('/')), days)
        print(file_name)
        result_table = {}
        days += 1
        for day_counter in range(1, days):
            target_day = '{}/{}'.format(start_month, str(day_counter).zfill(2))
            print(target_day)
            articles = fetcher.fetch_articles('*',
                                              4000,
                                              end_day=target_day,
                                              days=0)
            if len(articles) is 0:
                continue
            main.compute_article_vector(self._model, articles)
            for algorithm in [
                    main.clustering1, main.clustering2, main.clustering3,
                    main.clustering4
            ]:
                clusters = algorithm(self._model, articles)
                result = main.internal_validate(clusters)
                algorithm_name = str(algorithm).split(' ')[1]
                print(algorithm_name, result)
                if algorithm_name not in result_table:
                    result_table[algorithm_name] = []
                result_table[algorithm_name].append(result)

        self._print_test_result(result_table)
        self._save_as_csv(result_table, '', file_name)
Exemple #3
0
def term_expansion(keyword, num_article_for_search=5):
    articles = fetch_articles(keyword, num_article_for_search, days=3)
    if len(articles) == 0:
        print('articles not found...try another search range?')
        return
    input_data = [a.title + " " + a.content for a in articles]
    model = build_lda_model(input_data, 1)
    topic_tokens = get_topic(model)[0]
    return topic_tokens
Exemple #4
0
def get_sentence(keyword, number, page=1):
    articles = fetch_articles(keyword,
                              number,
                              page=page,
                              fl='title, content',
                              desc=False)

    result_sentences = []
    for article in articles:
        tokens = cut(article.title,
                     using_stopwords=False,
                     simplified_convert=True)
        result_sentences.append(tokens)
        if hasattr(article, 'content_sentence'):
            for sen in article.content_sentence:
                result_sentences.append(
                    cut(sen, using_stopwords=False, simplified_convert=True))
        # if hasattr(article, 'content'):
        #     result_sentences.append(cut(article.content, using_stopwords=False, simplified_convert=True))
    return result_sentences
from python_code.model import ptt_article_fetcher
from python_code.model.my_tokenize.tokenizer import cut

articles = ptt_article_fetcher.fetch_articles('', number=10, page=6)
using_stopwords = False

equals_tokens = []
for article in articles:
    token1 = cut(article.title, using_stopwords, True)
    token2 = cut(article.title, using_stopwords, False)
    if token1 == token2:
        equals_tokens.append(token1)
    else:
        print('經轉換' + str(token1))
        print('未轉換' + str(token2))

for i in equals_tokens:
    print(i)
def store_one_day_data(day):
    obj = fetcher.fetch_articles('', 5000, end_day=day, days=1)
    day = day.replace('/', '', 3)
    store_data(file_name=day, data=obj)
def get_ptt_articles(day='NOW/DAY', number=2000):
    return fetcher.fetch_articles('*', number=number, end_day=day, days=1)
Exemple #8
0
def get_ptt_articles(number=2000):
    return fetcher.fetch_articles('*', number=number, days=1)