def build_lda_by_keywords(keywords, num_article_for_search, num_topics=0):
    if num_topics == 0:
        num_topics = len(keywords)

    dir_name = '../data'
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    filename = dir_name + '/' + str("_".join(keywords) + "_" + str(num_article_for_search))

    with open(filename, 'w', encoding="utf-8") as file:
        articles = []
        for keyword in keywords:
            articles_keyword = fetch_articles(keyword, number=num_article_for_search, days=-1)
            articles.extend(articles_keyword)
            log(file, "%s : %d" % (keyword, len(articles_keyword)))

        texts = []
        for article in articles:
            tokens = cut(
                article.title + article.content, using_stopwords=True, simplified_convert=True)
            texts.append(tokens)

        start = time.time()
        model = lda.build_lda_model(texts, num_topics)
        for topic_key, tokens in lda.get_topic(model, num_topics=num_topics, num_words=15).items():
            log(file, tokens)

        end = time.time()
        log(file, "model train time : " + str(end - start))

        print("\n\n\n\n", file=file)
        for article in articles:
            print(article.title, end="\n", file=file)
    def _compute_vector(self, input_data):
        weights = None
        if isinstance(input_data, list):
            if len(input_data) == 0:
                tokens = []
            elif isinstance(input_data[0], tuple):
                tokens = [data_tuple[0] for data_tuple in input_data]
                weights = [data_tuple[1] for data_tuple in input_data]
            else:
                tokens = input_data
        else:
            tokens = cut(input_data,
                         using_stopwords=True,
                         simplified_convert=True)

        if len(tokens) > 0 and (tokens[-1] in ['八卦', '卦']):
            del tokens[-1]
        v1 = []
        for word in tokens:
            if word in self._model:
                word_vector = self._model[word]
                if weights:
                    weight = weights[tokens.index(word)]
                    word_vector = word_vector * weight
                v1.append(word_vector)
        if len(v1) is 0:
            print('invalid article:', input_data)
            return None
        return sum(v1)
Beispiel #3
0
def get_sentence(keyword, number, page=1):
    articles = fetch_articles(keyword,
                              number,
                              page=page,
                              fl='title, content',
                              desc=False)

    result_sentences = []
    for article in articles:
        tokens = cut(article.title,
                     using_stopwords=False,
                     simplified_convert=True)
        result_sentences.append(tokens)
        if hasattr(article, 'content_sentence'):
            for sen in article.content_sentence:
                result_sentences.append(
                    cut(sen, using_stopwords=False, simplified_convert=True))
        # if hasattr(article, 'content'):
        #     result_sentences.append(cut(article.content, using_stopwords=False, simplified_convert=True))
    return result_sentences
def n_similarity_test():
    sentence_list = [
        '馬總統走光照 蔡正元:經專家鑑定為光影', '馬走光照瘋傳總統府譴責', '2016 全球 軍事力量排名',
        '舉債也最低... 「六都還款王」第2名令人', '【北捷殺人案】鄭捷判死定讞5大理由曝光', '地震',
        '日本紅十字會:捐款不用手續費 善款100%', '日本熊本強震 屏縣府擬捐香蕉賑災', '有沒有日本重新定義島的八卦',
        '日本只在利益不衝突時才是朋友'
    ]

    tokens_list = [cut(sentence) for sentence in sentence_list]

    for i in range(1, len(tokens_list)):
        print(tokens_list[i - 1])
        print(tokens_list[i])
        for model in models:
            print(model.n_similarity(tokens_list[i], tokens_list[i - 1]))
def compute_vector(model, input_data, need_log=False):
    if isinstance(input_data, str):
        tokens = cut(input_data, using_stopwords=True, simplified_convert=True, log=need_log)
    else:
        tokens = input_data
    if len(tokens) > 0 and (tokens[-1] in ['八卦', '卦']):
        del tokens[-1]
    if need_log is True:
        print(tokens)
    tokens_not_found = [word for word in tokens if word not in model]
    if len(tokens_not_found) is not 0:
        log('token not in model :' + " ".join(tokens_not_found))
    v1 = [model[word] for word in tokens if word in model]
    if len(v1) is 0:
        print('invalid article: \'' + input_data + '\'')
        return None
    vector = matutils.unitvec(array(v1, float).mean(axis=0))
    return vector
Beispiel #6
0
def build_lda_model(input_data, num_topics=1):
    if len(input_data) == 0:
        print('data is empty')
        return

    if isinstance(input_data, str):
        input_data = [input_data]

    texts = []
    for data in input_data:
        tokens = cut(data, using_stopwords=True, simplified_convert=True)
        texts.append(tokens)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = models.ldamodel.LdaModel(
        corpus, num_topics=num_topics, id2word=dictionary, passes=1)

    return lda_model
    def _compute_vector(self, input_data, tfidf_vectorizer=None):
        weights = None
        if isinstance(input_data, list):
            if isinstance(input_data[0], tuple):
                tokens = [data_tuple[0] for data_tuple in input_data]
                weights = [data_tuple[1] for data_tuple in input_data]
            else:
                tokens = input_data
        else:
            tokens = cut(input_data,
                         using_stopwords=True,
                         simplified_convert=True)

        if len(tokens) > 0 and (tokens[-1] in ['八卦', '卦']):
            del tokens[-1]
        v1 = []
        if tfidf_vectorizer is not None:
            idf_table = self.build_idf_table(tfidf_vectorizer)
        for word in tokens:
            if word in self.model:
                word_vector = self.model[word]
                if weights:
                    weight = weights[tokens.index(word)]
                    word_vector = word_vector * weight
                if tfidf_vectorizer is not None and word in idf_table:
                    word_vector = word_vector * idf_table[word]
                v1.append(word_vector)
        if len(v1) is 0:
            print('invalid article:', input_data)
            return None

        # v1 = [self.model[word] for word in tokens if word in self.model]
        if tfidf_vectorizer is None:
            return array(v1, float).mean(axis=0)
        else:
            return sum(v1)
Beispiel #8
0
def _split_string(article, split_content=True):
    tokens = cut(article.title)
    if split_content:
        tokens.extend(keywords_extraction([article], 1))
    return ' '.join(tokens)
from python_code.model import ptt_article_fetcher
from python_code.model.my_tokenize.tokenizer import cut

articles = ptt_article_fetcher.fetch_articles('', number=10, page=6)
using_stopwords = False

equals_tokens = []
for article in articles:
    token1 = cut(article.title, using_stopwords, True)
    token2 = cut(article.title, using_stopwords, False)
    if token1 == token2:
        equals_tokens.append(token1)
    else:
        print('經轉換' + str(token1))
        print('未轉換' + str(token2))

for i in equals_tokens:
    print(i)