コード例 #1
0
ファイル: train_lda_de.py プロジェクト: Quving/newsminer
def prepare_articles(articles, from_cache=False):
    texts = []
    lemmatizer = Lemmatizer()
    german_stop_words = stopwords.words('german')
    filename = "data/lda-trainingdata.pickle"
    if from_cache:
        with open(filename, 'rb') as file:
            texts = pickle.load(file)
            return texts
    else:
        # Remove '... [+ xxx chars]' pattern from 'content'
        for article in progressbar(articles):
            article_text = ""
            for text in [article.description, article.title, article.fulltext if article.fulltext else article.content]:
                if text:
                    text = re.sub('\[.*?\]', '', text)
                    text = " ".join([x for x in text.split() if x.isalnum() or '.' in x])
                    article_text += lemmatizer.lemmatize_text(text=text, verbose=False)

            article_text = [x for x in article_text.split() if x not in german_stop_words]
            texts.append(article_text)

        # Cache lda-trainingdata
        if not os.path.exists("data"):
            os.makedirs("data")
        with open(filename, 'wb') as file:
            pickle.dump(texts, file)

    return texts