def prepare_articles(articles, from_cache=False): texts = [] lemmatizer = Lemmatizer() german_stop_words = stopwords.words('german') filename = "data/lda-trainingdata.pickle" if from_cache: with open(filename, 'rb') as file: texts = pickle.load(file) return texts else: # Remove '... [+ xxx chars]' pattern from 'content' for article in progressbar(articles): article_text = "" for text in [article.description, article.title, article.fulltext if article.fulltext else article.content]: if text: text = re.sub('\[.*?\]', '', text) text = " ".join([x for x in text.split() if x.isalnum() or '.' in x]) article_text += lemmatizer.lemmatize_text(text=text, verbose=False) article_text = [x for x in article_text.split() if x not in german_stop_words] texts.append(article_text) # Cache lda-trainingdata if not os.path.exists("data"): os.makedirs("data") with open(filename, 'wb') as file: pickle.dump(texts, file) return texts