import dataProcessHelper as dph import pyLDAvis.sklearn from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer; from sklearn.decomposition import LatentDirichletAllocation as LDA; from sklearn.preprocessing import normalize; import pickle; from sklearn.model_selection import GridSearchCV import dataProcessHelper as dph numberTopics = 10 columnName = 'combCleanLemma' path = dph.getDataPath('pressBiTriLemma.json') df = pd.read_json(path) dfBigramLemma = df[columnName] df2 = pd.DataFrame(dfBigramLemma) df2[columnName] = df2.apply(lambda row: ' '.join(map(str, row[columnName])), axis=1) vectorizer = TfidfVectorizer(strip_accents = 'unicode', ngram_range = (1,2)); xTrainTfidf = vectorizer.fit_transform(df2[columnName]); searchParams = {'n_components': [10], 'learning_decay': [.5]} if True: model = LDA() model = GridSearchCV(model, searchParams)
extraDict = { 'removeWords': ['bundes' ], #Bug: Wörter mit Umlauten wie 'für' lassen sich nicht filtern. 'alpha': 1, 'T': 150, 'tau': 64 } # Data-Preperation: ## Vervollständig das SettingsDict mit default Parametern settingsDict = dph.completeSettingsDict(settingsDict) ## Erstellt die Pfade für die Daten und dem Model. Der Dateiname für die Daten wird aus dem SettingsDict zusammengesetzt. filePath = dph.getDataPath( os.path.join('autoCreation', dph.getName(settingsDict))) modelPath = dph.getModelPath(os.path.join(modelType, str(uuid.uuid1()))) ## Erstellt oder lädt die Daten, falls schon vorhanden. df = dph.dataCleaningPipeline('press42.json', settingsDict) df = dph.removingWords(df, settingsDict['gensimPreProcess'], extraDict['removeWords']) ## Nachträgliches kleinschreiben der Wörter. if False: df = dph.toLower(df, settingsDict['gensimPreProcess']) extraDict['toLower'] = True ## Erstellt das Bag of Words Model und ein Dictionary. bowCorpus, dictionary = dph.getCorpus(df[settingsDict['gensimPreProcess']], corpusDict['noBelow'],
import de_core_news_sm nlp = de_core_news_sm.load() import locale locale.setlocale(locale.LC_ALL, 'de_DE') import gensim import dataProcessHelper as dph richTextFilter = ['Zur externen Meldung', 'Zur externen Pressemitteilung'] allowedPostags = ['NOUN', 'VERB'] #['NOUN', 'ADJ', 'VERB', 'ADV'] #'Deutschland', 'Bundesregierung', 'wichtig', 'erklärte' #Combine the columns 'shortText' and 'richText'. What is with the title? if True: path = dph.getDataPath('press42.json') df = pd.read_json(path) df['combText'] = df.apply(lambda row: df['shortText'].iloc[row.name] if df['richText'].iloc[row.name] in richTextFilter else dph.combineStrings(df, row), axis=1) dph.saveAsJson(df, 'press42Combined.json') #Load the combined data if True: path = dph.getDataPath('press42Combined.json') df = pd.read_json(path) if True: df['combClean'] = df.apply( lambda row: dph.removeStopPunctSpace(df['combText'].iloc[row.name]), axis=1)