Example #1
0
import dataProcessHelper as dph
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer;
from sklearn.decomposition import LatentDirichletAllocation as LDA;

from sklearn.preprocessing import normalize;
import pickle;
from sklearn.model_selection import GridSearchCV

import dataProcessHelper as dph

numberTopics = 10
columnName = 'combCleanLemma'

path = dph.getDataPath('pressBiTriLemma.json')
df = pd.read_json(path)

dfBigramLemma = df[columnName]

df2 = pd.DataFrame(dfBigramLemma)
df2[columnName] = df2.apply(lambda row: ' '.join(map(str, row[columnName])), axis=1)


vectorizer = TfidfVectorizer(strip_accents = 'unicode', ngram_range = (1,2));
xTrainTfidf = vectorizer.fit_transform(df2[columnName]);

searchParams = {'n_components': [10], 'learning_decay': [.5]}
if True:
    model = LDA()
    model = GridSearchCV(model, searchParams)
    extraDict = {
        'removeWords':
        ['bundes'
         ],  #Bug: Wörter mit Umlauten wie 'für' lassen sich nicht filtern.
        'alpha': 1,
        'T': 150,
        'tau': 64
    }

    #   Data-Preperation:

    ##  Vervollständig das SettingsDict mit default Parametern
    settingsDict = dph.completeSettingsDict(settingsDict)

    ##  Erstellt die Pfade für die Daten und dem Model. Der Dateiname für die Daten wird aus dem SettingsDict zusammengesetzt.
    filePath = dph.getDataPath(
        os.path.join('autoCreation', dph.getName(settingsDict)))
    modelPath = dph.getModelPath(os.path.join(modelType, str(uuid.uuid1())))

    ##  Erstellt oder lädt die Daten, falls schon vorhanden.
    df = dph.dataCleaningPipeline('press42.json', settingsDict)
    df = dph.removingWords(df, settingsDict['gensimPreProcess'],
                           extraDict['removeWords'])

    ##  Nachträgliches kleinschreiben der Wörter.
    if False:
        df = dph.toLower(df, settingsDict['gensimPreProcess'])
        extraDict['toLower'] = True

    ##  Erstellt das Bag of Words Model und ein Dictionary.
    bowCorpus, dictionary = dph.getCorpus(df[settingsDict['gensimPreProcess']],
                                          corpusDict['noBelow'],
import de_core_news_sm
nlp = de_core_news_sm.load()
import locale
locale.setlocale(locale.LC_ALL, 'de_DE')

import gensim

import dataProcessHelper as dph

richTextFilter = ['Zur externen Meldung', 'Zur externen Pressemitteilung']
allowedPostags = ['NOUN', 'VERB']  #['NOUN', 'ADJ', 'VERB', 'ADV']
#'Deutschland', 'Bundesregierung', 'wichtig', 'erklärte'

#Combine the columns 'shortText' and 'richText'. What is with the title?
if True:
    path = dph.getDataPath('press42.json')
    df = pd.read_json(path)
    df['combText'] = df.apply(lambda row: df['shortText'].iloc[row.name]
                              if df['richText'].iloc[row.name] in
                              richTextFilter else dph.combineStrings(df, row),
                              axis=1)
    dph.saveAsJson(df, 'press42Combined.json')
#Load the combined data
if True:
    path = dph.getDataPath('press42Combined.json')
    df = pd.read_json(path)

if True:
    df['combClean'] = df.apply(
        lambda row: dph.removeStopPunctSpace(df['combText'].iloc[row.name]),
        axis=1)