Beispiel #1
0
def obtainNgrams(tweetListPreProcessed, maxNgram):
    # Join all the tweets in one language. Return one dictionary of languages
    corpus, arrayLanguagesFull = concatenateLanguageTweets(tweetListPreProcessed)
    # individualLanguage=true:
    # Only individual languages(en,es,..)
    # individualLanguage=false:
    #       Mixed languages(en+es,pt+gl,..)
    individualLanguage = True
    if individualLanguage:
        corpus, arrayLanguages = separateIndividualLanguages(corpus)
    # clean dictionary of double spaces from concatenation
    for key in corpus.keys():
        corpus[key] = preprocess.remove_multiple_spaces(corpus.get(key))
    corpusNgrams = freqDistributions(corpus, maxNgram + 1)

    return corpusNgrams, arrayLanguages, arrayLanguagesFull