Beispiel #1
0
def obtainNgrams(tweetListPreProcessed, maxNgram):
    # Join all the tweets in one language. Return one dictionary of languages
    corpus, arrayLanguagesFull = concatenateLanguageTweets(tweetListPreProcessed)
    # individualLanguage=true:
    # Only individual languages(en,es,..)
    # individualLanguage=false:
    #       Mixed languages(en+es,pt+gl,..)
    individualLanguage = True
    if individualLanguage:
        corpus, arrayLanguages = separateIndividualLanguages(corpus)
    # clean dictionary of double spaces from concatenation
    for key in corpus.keys():
        corpus[key] = preprocess.remove_multiple_spaces(corpus.get(key))
    corpusNgrams = freqDistributions(corpus, maxNgram + 1)

    return corpusNgrams, arrayLanguages, arrayLanguagesFull
# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset_train = "../Dataset/output_complete.txt"
dataset_test = "../Dataset/test_complete.txt"
LI_Coefficients = "../Dataset/LICoefficients_10gram_for-train_complete_clean.txt"

tweetList_train = read.read_tweets_dataset(dataset_train)
tweetList_test = read.read_tweets_dataset(dataset_test)


# 2-. Pre-process state
# Raw data -> tweetList
#   Clean data -> tweetListPreProcessed

tweetListPreProcessed_train = preprocess.main(tweetList_train)
tweetListPreProcessed_test = preprocess.main(tweetList_test)
# shuffle(tweetListPreProcessed)

# 3-. Algorithms

# 3.1-. Algorithms: Bayesian Networks
#   3.2.1-. Linear interpolation
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data

# cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram)
linearCoefficientsAll = list()

trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram)
for gram in xrange(1, maxNgram+1):
# _____________________________________________________________________________


# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = sys.argv[1]
maxNgram = int(sys.argv[2])

filename = os.path.basename(dataset).split('.')

tweetList = read.read_tweets_dataset(dataset)

# 2-. Pre-process state
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed
tweetListPreProcessed = preprocess.main(tweetList)

# 3-. OBTAIN N-GRAMS and Linear Coefficients

for i in xrange(5, maxNgram+1):
    corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1)
    linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams)
    # print linearCoefficients
    file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+')
    for li in linearCoefficients:
        file.write(str(i)+"\t"+str(li[0]))
        for co in xrange(1, i+1):
            file.write("\t"+str(li[co]))
        file.write("\n")
file.close()
Beispiel #4
0
import sys

maxNgram = 5

# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = "../Dataset/output_complete.txt"
test = "../Dataset/mezclado.txt"
LI_Coefficients = "../Dataset/LICoefficients_5gram_for-output_complete.txt"

tweetList = read.read_tweets_dataset(dataset)
tweetListtest = read.read_tweets_dataset(test)
# 2-. Pre-process state

tweetListPreProcessed = preprocess.main(tweetList)
tweetListPreProcessedtest= preprocess.main(tweetListtest)
shuffle(tweetListPreProcessed)
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed

#utils.printTweets(tweetListPreProcessed)

# 3-. Algorithms
#
# 3.1-. OBTAIN N-GRAMS

corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, maxNgram+1)
arrayLanguagesFull = utils.orderVector(arrayLanguagesFull)

# Example:  print(corpusNgrams.get(str(3)).get('pt'))
import sys

maxNgram = 5

# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = "../Dataset/output_complete.txt"
test = "../Dataset/mezclado.txt"
LI_Coefficients = "../Dataset/LICoefficients_5gram_for-output_complete.txt"

tweetList = read.read_tweets_dataset(dataset)
tweetListtest = read.read_tweets_dataset(test)
# 2-. Pre-process state

tweetListPreProcessed = preprocess.main(tweetList)
tweetListPreProcessedtest = preprocess.main(tweetListtest)
shuffle(tweetListPreProcessed)
# Raw data -> tweetList
# Clean data -> tweetListPreProcessed

#utils.printTweets(tweetListPreProcessed)

# 3-. Algorithms
#
# 3.1-. OBTAIN N-GRAMS

corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(
    tweetListPreProcessed, maxNgram + 1)
arrayLanguagesFull = utils.orderVector(arrayLanguagesFull)