def obtainNgrams(tweetListPreProcessed, maxNgram): # Join all the tweets in one language. Return one dictionary of languages corpus, arrayLanguagesFull = concatenateLanguageTweets(tweetListPreProcessed) # individualLanguage=true: # Only individual languages(en,es,..) # individualLanguage=false: # Mixed languages(en+es,pt+gl,..) individualLanguage = True if individualLanguage: corpus, arrayLanguages = separateIndividualLanguages(corpus) # clean dictionary of double spaces from concatenation for key in corpus.keys(): corpus[key] = preprocess.remove_multiple_spaces(corpus.get(key)) corpusNgrams = freqDistributions(corpus, maxNgram + 1) return corpusNgrams, arrayLanguages, arrayLanguagesFull
# 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset_train = "../Dataset/output_complete.txt" dataset_test = "../Dataset/test_complete.txt" LI_Coefficients = "../Dataset/LICoefficients_10gram_for-train_complete_clean.txt" tweetList_train = read.read_tweets_dataset(dataset_train) tweetList_test = read.read_tweets_dataset(dataset_test) # 2-. Pre-process state # Raw data -> tweetList # Clean data -> tweetListPreProcessed tweetListPreProcessed_train = preprocess.main(tweetList_train) tweetListPreProcessed_test = preprocess.main(tweetList_test) # shuffle(tweetListPreProcessed) # 3-. Algorithms # 3.1-. Algorithms: Bayesian Networks # 3.2.1-. Linear interpolation # Generate linear coefficients: input (n-grams and language) # Smooth data # cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram) linearCoefficientsAll = list() trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram) for gram in xrange(1, maxNgram+1):
# _____________________________________________________________________________ # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = sys.argv[1] maxNgram = int(sys.argv[2]) filename = os.path.basename(dataset).split('.') tweetList = read.read_tweets_dataset(dataset) # 2-. Pre-process state # Raw data -> tweetList # Clean data -> tweetListPreProcessed tweetListPreProcessed = preprocess.main(tweetList) # 3-. OBTAIN N-GRAMS and Linear Coefficients for i in xrange(5, maxNgram+1): corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams) # print linearCoefficients file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+') for li in linearCoefficients: file.write(str(i)+"\t"+str(li[0])) for co in xrange(1, i+1): file.write("\t"+str(li[co])) file.write("\n") file.close()
import sys maxNgram = 5 # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = "../Dataset/output_complete.txt" test = "../Dataset/mezclado.txt" LI_Coefficients = "../Dataset/LICoefficients_5gram_for-output_complete.txt" tweetList = read.read_tweets_dataset(dataset) tweetListtest = read.read_tweets_dataset(test) # 2-. Pre-process state tweetListPreProcessed = preprocess.main(tweetList) tweetListPreProcessedtest= preprocess.main(tweetListtest) shuffle(tweetListPreProcessed) # Raw data -> tweetList # Clean data -> tweetListPreProcessed #utils.printTweets(tweetListPreProcessed) # 3-. Algorithms # # 3.1-. OBTAIN N-GRAMS corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, maxNgram+1) arrayLanguagesFull = utils.orderVector(arrayLanguagesFull) # Example: print(corpusNgrams.get(str(3)).get('pt'))
import sys maxNgram = 5 # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = "../Dataset/output_complete.txt" test = "../Dataset/mezclado.txt" LI_Coefficients = "../Dataset/LICoefficients_5gram_for-output_complete.txt" tweetList = read.read_tweets_dataset(dataset) tweetListtest = read.read_tweets_dataset(test) # 2-. Pre-process state tweetListPreProcessed = preprocess.main(tweetList) tweetListPreProcessedtest = preprocess.main(tweetListtest) shuffle(tweetListPreProcessed) # Raw data -> tweetList # Clean data -> tweetListPreProcessed #utils.printTweets(tweetListPreProcessed) # 3-. Algorithms # # 3.1-. OBTAIN N-GRAMS corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams( tweetListPreProcessed, maxNgram + 1) arrayLanguagesFull = utils.orderVector(arrayLanguagesFull)