def create_sets(self): """ Create training/test/validation set via indices """ debug = self.debug if (debug): try: print "reading from file" totallist = self.read_from_file(self.DEBUG_SETS) self.trainset = totallist[0] self.testset = totallist[1] except: print "! Error in reading from file debug.txt. Redo create_sets" debug = False if (not debug): for i in range(0, len(self.tweets)): # Test if random number is smaller than distribution for trainset r_nr = random.random() if (r_nr < self.distribution[0]): self.trainset.append(i) else: self.testset.append(i) if (self.dump): totallist = [] totallist.append(self.trainset) totallist.append(self.testset) helpers.dump_to_file(self.DEBUG_SETS, totallist)
def create_wordpostuples(self, array): """ Create tokens and POS tags for tweets """ filename = self.TOPICFILE.split('.')[0] wordpos_filename = filename + "_wordpos.txt" readfromfile = self.debug if (readfromfile): try: self.tuples = helpers.read_from_file(wordpos_filename) except: print "! Error in reading from file. Redo posword tuples" readfromfile = False if (not readfromfile): self.startFrogServer('start') time.sleep(20) # Time for startup server frogclient = FrogClient('localhost',self.PORTNUMBER) print "** START frog analysis." print "** Creating POS tags.. (This may take a while)" for item in array: lemmapos_array = self.frog_tweets(frogclient, item) self.tuples.append(lemmapos_array) helpers.dump_to_file(wordpos_filename, self.tuples) self.startFrogServer('stop')
def tryout_tuples(self, array): self.startFrogServer('start') time.sleep(15) # Time for startup server frogclient = FrogClient('localhost',self.PORTNUMBER) print "** START frog analysis." print "** Creating POS tags.. (This may take a while)" for item in array: wordpos_array = self.frog_tweets(frogclient, item) self.tuples.append(wordpos_array) helpers.dump_to_file('te-tuples.txt', self.tuples) self.startFrogServer('stop')
def __init__(self, mode, corpusfile, referencefile): """ Initialize tweets from files and dictionaries""" self.load_stopword_file() if '--debug' in mode: self.corpusfile_tweets = helpers.read_from_file("corpusfile_lda_testing.txt") self.referencefile_tweets = helpers.read_from_file("referencefile_lda_testing.txt") else: self.corpusfile_tweets = self.get_tweets(corpusfile) helpers.dump_to_file("corpusfile_lda_testing.txt", self.corpusfile_tweets) self.referencefile_tweets = self.get_tweets(referencefile) helpers.dump_to_file("referencefile_lda_testing.txt", self.referencefile_tweets) self.corpus = self.create_dictionary(self.corpusfile_tweets) self.referencecorpus = self.create_dictionary(self.referencefile_tweets) self.loglikelihood = self.calculate_loglikelihood(self.corpus, self.referencecorpus)
def write_all_to_file(self): """ Dumps all filled arrays to file """ print "WRITE TO FILE" if ( self.stemmed_tweets_array ): helpers.dump_to_file(self.DEBUG_STEM, self.stemmed_tweets_array) if ( self.tokenized_tweets_array ): helpers.dump_to_file(self.DEBUG_TOKEN, self.tokenized_tweets_array) if ( self.lemmatized_tweets_array ): helpers.dump_to_file(self.DEBUG_LEMMA, self.lemmatized_tweets_array) if ( self.pos_tweets_array ): helpers.dump_to_file(self.DEBUG_POS, self.pos_tweets_array)
def dump_classifier(self, filename): """ Dump classifier and scaler to file """ dumptuple = (self.classifier, self.scaler) helpers.dump_to_file(filename, dumptuple)