def to_txt(write_path, read_path=None, verbose=False): ''' Function that takes in a path to the StanfordTweetData CSV file, opens it, and writes the tweets with new lines to an output file. ''' read_path = self.load_data(verbose=verbose) with open(read_path, 'r') as twitter_csv, open(write_path, 'w') as output: reader = latin_csv_reader(twitter_csv, delimiter=',') # For each line in CSV, write each tweet with a new line to the output for line in reader: output.write(line[5].encode('UTF-8') + '\n')
def to_txt(write_path, read_path=None, verbose=False): ''' Function that takes in a path to the StanfordTweetData CSV file, opens it, and writes the tweets with new lines to an output file. ''' read_path = load_data(file_path=read_path, verbose=verbose) with open(read_path, 'r') as twitter_csv, open(write_path, 'w') as output: reader = latin_csv_reader(twitter_csv, delimiter=',') # For each line in CSV, write each tweet with a new line to the output for line in reader: output.write(line[5].encode('UTF-8') + '\n')
def load_data(self, feat_extractor=None, verbose=False, return_iter=True): ''' Function that takes in a path to the StanfordTweetData CSV file, opens it, and adds tweet strings and their respective sentiments to a list @Arguments: file_path -- (optional) personal system file path to the training.1600000.processed.noemoticon.csv data set (or others of a similar structure) The Stanford Sentiment140 Dataset is of the following format per row: [polarity, tweet id, tweet date, query, user, tweet text] feat_extractor -- (optional) a function that converts a tweet text string and outputs a dictionary of features verbose -- if True, this funciton shows logging data as it progresses return_iter -- if True, return an iterator over tuples of (record, sentiment); if False, return a list of such tuples @Return: A list of tuples of the following format: (tweets/features, sentiment label) ''' tweet_to_sentiment = list() # Open file path try: twitter_csv = open(self.file_path, 'r') except IOError as e: logger.exception("File I/O error, will try downloading...") logger.info("Downloading...") self.download_data(self.file_path) twitter_csv = open(self.file_path, 'r') # Perform parsing of CSV file reader = latin_csv_reader(twitter_csv, delimiter=',') for i, tweet in enumerate(reader): # Prints progress every 10000 words read if verbose and i % 10000 == 0: logging.info("PROGRESS: at tweet #%s", i) # Gets tweets string from line in csv tweet_string = tweet[5] # ensure feature is in Sentiment dictionary try: sent = sentiment_binary[int(tweet[0])] # If a feat_extractor function was provided, apply it to tweet if feat_extractor: features = feat_extractor(tweet_string) tweet_to_sentiment.append((features, sent)) else: tweet_to_sentiment.append((tweet_string, sent)) # tally number of samples self.samples += 1 except KeyError: logger.debug("Sentiment score of {} skipped.".format(tweet[0])) twitter_csv.close() # shuffle dataset random.shuffle(tweet_to_sentiment) # return list or iterator if return_iter: return iter(tweet_to_sentiment) else: return tweet_to_sentiment
def load_data(self, feat_extractor=None, verbose=False, return_iter=True, rng_seed=None): ''' Function that takes in a path to the StanfordTweetData CSV file, opens it, and adds tweet strings and their respective sentiments to a list @Arguments: file_path -- (optional) personal system file path to the training.1600000.processed.noemoticon.csv data set (or others of a similar structure) The Stanford Sentiment140 Dataset is of the following format per row: [polarity, tweet id, tweet date, query, user, tweet text] feat_extractor -- (optional) a function that converts a tweet text string and outputs a dictionary of features verbose -- if True, this funciton shows logging data as it progresses return_iter -- if True, return an iterator over tuples of (record, sentiment); if False, return a list of such tuples @Return: A list of tuples of the following format: (tweets/features, sentiment label) ''' tweet_to_sentiment = list() # Open file path try: twitter_csv = open(self.file_path, 'r') except IOError as e: logger.exception("File I/O error, will try downloading...") logger.info("Downloading...") self.download_data(self.file_path) twitter_csv = open(self.file_path, 'r') # Perform parsing of CSV file reader = latin_csv_reader(twitter_csv, delimiter=',') for i, tweet in enumerate(reader): # Prints progress every 10000 words read if verbose and i % 10000 == 0: logging.info("PROGRESS: at tweet #%s", i) # Gets tweets string from line in csv tweet_string = tweet[5] # ensure feature is in Sentiment dictionary try: sent = sentiment_binary[int(tweet[0])] # If a feat_extractor function was provided, apply it to tweet if feat_extractor: features = feat_extractor(tweet_string) tweet_to_sentiment.append((features, sent)) else: tweet_to_sentiment.append((tweet_string, sent)) # tally number of samples self.samples += 1 except KeyError: logger.debug("Sentiment score of {} skipped.".format(tweet[0])) twitter_csv.close() # shuffle dataset random.seed(rng_seed) random.shuffle(tweet_to_sentiment) # return list or iterator if return_iter: return iter(tweet_to_sentiment) else: return tweet_to_sentiment
# Open file path if file_path: try: twitter_csv = open(file_path, 'r') except IOError, e: print "IO Error:", e.code, file_path else: # Dowloads and saves locally the zip file from internet file_path = get_file("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip") with ZipFile(file_path, 'r') as zp: twitter_csv = zp.open('training.1600000.processed.noemoticon.csv') # Perform parsing of CSV file reader = latin_csv_reader(twitter_csv, delimiter=',') for i, tweet in enumerate(reader): # Prints progress every 10000 words read if verbose and i % 10000 == 0: logging.info("PROGRESS: at tweet #%s", i) # Gets tweets string from line in csv tweet_string = tweet[5] # Gets feature from Sentiment dictionary sent = Sentiment[int(tweet[0])] # If a feat_extractor function was provided, apply it to tweet if feat_extractor: features = feat_extractor(tweet_string) tweet_to_sentiment.append((features, sent)) else: tweet_to_sentiment.append((tweet_string, sent))
# Open file path if file_path: try: twitter_csv = open(file_path, 'r') except IOError, e: print "IO Error:", e.code, file_path else: # Dowloads and saves locally the zip file from internet file_path = get_file( "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip") with ZipFile(file_path, 'r') as zp: twitter_csv = zp.open('training.1600000.processed.noemoticon.csv') # Perform parsing of CSV file reader = latin_csv_reader(twitter_csv, delimiter=',') for i, tweet in enumerate(reader): # Prints progress every 10000 words read if verbose and i % 10000 == 0: logging.info("PROGRESS: at tweet #%s", i) # Gets tweets string from line in csv tweet_string = tweet[5] # Gets feature from Sentiment dictionary sent = Sentiment[int(tweet[0])] # If a feat_extractor function was provided, apply it to tweet if feat_extractor: features = feat_extractor(tweet_string) tweet_to_sentiment.append((features, sent)) else: tweet_to_sentiment.append((tweet_string, sent))