Esempio n. 1
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     if self.compression == 'bz2':
         self.mode = binary_mode(self.mode)
         csv_handle = bz2.open(self.filepath,
                               self.mode,
                               encoding=self.encoding)
     elif self.compression == 'gzip':
         self.mode = binary_mode(self.mode)
         csv_handle = gzip.open(self.filepath,
                                self.mode,
                                encoding=self.encoding)
     else:
         csv_handle = open(self.filepath, self.mode, encoding=self.encoding)
     for count, tweet in enumerate(csv.DictReader(csv_handle)):
         if self.limit < count + 1 and self.limit != 0:
             csv_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield dict(tweet)
     csv_handle.close()
Esempio n. 2
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     bson_handle = open(self.filepath, 'rb')
     for count, tweet in enumerate(bson.decode_file_iter(bson_handle)):
         if self.limit < count + 1 and self.limit != 0:
             bson_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     bson_handle.close()
Esempio n. 3
0
	def get_iterator(self):
		tweet_parser = TweetParser()
		bson_handle = open(self.filepath, 'rb')
		for count, tweet in enumerate(bson.decode_file_iter(bson_handle)):
			if self.limit < count+1 and self.limit != 0:
				bson_handle.close()
				return
			elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
			and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
				if self.should_strip:
					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
				else: 
					yield tweet
		bson_handle.close()
Esempio n. 4
0
	def get_iterator(self):
		tweet_parser = TweetParser()
		json_handle = open(self.filepath, 'r')
		for count, tweet in enumerate(json_handle):
			tweet = json_util.loads(tweet)
			if self.limit != 0 and self.limit <= count:
				return
			elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
			and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
				if self.should_strip:
					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
				else: 
					yield tweet
		json_handle.close()
Esempio n. 5
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     csv_handle = open(self.filepath, "rb")
     for count, tweet in enumerate(unicodecsv.DictReader(csv_handle)):
         if self.limit < count + 1 and self.limit != 0:
             csv_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) and tweet_parser.tweet_passes_custom_filter_list(
             self.custom_filters, tweet
         ):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     csv_handle.close()
Esempio n. 6
0
 def get_iterator(self):
     tweet_parser = TweetParser()
     if self.compression == 'bz2':
         self.mode = binary_mode(self.mode)
         json_handle = bz2.open(self.filepath,
                                self.mode,
                                encoding=self.encoding)
     elif self.compression == 'gzip':
         self.mode = binary_mode(self.mode)
         json_handle = gzip.open(self.filepath,
                                 self.mode,
                                 encoding=self.encoding)
     else:
         json_handle = open(self.filepath,
                            self.mode,
                            encoding=self.encoding)
     bad_lines = 0
     for count, tweet in enumerate(json_handle):
         if not self.throw_error:
             try:
                 tweet = json_util.loads(tweet)
             except:
                 bad_lines += 1
         else:
             tweet = json_util.loads(tweet)
         if self.limit != 0 and self.limit <= count:
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     if self.verbose:
         print("{} rows are ok.".format(count - bad_lines))
         print("{} rows are corrupt.".format(bad_lines))
     json_handle.close()