Python TweetParser.TweetParserの例

プログラミング言語: Python

名前空間/パッケージ名: smappdragon.tools.tweet_parser

クラス/型: TweetParser

メソッド/関数: TweetParser

hotexamples.comのコード掲載数: 10

Python TweetParser.TweetParser - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのsmappdragon.tools.tweet_parser.TweetParser.TweetParserの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

TweetParser(10)

strip_tweet(4)

tweet_passes_custom_filter_list(4)

parse_columns_from_tweet(3)

tweet_passes_filter(3)

flatten_dict(1)

コード例 #1

ファイルを表示

def make_sqlite_db_json(output, input_file, fields):
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)

    column_str = ','.join([column for column in fields]).replace('.','__')
    question_marks = ','.join(['?' for column in fields])
    con = sqlite3.connect(output)
    cur = con.cursor()
    cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

    json_col = JsonCollection(input_file)
    insert_list = []
    tp = TweetParser()

    for count,tweet in enumerate(json_col.get_iterator()):
        ret = tp.parse_columns_from_tweet(tweet, fields)
        row = [replace_none(col_val[1]) for col_val in ret]
        insert_list.append(tuple(row))

        if (count % 10000) == 0:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
            insert_list = []

    if count < 10000:
        cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
        con.commit()

    con.close()
    logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))

コード例 #2

ファイルを表示

 def get_iterator(self):
     tweet_parser = TweetParser()
     if self.compression == 'bz2':
         self.mode = binary_mode(self.mode)
         csv_handle = bz2.open(self.filepath,
                               self.mode,
                               encoding=self.encoding)
     elif self.compression == 'gzip':
         self.mode = binary_mode(self.mode)
         csv_handle = gzip.open(self.filepath,
                                self.mode,
                                encoding=self.encoding)
     else:
         csv_handle = open(self.filepath, self.mode, encoding=self.encoding)
     for count, tweet in enumerate(csv.DictReader(csv_handle)):
         if self.limit < count + 1 and self.limit != 0:
             csv_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield dict(tweet)
     csv_handle.close()

コード例 #3

ファイルを表示

    def dump_to_sqlite_db(self, output_db, input_fields, top_level=False):
        def replace_none(s):
            if s is None:
                return 'NULL'
            return s

        tweet_parser = TweetParser()
        column_str = ','.join([column for column in input_fields]).replace('.','__')
        question_marks = ','.join(['?' for column in input_fields])

        con = sqlite3.connect(output_db)
        cur = con.cursor()
        cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

        insert_list = []
        # batch insert if more than 10k tweets
        for count, tweet in enumerate(self.get_iterator()):
            if top_level:
                ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
            else:
                ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields)
            row = [replace_none(col_val[1]) for col_val in ret]
            insert_list.append(tuple(row))
            if (count % 10000) == 0:
                cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
                con.commit()
                insert_list = []
        if count < 10000:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
        con.close()

コード例 #4

ファイルを表示

	def test_strip_tweets_keeps_fields(self):
		tweet_parser = TweetParser()
		collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
		self.maxDiff = None
		it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
		def tweets_have_right_keys(iterator, fields):
			for tweet in iterator:
				keys = [key for key,value in tweet_parser.flatten_dict(tweet)]
				for elem in fields:
					if elem not in keys:
						return False
			return True		
		self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))

コード例 #5

ファイルを表示

 def get_iterator(self):
     tweet_parser = TweetParser()
     bson_handle = open(self.filepath, 'rb')
     for count, tweet in enumerate(bson.decode_file_iter(bson_handle)):
         if self.limit < count + 1 and self.limit != 0:
             bson_handle.close()
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     bson_handle.close()

コード例 #6

ファイルを表示

ファイル: mongo_collection.py プロジェクト: SMAPPNYU/smappdragon

 def get_iterator(self):
     tweet_parser = TweetParser()
     mongo_cursor = self.mongo_collection.find( \
      filter=self.filter, \
      no_cursor_timeout=False, \
      limit=self.limit \
     )
     for tweet in mongo_cursor:
         if tweet_parser.tweet_passes_custom_filter_list(
                 self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     mongo_cursor.close()

コード例 #7

ファイルを表示

    def test_strip_tweets_keeps_fields(self):
        tweet_parser = TweetParser()
        collection = CsvCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['csv']['valid'])
        self.maxDiff = None
        it = collection.strip_tweets(['source', 'text',
                                      'id_str']).get_iterator()

        def tweets_have_right_keys(iterator, fields):
            for tweet in iterator:
                keys = [key for key, value in tweet_parser.flatten_dict(tweet)]
                for elem in fields:
                    if elem not in keys:
                        return False
            return True

        self.assertTrue(
            tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))

コード例 #8

ファイルを表示

    def dump_to_csv(self, output_csv, input_fields, write_header=True, top_level=False, mode='a', encoding='utf-8', compression=None):
        if compression == 'bz2':
            mode = binary_mode(mode)
            filehandle = bz2.open(output_csv, mode)
        elif compression == 'gzip':
            mode = binary_mode(mode)
            filehandle = gzip.open(output_csv, mode)
        else:
            filehandle = open(output_csv, mode)
            
        writer = csv.writer(filehandle)
        if write_header:
            writer.writerow(input_fields)
        tweet_parser = TweetParser()

        for tweet in self.get_iterator():
            if top_level:
                ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
            else:
                ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields)
            ret_values = [col_val[1] for col_val in ret]
            writer.writerow(ret_values)
        filehandle.close()

コード例 #9

ファイルを表示

ファイル: json_collection.py プロジェクト: SMAPPNYU/smappdragon

 def get_iterator(self):
     tweet_parser = TweetParser()
     if self.compression == 'bz2':
         self.mode = binary_mode(self.mode)
         json_handle = bz2.open(self.filepath,
                                self.mode,
                                encoding=self.encoding)
     elif self.compression == 'gzip':
         self.mode = binary_mode(self.mode)
         json_handle = gzip.open(self.filepath,
                                 self.mode,
                                 encoding=self.encoding)
     else:
         json_handle = open(self.filepath,
                            self.mode,
                            encoding=self.encoding)
     bad_lines = 0
     for count, tweet in enumerate(json_handle):
         if not self.throw_error:
             try:
                 tweet = json_util.loads(tweet)
             except:
                 bad_lines += 1
         else:
             tweet = json_util.loads(tweet)
         if self.limit != 0 and self.limit <= count:
             return
         elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
             if self.should_strip:
                 yield tweet_parser.strip_tweet(self.keep_fields, tweet)
             else:
                 yield tweet
     if self.verbose:
         print("{} rows are ok.".format(count - bad_lines))
         print("{} rows are corrupt.".format(bad_lines))
     json_handle.close()

コード例 #10

ファイルを表示

ファイル: test_mongo_collection.py プロジェクト: SMAPPNYU/smappdragon

    def test_strip_tweets_keeps_fields(self):
        tweet_parser = TweetParser()
        collection = MongoCollection(config['mongo']['host'],
                                     config['mongo']['port'],
                                     config['mongo']['user'],
                                     config['mongo']['password'],
                                     config['mongo']['database'],
                                     config['mongo']['collection'])
        self.maxDiff = None
        it = collection.set_limit(10).strip_tweets(
            ['id', 'entities.user_mentions',
             'user.profile_image_url_https']).get_iterator()

        def tweets_have_right_keys(iterator, fields):
            for tweet in iterator:
                keys = [key for key, value in tweet_parser.flatten_dict(tweet)]
                for elem in fields:
                    if elem not in keys:
                        return False
            return True

        self.assertTrue(
            tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'],
                                        ['user', 'profile_image_url_https']]))