Python SmappCollection Beispiele, pysmap.SmappCollection Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_user_description_contains(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['json']['valid'])
     collection = SmappCollection('json', file_path)
     count = len(
         [tweet for tweet in collection.user_description_contains('JESUS')])
     self.assertEqual(15, count)

Beispiel #2

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

    def test_dump_to_csv(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')

        output_path = os.path.dirname(
            os.path.realpath(__file__)) + '/' + 'data/output.csv'
        collection = SmappCollection(
            'bson',
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['bson']['valid'])
        collection.dump_to_csv(
            output_path,
            ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
        self.assertTrue(os.path.getsize(output_path) > 0)

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')

Beispiel #3

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_user_id_is(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['json']['valid'])
     collection = SmappCollection('json', file_path)
     count = len(
         [tweet for tweet in collection.user_id_is(379851447, 149751818)])
     self.assertEqual(77, count)

Beispiel #4

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

    def test_sample_chains_and_dumps(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson.json'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson.json')

        output_path = '{}/{}'.format(
            os.path.dirname(os.path.realpath(__file__)),
            'data/output.bson.json')
        collection = SmappCollection(
            'bson',
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['bson']['valid'])
        sample_tweets = collection.sample(10)
        sample_tweets.dump_to_json(output_path)
        self.assertTrue(os.path.getsize(output_path) > 0)
        with open(output_path) as f:
            self.assertEqual(10, len([line for line in f]))

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson.json'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson.json')

Beispiel #5

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_detect_tweet_language(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     count = len(
         [tweet for tweet in collection.detect_tweet_language('en')])
     self.assertEqual(907, count)

Beispiel #6

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_tweets_containing(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     count = len(
         [tweet for tweet in collection.get_tweets_containing('jade')])
     self.assertEqual(167, count)

Beispiel #7

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_date_range(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     count = len([
         tweet for tweet in collection.get_date_range(
             datetime(2015, 11, 2), datetime(2015, 11, 3))
     ])
     self.assertEqual(26, count)

Beispiel #8

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_place_name_contains_country(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['json']['valid'])
     collection = SmappCollection('json', file_path)
     count = len([
         tweet for tweet in collection.place_name_contains_country(
             'United States')
     ])
     self.assertEqual(6, count)

Beispiel #9

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_find_date_range(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     range_obj = collection.find_date_range()
     self.assertEqual(datetime(2015, 11, 2, 19, 56, 33),
                      range_obj['date_min'])
     self.assertEqual(datetime(2015, 11, 6, 21, 35, 54),
                      range_obj['date_max'])

Beispiel #10

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_base_top_entities_returns_counts(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     returndict = collection.get_top_entities({'urls': 5, 'symbols': 3})
     if len(returndict['urls']) > 0:
         self.assertTrue(len(returndict['urls']) == 5)
     if len(returndict['symbols']) > 0:
         self.assertTrue(len(returndict['symbols']) == 3)

Beispiel #11

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_tweet_texts(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     texts = [
         text
         for text in collection.limit_number_of_tweets(1).get_tweet_texts()
     ]
     self.assertEqual(str, type(texts[0]))

Beispiel #12

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_base_top_entities_returns_hashtags_and_media(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     returndict = collection.get_top_entities({
         'user_mentions': 5,
         'media': 3
     })
     self.assertTrue('user_mentions' in returndict
                     and 'media' in returndict)

Beispiel #13

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_within_geobox(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['json']['valid'])
     collection = SmappCollection('json', file_path)
     # geobox here is for us mountain time
     # i created a coordinate in our data file on the last object [-105.29, 40.33]
     # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997]
     count = len([
         tweet for tweet in collection.within_geobox(
             -113.95, 28.81, -100.05, 48.87)
     ])
     self.assertEqual(1, count)

Beispiel #14

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_top_urls(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     urls = collection.get_top_urls(5)
     base_urls = {
         'urls': {
             'https://t.co/ATzXpRciyr': 18,
             'https://t.co/dpz7vZ1JWy': 39,
             'https://t.co/l9OEuvRlt8': 24,
             'https://t.co/nkc4hnukLX': 21,
             'https://t.co/rsNUItS48U': 60
         }
     }
     self.assertTrue(set(urls.keys()) == set(base_urls.keys()))

Beispiel #15

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_top_hashtags(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     base_hashtags = {
         'hashtags': {
             '2a': 26,
             'pjnet': 26,
             'jadehelm': 111,
             'falseflag': 32,
             'JadeHelm': 118
         }
     }
     hashtags = collection.get_top_hashtags(5)
     self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys()))

Beispiel #16

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_top_media(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     top_media = collection.get_top_media(5)
     base_top_media = {
         'media': {
             'https://t.co/pAfigDPcNc': 27,
             'https://t.co/MaOGn6wH40': 17,
             'https://t.co/TH8TmGuYww': 24,
             'https://t.co/YpqDPqA2UO': 14,
             'https://t.co/ORaTXOM2oX': 55
         }
     }
     self.assertTrue(set(top_media.keys()) == set(base_top_media.keys()))

Beispiel #17

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_top_symbols(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     top_symbols = collection.get_top_symbols(5)
     base_top_symbols = {
         'symbols': {
             0: None,
             'hould': 1,
             2: None,
             3: None,
             1: None
         }
     }
     self.assertTrue(
         set(top_symbols.keys()) == set(base_top_symbols.keys()))

Beispiel #18

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_top_mentions(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     top_mentions = collection.get_top_mentions(5)
     base_top_mentions = {
         'user_mentions': {
             '233498836': 58,
             '27234909': 56,
             '10228272': 75,
             '1619936671': 41,
             '733417892': 121
         }
     }
     self.assertTrue(
         set(top_mentions.keys()) == set(base_top_mentions.keys()))

Beispiel #19

0

Datei anzeigen

def dump_tweets(filename, retweets, fields=None):
    collection = SmappCollection('json', filename)
    collection = collection.user_language_is('en')
    if fields is None:
        fields = ['id', 'text', 'timestamp_ms', 'user.id_str']

    if retweets:
        collection = collection.get_retweets()
        collection.dump_to_csv('/scratch/en919/retw_' + args[1] + '.csv',
                               fields)
    else:
        collection = collection.exclude_retweets()
        collection.dump_to_csv('/scratch/en919/no_retw_' + args[1] + '.csv',
                               fields)

Beispiel #20

0

Datei anzeigen

 def test_smapp_dataset_takes_collections_datasets_and_base_input_types(self):
     file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     file_path_bson_2 = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
     file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
     collection = SmappCollection('bson', file_path_bson_2)
     dataset_1 = SmappDataset(['bson', file_path_bson], ['csv', file_path_csv])
     dataset_2 = SmappDataset(dataset_1,  ['json', file_path_json], collection)
     self.assertTrue(len(list(dataset_2)) > 0)

Beispiel #21

0

Datei anzeigen

Datei: olympus2scratch_csv.py Projekt: notconfusing/smapputil

def json2csv(f, f_out, cols=cols):
    '''
    Reads a json file into a SmappCollection.
    Dumps to csv.
    Removes incompletely dumped csv's from past jobs.
    gzips csv.
    '''
    collection = SmappCollection('json', f).dump_to_csv(f_out, cols)
    gzip(f_out)

Beispiel #22

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_get_top_terms(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     top_counts = collection.get_top_terms(10)
     base_top_counts = {
         'Jade': 538,
         'Duty:': 146,
         'Ops': 265,
         'Sevenfold': 216,
         'III': 173,
         'RT': 524,
         'Black': 235,
         'Helm': 415,
         'Avenged': 220,
         '-': 193
     }
     self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys()))

Beispiel #23

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

    def test_set_custom_filter_properly_filters(self):
        file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                   config['bson']['valid'])
        collection_one = SmappCollection('bson', file_path)
        full_collection_len = len(list(collection_one))

        def is_tweet_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return True
            else:
                return False

        num_retweets = len(
            list(collection_one.set_custom_filter(is_tweet_a_retweet)))

        collection_two = SmappCollection('bson', file_path)

        def is_not_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return False
            else:
                return True

        num_non_retweets = len(
            list(collection_two.set_custom_filter(is_not_a_retweet)))
        self.assertEqual(num_retweets + num_non_retweets, full_collection_len)

Beispiel #24

0

Datei anzeigen

Datei: json2csv.py Projekt: notconfusing/smapputil

def split_json(f, chunksize=120000, remove=True):
    '''
    Splits json into chunksize, and removes old file
    '''
    collection = SmappCollection('json', f)
    for i, group in enumerate(grouper(collection, chunksize)):
        f_out = f.replace('.json', '___pt{:03d}.json'.format(i))
        if os.path.isfile(f_out):
            os.remove(f_out)
        with open(f_out, 'w') as outputfile:
            json.dump(list(group), outputfile, ensure_ascii=False)
    if remove:
        os.remove(f)

Beispiel #25

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_sample_returns_dif_tweets_than_fist_10_tweets(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection_one = SmappCollection('bson', file_path)
     sample_tweets = list(collection_one.sample(10))
     collection_two = SmappCollection('bson', file_path)
     first_ten_tweets = list(collection_two.limit_number_of_tweets(10))
     self.assertNotEqual(sample_tweets, first_ten_tweets)

Beispiel #26

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

    def test_dump_to_bson(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson')

        output_path = os.path.dirname(
            os.path.realpath(__file__)) + '/' + 'data/output.bson'
        collection = SmappCollection(
            'bson',
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['bson']['valid'])
        collection.dump_to_bson(output_path)
        self.assertTrue(os.path.getsize(output_path) > 0)

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.bson')

Beispiel #27

0

Datei anzeigen

Datei: json2csv.py Projekt: notconfusing/smapputil

def json2csv(f, cols=cols, keep=True):
    '''
    Reads a json file into a SmappCollection.
    Dumps to csv.
    Removes incompletely dumped csv's from past jobs.
    gzips csv.
    '''
    f_out = bootstrap(f)
    if not os.path.isfile(f_out):
        collection = SmappCollection('json', f).dump_to_csv(f_out, cols)
    if not keep:
        os.remove(f)

    split_csv(f_out)

Beispiel #28

0

Datei anzeigen

    def test_tweet_field_grouped_by_timeslice_years(self):
        output_path = '{}/chart_tests/Bar-{}-bar.png'.format(
            os.path.dirname(os.path.realpath(__file__)), datetime.now())
        file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                   config['json']['valid'])
        collection = SmappCollection('json', file_path)

        def custom_filter(tweet):
            return True

        plots.bar_graph_tweet_field_grouped_by_period(
            collection, '', [], custom_filter, 'years', datetime(2015, 9, 1),
            datetime(2015, 11, 30), output_path, 'date', 'tweet counts',
            'filtered tweets by hour')

Beispiel #29

0

Datei anzeigen

Datei: date_filter_bson.py Projekt: SMAPPNYU/smapputil

def date_filter(output, input_file, dateone, datetwo):
    #configure logging
    logger = logging.getLogger(__name__)
    logger.info('Iterating through your file : %s', output)

    _, file_extension = splitext(input_file)
    file_extension = file_extension[1:]

    #if dateone input exists make a datetime object with it
    if dateone:
        startdate = datetime.datetime.strptime(dateone,'%Y-%m-%d %H:%M:%S')
    #if datetwo input exists make a datetime object with it
    if datetwo:
        enddate = datetime.datetime.strptime(datetwo,'%Y-%m-%d %H:%M:%S')

    #user gave 2 dates and wants a range
    if dateone and datetwo:
        logger.info('creating smapp collection and query for dates {} and {}'.format(startdate, enddate))
        collection = SmappCollection(file_extension, input_file)
        collection.get_date_range(startdate, enddate).dump_to_bson(output)

    #user gave date once and wants objects since
    elif dateone:
        enddate = datetime.datetime.now()
        logger.info('creating smapp collection and query for dates {} and {}'.format(startdate, enddate))
        collection = SmappCollection(file_extension, input_file)
        collection.get_date_range(startdate, enddate).dump_to_bson(output)

    #user gave date two and wants objects up to that point
    elif datetwo:
        startdate = datetime.datetime.min
        logger.info('creating smapp collection and query for dates {} and {}'.format(startdate, enddate))
        collection = SmappCollection(file_extension, input_file)
        collection.get_date_range(startdate, enddate).dump_to_bson(output)

    else:
        logger.info('Couldn\'t find a date, exiting at %s!', datetime.datetime.now().strftime('%Y-%m-%d %H:%M'))

    logger.info('Finished merging input file : %s', output)
    logger.info('Finished merging all input files to path : %s', output)

Beispiel #30

0

Datei anzeigen

Datei: check_dump_integrity.py Projekt: SMAPPNYU/smapputil

def check_dump_integrity(hostname, port, dbname, username, password, authdb, dump_dir, check_type = COUNT_COLLECTIONS):

	# connect to the db
	mongo = pymongo.MongoClient(hostname, int(port))
	if username and password:
		mongo[authdb].authenticate(username, password)

	db = mongo[dbname]

	# Get a list of relevant collections from the database
	db_collections = db.collection_names()
	db_relevant_collections = [match.group(1) for coll in db_collections for match in [DB_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match]
	db_relevant_collections.sort()
	db_relevant_collections.sort(key=len)

	#Get a list of relevant collections from the dump
	dump_exists = True
	dump_path = dump_dir + dbname if dump_dir[-1:] == '/' else dump_dir + '/' + dbname
	dump_collections = []
	try:
		dump_collections = [file for file in os.listdir(dump_path)]
	except OSError as e:
		if e.errno == 2:
			dump_exists = False
		else:
			logging.error(e)
			print(e)

	dump_relevant_collections = [match.group(1) for coll in dump_collections for match in [DUMP_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match]
	dump_relevant_collections.sort()
	dump_relevant_collections.sort(key=len)

	#Find out format of dump (.json or .bson)
	dump_format = 'bson'
	if dump_relevant_collections[0].split('.', 1)[1] == 'json':
		dump_format = 'json'

	#CHECK NUMBER OF COLLECTIONS
	if check_type == COUNT_COLLECTIONS:
		num_collections_in_db = len(db_relevant_collections)
		num_collections_dumped = len(dump_relevant_collections)

		#Print integrity of number of collections in dump
		log_output = 'DUMP FOR {} '.format(dbname)
		if not dump_exists:
			log_output += 'DOES NOT EXIST. '
		elif num_collections_dumped < num_collections_in_db:
			log_output += 'IS MISSING COLLECTIONS. '
		elif num_collections_dumped > num_collections_in_db:
			log_output += 'HAS TOO MANY COLLECTIONS. '
		else:
			log_output += 'IS OK ON COLLECTIONS. '
		log_output += 'Number of collections in database: {}, Number of collections dumped: {}'.format(num_collections_in_db, num_collections_dumped)
		logging.info('\n' + log_output)
		print('\n' + log_output)

		#Print list of any collections missing from dump
		if dump_exists and (num_collections_dumped < num_collections_in_db):
			dump_relevant_collections_split = [dump_coll.split('.' + dump_format, 1)[0] for dump_coll in dump_relevant_collections]
			missing_collections = [coll for coll in db_relevant_collections if coll not in dump_relevant_collections_split]
			missing_collections.sort()
			missing_collections.sort(key=len)
			logging.info('\n' + 'Missing Collections: {}'.format(missing_collections))
			print('\n' + 'Missing Collections: {}'.format(missing_collections))

	#CHECK NUMBER OF DOCUMENTS
	elif check_type == COUNT_DOCUMENTS:
		logger.info('\n' + 'Counting number of documents in {} database'.format(dbname))
		print('\n' + 'Counting number of documents in {} database'.format(dbname))

		total_documents_in_db = 0
		db_collection_doc_counts = {}

		#Sum total documents in db
		for coll in db_relevant_collections:
			num_docs_in_coll = db[coll].count()
			total_documents_in_db += num_docs_in_coll

			#Save document count for db collection
			db_collection_doc_counts[coll] = num_docs_in_coll

			logging.info("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll))
			print("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll))
	
		logger.info('\n' + 'Counting number of documents in {} dump'.format(dbname))
		print('\n' + 'Counting number of documents in {} dump'.format(dbname))

		total_documents_dumped = 0
		dump_collection_doc_counts = {}

		#Sum up total number of documents in dump
		for coll in dump_relevant_collections:
			collection = SmappCollection(dump_format, dump_path + '/' + coll)
			num_docs_in_coll = collection.count_tweets()
			total_documents_dumped += num_docs_in_coll

			#Save document count for dump collection
			dump_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] = num_docs_in_coll

			#Calculate # and % missing documents for collection
			num_docs_in_db_coll = db_collection_doc_counts[coll.split('.' + dump_format, 1)[0]]
			num_docs_missing = num_docs_in_db_coll - num_docs_in_coll
			percentage_docs_missing = 0
			if num_docs_in_db_coll != 0:
				percentage_docs_missing = (num_docs_missing/num_docs_in_db_coll) * 100

			logging.info("Dump {} {} document count: {}".format(dbname, coll, num_docs_in_coll))
			print("Dump {0} {1} document count: {2} (Missing {3}, {4:.2f}%)".format(dbname, coll, num_docs_in_coll, num_docs_missing, percentage_docs_missing))
			# print("".format())

		#Calculate # and % missing documents overall
		total_docs_missing = total_documents_in_db - total_documents_dumped
		percentage_total_docs_missing = 0
		if total_documents_in_db != 0:
			percentage_total_docs_missing = (total_docs_missing/total_documents_in_db) * 100

		#Print integrity of number of documents in dump
		log_output = 'DUMP FOR {} '.format(dbname)
		if not dump_exists:
			log_output += 'DOES NOT EXIST. '
		elif total_documents_dumped < total_documents_in_db:
			log_output += 'IS MISSING DOCUMENTS. '
		elif total_documents_dumped > total_documents_in_db:
			log_output += 'HAS TOO MANY DOCUMENTS. '
		else:
			log_output += 'IS OK ON DOCUMENTS. '
		log_output += 'Total documents in database: {0}, Total documents dumped: {1} (Missing {2}, {3:.2f}%)'.format(total_documents_in_db, total_documents_dumped, total_docs_missing, percentage_total_docs_missing)
		logging.info('\n' + log_output)
		print('\n' + log_output)

		#Print list of any collections from dump missing documents
		if dump_exists and (total_documents_dumped < total_documents_in_db):
			collections_missing_docs = [coll for coll, count in db_collection_doc_counts.items() if (coll not in dump_collection_doc_counts or dump_collection_doc_counts[coll] != count)]
			collections_missing_docs.sort()
			collections_missing_docs.sort(key=len)
			logging.info('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs))
			print('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs))

Beispiel #31

0

Datei anzeigen

def date_filter(output, input_file, dateone, datetwo):
    #configure logging
    logger = logging.getLogger(__name__)
    logger.info('Iterating through your file : %s', output)

    _, file_extension = splitext(input_file)
    file_extension = file_extension[1:]

    #if dateone input exists make a datetime object with it
    if dateone:
        startdate = datetime.datetime.strptime(dateone, '%Y-%m-%d %H:%M:%S')
    #if datetwo input exists make a datetime object with it
    if datetwo:
        enddate = datetime.datetime.strptime(datetwo, '%Y-%m-%d %H:%M:%S')

    #user gave 2 dates and wants a range
    if dateone and datetwo:
        logger.info(
            'creating smapp collection and query for dates {} and {}'.format(
                startdate, enddate))
        collection = SmappCollection(file_extension, input_file)
        collection.get_date_range(startdate, enddate).dump_to_bson(output)

    #user gave date once and wants objects since
    elif dateone:
        enddate = datetime.datetime.now()
        logger.info(
            'creating smapp collection and query for dates {} and {}'.format(
                startdate, enddate))
        collection = SmappCollection(file_extension, input_file)
        collection.get_date_range(startdate, enddate).dump_to_bson(output)

    #user gave date two and wants objects up to that point
    elif datetwo:
        startdate = datetime.datetime.min
        logger.info(
            'creating smapp collection and query for dates {} and {}'.format(
                startdate, enddate))
        collection = SmappCollection(file_extension, input_file)
        collection.get_date_range(startdate, enddate).dump_to_bson(output)

    else:
        logger.info('Couldn\'t find a date, exiting at %s!',
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M'))

    logger.info('Finished merging input file : %s', output)
    logger.info('Finished merging all input files to path : %s', output)

Beispiel #32

0

Datei anzeigen

Datei: test_smapp_collection.py Projekt: Yibeichan/pysmap

 def test_count_tweets(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     count = collection.count_tweets()
     self.assertEqual(1187, count)