def jobsfs_to_mongo(guid, buid, name): """Composed method for resopnding to a guid update.""" assert re.match(r'^[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}$', guid.upper()), \ "%s is not a valid guid" % guid assert re.match(r'^\d+$', str(buid)), "%s is not a valid buid" % buid logger.info("Updating Job Source %s", guid) # Make the BusinessUnit and Company create_businessunit(buid) bu = BusinessUnit.objects.get(id=buid) bu.title = name bu.save() add_company(bu) # Lookup the jobs, filter then, transform them, and then load the jobs zf = get_jobsfs_zipfile(guid) jobs = get_jobs_from_zipfile(zf, guid) jobs = filter_current_jobs(jobs, bu) jobs = (hr_xml_to_json(job, bu) for job in jobs) jobs = list(jobs) for job in jobs: job['guid'] = job['guid'].lower() if len(jobs) > 0: collection = connect_db().db.jobs bulk = collection.initialize_unordered_bulk_op() for job in jobs: bulk.find({'guid': job['guid']}).upsert().replace_one(job) bulk.execute()
def get_mongo_db(): """ Retrieve the current mongo database (defined in settings.MONGO_DBNAME). :return: a mongo database """ return connect_db().db
def populate_mongo_data(self): job_views = connect_db().db.job_views mongo_data = [ { "time_first_viewed": dateparser.parse("10/17/2012 01:00:00"), "country": "GER", "state": "Gutentag", "city": "Braunshweiger", "found_on": "www.google.de", "view_count": 3, }, { "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"), "country": "GER", "state": "Gutentag", "city": "Braunshweiger", "found_on": "www.google.de", "view_count": 3, }, { "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"), "country": "USA", "state": "IN", "city": "Peru", "found_on": "www.google.com", "view_count": 2, }, { "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"), "country": "USA", "state": "IN", "city": "Indianapolis", "found_on": "www.google.com", "view_count": 7, }, { "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"), "country": "USA", "state": "MI", "city": "Muskegon", "found_on": "www.google.com", "view_count": 1, }, ] job_views.insert_many(mongo_data)
def seoxml_to_mongo(buid, data_dir=DATA_DIR): filepath = download_feed_file(buid, data_dir=data_dir) jobfeed = DEv2JobFeed(filepath, jsid=buid, markdown=False, company=None) # If the feed file did not pass validation, return. The return value is # '(0, 0)' to match what's returned on a successful parse. if jobfeed.errors: error = jobfeed.error_messages logging.error("BUID:%s - Feed file has failed validation on line %s. " "Exception: %s" % (buid, error['line'], error['exception'])) raise FeedImportError(error) # A dictionary of uids jobfeed.jobparse() jobs = jobfeed.solr_jobs() collection = connect_db().db.jobs bulk = collection.initialize_unordered_bulk_op() for job in jobs: bulk.find({'guid': job['guid']}).upsert().replace_one(job) bulk.execute()
def to_mongo(file_name): """ Inserts the constituent lines of the file pointed to by "file_name" into a Mongo database. """ analytics = connect_db().db base_name = os.path.basename(file_name) # If you're comparing this with store.py from MyJobs-Analytics, you'll # notice that we're using file_name here instead of base_name. Analytics # log files are stored on the analytics servers themselves, while redirect # stores its logs in s3. This is one way to differentiate the two. count = analytics.files.find_one({"file": file_name}) or 0 if count: # This file was already processed. # TODO: Add ability to upsert based on this? return lines = get_log_lines(file_name) # All redirect logs are named using the same format, which differs from # analytics logs - "ex%y%m%d%H%M%S.log"; strip "ex" and the extension to # get our timestamp. timestamp = os.path.splitext(base_name)[0][2:] file_created = datetime.strptime(timestamp, "%y%m%d%H%M%S") file_id = analytics.files.insert_one({ "file": file_name, "line_count": len(lines), "file_created": file_created }).inserted_id json_lines = [] # Denotes if the file was inserted successfully in its entirety. Toggled if # we can't parse a line as JSON. success = True for line in lines: # Try to parse each line as JSON. There may or may not be invalid data # in the file; Don't crash and burn if so. try: json_line = json.loads(line) except ValueError: success = False else: # '"-"' is valid JSON but insert_many requires a list of # dictionaries. If "json_line" is a string, it's not a # document we wish to keep. if not isinstance(json_line, basestring): for key, value in json_line.items(): if key in ['to', 'sv', 'nv', 'fv'] and value: # parser.parse('') results in today's date; we probably # don't want that. Ensure the parameter has a value. try: json_line[key] = parser.parse(value) except (ValueError, TypeError): pass elif isinstance(value, basestring) and value.isdigit(): json_line[key] = int(value) if key == 'time': json_line[key] = datetime.fromtimestamp( json_line[key]) json_line['file_id'] = file_id json_lines.append(json_line) if json_lines: # It's possible the file is blank or we failed parsing all lines. # TODO: Auditing procedure that compares "line_count" from the "files" # collection with the number of items related to that file in the # "analytics" collection. analytics.analytics.insert_many(json_lines) analytics.files.update({'_id': file_id}, {'$set': {'success': success}})
def to_mongo(file_name): """ Inserts the constituent lines of the file pointed to by "file_name" into a Mongo database. """ analytics = connect_db().db base_name = os.path.basename(file_name) # If you're comparing this with store.py from MyJobs-Analytics, you'll # notice that we're using file_name here instead of base_name. Analytics # log files are stored on the analytics servers themselves, while redirect # stores its logs in s3. This is one way to differentiate the two. count = analytics.files.find_one({"file": file_name}) or 0 if count: # This file was already processed. # TODO: Add ability to upsert based on this? return lines = get_log_lines(file_name) # All redirect logs are named using the same format, which differs from # analytics logs - "ex%y%m%d%H%M%S.log"; strip "ex" and the extension to # get our timestamp. timestamp = os.path.splitext(base_name)[0][2:] file_created = datetime.strptime(timestamp, "%y%m%d%H%M%S") file_id = analytics.files.insert_one({ "file": file_name, "line_count": len(lines), "file_created": file_created}).inserted_id json_lines = [] # Denotes if the file was inserted successfully in its entirety. Toggled if # we can't parse a line as JSON. success = True for line in lines: # Try to parse each line as JSON. There may or may not be invalid data # in the file; Don't crash and burn if so. try: json_line = json.loads(line) except ValueError: success = False else: # '"-"' is valid JSON but insert_many requires a list of # dictionaries. If "json_line" is a string, it's not a # document we wish to keep. if not isinstance(json_line, basestring): for key, value in json_line.items(): if key in ['to', 'sv', 'nv', 'fv'] and value: # parser.parse('') results in today's date; we probably # don't want that. Ensure the parameter has a value. try: json_line[key] = parser.parse(value) except (ValueError, TypeError): pass elif isinstance(value, basestring) and value.isdigit(): json_line[key] = int(value) if key == 'time': json_line[key] = datetime.fromtimestamp( json_line[key]) json_line['file_id'] = file_id json_lines.append(json_line) if json_lines: # It's possible the file is blank or we failed parsing all lines. # TODO: Auditing procedure that compares "line_count" from the "files" # collection with the number of items related to that file in the # "analytics" collection. analytics.analytics.insert_many(json_lines) analytics.files.update({'_id': file_id}, {'$set': {'success': success}})