Esempio n. 1
0
def jobsfs_to_mongo(guid, buid, name):
    """Composed method for resopnding to a guid update."""

    assert re.match(r'^[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}$', guid.upper()), \
           "%s is not a valid guid" % guid
    assert re.match(r'^\d+$', str(buid)), "%s is not a valid buid" % buid

    logger.info("Updating Job Source %s", guid)
    # Make the BusinessUnit and Company
    create_businessunit(buid)
    bu = BusinessUnit.objects.get(id=buid)
    bu.title = name
    bu.save()
    add_company(bu)

    # Lookup the jobs, filter then, transform them, and then load the jobs
    zf = get_jobsfs_zipfile(guid)
    jobs = get_jobs_from_zipfile(zf, guid)
    jobs = filter_current_jobs(jobs, bu)
    jobs = (hr_xml_to_json(job, bu) for job in jobs)
    jobs = list(jobs)
    for job in jobs:
        job['guid'] = job['guid'].lower()

    if len(jobs) > 0:
        collection = connect_db().db.jobs
        bulk = collection.initialize_unordered_bulk_op()
        for job in jobs:
            bulk.find({'guid': job['guid']}).upsert().replace_one(job)
        bulk.execute()
Esempio n. 2
0
def jobsfs_to_mongo(guid, buid, name):
    """Composed method for resopnding to a guid update."""

    assert re.match(r'^[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}$', guid.upper()), \
           "%s is not a valid guid" % guid
    assert re.match(r'^\d+$', str(buid)), "%s is not a valid buid" % buid

    logger.info("Updating Job Source %s", guid)
    # Make the BusinessUnit and Company
    create_businessunit(buid)
    bu = BusinessUnit.objects.get(id=buid)
    bu.title = name
    bu.save()
    add_company(bu)

    # Lookup the jobs, filter then, transform them, and then load the jobs
    zf = get_jobsfs_zipfile(guid)
    jobs = get_jobs_from_zipfile(zf, guid)
    jobs = filter_current_jobs(jobs, bu)
    jobs = (hr_xml_to_json(job, bu) for job in jobs)
    jobs = list(jobs)
    for job in jobs:
        job['guid'] = job['guid'].lower()

    if len(jobs) > 0:
        collection = connect_db().db.jobs
        bulk = collection.initialize_unordered_bulk_op()
        for job in jobs:
            bulk.find({'guid': job['guid']}).upsert().replace_one(job)
        bulk.execute()
Esempio n. 3
0
def get_mongo_db():
    """
    Retrieve the current mongo database (defined in settings.MONGO_DBNAME).

    :return: a mongo database

    """
    return connect_db().db
Esempio n. 4
0
 def populate_mongo_data(self):
     job_views = connect_db().db.job_views
     mongo_data = [
         {
             "time_first_viewed": dateparser.parse("10/17/2012 01:00:00"),
             "country": "GER",
             "state": "Gutentag",
             "city": "Braunshweiger",
             "found_on": "www.google.de",
             "view_count": 3,
         },
         {
             "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"),
             "country": "GER",
             "state": "Gutentag",
             "city": "Braunshweiger",
             "found_on": "www.google.de",
             "view_count": 3,
         },
         {
             "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"),
             "country": "USA",
             "state": "IN",
             "city": "Peru",
             "found_on": "www.google.com",
             "view_count": 2,
         },
         {
             "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"),
             "country": "USA",
             "state": "IN",
             "city": "Indianapolis",
             "found_on": "www.google.com",
             "view_count": 7,
         },
         {
             "time_first_viewed": dateparser.parse("10/18/2012 01:00:00"),
             "country": "USA",
             "state": "MI",
             "city": "Muskegon",
             "found_on": "www.google.com",
             "view_count": 1,
         },
     ]
     job_views.insert_many(mongo_data)
Esempio n. 5
0
def seoxml_to_mongo(buid, data_dir=DATA_DIR):
    filepath = download_feed_file(buid, data_dir=data_dir)

    jobfeed = DEv2JobFeed(filepath, jsid=buid, markdown=False, company=None)
    # If the feed file did not pass validation, return. The return value is
    # '(0, 0)' to match what's returned on a successful parse.
    if jobfeed.errors:
        error = jobfeed.error_messages
        logging.error("BUID:%s - Feed file has failed validation on line %s. "
                      "Exception: %s" %
                      (buid, error['line'], error['exception']))
        raise FeedImportError(error)

    # A dictionary of uids
    jobfeed.jobparse()
    jobs = jobfeed.solr_jobs()

    collection = connect_db().db.jobs
    bulk = collection.initialize_unordered_bulk_op()
    for job in jobs:
        bulk.find({'guid': job['guid']}).upsert().replace_one(job)
    bulk.execute()
Esempio n. 6
0
def seoxml_to_mongo(buid, data_dir=DATA_DIR):
    filepath = download_feed_file(buid, data_dir=data_dir)

    jobfeed = DEv2JobFeed(filepath, jsid=buid, markdown=False,
                          company=None)
    # If the feed file did not pass validation, return. The return value is
    # '(0, 0)' to match what's returned on a successful parse.
    if jobfeed.errors:
        error = jobfeed.error_messages
        logging.error("BUID:%s - Feed file has failed validation on line %s. "
                      "Exception: %s" % (buid, error['line'],
                                         error['exception']))
        raise FeedImportError(error)

    # A dictionary of uids
    jobfeed.jobparse()
    jobs = jobfeed.solr_jobs()

    collection = connect_db().db.jobs
    bulk = collection.initialize_unordered_bulk_op()
    for job in jobs:
        bulk.find({'guid': job['guid']}).upsert().replace_one(job)
    bulk.execute()
Esempio n. 7
0
def to_mongo(file_name):
    """
    Inserts the constituent lines of the file pointed to by "file_name" into
    a Mongo database.
    """
    analytics = connect_db().db

    base_name = os.path.basename(file_name)

    # If you're comparing this with store.py from MyJobs-Analytics, you'll
    # notice that we're using file_name here instead of base_name. Analytics
    # log files are stored on the analytics servers themselves, while redirect
    # stores its logs in s3. This is one way to differentiate the two.
    count = analytics.files.find_one({"file": file_name}) or 0
    if count:
        # This file was already processed.
        # TODO: Add ability to upsert based on this?
        return

    lines = get_log_lines(file_name)

    # All redirect logs are named using the same format, which differs from
    # analytics logs - "ex%y%m%d%H%M%S.log"; strip "ex" and the extension to
    # get our timestamp.
    timestamp = os.path.splitext(base_name)[0][2:]
    file_created = datetime.strptime(timestamp, "%y%m%d%H%M%S")

    file_id = analytics.files.insert_one({
        "file": file_name,
        "line_count": len(lines),
        "file_created": file_created
    }).inserted_id

    json_lines = []
    # Denotes if the file was inserted successfully in its entirety. Toggled if
    # we can't parse a line as JSON.
    success = True
    for line in lines:
        # Try to parse each line as JSON. There may or may not be invalid data
        # in the file; Don't crash and burn if so.
        try:
            json_line = json.loads(line)
        except ValueError:
            success = False
        else:
            # '"-"' is valid JSON but insert_many requires a list of
            # dictionaries. If "json_line" is a string, it's not a
            # document we wish to keep.
            if not isinstance(json_line, basestring):
                for key, value in json_line.items():
                    if key in ['to', 'sv', 'nv', 'fv'] and value:
                        # parser.parse('') results in today's date; we probably
                        # don't want that. Ensure the parameter has a value.
                        try:
                            json_line[key] = parser.parse(value)
                        except (ValueError, TypeError):
                            pass
                    elif isinstance(value, basestring) and value.isdigit():
                        json_line[key] = int(value)
                        if key == 'time':
                            json_line[key] = datetime.fromtimestamp(
                                json_line[key])
                json_line['file_id'] = file_id
                json_lines.append(json_line)

    if json_lines:
        # It's possible the file is blank or we failed parsing all lines.
        # TODO: Auditing procedure that compares "line_count" from the "files"
        #     collection with the number of items related to that file in the
        #     "analytics" collection.
        analytics.analytics.insert_many(json_lines)
    analytics.files.update({'_id': file_id}, {'$set': {'success': success}})
Esempio n. 8
0
def get_mongo_db():
    """
    Retrieve the current mongo database (defined in settings.MONGO_DBNAME).
    :return: a mongo database
    """
    return connect_db().db
Esempio n. 9
0
def to_mongo(file_name):
    """
    Inserts the constituent lines of the file pointed to by "file_name" into
    a Mongo database.
    """
    analytics = connect_db().db

    base_name = os.path.basename(file_name)

    # If you're comparing this with store.py from MyJobs-Analytics, you'll
    # notice that we're using file_name here instead of base_name. Analytics
    # log files are stored on the analytics servers themselves, while redirect
    # stores its logs in s3. This is one way to differentiate the two.
    count = analytics.files.find_one({"file": file_name}) or 0
    if count:
        # This file was already processed.
        # TODO: Add ability to upsert based on this?
        return

    lines = get_log_lines(file_name)

    # All redirect logs are named using the same format, which differs from
    # analytics logs - "ex%y%m%d%H%M%S.log"; strip "ex" and the extension to
    # get our timestamp.
    timestamp = os.path.splitext(base_name)[0][2:]
    file_created = datetime.strptime(timestamp, "%y%m%d%H%M%S")

    file_id = analytics.files.insert_one({
        "file": file_name, "line_count": len(lines),
        "file_created": file_created}).inserted_id

    json_lines = []
    # Denotes if the file was inserted successfully in its entirety. Toggled if
    # we can't parse a line as JSON.
    success = True
    for line in lines:
        # Try to parse each line as JSON. There may or may not be invalid data
        # in the file; Don't crash and burn if so.
        try:
            json_line = json.loads(line)
        except ValueError:
            success = False
        else:
            # '"-"' is valid JSON but insert_many requires a list of
            # dictionaries. If "json_line" is a string, it's not a
            # document we wish to keep.
            if not isinstance(json_line, basestring):
                for key, value in json_line.items():
                    if key in ['to', 'sv', 'nv', 'fv'] and value:
                        # parser.parse('') results in today's date; we probably
                        # don't want that. Ensure the parameter has a value.
                        try:
                            json_line[key] = parser.parse(value)
                        except (ValueError, TypeError):
                            pass
                    elif isinstance(value, basestring) and value.isdigit():
                        json_line[key] = int(value)
                        if key == 'time':
                            json_line[key] = datetime.fromtimestamp(
                                json_line[key])
                json_line['file_id'] = file_id
                json_lines.append(json_line)

    if json_lines:
        # It's possible the file is blank or we failed parsing all lines.
        # TODO: Auditing procedure that compares "line_count" from the "files"
        #     collection with the number of items related to that file in the
        #     "analytics" collection.
        analytics.analytics.insert_many(json_lines)
    analytics.files.update({'_id': file_id},
                                  {'$set': {'success': success}})