Beispiel #1
0
def main():
    '''Run classification on job data

    Process directly from partner feeds

    Current configuration:

    + Load csv data from xml feed
    + Count Vectorize with Bag of Words
    + Label using Naive Bayes Classifier
    + Write results back to s3
    '''

    # define paths
    key_path = '/home/ubuntu/job-classifier/.keys/'
    model_path = '/home/ubuntu/job-classifier/models/'

    # dict with partner:url pairs
    partners = load_pickle(os.path.join(key_path, 'partners.pickle'))
    s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle'))

    bucket = s3_details['csv_bucket']
    file_to_write = s3_details['target']

    cv_model = load_pickle(os.path.join(model_path, 'CV_nb_bow_model.pckl'))
    # update cv model above
    #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl'))
    cnb_model = load_pickle(
        os.path.join(model_path, 'complement_nb_model.pckl'))

    # cities & roles
    city_roles = {
        'driver': ['new york', 'los angeles'],
        'nurse': ['dallas', 'los angeles'],
        'tech': ['san francisco', 'boston']
    }

    # pull xml from url and parse into df
    for partner in partners:
        print('\nProcessing:', partner.upper())
        url = partners[partner]
        if url.endswith('.xml.gz'):
            df = xt.xml_from_url_compressed(url)
        else:
            df = xt.xml_from_url(url)

        # standardize text format
        df = nlp.standardize_text(df, 'title')

        # select data to predict from
        X_classify = df['title'].tolist()

        # get count vec
        X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

        # predict with model
        #y_label = mnb_model.predict(X_classify_counts)
        y_label = cnb_model.predict(X_classify_counts)

        # assign predictions to jobs & prune dataframe
        df['label'] = y_label

        label_cols = ['label', 'company', 'title', 'city', 'state', 'url']
        #labels_to_drop = ['ignore','driver','service']
        labels_to_drop = ['ignore', 'service']

        df_tmp = df[~(df['label'].isin(labels_to_drop))][label_cols]

        for role in city_roles:
            print('processing:', role.upper())

            for city in city_roles[role]:
                print('city:', city.upper())

                df_to_write = get_city_df(df, city, role)

                # write labeled roles
                city_file = city.replace(' ', '_')
                label_key = partner + '/' + role + '/' + city_file + '/' + 'jobs.csv'
                bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
Beispiel #2
0
def main():
    '''Run classification on job data

    Process directly from partner feeds

    Current configuration:

    + Load csv data from xml feed
    + Count Vectorize with Bag of Words
    + Label using Naive Bayes Classifier
    + Write results back to s3
    '''

    # define paths
    key_path = '/home/ubuntu/job-classifier/.keys/'
    model_path = '/home/ubuntu/job-classifier/models/'

    # dict with partner:url pairs
    partners = load_pickle(os.path.join(key_path, 'partners.pickle'))
    s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle'))

    bucket = s3_details['csv_bucket']
    file_to_write = s3_details['target']

    cv_model = load_pickle(os.path.join(model_path,'CV_nb_bow_model.pckl'))
    # update cv model above
    #clf_model = load_pickle(os.path.join(model_path, 'lr_bow_train_only_model.pckl'))
    #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl'))
    cnb_model = load_pickle(os.path.join(model_path, 'complement_nb_model.pckl'))

    # pull xml from url and parse into df
    for partner in partners:
        url = partners[partner]
        if url.endswith('.xml.gz'):
            df = xt.xml_from_url_compressed(url)
        else:
            df = xt.xml_from_url(url)

        # standardize text format
        df = nlp.standardize_text(df, 'title')

        # select data to predict from
        X_classify = df['title'].tolist()

        # get count vec
        X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

        # predict with model
        #y_label = mnb_model.predict(X_classify_counts)
        y_label = cnb_model.predict(X_classify_counts)

        # assign predictions to jobs & prune dataframe
        df['label'] = y_label

        label_cols = ['label', 'company','title','city','state','url']
        labels_to_drop = ['ignore','driver','service']
        df_to_write = df[~(df['label'].isin(labels_to_drop))][label_cols]
        #df_to_write = df[~(df['label']=='ignore')][label_cols]

        # SAMPLE DF_TO_WRITE for smaller dataset
        df_to_write = df_to_write.sample(n=100)

        # write labeled roles
        label_key = partner + '/' + 'labeled_jobs.csv'
        bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
def main():
    '''Run classification on job data

    Current configuration:

    + Load csv data from s3
    + Count Vectorize with Bag of Words
    + Predict Linear Regression
    + Select results on LinReg class probability
    + Write results back to s3
    '''

    # add custom modules to path
    path_to_module = '/home/ubuntu/job-classifier/tools/'
    sys.path.append(path_to_module)

    # load s3 connector & preprocessing functions
    import bototools as bt
    import xmltools as xt
    import nlp_preprocessing as nlp

    # load job data from s3 csv to dataframe
    path_to_data = '/home/ubuntu/job-classifier/.keys/'
    file_name = 'cron_data_file.json'
    bucket, key, url, target = bt.load_s3_location(path_to_data, file_name)

    # pull xml from url and parse into df
    df = xt.xml_from_url(url)

    # standardize text format
    cols_to_model = ['title']
    for col in cols_to_model:
        df = nlp.standardize_text(df, col)

    # select data to predict from
    X_classify = df['title'].tolist()

    # define model path
    path_to_models = '/home/ubuntu/job-classifier/models/'

    # load count vectorizer & transform data to predict
    cv_pickle = 'CV_lr_bow_train_only_model.pckl'
    cv_path = os.path.join(path_to_models, cv_pickle)
    cv_model = pickle.load(open(cv_path, 'rb'))
    X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

    # load, train, fit model
    clf_pickle = 'lr_bow_train_only_model.pckl'
    clf_path = os.path.join(path_to_models, clf_pickle)
    clf_model = pickle.load(open(clf_path, 'rb'))
    y_predicted = clf_model.predict(X_classify_counts)
    y_prob = clf_model.predict_proba(X_classify_counts)

    # assign predictions to jobs & prune dataframe
    df['gig'] = y_predicted
    cols_to_write = ['company','title','city','state','url']

    df_to_write = df[df['gig']==1][cols_to_write]

    # write jobs to accessible location on s3
    # custom name by date -- test overlap between days
    timestr = time.strftime("%Y-%m-%d")
    prefix, fn = target.split('/')
    file_to_write = prefix + '/' + timestr + '-' + fn
    bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False)

    # add labeled samples to validate for future training
    df_positive = df[df['gig']==1].sample(1000)
    file_positive = 'positive' + '/' + timestr + '-' + fn
    bt.write_df_to_s3(df_positive, bucket, file_positive, comp=False)

    df_negative = df[df['gig']==0].sample(1000)
    file_negative = 'negative' + '/' + timestr + '-' + fn
    bt.write_df_to_s3(df_negative, bucket, file_negative, comp=False)
def main():
    '''Run classification on job data

    Current configuration:

    + Load csv data from xml feed
    + Count Vectorize with Bag of Words
    + Predict Linear Regression
    + Select results on LinReg class probability
    + Write results back to s3
    '''

    # add custom modules to path
    path_to_module = '../tools/'
    sys.path.append(path_to_module)

    # load s3 connector & preprocessing functions
    import bototools as bt
    import xmltools as xt
    import nlp_preprocessing as nlp

    # load job data from s3 csv to dataframe
    path_to_data = '../.keys/'
    file_name = 'eda_data_file.json'
    bucket, key, url, target = bt.load_s3_location(path_to_data, file_name)

    # pull xml from url and parse into df
    #df = bt.load_df_from_s3(bucket, key, comp='gzip')
    df = xt.xml_from_url(url)

    # standardize text format
    cols_to_model = ['title']
    for col in cols_to_model:
        df = nlp.standardize_text(df, col)

    # select data to predict from
    X_classify = df['title'].tolist()

    # define model path
    path_to_models = '../models/'

    # load count vectorizer & transform data to predict
    cv_pickle = 'CV_lr_bow_train_only_model.pckl'
    cv_path = os.path.join(path_to_models, cv_pickle)
    cv_model = pickle.load(open(cv_path, 'rb'))
    X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

    # load, train, fit model
    clf_pickle = 'lr_bow_train_only_model.pckl'
    clf_path = os.path.join(path_to_models, clf_pickle)
    clf_model = pickle.load(open(clf_path, 'rb'))

    y_predicted = clf_model.predict(X_classify_counts)
    #y_prob = clf_model.predict_proba(X_classify_counts)

    # assign predictions to jobs & prune dataframe
    df['gig'] = y_predicted
    #df['prob'] = y_prob[:,0] # failed last test
    cols_to_write = ['company','title','city','state','url']
    #cols_to_write = ['company','title','city','state','posted_at','url']

    # only keep listings with over 95% probability of being a gig job
    # tighten/loosen requirement depending on model
    #df_to_write = df[(df['gig']==1) & (df['prob']==0.95)][cols_to_write]
    df_to_write = df[df['gig']==1][cols_to_write]

    # write jobs to accessible location on s3
    #file_to_write = 'gigs/streamed_full_daily_job_list.csv'
    file_to_write = target
    bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False)
Beispiel #5
0
def main():
    '''Run classification
    '''

    path_to_module = '../tools/'
    sys.path.append(path_to_module)
    # load s3 read & write functions
    import bototools as bt # add these to actual path

    print('classifying new jobs...\n')

    path_to_data = '../.keys/'
    #file_name = 'csv_to_classify.json'
    file_name = 'eda_data_file.json'
    #bucket, key = bt.load_s3_location(path_to_data, file_name)
    # use a sample file for testing -- avoid large files for now
    bucket, _ = bt.load_s3_location(path_to_data, file_name)
    key = 'eda_sample_data_file.csv'
    df = bt.load_df_from_s3(bucket, key) # sample not gzipped
    #df = bt.load_df_from_s3(bucket, key, comp='gzip')

    # import preprocessing tools
    import nlp_preprocessing as nlp
    # cleanup the dataframe to prepare for classification
    # while only SOME columns are used, ALL need to be returned for ops team

    cols_to_model = ['title']
    for col in cols_to_model:
        df = nlp.standardize_text(df, col)

    X_classify = df['title'].tolist()

    # load count vectorizer
    path_to_models = '../models/'

    cv_pickle = 'CV_lr_bow_train_only_model.pckl' # use private file too
    cv_path = os.path.join(path_to_models, cv_pickle)

    cv_model = pickle.load(open(cv_path, 'rb'))

    X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

    from sklearn.linear_model import LogisticRegression # move outside main

    # load pre-trained model
    #path_to_model = '../models/'

    # functionalize loading & training
    clf_pickle = 'lr_bow_train_only_model.pckl' # use private file too
    clf_path = os.path.join(path_to_models, clf_pickle)

    clf_model = pickle.load(open(clf_path, 'rb'))
    y_predicted = clf_model.predict(X_classify_counts)

    df['gig'] = y_predicted
    cols_to_write = ['company','title','city','state','posted_at','url']
    df_to_write = df[df['gig']==1][cols_to_write]
    #df_to_write = df[df['gig']==1]
    #df[df['gig']==1].to_csv('../data/classified_gig_jobs.csv', index=False)
    #print('Gig jobs found: {}'.format(df[df['gig']==1].shape[0]))

    # write output (use a prefix!)
    file_to_write = 'gigs/sample_daily_job_list.csv'
    bt.write_df_to_s3(df_to_write, bucket, file_to_write)