def extract():
    logging.info('Begin extract')
    
    """
    # Use for batch parsing
    candidate_file_agg = list()

    for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):
        folder_files = map(lambda x: os.path.join(root, x), files) # [os.path.join(root, x) for x in files]
        candidate_file_agg.extend(folder_files)
    """

    candidate_file = list()
    candidate_file.append(sys.argv[1])

    # Convert list to a pandas DataFrame
    observations = pandas.DataFrame(data=candidate_file, columns=['file_path'])
    logging.info('Found {} candidate file(s)'.format(len(observations.index)))

    # Subset candidate files to supported extensions
    observations['extension'] = observations['file_path'].apply(lambda x: os.path.splitext(x)[1])
    observations = observations[observations['extension'].isin(lib.AVAILABLE_EXTENSIONS)]
    logging.info('Took candidate file(s) with appropriate file format(s). {} file(s) remain'.
                 format(len(observations.index)))

    with open(candidate_file[0], 'r') as cv: # needs to be utf-8 encoded
        text = cv.read()

    # Attempt to extract text from files
    observations['text'] = text # observations['file_path'].apply(text_extract_utf8)

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
def load(observations, transformation_pipeline, trained_model):
    logging.info('Begin load')

    # Reference variables
    lib.get_temp_dir()

    observations_path = os.path.join(lib.get_temp_dir(), 'observations.csv')
    logging.info('Saving observations to path: {}'.format(observations_path))
    observations.to_csv(observations_path, index=False)

    if transformation_pipeline is not None:
        transformation_pipeline_path = os.path.join(
            lib.get_temp_dir(), 'transformation_pipeline.pkl')
        logging.info('Saving transformation_pipeline to path: {}'.format(
            transformation_pipeline))
        cPickle.dump(transformation_pipeline,
                     open(transformation_pipeline, 'w+'))

    if trained_model is not None:
        trained_model_path = os.path.join(lib.get_temp_dir(),
                                          'trained_model.pkl')
        logging.info(
            'Saving trained_model to path: {}'.format(transformation_pipeline))
        cPickle.dump(trained_model, open(trained_model_path, 'w+'))

    lib.archive_dataset_schemas('load', locals(), globals())
    logging.info('End load')
    pass
def transform(observations, nlp):
    logging.info('Begin transform')

    print("Extracting name, email, phone, GPA, and dates of work experience")
    observations = observations.fillna('')
    observations['candidate_name'] = observations['text'].apply(lambda x: field_extraction.candidate_name_extractor(x, nlp))
    observations['email'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))
    observations['GPA'] = observations['text'].apply(lambda x: field_extraction.gpa_extractor(x))
    observations['years_experience'] = observations['Work'].apply(lambda x: field_extraction.years_of_experience(x))
    observations['mos_experience'] = field_extraction.months_of_experience(observations['years_experience'])


    # observations['work_dates'] = observations['Work'].apply(
    #     lambda x: field_extraction.spacy_extractor_by_type(str(x).replace('\n', '. '), nlp, 'DATE', 2))

    # observations['uni'] = observations['Edu'].apply(
    #     lambda x: field_extraction.spacy_extractor_by_type(str(x), nlp, 'ORG', 2))

    observations = field_extraction.extract_fields(observations)  # search for terms in whole resume

    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations
def extract():
    logging.info('Begin extract')
    observations = pandas.DataFrame()

    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
Exemple #5
0
def transform(observations):
    logging.info('Begin transform')

    # Transform newsgroup20 data set
    # Newsgroup20: Extract article filename from document path
    observations['filename'] = observations['document_path'].apply(lambda x: ntpath.basename(x))

    # Newsgroup20: Extract article category from document path
    observations['category'] = observations['document_path'].apply(lambda x: ntpath.basename(os.path.dirname(x)))

    # Newsgroup20: Extract article text (and strip article headers), from document path
    observations['text'] = observations['document_path'].apply(lambda x: lib.strip_header(open(x).readlines()))

    # Remove non-ascii characters
    observations['text'] = observations['text'].apply(lambda x: x.decode('ascii', errors='ignore'))

    # Newsgroup20: Convert text to normalized tokens. Unknown tokens will map to 'UNK'.
    observations['tokens'] = observations['text'].apply(simple_preprocess)

    # Newsgroup20: Create bigrams
    observations['bigrams'] = observations['text'].apply(lambda x: lib.find_ngrams(x, n=2))

    # Newsgroup20: Create modeling text
    observations['modeling_text_list'] = observations['tokens'] + observations['bigrams']
    observations['modeling_text'] = observations['modeling_text_list'].apply(lambda x: ' '.join(x))

    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations
Exemple #6
0
def extract():
    logging.info('Begin extract')

    candidate_file_agg = list()  # for creating list of resume file paths
    for root, subdirs, files in os.walk(lib.get_conf(
            'resume_directory')):  # gets path to resumes from yaml file
        # os.walk(parentdir + '/data/input/example_resumes'): would do the same thing
        files = filter(lambda f: f.endswith(('.pdf', '.PDF')),
                       files)  # only read pdfs
        folder_files = map(lambda x: os.path.join(root, x), files)
        candidate_file_agg.extend(folder_files)

    observations = pd.DataFrame(data=candidate_file_agg,
                                columns=['file_path'])  # convert to df
    logging.info('Found {} candidate files'.format(len(observations.index)))
    observations['extension'] = observations['file_path'].apply(
        lambda x: os.path.splitext(x)[1])  # e.g. pdf or doc
    observations = observations[observations['extension'].isin(
        lib.AVAILABLE_EXTENSIONS)]
    logging.info(
        'Subset candidate files to extensions w/ available parsers. {} files remain'
        .format(len(observations.index)))
    observations['text'] = observations['file_path'].apply(
        lib.convert_pdf)  # get text from .pdf files

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(),
                                globals())  # saving the schema
    logging.info('End extract')
    return observations
Exemple #7
0
def model(observations):
    logging.info('Begin model')

    # Resources
    vocabulary = set(itertools.chain.from_iterable(observations['modeling_text_list']))
    vectorizer = CountVectorizer(vocabulary=vocabulary)

    # Create train, test sets
    msk = numpy.random.rand(len(observations)) < 0.8
    train = observations[msk]
    test = observations[~msk]

    # Create X, y vectors
    X_train = vectorizer.fit_transform(train['modeling_text']).todense()
    y_train = train['category']

    X_test = vectorizer.transform(test['modeling_text']).todense()
    y_test = test['category']

    # Create, train model
    nb = GaussianNB()
    nb.fit(X_train, y_train)

    # Create predictions, using trained model
    test['preds'] = nb.predict(X_test)
    scores = nb.score(X_test, y_test)
    logging.info('Scores: {}'.format(scores))

    lib.archive_dataset_schemas('model', locals(), globals())
    logging.info('End model')
    return observations, vectorizer, nb, test
Exemple #8
0
def extract():
    logging.info('Begin extract')

    # Reference variables
    candidate_file_agg = list()

    # Create list of candidate files
    for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):
        folder_files = map(lambda x: os.path.join(root, x), files)
        candidate_file_agg.extend(folder_files)

    # Convert list to a pandas DataFrame
    observations = pandas.DataFrame(data=candidate_file_agg,
                                    columns=['file_path'])
    logging.info('Found {} candidate files'.format(len(observations.index)))

    # Subset candidate files to supported extensions
    observations['extension'] = observations['file_path'].apply(
        lambda x: os.path.splitext(x)[1])
    observations = observations[observations['extension'].isin(
        lib.AVAILABLE_EXTENSIONS)]
    logging.info(
        'Subset candidate files to extensions w/ available parsers. {} files remain'
        .format(len(observations.index)))

    # Attempt to extract text from files
    observations['text'] = observations['file_path'].apply(text_extract_utf8)

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
Exemple #9
0
def transform(observations, nlp):
    # TODO Docstring
    logging.info('Begin transform')

    # Extract candidate name
    observations['candidate_name'] = observations['text'].apply(
        lambda x: field_extraction.candidate_name_extractor(x, nlp))

    # Extract contact fields
    observations['email'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))

    # Extract university
    observations['universities'] = observations['text'].apply(
        field_extraction.extract_universities)
    # Extract skills
    observations['skills'] = observations['text'].apply(
        field_extraction.extract_skills)

    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations, nlp
Exemple #10
0
def transform(observations, nlp):
    # TODO Docstring
    logging.info('Begin transform')

    # Extract candidate name
    observations['candidate_name'] = observations['text'].apply(
        lambda x: extract_entities.candidate_name_extractor(x, nlp))
    #Extract nationality
    observations['nationality'] = observations['text'].apply(
        lambda x: extract_entities.nationality_extractor(x, nlp))

    # Extract contact fields
    observations['email'] = observations['text'].apply(
        lambda x: lib.term_match(x, extract_entities.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(
        lambda x: lib.term_match(x, extract_entities.PHONE_REGEX))
    observations['birthdate'] = observations['text'].apply(
        lambda x: lib.birthdate_match(x, extract_entities.BIRTHDATE_REGEX))
    observations['unit_postcode'] = observations['text'].apply(
        lambda x: lib.term_match(x, extract_entities.UNIT_POSTCODE_REGEX))
    observations['url'] = observations['text'].apply(
        lambda x: lib.term_match(x, extract_entities.URL_REGEX))

    # Extract skills
    observations = extract_entities.extract_fields(observations)

    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations, nlp
Exemple #11
0
def load(mapper, large_model):
    """

     - Save mapper to pkl file
    - Save large model to h5py file

    :param mapper: Mapper, to translate pandas dataframe to usable numpy matrix
    :type mapper: DataFrameMapper
    :param large_model: A trained keras model
    :type large_model: keras.Model

    :return:
    """
    logging.info('Begin load')

    # Save mapper to file
    cPickle.dump(mapper, open('../data/output/mapper.pkl', 'w+'))

    # Save model to file
    large_model.save('../data/output/large_model.h5py', 'w+')

    # Archive & return
    lib.archive_dataset_schemas('load', locals(), globals())
    logging.info('End load')
    pass
Exemple #12
0
def model(observations, mapper):
    logging.info('Begin model')

    cat_vars = ['air_store_id']
    cont_vars = ['reserve_visitors']
    date_vars = ['visit_datetime', 'reserve_datetime']
    response_var = 'visitors'

    Xs, y, x_inputs, input_nub, output_nub = df_prep.create_model_layers(
        observations, mapper, cat_vars, cont_vars, date_vars, response_var)

    # Create model
    x = input_nub
    preds = output_nub(x)

    regression_model = Model(x_inputs, preds)
    opt = optimizers.Adam()
    regression_model.compile(loss=lib.root_mean_squared_log_error,
                             optimizer=opt)

    regression_model.fit(Xs, y, batch_size=2**12, validation_split=.2)

    regression_model.save('..data/models/regression.csv')

    lib.archive_dataset_schemas('model', locals(), globals())
    logging.info('End model')
    pass
Exemple #13
0
def extract():

    # Extract appropriate model
    char_model = load_model(filepath=lib.get_conf('generate_model_path'))

    # Extract posts to be completed
    observations = pandas.read_csv(lib.get_conf('post_seed_path'))

    logging.info('End extract')
    lib.archive_dataset_schemas('generate_extract', locals(), globals())
    return char_model, observations
def model(observations):
    logging.info('Begin model')

    mapper = None

    transformation_pipeline = None

    trained_model = None

    lib.archive_dataset_schemas('model', locals(), globals())
    logging.info('End model')
    return observations, transformation_pipeline, trained_model
def extract():
    # TODO Extract

    # Extract all posts for given subreddit, going back given number of days
    logging.info('Downloading submissions from Reddit')
    observations = scrape_subreddit(lib.get_conf('subreddit'),
                                    lib.get_conf('history_num_days'))
    logging.info('Found {} submissions'.format(len(observations.index)))

    logging.info('End extract')
    lib.archive_dataset_schemas('extract', locals(), globals())
    return observations
Exemple #16
0
def extract():
    logging.info('Begin extract')

    reservations = pandas.read_csv('../data/input/air_reserve.csv')
    visits = pandas.read_csv('../data/input/air_visit_data.csv')

    observations = pandas.merge(reservations, visits)
    observations = observations.sample(frac=1.0, replace=False)
    observations = observations.head(100000)
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
Exemple #17
0
def extract():
    """
    Extract necessary data / resources from upstream. This method will:

     - Validate that newsgroup data set is available, and read in
     - Validate that text embeddings are available, and read in
     - Validate that text to embedding index lookup is available, and read in


    :return: observations, embedding_matrix, word_to_index
    :rtype: (pandas.DataFrame, numpy.array, dict)
    """

    logging.info('Begin extract')
    logging.info('Performing extract for batch: {}, from newgroup_path: {}'
                 .format(lib.get_batch_name(), lib.get_conf('newsgroup_path')))

    # Download resources

    # Confirm newsgroup data set is downloaded
    resources.download_newsgroup()

    # Confirm that embedding is downloaded
    resources.download_embedding()

    # Extract resources from file system

    # Newsgroup20: Get list of all candidate documents
    glob_pattern = os.path.join(lib.get_conf('newsgroup_path'), '*', '*')
    logging.info('Searching for glob_pattern: {}'.format(glob_pattern))
    document_candidates = glob.glob(glob_pattern)

    # Newsgroup20: Create observations data set
    observations = pandas.DataFrame(document_candidates, columns=['document_path'])
    logging.info('Shape of observations data frame created from glob matches: {}'.format(observations.shape))

    # Newsgroup20: Re-order rows
    observations = observations.sample(frac=1)

    # Newsgroup20: Subset number of observations, if it's a test run
    if lib.get_conf('test_run'):
        logging.info('Reducing file size for test run')
        observations = observations.head(100)
        logging.info('Test run number of records: {}'.format(len(observations.index)))

    # Embedding: Load embedding
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    logging.info('word_to_index max index: {}'.format(max(word_to_index.values())))

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations, embedding_matrix, word_to_index
Exemple #18
0
def transform(observations, label_encoder):
    logging.info('Begin transform')

    # Feature engineering
    observations.columns = map(lambda x: '_'.join(x.lower().split()),
                               observations.columns)
    observations['lat'] = observations['location_1'].apply(
        lambda x: eval(x)[0])
    observations['long'] = observations['location_1'].apply(
        lambda x: eval(x)[1])

    # TODO Feature engineering
    observations['is_manhattan'] = observations['borough'] == 'MANHATTAN'
    observations['is_ny_police'] = observations[
        'jurisdiction'] == 'N.Y. POLICE DEPT'
    # observations['occurence_epoch'] = pandas.to_datetime(observations['occurrence_datetime'], format='%m/%d/%y %I:%M:%S %p')
    # print observations['occurence_epoch'][0], type(observations['occurence_epoch'][0])
    # observations['compstat_date'] = observations['compstat_year'].astype(str) + '-' + observations['compstat_month'].astype(str)  + '-' + \
    #                                 observations['compstat_day'].astype(str)
    #

    # Dummy out response variable
    if label_encoder is None:
        label_encoder = lib.create_label_encoder(observations['offense'])
    observations['response'] = observations['offense'].apply(
        lambda x: label_encoder[x])

    observations['is_grand_larceny'] = observations['offense'].apply(
        lambda x: x == 'GRAND LARCENY')
    logging.info('is_grand_larceny value counts: {}'.format(
        observations['is_grand_larceny'].value_counts()))

    lib.archive_dataset_schemas('transform', locals(), globals())

    regressors = [
        'occurrence_day', 'occurrence_year', 'compstat_month', 'compstat_day',
        'compstat_year', 'lat', 'long'
    ]
    response_var = 'response'

    # TODO Normalization should always be based on training set, not just set at hand
    for regressor in regressors:
        max_value = observations[regressor].max()
        min_value = observations[regressor].min()
        observations[regressor] = (observations[regressor] -
                                   min_value) / (max_value - min_value)
    regressors.extend(['is_manhattan', 'is_ny_police'])
    X = observations[regressors].as_matrix().astype(numpy.float32)
    y = numpy.array(observations[response_var].tolist()).astype(numpy.float32)

    logging.info('End transform')
    return observations, X, y, label_encoder
Exemple #19
0
def load(char_model, observations, generated_posts):
    logging.info('Begin transform')

    # Export observations
    observations.to_csv(
        path_or_buf=lib.get_conf('generated_observations_path'), index=False)

    # Export generated posts
    generated_posts.to_csv(path_or_buf=lib.get_conf('generated_posts_path'),
                           index=False)

    logging.info('End load')
    lib.archive_dataset_schemas('generate_load', locals(), globals())
    pass
Exemple #20
0
def extract():
    """

     - Extract data from CSV

    :return:
    """
    logging.info('Begin extract')

    # Read files from CSV
    observations = pandas.read_csv('../data/input/titanic.csv')

    # Archive & return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
Exemple #21
0
def model(x_train, x_test, y_train, y_test):
    """

     - Train multiple models, return a trained model

    :param x_train:
    :param x_test:
    :param y_train:
    :param y_test:
    :return:
    """
    logging.info('Begin model')

    # Baseline model
    baseline_model = models.baseline()
    baseline_model.fit(
        x_train,
        y_train,
        epochs=20,
        validation_split=.3,
        callbacks=[
            TensorBoard(log_dir=os.path.expanduser('~/.logs/baseline'))
        ])

    # Small model
    intermediate_model = models.small()
    intermediate_model.fit(
        x_train,
        y_train,
        epochs=20,
        validation_split=.3,
        callbacks=[TensorBoard(log_dir=os.path.expanduser('~/.logs/small'))])

    # Large
    large_model = models.large()
    large_model.fit(
        x_train,
        y_train,
        epochs=20,
        validation_split=.3,
        callbacks=[TensorBoard(log_dir=os.path.expanduser('~/.logs/large'))])

    # Archive & return
    lib.archive_dataset_schemas('model', locals(), globals())
    logging.info('End model')

    return large_model
Exemple #22
0
def transform(observations):
    """

     - Convert Sex to boolean male indicator
     - Create train / test split
     - Create SKLearn-Pandas mapper
     - Train SKLearn
     - Transform train and test data

    :param observations:
    :type observations: pandas.DataFrame
    :return:
    """
    logging.info('Begin transform')

    # Convert Sex field into boolean male indicator
    observations['male'] = observations['Sex'] == 'male'
    logging.info('Converted Sex to binary class. Value counts: {}'.format(
        observations['male'].value_counts()))

    # Split into train / test split
    mask = numpy.random.rand(len(observations)) < 0.8
    observations_train = observations[mask]
    observations_test = observations[~mask]

    logging.info('Creating dataframe mapper')
    mapper = DataFrameMapper([(['Age'], [Imputer(),
                                         StandardScaler()]),
                              (['SibSp'], [Imputer(),
                                           StandardScaler()]),
                              (['Parch'], [Imputer(),
                                           StandardScaler()]),
                              (['male'], [Imputer(strategy='most_frequent')])])

    logging.info('Fitting and transforming training data set')
    x_train = mapper.fit_transform(observations_train)
    y_train = observations_train['Survived'].values

    logging.info('Transforming response data set')
    x_test = mapper.transform(observations_test)
    y_test = observations_test['Survived'].values

    # Archive & return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return x_train, x_test, y_train, y_test, mapper
Exemple #23
0
def model(observations, X, y, label_encoder):
    logging.info('Beginning model')

    # Data split, formatting
    dummy_X = observations[['lat', 'long']].as_matrix()
    dummy_y = observations['is_grand_larceny']

    # ZeroR Model
    dummy_clf = DummyClassifier(strategy='constant', constant=1)
    dummy_clf.fit(dummy_X, dummy_y)
    print('Dummy modle accuracy: {}'.format(dummy_clf.score(dummy_X, dummy_y)))

    # Keras model

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    train_test_mask = numpy.random.random(size=len(observations.index))
    num_train = sum(train_test_mask < .8)
    num_validate = sum(train_test_mask >= .8)
    logging.info(
        'Proceeding w/ {} train observations, and {} test observations'.format(
            num_train, num_validate))

    ff_model = models.gen_stupid_ff_network(X.shape[1], y.shape[1])

    ff_model.fit(X_train,
                 y_train,
                 batch_size=1024,
                 epochs=4,
                 validation_data=(X_test, y_test))

    # Add predictions to data set
    preds = ff_model.predict(X)
    ff_model.metrics, ff_model.metrics_names

    observations['max_probability'] = map(max, preds)
    observations['prediction_index'] = map(lambda x: numpy.argmax(x), preds)
    observations['modeling_prediction'] = map(
        lambda x: lib.prop_to_label(x, label_encoder), preds)
    trained_model = ff_model
    logging.info('End model')
    lib.archive_dataset_schemas('model', locals(), globals())
    return observations, X, y, label_encoder, trained_model
def load(train, test, transformation_pipeline, trained_model):
    """
    Load all assets for downstream use

    :param train:
    :param test:
    :param transformation_pipeline:
    :param trained_model:
    :return:
    """
    logging.info('Begin load')

    # Serialize train
    train_path = os.path.join(lib.get_batch_output_folder(), 'train.csv')
    logging.info('Saving train to path: {}'.format(train_path))
    train.to_csv(train_path, index=False)

    # Serialize test
    test_path = os.path.join(lib.get_batch_output_folder(), 'test.csv')
    logging.info('Saving test to path: {}'.format(train_path))
    test.to_csv(test_path, index=False)

    # Serialize transformation_pipeline
    if transformation_pipeline is not None:
        transformation_pipeline_path = os.path.join(
            lib.get_batch_output_folder(), 'transformation_pipeline.pkl')
        logging.info('Saving transformation_pipeline to path: {}'.format(
            transformation_pipeline_path))
        pickle.dump(transformation_pipeline,
                    open(transformation_pipeline_path, 'wb'))

    # Serialize trained_model
    if trained_model is not None:
        # Serialize trained_model
        trained_model_path = os.path.join(lib.get_batch_output_folder(),
                                          'trained_model.pkl')
        logging.info(
            'Saving trained_model to path: {}'.format(trained_model_path))
        pickle.dump(trained_model, open(trained_model_path, 'wb'))

        # Capture model results
        print(trained_model.cv_results_)

    lib.archive_dataset_schemas('load', locals(), globals())
    logging.info('End load')
    pass
def model(train, test):
    """
    Create a pipeline and train a grid searched model

    :param train:
    :param test:
    :return:
    """

    logging.info('Begin model')

    mapper = DataFrameMapper([
        ('honorific', [CountVectorizer(vocabulary=lib.HONORIFIC_VOCABULARY)]),
        (['pclass'], [Imputer(), StandardScaler()]),
        (['male'], [Imputer(), StandardScaler()]),
        (['siblings_spouses_aboard'], [Imputer(), StandardScaler()]),
        (['parents_children_aboard'], [Imputer(), StandardScaler()]),
        (['fare'], [Imputer(), StandardScaler()]),
    ])
    transformation_pipeline = Pipeline([('featureizer', mapper),
                                        ('svc', SVC())])

    param_grid = {
        'svc__gamma': numpy.logspace(-9, 3, 1),
        'svc__C': numpy.logspace(-2, 10, 1),
        'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__degree': range(2, 8)
    }

    trained_model = GridSearchCV(transformation_pipeline,
                                 param_grid=param_grid,
                                 scoring='accuracy',
                                 cv=2,
                                 n_jobs=-1)
    logging.info('Training model')
    trained_model.fit(train.copy(), y=train['survived'])

    # Set prediction
    for data_set in [train, test]:
        data_set['pred'] = trained_model.predict(data_set)

    lib.archive_dataset_schemas('model', locals(), globals())
    logging.info('End model')
    return train, test, transformation_pipeline, trained_model
Exemple #26
0
def transform(observations):
    """
    Perform light feature transformation, ahead of feature transformation pipeline

    :param observations:
    :return:
    """
    logging.info('Begin transform')

    # Convert the gender column to a male or not column
    observations['male'] = observations['sex'] == 'male'

    # Get the honorific (e.g. `Mr.` from `,Mr. Henry Jr Sutehall`)
    observations['honorific'] = observations['name'].apply(lambda x: str(x).split()[0])
    train, test = train_test_split(observations, test_size=0.2)

    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return train, test
Exemple #27
0
def load(observations, vectorizer, nb, test):
    logging.info('Begin load')

    logging.info('Writing observations to CSV')
    observations.to_csv(os.path.join(lib.get_batch_output_folder(), 'observations.csv'))

    logging.info('Writing test observations to CSV ')
    test.to_csv(os.path.join(lib.get_batch_output_folder(), 'test.csv'))

    logging.info('Writing vectorizer to file')
    cPickle.dump(vectorizer, open(os.path.join(lib.get_batch_output_folder(), 'vectorizer.pkl'), 'w+'))

    logging.info('Writing model to file')
    cPickle.dump(nb, open(os.path.join(lib.get_batch_output_folder(), 'model.pkl'), 'w+'))


    lib.archive_dataset_schemas('load', locals(), globals())
    logging.info('End load')
    pass
Exemple #28
0
def extract():
    """
    Extract the data set from upstream

    :return:
    """
    logging.info('Begin extract')

    # Load the data set
    observations = lib.load_titanic()

    # Subset observation for speedier test iterations
    if lib.get_conf('test_run'):
        logging.warn('test_run is set to True. Subsetting to a much smaller data set for testing purposes.')
        observations = observations.sample(100)
        observations = observations.reset_index()

    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
Exemple #29
0
def extract():
    # TODO Docstring

    logging.info('Begin extract')

    # Extract all posts for given subreddit, going back given number of days
    logging.info('Downloading submissions from Reddit')
    observations = scrape_subreddit(lib.get_conf('subreddit'),
                                    lib.get_conf('history_num_days'))
    logging.info('Found {} submissions'.format(len(observations.index)))

    # Load embedding matrix
    resources.download_embedding()
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    logging.info('word_to_index max index: {}'.format(
        max(word_to_index.values())))

    logging.info('End extract')
    lib.archive_dataset_schemas('extract', locals(), globals())
    return embedding_matrix, word_to_index, observations
Exemple #30
0
def transform(observations):
    logging.info('Begin transform')

    cat_vars = ['air_store_id']
    cont_vars = ['reserve_visitors', 'visitors']
    date_vars = ['visit_datetime', 'reserve_datetime']

    # Convert datetime vars
    for date_var in date_vars:
        logging.info('Converting date_var: {}'.format(date_var))
        observations[date_var] = pandas.to_datetime(observations[date_var],
                                                    format='%Y-%m-%d %H:%M:%S')

    mapper = df_prep.create_mapper(observations,
                                   cat_vars=cat_vars,
                                   cont_vars=cont_vars,
                                   date_vars=date_vars)

    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations, mapper