Exemple #1
0
def api_post(code, refresh):
    import http.client
    clientID = lib.get_conf('client_id')
    clientSecret = lib.get_conf('client_secret')
    redirect_URL = 'http://127.0.0.1:5000/'
    conn = http.client.HTTPSConnection("api.dexcom.com")

    if refresh:
        payload = "client_secret=" + clientSecret + "&client_id=" +clientID \
        + "&refresh_token=" + code + "&grant_type=refresh_token&redirect_uri=" + redirect_URL

    else:
        payload = "client_secret=" + clientSecret + "&client_id=" +clientID \
        + "&code=" + code + "&grant_type=authorization_code&redirect_uri=" + redirect_URL

    headers = {
        'content-type': "application/x-www-form-urlencoded",
        'cache-control': "no-cache"
    }

    conn.request("POST", "/v2/oauth2/token", payload, headers)

    res = conn.getresponse()
    data = res.read()

    return data
Exemple #2
0
def evaluate(x_test, y_test):
    model_dir_path = lib.get_conf('model_directory')

    # Load saved model
    model_name = listdir(model_dir_path)[-1]

    model_path = model_dir_path + '\\' + model_name

    classifier = load_model(model_path)

    y_pred = classifier.predict_classes(x_test)

    # Instantiate empty output dataframe
    pd_eval_output = pd.DataFrame()
    pd_eval_output['Actual'] = pd.Series(y_test)
    pd_eval_output['Predicted'] = pd.Series(y_pred)

    eval_output_directory = lib.get_conf('prediction_output_directory')
    eval_output_file_name = 'tf_evaluation_output.csv'
    eval_output_path = eval_output_directory + '\\' + eval_output_file_name

    # Write out evaluation output
    pd_eval_output.to_csv(eval_output_path)

    score = metrics.accuracy_score(y_test, y_pred)

    cm = metrics.confusion_matrix(y_test, y_pred)

    print('Model Accuracy: {}'.format(score))  #For Demo purposes
    logging.info('Model Accuracy: {}'.format(score))
Exemple #3
0
def extract():

    # Extract appropriate model
    char_model = load_model(filepath=lib.get_conf('generate_model_path'))

    # Extract posts to be completed
    observations = pandas.read_csv(lib.get_conf('post_seed_path'))

    logging.info('End extract')
    lib.archive_dataset_schemas('generate_extract', locals(), globals())
    return char_model, observations
Exemple #4
0
def extract():
    """
    Extract necessary data / resources from upstream. This method will:

     - Validate that newsgroup data set is available, and read in
     - Validate that text embeddings are available, and read in
     - Validate that text to embedding index lookup is available, and read in


    :return: observations, embedding_matrix, word_to_index
    :rtype: (pandas.DataFrame, numpy.array, dict)
    """

    logging.info('Begin extract')
    logging.info('Performing extract for batch: {}, from newgroup_path: {}'
                 .format(lib.get_batch_name(), lib.get_conf('newsgroup_path')))

    # Download resources

    # Confirm newsgroup data set is downloaded
    resources.download_newsgroup()

    # Confirm that embedding is downloaded
    resources.download_embedding()

    # Extract resources from file system

    # Newsgroup20: Get list of all candidate documents
    glob_pattern = os.path.join(lib.get_conf('newsgroup_path'), '*', '*')
    logging.info('Searching for glob_pattern: {}'.format(glob_pattern))
    document_candidates = glob.glob(glob_pattern)

    # Newsgroup20: Create observations data set
    observations = pandas.DataFrame(document_candidates, columns=['document_path'])
    logging.info('Shape of observations data frame created from glob matches: {}'.format(observations.shape))

    # Newsgroup20: Re-order rows
    observations = observations.sample(frac=1)

    # Newsgroup20: Subset number of observations, if it's a test run
    if lib.get_conf('test_run'):
        logging.info('Reducing file size for test run')
        observations = observations.head(100)
        logging.info('Test run number of records: {}'.format(len(observations.index)))

    # Embedding: Load embedding
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    logging.info('word_to_index max index: {}'.format(max(word_to_index.values())))

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations, embedding_matrix, word_to_index
def extract():
    # TODO Extract

    # Extract all posts for given subreddit, going back given number of days
    logging.info('Downloading submissions from Reddit')
    observations = scrape_subreddit(lib.get_conf('subreddit'),
                                    lib.get_conf('history_num_days'))
    logging.info('Found {} submissions'.format(len(observations.index)))

    logging.info('End extract')
    lib.archive_dataset_schemas('extract', locals(), globals())
    return observations
Exemple #6
0
def load(char_model, observations, generated_posts):
    logging.info('Begin transform')

    # Export observations
    observations.to_csv(
        path_or_buf=lib.get_conf('generated_observations_path'), index=False)

    # Export generated posts
    generated_posts.to_csv(path_or_buf=lib.get_conf('generated_posts_path'),
                           index=False)

    logging.info('End load')
    lib.archive_dataset_schemas('generate_load', locals(), globals())
    pass
Exemple #7
0
def scrape_subreddit(subreddit_name, num_days):
    # TODO Docstring

    logging.info(
        'Beginning Reddit scraper, for subreddit: {}, and num_days: {}'.format(
            subreddit_name, num_days))

    # Reference variables
    parsed_submission_agg = list()

    # Create connection. For details, see https://www.reddit.com/prefs/apps/

    logging.info('Creating reddit connection')
    reddit = praw.Reddit(client_id=lib.get_conf('client_id'),
                         client_secret=lib.get_conf('client_secret'),
                         user_agent='upvote_estimator:0.0.1')

    # Find correct subreddit
    logging.info('Searching for subreddit: {}'.format(subreddit_name))
    subreddit = reddit.subreddit(subreddit_name)
    logging.debug('Searched for subreddit: {}, found subreddit: {}, {}'.format(
        subreddit_name, subreddit.display_name, subreddit.title))

    # Compute correct time range (current datetime - num_days to current datetime)
    end_datetime = datetime.datetime.utcnow()
    end_datetime_unix = time.mktime(end_datetime.timetuple())
    start_datetime = end_datetime - datetime.timedelta(days=num_days)
    start_datetime_unix = time.mktime(start_datetime.timetuple())
    logging.debug('Time range: {} to {}'.format(start_datetime, end_datetime))

    # Iterate through posts chronologically
    for index, submission in enumerate(
            subreddit.submissions(start_datetime_unix, end_datetime_unix)):
        logging.info('Working number {}, submission: '.format(
            index, submission))

        # Parse each submission and extract essential fields
        parsed_submission = submission_parser(submission)

        # Add info from each post to aggregator
        parsed_submission_agg.append(parsed_submission)

        if lib.get_conf('test_run') and index >= 49:
            break
    # Create DataFrame from pulled data
    posts = pandas.DataFrame(parsed_submission_agg)

    # Return
    return posts
Exemple #8
0
def create_trials():
    # TODO Docstring

    logging.info('Beginning trials')

    # Create pre-trial DataFrame (one observation is one trial), wth trial number and starting dollar amount
    logging.info('Creating pre-trial dataframe')
    index = range(1, lib.get_conf('num_trials') + 1)
    trials = pandas.DataFrame(index=index)
    trials['trial_num'] = trials.index
    trials['starting_amount'] = lib.get_conf('starting_amount')

    logging.info(
        'Created trials table with starting info: \n{}'.format(trials))

    # Generate observed standard deviations
    logging.info('Generating standard_deviations')
    st_devs = [
        numpy.random.normal(size=20) for i in range(lib.get_conf('num_trials'))
    ]
    trials['st_devs'] = st_devs

    # Translate observed standard deviations to dollar amounts (with inflation)

    # Iterate through portfolios
    for portfolio_dict in lib.get_conf('portfolios'):
        logging.info(
            'Generating results for portfolio dict: {}'.format(portfolio_dict))

        # Compute list of balances (one for every year)
        balances_columns = portfolio_dict['portfolio'] + '_balances'
        trials[balances_columns] = trials.apply(
            func=lambda x: lib.compute_balances(
                x,
                return_mean=portfolio_dict['return_mean'],
                return_std_dev=portfolio_dict['return_std_dev']),
            axis=1)

        # Extract final dollar amount for each trial

        trials[portfolio_dict['portfolio'] +
               '_final_balance'] = trials[balances_columns].apply(
                   lambda x: x[-1])

    # TODO Archive data and return trial data
    logging.info('Trials complete')

    return trials
def transform(observations, false_y=False):

    # Reference variables
    char_indices = lib.get_char_indices()
    indices_char = lib.get_indices_char()
    x_agg = list()
    y_agg = list()

    if lib.get_conf('test_run'):
        observations = observations.head(100).copy()

    # Create a single field with all text. < and > serve as start and end tokens
    observations[
        'model_text'] = observations['title'] + ' ' + observations['selftext']

    # Iterate through individual observations
    for text in observations['model_text']:

        # Generate x and y for observations
        observation_x, observation_y = lib.gen_x_y(text, false_y=false_y)
        x_agg.extend(observation_x)
        y_agg.extend(observation_y)

    x = numpy.matrix(x_agg)
    y = numpy.matrix(y_agg)
    return observations, char_indices, indices_char, x, y
Exemple #10
0
def create_embedding_matrix():
    """
    Load embedding assets from file.

     - Load embedding binaries w/ gsensim
     - Extract embedding matrix from gensim model
     - Extract word to index lookup from gensim model
    :return: embedding_matrix, word_to_index
    :rtype: (numpy.array, {str:int})
    """

    logging.info(
        'Reading embedding matrix and word to index dictionary from file')

    # Get word weights from file via gensim
    model = gensim.models.KeyedVectors.load_word2vec_format(
        get_conf('embedding_path'), binary=True)
    embedding_matrix = model.syn0

    # Filter out words with index not in w2v range
    word_to_index = dict([(k, v.index) for k, v in model.vocab.items()])

    logging.info('Created embedding matrix, of shape: {}'.format(
        embedding_matrix.shape))
    logging.info(
        'Created word to index lookup, with min index: {}, max index: {}'.
        format(min(word_to_index.values()), max(word_to_index.values())))

    return embedding_matrix, word_to_index
Exemple #11
0
def extract():
    logging.info('Begin extract')

    candidate_file_agg = list()  # for creating list of resume file paths
    for root, subdirs, files in os.walk(lib.get_conf(
            'resume_directory')):  # gets path to resumes from yaml file
        # os.walk(parentdir + '/data/input/example_resumes'): would do the same thing
        files = filter(lambda f: f.endswith(('.pdf', '.PDF')),
                       files)  # only read pdfs
        folder_files = map(lambda x: os.path.join(root, x), files)
        candidate_file_agg.extend(folder_files)

    observations = pd.DataFrame(data=candidate_file_agg,
                                columns=['file_path'])  # convert to df
    logging.info('Found {} candidate files'.format(len(observations.index)))
    observations['extension'] = observations['file_path'].apply(
        lambda x: os.path.splitext(x)[1])  # e.g. pdf or doc
    observations = observations[observations['extension'].isin(
        lib.AVAILABLE_EXTENSIONS)]
    logging.info(
        'Subset candidate files to extensions w/ available parsers. {} files remain'
        .format(len(observations.index)))
    observations['text'] = observations['file_path'].apply(
        lib.convert_pdf)  # get text from .pdf files

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(),
                                globals())  # saving the schema
    logging.info('End extract')
    return observations
Exemple #12
0
def extract_skills(resume_text):
    potential_skills_dict = dict()
    matched_skills = set()

    # TODO This skill input formatting could happen once per run, instead of once per observation.
    for skill_input in lib.get_conf('skills'):

        # Format list inputs
        if type(skill_input) is list and len(skill_input) >= 1:
            potential_skills_dict[skill_input[0]] = skill_input

        # Format string inputs
        elif type(skill_input) is str:
            potential_skills_dict[skill_input] = [skill_input]
        else:
            logging.warn(
                'Unknown skill listing type: {}. Please format as either a single string or a list of strings'
                ''.format(skill_input))

    for (skill_name, skill_alias_list) in potential_skills_dict.items():

        skill_matches = 0
        # Iterate through aliases
        for skill_alias in skill_alias_list:
            # Add the number of matches for each alias
            skill_matches += lib.term_count(resume_text, skill_alias.lower())

        # If at least one alias is found, add skill name to set of skills
        if skill_matches > 0:
            matched_skills.add(skill_name)

    return matched_skills
Exemple #13
0
def send_to_Cosmos():
    input_path=os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.json')
    client = dc.DocumentClient(config_cosmos.COSMOSDB_HOST, {'masterKey': config_cosmos.COSMOSDB_KEY})
    with open(input_path) as read_file:
        data = json.load(read_file)
    for i in data["data"]:
        client.CreateDocument(CosmosConnection.get_collection_link(config_cosmos.COSMOSDB_DATABASE, config_cosmos.COSMOSDB_COLLECTION), i)
Exemple #14
0
def extract():
    logging.info('Begin extract')

    # Reference variables
    candidate_file_agg = list()

    # Create list of candidate files
    for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):
        folder_files = map(lambda x: os.path.join(root, x), files)
        candidate_file_agg.extend(folder_files)

    # Convert list to a pandas DataFrame
    observations = pandas.DataFrame(data=candidate_file_agg,
                                    columns=['file_path'])
    logging.info('Found {} candidate files'.format(len(observations.index)))

    # Subset candidate files to supported extensions
    observations['extension'] = observations['file_path'].apply(
        lambda x: os.path.splitext(x)[1])
    observations = observations[observations['extension'].isin(
        lib.AVAILABLE_EXTENSIONS)]
    logging.info(
        'Subset candidate files to extensions w/ available parsers. {} files remain'
        .format(len(observations.index)))

    # Attempt to extract text from files
    observations['text'] = observations['file_path'].apply(text_extract_utf8)

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
def create_embedding_matrix():
    """
    Load embedding assets from file.

     - Load embedding binaries w/ gsensim
     - Extract embedding matrix from gensim model
     - Extract word to index lookup from gensim model
    :return: embedding_matrix, word_to_index
    :rtype: (numpy.array, {str:int})
    """

    logging.info(
        'Reading embedding matrix and word to index dictionary from file')

    # Get word weights from file via gensim
    model = gensim.models.KeyedVectors.load_word2vec_format(
        get_conf('embedding_path'), binary=True)
    embedding_matrix = model.syn0

    # Filter out words with index not in w2v range
    word_to_index = dict([(k, v.index) for k, v in model.vocab.items()])

    # Transform embedding resources
    # Embedding: Update embedding to map any unknown words (words not in training vocabulary) to the unknown value
    default_dict_instance = defaultdict(lambda: word_to_index['UNK'])
    default_dict_instance.update(word_to_index)
    word_to_index = default_dict_instance

    logging.info('Created embedding matrix, of shape: {}'.format(
        embedding_matrix.shape))
    logging.info(
        'Created word to index lookup, with min index: {}, max index: {}'.
        format(min(word_to_index.values()), max(word_to_index.values())))

    return embedding_matrix, word_to_index
def download_embedding():
    """
    Prepare GoogleNews pre-trained word embeddings.

     - Check if compressed embeddings are available
     - If compressed embeddings are not available, download them
     - Check if uncompressed embeddings are available
     - If compressed embeddings are not available, uncompress embeddings

    :return: None
    :rtype: None
    """

    logging.info(
        'Attempting to either validate or download and extract embeddings.')

    # Reference variables
    embedding_download_link = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
    embedding_downloaded_path = '../resources/compressed/GoogleNews-vectors-negative300.bin.gz'

    # Download embeddings, if necessary
    if not os.path.exists(embedding_downloaded_path):
        logging.warn(
            'embedding_downloaded_path does not yet exist. Downloading embedding. This occurs once, and will take '
            '10-20 minutes (due to large file size)')
        print(
            'embedding_downloaded_path does not yet exist. Downloading embedding. This occurs once, and will take '
            '10-20 minutes (due to large file size)')
        logging.info('Downloading embedding data from: {} to: {}'.format(
            embedding_download_link, embedding_downloaded_path))

        download_file(embedding_download_link, embedding_downloaded_path)

    # Extract embeddings, if necessary
    if not os.path.exists(get_conf('embedding_path')):
        logging.warn('embedding_path does not exist. Extracting embedding.')
        logging.info('Extracting embedding data from: {} to: {}'.format(
            embedding_downloaded_path, get_conf('embedding_path')))

        with gzip.open(embedding_downloaded_path, 'rb') as zipped, \
                open(get_conf('embedding_path'), 'w+') as unzipped:
            for line in zipped:
                unzipped.write(line)

    logging.info('Embeddings available at: {}'.format(
        get_conf('embedding_path')))
def model(observation, char_indices, indices_char, x, y):

    char_model = models.rnn_embedding_model(x, y)

    # Set up model training variables
    optimizer = RMSprop(lr=0.01)
    char_model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    batch_size = 4096
    num_epochs = 200

    if lib.get_conf('test_run'):
        num_epochs = 2

    # Set up callbacks
    tf_log_path = os.path.join(os.path.expanduser('~/log_dir'),
                               lib.get_batch_name())
    logging.info('Using Tensorboard path: {}'.format(tf_log_path))

    mc_log_path = os.path.join(
        lib.get_conf('model_checkpoint_path'),
        lib.get_batch_name() + '_epoch_{epoch:03d}_loss_{loss:.2f}.h5py')
    logging.info('Using mc_log_path path: {}'.format(mc_log_path))

    sentence_generator = SentenceGenerator(verbose=1)

    clr_step_size = numpy.floor((float(x.shape[0]) / batch_size) * 4)
    clr = CyclicLR(base_lr=.005,
                   max_lr=.02,
                   mode='triangular2',
                   step_size=clr_step_size)
    logging.info('Using CRL step size: {}'.format(clr_step_size))

    callbacks = [
        TensorBoard(log_dir=tf_log_path),
        ModelCheckpoint(mc_log_path), sentence_generator, clr
    ]

    # Train the model, output generated text after each iteration
    char_model.fit(x,
                   y,
                   batch_size=batch_size,
                   epochs=num_epochs,
                   callbacks=callbacks)

    print sentence_generator.sentences
Exemple #18
0
def download_file(url, local_file_path, auth=False):
    """
    Download the file at `url` in chunks, to the location at `local_file_path`
    :param url: URL to a file to be downloaded
    :type url: str
    :param local_file_path: Path to download the file to
    :type local_file_path: str
    :param auth: is authentication required to download file
    :type auth: Boolean
    :return: The path to the file on the local machine (same as input `local_file_path`)
    :rtype: str
    """

    # Get user name and password
    username = lib.get_conf('user_name')
    password = lib.get_conf('password')

    # Reference variables
    chunk_count = 0

    if auth:

        # Create connection to the stream
        r = requests.get(url, auth=(username, password), stream=True)
    else:

        # Create connection without password
        r = requests.get(url, stream=True)

    # Open output file
    with open(local_file_path, 'wb') as f:

        # Iterate through chunks of file
        for chunk in r.iter_content(chunk_size=64 * 1024):

            logging.debug('Downloading chunk: {} for file: {}'.format(
                chunk_count, local_file_path))

            # Write chunk to file
            f.write(chunk)

            # Increase chunk counter
            chunk_count = chunk_count + 1

    return local_file_path
Exemple #19
0
def load(observations):
    logging.info('Begin load')
    output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv')

    logging.info('Results being output to {}'.format(output_path))
    print('Results output to {}'.format(output_path))

    observations.to_csv(path_or_buf=output_path, index_label='index')
    logging.info('End transform')
    pass
Exemple #20
0
    def on_epoch_end(self, epoch, logs={}):

        # Reference variables
        sentence_agg = list()

        seed_chars = '<Jiggling around cheap iPhone chargers to find the sweet spot is the millennial version of ' \
                     'tweaking a TV antenna.>'[:lib.get_conf('ngram_len')]

        for diversity in [0.2, 0.5, 1.0, 1.2]:

            generated = ''
            sentence = seed_chars
            generated += sentence

            # Generate next characters, using a rolling window
            for next_char_index in range(lib.get_conf('pred_length')):
                x_pred, text_y = lib.gen_x_y(sentence, false_y=True)

                preds = self.model.predict(x_pred, verbose=0)[-1]

                next_index = lib.sample(preds, diversity)
                next_char = lib.get_indices_char()[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

            local_dict = dict()
            local_dict['epoch'] = epoch
            local_dict['seed'] = seed_chars
            local_dict['diversity'] = diversity
            local_dict['generated_post'] = generated
            sentence_agg.append(local_dict)

            if self.verbose >= 1:
                print('Diversity: {}, generated post: {}'.format(
                    local_dict['diversity'], local_dict['generated_post']))

        epoch_sentences = pandas.DataFrame(sentence_agg)

        self.sentences = pandas.concat(objs=[self.sentences, epoch_sentences])
        if self.output_path is not None:
            self.sentences.to_csv(path_or_buf=self.output_path, index=False)
Exemple #21
0
def load_to_json(observations, nlp):
    logging.info('Begin load to json')
    output_path = os.path.join(lib.get_conf('summary_output_directory'),
                               'resume_summary.json')

    logging.info('Results being output to {}'.format(output_path))
    print('Results output to {}'.format(output_path))

    observations.to_json(path_or_buf=output_path, orient='table')
    logging.info('End transform to json')
    pass
Exemple #22
0
def load_data():
    logging.info('Loading evaluation data')
    eval_data_directory = lib.get_conf('raw_labeled_data_directory')

    eval_data_file_name = listdir(eval_data_directory)[0]

    eval_data_path = eval_data_directory + '\\' + eval_data_file_name

    pd_eval_data = pd.read_csv(eval_data_path)

    return pd_eval_data
Exemple #23
0
def extract():
    # TODO Docstring

    logging.info('Begin extract')

    # Extract all posts for given subreddit, going back given number of days
    logging.info('Downloading submissions from Reddit')
    observations = scrape_subreddit(lib.get_conf('subreddit'),
                                    lib.get_conf('history_num_days'))
    logging.info('Found {} submissions'.format(len(observations.index)))

    # Load embedding matrix
    resources.download_embedding()
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    logging.info('word_to_index max index: {}'.format(
        max(word_to_index.values())))

    logging.info('End extract')
    lib.archive_dataset_schemas('extract', locals(), globals())
    return embedding_matrix, word_to_index, observations
Exemple #24
0
def download_newsgroup():
    """
    Validate that newsgroup20 data set is available

      - Check if newsgroup20 data set is available
      - If newsgroup20 data set is not available:
        - Download files
        - Un-tar files

    :return: None
    :rtype: None
    """
    # TODO Docstring

    # Reference variables
    newsgroup_20_download_link = 'http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz'
    newsgroup_20_downloaded_path = '../resources/compressed/news20.tar.gz'

    logging.info(
        'Attempting to either validate or download and extract newsgroup_20 data set from {}'
        .format(newsgroup_20_download_link))

    # Download and expand newsgroup 20, if necessary
    if not os.path.exists(get_conf('newsgroup_path')):
        logging.warn(
            'newsgroup_path does not exist. Downloading and extracting data set'
        )
        logging.info(
            'Downloading newgroup 20 data set from: {}, to: {}'.format(
                newsgroup_20_download_link, newsgroup_20_downloaded_path))
        download_file(newsgroup_20_download_link, newsgroup_20_downloaded_path)
        logging.info('Expanding newgroup data set')
        tar = tarfile.open(newsgroup_20_downloaded_path)
        tar.extractall(os.path.dirname(get_conf('newsgroup_path')))
        tar.close()

    logging.info('Newsgroup dataset available at: {}'.format(
        os.path.dirname(get_conf('newsgroup_path'))))
def predict(x_pred):
    # Predict.
    model_dir_path = lib.get_conf('model_directory')

    # Load saved model
    model_name = listdir(model_dir_path)[-1]

    model_path = model_dir_path + '\\' + model_name

    logging.info('Loading saved model')
    classifier = load_model(model_path)

    predictions = classifier.predict_classes(x_pred)

    # To output probability of classes
    #prediction_probability = classifer.predict(x_pred)

    output_dir = lib.get_conf('prediction_output_directory')
    output_path = output_dir + '\\tf_prediction_output.csv'

    pd_prediction_output = pd.DataFrame(predictions)

    logging.info('Writing prediction output to {}'.format(output_dir))
    pd_prediction_output.to_csv(output_path)
def extract_fields(df):
    for extractor, items_of_interest in lib.get_conf(
            'case_agnostic_whole_resume').items():
        # column name is title of the sections in the yaml file
        df[extractor] = df['text'].apply(
            lambda x: extract_skills_case_agnostic(x, items_of_interest))
    # get universities
    for extractor, items_of_interest in lib.get_conf(
            'case_agnostic_education').items():
        df[extractor] = df['Edu'].apply(lambda x: extract_skills_case_agnostic(
            str(x).replace(' - ', ' ').replace('-', ' ').replace(',', ''),
            items_of_interest))
    # get level
    for extractor, items_of_interest in lib.get_conf(
            'case_sensitive_education').items():
        df[extractor] = df['Edu'].apply(
            lambda x: extract_skills_case_sensitive(x, items_of_interest))
    # get languages spoken
    for extractor, items_of_interest in lib.get_conf(
            'case_agnostic_languages').items():
        df[extractor] = df['Language'].apply(
            lambda x: extract_skills_case_agnostic(x, items_of_interest))

    return df
def load_data():
    logging.info('Loading train data')

    # Get data directory
    train_data_directory = lib.get_conf('raw_labeled_data_directory')

    # Get file name
    train_data_file_name = listdir(train_data_directory)[0]

    # Generate complete file path
    train_data_path = train_data_directory + '\\' + train_data_file_name

    pd_data = pd.read_csv(train_data_path)

    return pd_data
Exemple #28
0
def extract_universities(resume_text):

    # Reference variables
    matched_universities = set()
    normalized_resume_text = ' '.join(simple_preprocess(resume_text))

    # Iterate through possible universities
    for university in lib.get_conf('universities'):

        university = ' '.join(simple_preprocess(university))
        university_count = lib.term_count(normalized_resume_text, university)

        if university_count > 0:
            matched_universities.add(university)

    return matched_universities
Exemple #29
0
def summarize_trials(trials):
    # TODO Docstring

    # TODO Reference variables
    summary_agg = list()

    # TODO Iterate through portfolios
    for portfolio_dict in lib.get_conf('portfolios'):

        logging.info(
            'Creating summary statistics for portfolio_dict: {}'.format(
                portfolio_dict))

        observation_dict = dict()
        portfolio = portfolio_dict['portfolio']
        observation_dict['portfolio'] = portfolio

        final_balances = trials[portfolio + '_final_balance'].tolist()

        # Compute median for each portfolio
        observation_dict['median'] = numpy.median(final_balances)

        # Compute 1st decile
        observation_dict['top_10_perc'] = numpy.percentile(final_balances, 90)

        # Compute 9th decile
        observation_dict['bottom_10_perc'] = numpy.percentile(
            final_balances, 10)
        summary_agg.append(observation_dict)

    # Format results
    summary_df = pandas.DataFrame(summary_agg)
    summary_df = summary_df[[
        'portfolio', 'bottom_10_perc', 'median', 'top_10_perc'
    ]]

    # Round decimal places
    round_cols = ['bottom_10_perc', 'median', 'top_10_perc']
    for round_col in round_cols:

        summary_df[round_col] = summary_df[round_col].apply(
            lambda x: numpy.round(x, decimals=2))

    # Return results
    return summary_df
def load(observations, nlp):

    # Extract file name from path
    filename = os.path.basename(sys.argv[1])

    logging.info('Begin load')
    output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv')
    json_path = os.path.splitext(filename)[0] + '.json'

    logging.info('Results being output to {}'.format(output_path))
    # print('Results output to {}'.format(output_path))
    
    education = pandas.DataFrame(columns=['university', 'degree', 'gpa', 'year'])
    experience = pandas.DataFrame(columns=['company', 'position', 'totalExperience'])
    
    for university, degree, gpa in zip(observations['university'].iloc[0], observations['degree'].iloc[0], observations['gpa']):
        education = education.append({'university':university,'degree':degree, 'gpa':gpa}, ignore_index=True)
    
    #for company in observations['company'].iloc[0]:
        #experience.append({'company':company}, ignore_index=True)
    
    for job in observations['jobs'].iloc[0]:
        experience = experience.append({'position':job}, ignore_index=True)
    
    observations = observations.drop(columns=['file_path', 'extension', 'text', 'gpa', 'university', 'degree', 'jobs'])
    
    if type(observations['phone'].iloc[0]) is tuple:
        observations['phone'] = ''.join(observations['phone'].iloc[0]).rstrip()
    
    education_dict = {"university":"","degree":"","gpa":0,"year":""}
    experience_dict = {"company":"","position":"","totalExperience":0}
    
    observations['education'] = [education_dict]
    observations['experience'] = [experience_dict]
    observations['education'].iloc[0] = education.to_dict('records')
    observations['experience'].iloc[0] = experience.to_dict('records')
    
    observations.to_csv(path_or_buf=output_path, index_label='index', encoding='utf-8', sep=";")
    print(observations.to_json(orient='records'))
    
    # Send JSON to stdout to be handled by Node.JS
    #print(json.dumps(observations))
    
    logging.info('End load')
    pass