def api_post(code, refresh): import http.client clientID = lib.get_conf('client_id') clientSecret = lib.get_conf('client_secret') redirect_URL = 'http://127.0.0.1:5000/' conn = http.client.HTTPSConnection("api.dexcom.com") if refresh: payload = "client_secret=" + clientSecret + "&client_id=" +clientID \ + "&refresh_token=" + code + "&grant_type=refresh_token&redirect_uri=" + redirect_URL else: payload = "client_secret=" + clientSecret + "&client_id=" +clientID \ + "&code=" + code + "&grant_type=authorization_code&redirect_uri=" + redirect_URL headers = { 'content-type': "application/x-www-form-urlencoded", 'cache-control': "no-cache" } conn.request("POST", "/v2/oauth2/token", payload, headers) res = conn.getresponse() data = res.read() return data
def evaluate(x_test, y_test): model_dir_path = lib.get_conf('model_directory') # Load saved model model_name = listdir(model_dir_path)[-1] model_path = model_dir_path + '\\' + model_name classifier = load_model(model_path) y_pred = classifier.predict_classes(x_test) # Instantiate empty output dataframe pd_eval_output = pd.DataFrame() pd_eval_output['Actual'] = pd.Series(y_test) pd_eval_output['Predicted'] = pd.Series(y_pred) eval_output_directory = lib.get_conf('prediction_output_directory') eval_output_file_name = 'tf_evaluation_output.csv' eval_output_path = eval_output_directory + '\\' + eval_output_file_name # Write out evaluation output pd_eval_output.to_csv(eval_output_path) score = metrics.accuracy_score(y_test, y_pred) cm = metrics.confusion_matrix(y_test, y_pred) print('Model Accuracy: {}'.format(score)) #For Demo purposes logging.info('Model Accuracy: {}'.format(score))
def extract(): # Extract appropriate model char_model = load_model(filepath=lib.get_conf('generate_model_path')) # Extract posts to be completed observations = pandas.read_csv(lib.get_conf('post_seed_path')) logging.info('End extract') lib.archive_dataset_schemas('generate_extract', locals(), globals()) return char_model, observations
def extract(): """ Extract necessary data / resources from upstream. This method will: - Validate that newsgroup data set is available, and read in - Validate that text embeddings are available, and read in - Validate that text to embedding index lookup is available, and read in :return: observations, embedding_matrix, word_to_index :rtype: (pandas.DataFrame, numpy.array, dict) """ logging.info('Begin extract') logging.info('Performing extract for batch: {}, from newgroup_path: {}' .format(lib.get_batch_name(), lib.get_conf('newsgroup_path'))) # Download resources # Confirm newsgroup data set is downloaded resources.download_newsgroup() # Confirm that embedding is downloaded resources.download_embedding() # Extract resources from file system # Newsgroup20: Get list of all candidate documents glob_pattern = os.path.join(lib.get_conf('newsgroup_path'), '*', '*') logging.info('Searching for glob_pattern: {}'.format(glob_pattern)) document_candidates = glob.glob(glob_pattern) # Newsgroup20: Create observations data set observations = pandas.DataFrame(document_candidates, columns=['document_path']) logging.info('Shape of observations data frame created from glob matches: {}'.format(observations.shape)) # Newsgroup20: Re-order rows observations = observations.sample(frac=1) # Newsgroup20: Subset number of observations, if it's a test run if lib.get_conf('test_run'): logging.info('Reducing file size for test run') observations = observations.head(100) logging.info('Test run number of records: {}'.format(len(observations.index))) # Embedding: Load embedding embedding_matrix, word_to_index = resources.create_embedding_matrix() logging.info('word_to_index max index: {}'.format(max(word_to_index.values()))) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations, embedding_matrix, word_to_index
def extract(): # TODO Extract # Extract all posts for given subreddit, going back given number of days logging.info('Downloading submissions from Reddit') observations = scrape_subreddit(lib.get_conf('subreddit'), lib.get_conf('history_num_days')) logging.info('Found {} submissions'.format(len(observations.index))) logging.info('End extract') lib.archive_dataset_schemas('extract', locals(), globals()) return observations
def load(char_model, observations, generated_posts): logging.info('Begin transform') # Export observations observations.to_csv( path_or_buf=lib.get_conf('generated_observations_path'), index=False) # Export generated posts generated_posts.to_csv(path_or_buf=lib.get_conf('generated_posts_path'), index=False) logging.info('End load') lib.archive_dataset_schemas('generate_load', locals(), globals()) pass
def scrape_subreddit(subreddit_name, num_days): # TODO Docstring logging.info( 'Beginning Reddit scraper, for subreddit: {}, and num_days: {}'.format( subreddit_name, num_days)) # Reference variables parsed_submission_agg = list() # Create connection. For details, see https://www.reddit.com/prefs/apps/ logging.info('Creating reddit connection') reddit = praw.Reddit(client_id=lib.get_conf('client_id'), client_secret=lib.get_conf('client_secret'), user_agent='upvote_estimator:0.0.1') # Find correct subreddit logging.info('Searching for subreddit: {}'.format(subreddit_name)) subreddit = reddit.subreddit(subreddit_name) logging.debug('Searched for subreddit: {}, found subreddit: {}, {}'.format( subreddit_name, subreddit.display_name, subreddit.title)) # Compute correct time range (current datetime - num_days to current datetime) end_datetime = datetime.datetime.utcnow() end_datetime_unix = time.mktime(end_datetime.timetuple()) start_datetime = end_datetime - datetime.timedelta(days=num_days) start_datetime_unix = time.mktime(start_datetime.timetuple()) logging.debug('Time range: {} to {}'.format(start_datetime, end_datetime)) # Iterate through posts chronologically for index, submission in enumerate( subreddit.submissions(start_datetime_unix, end_datetime_unix)): logging.info('Working number {}, submission: '.format( index, submission)) # Parse each submission and extract essential fields parsed_submission = submission_parser(submission) # Add info from each post to aggregator parsed_submission_agg.append(parsed_submission) if lib.get_conf('test_run') and index >= 49: break # Create DataFrame from pulled data posts = pandas.DataFrame(parsed_submission_agg) # Return return posts
def create_trials(): # TODO Docstring logging.info('Beginning trials') # Create pre-trial DataFrame (one observation is one trial), wth trial number and starting dollar amount logging.info('Creating pre-trial dataframe') index = range(1, lib.get_conf('num_trials') + 1) trials = pandas.DataFrame(index=index) trials['trial_num'] = trials.index trials['starting_amount'] = lib.get_conf('starting_amount') logging.info( 'Created trials table with starting info: \n{}'.format(trials)) # Generate observed standard deviations logging.info('Generating standard_deviations') st_devs = [ numpy.random.normal(size=20) for i in range(lib.get_conf('num_trials')) ] trials['st_devs'] = st_devs # Translate observed standard deviations to dollar amounts (with inflation) # Iterate through portfolios for portfolio_dict in lib.get_conf('portfolios'): logging.info( 'Generating results for portfolio dict: {}'.format(portfolio_dict)) # Compute list of balances (one for every year) balances_columns = portfolio_dict['portfolio'] + '_balances' trials[balances_columns] = trials.apply( func=lambda x: lib.compute_balances( x, return_mean=portfolio_dict['return_mean'], return_std_dev=portfolio_dict['return_std_dev']), axis=1) # Extract final dollar amount for each trial trials[portfolio_dict['portfolio'] + '_final_balance'] = trials[balances_columns].apply( lambda x: x[-1]) # TODO Archive data and return trial data logging.info('Trials complete') return trials
def transform(observations, false_y=False): # Reference variables char_indices = lib.get_char_indices() indices_char = lib.get_indices_char() x_agg = list() y_agg = list() if lib.get_conf('test_run'): observations = observations.head(100).copy() # Create a single field with all text. < and > serve as start and end tokens observations[ 'model_text'] = observations['title'] + ' ' + observations['selftext'] # Iterate through individual observations for text in observations['model_text']: # Generate x and y for observations observation_x, observation_y = lib.gen_x_y(text, false_y=false_y) x_agg.extend(observation_x) y_agg.extend(observation_y) x = numpy.matrix(x_agg) y = numpy.matrix(y_agg) return observations, char_indices, indices_char, x, y
def create_embedding_matrix(): """ Load embedding assets from file. - Load embedding binaries w/ gsensim - Extract embedding matrix from gensim model - Extract word to index lookup from gensim model :return: embedding_matrix, word_to_index :rtype: (numpy.array, {str:int}) """ logging.info( 'Reading embedding matrix and word to index dictionary from file') # Get word weights from file via gensim model = gensim.models.KeyedVectors.load_word2vec_format( get_conf('embedding_path'), binary=True) embedding_matrix = model.syn0 # Filter out words with index not in w2v range word_to_index = dict([(k, v.index) for k, v in model.vocab.items()]) logging.info('Created embedding matrix, of shape: {}'.format( embedding_matrix.shape)) logging.info( 'Created word to index lookup, with min index: {}, max index: {}'. format(min(word_to_index.values()), max(word_to_index.values()))) return embedding_matrix, word_to_index
def extract(): logging.info('Begin extract') candidate_file_agg = list() # for creating list of resume file paths for root, subdirs, files in os.walk(lib.get_conf( 'resume_directory')): # gets path to resumes from yaml file # os.walk(parentdir + '/data/input/example_resumes'): would do the same thing files = filter(lambda f: f.endswith(('.pdf', '.PDF')), files) # only read pdfs folder_files = map(lambda x: os.path.join(root, x), files) candidate_file_agg.extend(folder_files) observations = pd.DataFrame(data=candidate_file_agg, columns=['file_path']) # convert to df logging.info('Found {} candidate files'.format(len(observations.index))) observations['extension'] = observations['file_path'].apply( lambda x: os.path.splitext(x)[1]) # e.g. pdf or doc observations = observations[observations['extension'].isin( lib.AVAILABLE_EXTENSIONS)] logging.info( 'Subset candidate files to extensions w/ available parsers. {} files remain' .format(len(observations.index))) observations['text'] = observations['file_path'].apply( lib.convert_pdf) # get text from .pdf files # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) # saving the schema logging.info('End extract') return observations
def extract_skills(resume_text): potential_skills_dict = dict() matched_skills = set() # TODO This skill input formatting could happen once per run, instead of once per observation. for skill_input in lib.get_conf('skills'): # Format list inputs if type(skill_input) is list and len(skill_input) >= 1: potential_skills_dict[skill_input[0]] = skill_input # Format string inputs elif type(skill_input) is str: potential_skills_dict[skill_input] = [skill_input] else: logging.warn( 'Unknown skill listing type: {}. Please format as either a single string or a list of strings' ''.format(skill_input)) for (skill_name, skill_alias_list) in potential_skills_dict.items(): skill_matches = 0 # Iterate through aliases for skill_alias in skill_alias_list: # Add the number of matches for each alias skill_matches += lib.term_count(resume_text, skill_alias.lower()) # If at least one alias is found, add skill name to set of skills if skill_matches > 0: matched_skills.add(skill_name) return matched_skills
def send_to_Cosmos(): input_path=os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.json') client = dc.DocumentClient(config_cosmos.COSMOSDB_HOST, {'masterKey': config_cosmos.COSMOSDB_KEY}) with open(input_path) as read_file: data = json.load(read_file) for i in data["data"]: client.CreateDocument(CosmosConnection.get_collection_link(config_cosmos.COSMOSDB_DATABASE, config_cosmos.COSMOSDB_COLLECTION), i)
def extract(): logging.info('Begin extract') # Reference variables candidate_file_agg = list() # Create list of candidate files for root, subdirs, files in os.walk(lib.get_conf('resume_directory')): folder_files = map(lambda x: os.path.join(root, x), files) candidate_file_agg.extend(folder_files) # Convert list to a pandas DataFrame observations = pandas.DataFrame(data=candidate_file_agg, columns=['file_path']) logging.info('Found {} candidate files'.format(len(observations.index))) # Subset candidate files to supported extensions observations['extension'] = observations['file_path'].apply( lambda x: os.path.splitext(x)[1]) observations = observations[observations['extension'].isin( lib.AVAILABLE_EXTENSIONS)] logging.info( 'Subset candidate files to extensions w/ available parsers. {} files remain' .format(len(observations.index))) # Attempt to extract text from files observations['text'] = observations['file_path'].apply(text_extract_utf8) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def create_embedding_matrix(): """ Load embedding assets from file. - Load embedding binaries w/ gsensim - Extract embedding matrix from gensim model - Extract word to index lookup from gensim model :return: embedding_matrix, word_to_index :rtype: (numpy.array, {str:int}) """ logging.info( 'Reading embedding matrix and word to index dictionary from file') # Get word weights from file via gensim model = gensim.models.KeyedVectors.load_word2vec_format( get_conf('embedding_path'), binary=True) embedding_matrix = model.syn0 # Filter out words with index not in w2v range word_to_index = dict([(k, v.index) for k, v in model.vocab.items()]) # Transform embedding resources # Embedding: Update embedding to map any unknown words (words not in training vocabulary) to the unknown value default_dict_instance = defaultdict(lambda: word_to_index['UNK']) default_dict_instance.update(word_to_index) word_to_index = default_dict_instance logging.info('Created embedding matrix, of shape: {}'.format( embedding_matrix.shape)) logging.info( 'Created word to index lookup, with min index: {}, max index: {}'. format(min(word_to_index.values()), max(word_to_index.values()))) return embedding_matrix, word_to_index
def download_embedding(): """ Prepare GoogleNews pre-trained word embeddings. - Check if compressed embeddings are available - If compressed embeddings are not available, download them - Check if uncompressed embeddings are available - If compressed embeddings are not available, uncompress embeddings :return: None :rtype: None """ logging.info( 'Attempting to either validate or download and extract embeddings.') # Reference variables embedding_download_link = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz' embedding_downloaded_path = '../resources/compressed/GoogleNews-vectors-negative300.bin.gz' # Download embeddings, if necessary if not os.path.exists(embedding_downloaded_path): logging.warn( 'embedding_downloaded_path does not yet exist. Downloading embedding. This occurs once, and will take ' '10-20 minutes (due to large file size)') print( 'embedding_downloaded_path does not yet exist. Downloading embedding. This occurs once, and will take ' '10-20 minutes (due to large file size)') logging.info('Downloading embedding data from: {} to: {}'.format( embedding_download_link, embedding_downloaded_path)) download_file(embedding_download_link, embedding_downloaded_path) # Extract embeddings, if necessary if not os.path.exists(get_conf('embedding_path')): logging.warn('embedding_path does not exist. Extracting embedding.') logging.info('Extracting embedding data from: {} to: {}'.format( embedding_downloaded_path, get_conf('embedding_path'))) with gzip.open(embedding_downloaded_path, 'rb') as zipped, \ open(get_conf('embedding_path'), 'w+') as unzipped: for line in zipped: unzipped.write(line) logging.info('Embeddings available at: {}'.format( get_conf('embedding_path')))
def model(observation, char_indices, indices_char, x, y): char_model = models.rnn_embedding_model(x, y) # Set up model training variables optimizer = RMSprop(lr=0.01) char_model.compile(loss='categorical_crossentropy', optimizer=optimizer) batch_size = 4096 num_epochs = 200 if lib.get_conf('test_run'): num_epochs = 2 # Set up callbacks tf_log_path = os.path.join(os.path.expanduser('~/log_dir'), lib.get_batch_name()) logging.info('Using Tensorboard path: {}'.format(tf_log_path)) mc_log_path = os.path.join( lib.get_conf('model_checkpoint_path'), lib.get_batch_name() + '_epoch_{epoch:03d}_loss_{loss:.2f}.h5py') logging.info('Using mc_log_path path: {}'.format(mc_log_path)) sentence_generator = SentenceGenerator(verbose=1) clr_step_size = numpy.floor((float(x.shape[0]) / batch_size) * 4) clr = CyclicLR(base_lr=.005, max_lr=.02, mode='triangular2', step_size=clr_step_size) logging.info('Using CRL step size: {}'.format(clr_step_size)) callbacks = [ TensorBoard(log_dir=tf_log_path), ModelCheckpoint(mc_log_path), sentence_generator, clr ] # Train the model, output generated text after each iteration char_model.fit(x, y, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks) print sentence_generator.sentences
def download_file(url, local_file_path, auth=False): """ Download the file at `url` in chunks, to the location at `local_file_path` :param url: URL to a file to be downloaded :type url: str :param local_file_path: Path to download the file to :type local_file_path: str :param auth: is authentication required to download file :type auth: Boolean :return: The path to the file on the local machine (same as input `local_file_path`) :rtype: str """ # Get user name and password username = lib.get_conf('user_name') password = lib.get_conf('password') # Reference variables chunk_count = 0 if auth: # Create connection to the stream r = requests.get(url, auth=(username, password), stream=True) else: # Create connection without password r = requests.get(url, stream=True) # Open output file with open(local_file_path, 'wb') as f: # Iterate through chunks of file for chunk in r.iter_content(chunk_size=64 * 1024): logging.debug('Downloading chunk: {} for file: {}'.format( chunk_count, local_file_path)) # Write chunk to file f.write(chunk) # Increase chunk counter chunk_count = chunk_count + 1 return local_file_path
def load(observations): logging.info('Begin load') output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv') logging.info('Results being output to {}'.format(output_path)) print('Results output to {}'.format(output_path)) observations.to_csv(path_or_buf=output_path, index_label='index') logging.info('End transform') pass
def on_epoch_end(self, epoch, logs={}): # Reference variables sentence_agg = list() seed_chars = '<Jiggling around cheap iPhone chargers to find the sweet spot is the millennial version of ' \ 'tweaking a TV antenna.>'[:lib.get_conf('ngram_len')] for diversity in [0.2, 0.5, 1.0, 1.2]: generated = '' sentence = seed_chars generated += sentence # Generate next characters, using a rolling window for next_char_index in range(lib.get_conf('pred_length')): x_pred, text_y = lib.gen_x_y(sentence, false_y=True) preds = self.model.predict(x_pred, verbose=0)[-1] next_index = lib.sample(preds, diversity) next_char = lib.get_indices_char()[next_index] generated += next_char sentence = sentence[1:] + next_char local_dict = dict() local_dict['epoch'] = epoch local_dict['seed'] = seed_chars local_dict['diversity'] = diversity local_dict['generated_post'] = generated sentence_agg.append(local_dict) if self.verbose >= 1: print('Diversity: {}, generated post: {}'.format( local_dict['diversity'], local_dict['generated_post'])) epoch_sentences = pandas.DataFrame(sentence_agg) self.sentences = pandas.concat(objs=[self.sentences, epoch_sentences]) if self.output_path is not None: self.sentences.to_csv(path_or_buf=self.output_path, index=False)
def load_to_json(observations, nlp): logging.info('Begin load to json') output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.json') logging.info('Results being output to {}'.format(output_path)) print('Results output to {}'.format(output_path)) observations.to_json(path_or_buf=output_path, orient='table') logging.info('End transform to json') pass
def load_data(): logging.info('Loading evaluation data') eval_data_directory = lib.get_conf('raw_labeled_data_directory') eval_data_file_name = listdir(eval_data_directory)[0] eval_data_path = eval_data_directory + '\\' + eval_data_file_name pd_eval_data = pd.read_csv(eval_data_path) return pd_eval_data
def extract(): # TODO Docstring logging.info('Begin extract') # Extract all posts for given subreddit, going back given number of days logging.info('Downloading submissions from Reddit') observations = scrape_subreddit(lib.get_conf('subreddit'), lib.get_conf('history_num_days')) logging.info('Found {} submissions'.format(len(observations.index))) # Load embedding matrix resources.download_embedding() embedding_matrix, word_to_index = resources.create_embedding_matrix() logging.info('word_to_index max index: {}'.format( max(word_to_index.values()))) logging.info('End extract') lib.archive_dataset_schemas('extract', locals(), globals()) return embedding_matrix, word_to_index, observations
def download_newsgroup(): """ Validate that newsgroup20 data set is available - Check if newsgroup20 data set is available - If newsgroup20 data set is not available: - Download files - Un-tar files :return: None :rtype: None """ # TODO Docstring # Reference variables newsgroup_20_download_link = 'http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz' newsgroup_20_downloaded_path = '../resources/compressed/news20.tar.gz' logging.info( 'Attempting to either validate or download and extract newsgroup_20 data set from {}' .format(newsgroup_20_download_link)) # Download and expand newsgroup 20, if necessary if not os.path.exists(get_conf('newsgroup_path')): logging.warn( 'newsgroup_path does not exist. Downloading and extracting data set' ) logging.info( 'Downloading newgroup 20 data set from: {}, to: {}'.format( newsgroup_20_download_link, newsgroup_20_downloaded_path)) download_file(newsgroup_20_download_link, newsgroup_20_downloaded_path) logging.info('Expanding newgroup data set') tar = tarfile.open(newsgroup_20_downloaded_path) tar.extractall(os.path.dirname(get_conf('newsgroup_path'))) tar.close() logging.info('Newsgroup dataset available at: {}'.format( os.path.dirname(get_conf('newsgroup_path'))))
def predict(x_pred): # Predict. model_dir_path = lib.get_conf('model_directory') # Load saved model model_name = listdir(model_dir_path)[-1] model_path = model_dir_path + '\\' + model_name logging.info('Loading saved model') classifier = load_model(model_path) predictions = classifier.predict_classes(x_pred) # To output probability of classes #prediction_probability = classifer.predict(x_pred) output_dir = lib.get_conf('prediction_output_directory') output_path = output_dir + '\\tf_prediction_output.csv' pd_prediction_output = pd.DataFrame(predictions) logging.info('Writing prediction output to {}'.format(output_dir)) pd_prediction_output.to_csv(output_path)
def extract_fields(df): for extractor, items_of_interest in lib.get_conf( 'case_agnostic_whole_resume').items(): # column name is title of the sections in the yaml file df[extractor] = df['text'].apply( lambda x: extract_skills_case_agnostic(x, items_of_interest)) # get universities for extractor, items_of_interest in lib.get_conf( 'case_agnostic_education').items(): df[extractor] = df['Edu'].apply(lambda x: extract_skills_case_agnostic( str(x).replace(' - ', ' ').replace('-', ' ').replace(',', ''), items_of_interest)) # get level for extractor, items_of_interest in lib.get_conf( 'case_sensitive_education').items(): df[extractor] = df['Edu'].apply( lambda x: extract_skills_case_sensitive(x, items_of_interest)) # get languages spoken for extractor, items_of_interest in lib.get_conf( 'case_agnostic_languages').items(): df[extractor] = df['Language'].apply( lambda x: extract_skills_case_agnostic(x, items_of_interest)) return df
def load_data(): logging.info('Loading train data') # Get data directory train_data_directory = lib.get_conf('raw_labeled_data_directory') # Get file name train_data_file_name = listdir(train_data_directory)[0] # Generate complete file path train_data_path = train_data_directory + '\\' + train_data_file_name pd_data = pd.read_csv(train_data_path) return pd_data
def extract_universities(resume_text): # Reference variables matched_universities = set() normalized_resume_text = ' '.join(simple_preprocess(resume_text)) # Iterate through possible universities for university in lib.get_conf('universities'): university = ' '.join(simple_preprocess(university)) university_count = lib.term_count(normalized_resume_text, university) if university_count > 0: matched_universities.add(university) return matched_universities
def summarize_trials(trials): # TODO Docstring # TODO Reference variables summary_agg = list() # TODO Iterate through portfolios for portfolio_dict in lib.get_conf('portfolios'): logging.info( 'Creating summary statistics for portfolio_dict: {}'.format( portfolio_dict)) observation_dict = dict() portfolio = portfolio_dict['portfolio'] observation_dict['portfolio'] = portfolio final_balances = trials[portfolio + '_final_balance'].tolist() # Compute median for each portfolio observation_dict['median'] = numpy.median(final_balances) # Compute 1st decile observation_dict['top_10_perc'] = numpy.percentile(final_balances, 90) # Compute 9th decile observation_dict['bottom_10_perc'] = numpy.percentile( final_balances, 10) summary_agg.append(observation_dict) # Format results summary_df = pandas.DataFrame(summary_agg) summary_df = summary_df[[ 'portfolio', 'bottom_10_perc', 'median', 'top_10_perc' ]] # Round decimal places round_cols = ['bottom_10_perc', 'median', 'top_10_perc'] for round_col in round_cols: summary_df[round_col] = summary_df[round_col].apply( lambda x: numpy.round(x, decimals=2)) # Return results return summary_df
def load(observations, nlp): # Extract file name from path filename = os.path.basename(sys.argv[1]) logging.info('Begin load') output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv') json_path = os.path.splitext(filename)[0] + '.json' logging.info('Results being output to {}'.format(output_path)) # print('Results output to {}'.format(output_path)) education = pandas.DataFrame(columns=['university', 'degree', 'gpa', 'year']) experience = pandas.DataFrame(columns=['company', 'position', 'totalExperience']) for university, degree, gpa in zip(observations['university'].iloc[0], observations['degree'].iloc[0], observations['gpa']): education = education.append({'university':university,'degree':degree, 'gpa':gpa}, ignore_index=True) #for company in observations['company'].iloc[0]: #experience.append({'company':company}, ignore_index=True) for job in observations['jobs'].iloc[0]: experience = experience.append({'position':job}, ignore_index=True) observations = observations.drop(columns=['file_path', 'extension', 'text', 'gpa', 'university', 'degree', 'jobs']) if type(observations['phone'].iloc[0]) is tuple: observations['phone'] = ''.join(observations['phone'].iloc[0]).rstrip() education_dict = {"university":"","degree":"","gpa":0,"year":""} experience_dict = {"company":"","position":"","totalExperience":0} observations['education'] = [education_dict] observations['experience'] = [experience_dict] observations['education'].iloc[0] = education.to_dict('records') observations['experience'].iloc[0] = experience.to_dict('records') observations.to_csv(path_or_buf=output_path, index_label='index', encoding='utf-8', sep=";") print(observations.to_json(orient='records')) # Send JSON to stdout to be handled by Node.JS #print(json.dumps(observations)) logging.info('End load') pass