def test_check_db(): """Test the check_db function.""" # Check that it returns an SCDB when given None db = check_db(None) assert isinstance(db, SCDB) # Check that it returns an SCDB object when given one db = SCDB() db = check_db(db) assert isinstance(db, SCDB)
def save_summary(self, db=None): """Save out a summary of the scraped term paper data.""" db = check_db(db) with open(db.words_path + '/summary/' + self.label + '.json', 'w') as outfile: json.dump(self.summary, outfile)
def plot_years(year_counts, label, disp_fig=True, save_fig=False, db=None): """Plot publications across years histogram.""" f, ax = plt.subplots(figsize=(10, 5)) yrs = set(range(1985, 2016)) # Extract x & y data to plot x_dat = [y[0] for y in year_counts] y_dat = [y[1] for y in year_counts] # Add line and points to plot plt.plot(x_dat, y_dat) plt.plot(x_dat, y_dat, '.', markersize=16) # Set plot limits plt.xlim([min(yrs), max(yrs)]) plt.ylim([0, max(y_dat)+5]) # Add title & labels plt.title('Publication History', fontsize=24, fontweight='bold') plt.xlabel('Year', fontsize=18) plt.ylabel('# Pubs', fontsize=18) if save_fig: db = check_db(db) s_file = os.path.join(db.figs_path, 'year', label + '.svg') plt.savefig(s_file, transparent=True) if not disp_fig: plt.close()
def make_wc(freq_dist, n_words, label, disp_fig=True, save_fig=False, db=None): """Create and display wordcloud. Parameters ---------- n_words : int Number of top words to include in the wordcloud. save_fig : boolean Whether to save out the wordcloud. """ wc = create_wc(conv_freqs(freq_dist, 20)) plt.figure(figsize=(10, 10)) plt.imshow(wc) plt.axis("off") if save_fig: db = check_db(db) s_file = os.path.join(db.figs_path, 'wc', label + '.svg') plt.savefig(s_file, transparent=True) if not disp_fig: plt.close()
def load_pickle_obj(f_name, db=None): """Load a custom object, from a pickle file, for SCANR project. Parameters ---------- f_name : str File name of the object to be loaded. db : SCDB object, optional Database object for the SCANR project. """ # Check for database object, initialize if not provided db = check_db(db) # Get all available files, for Count and Words pickled objects counts_objs = os.listdir(db.counts_path) words_objs = os.listdir(db.words_path) # Search for object in saved Count files, and set path if found if f_name + '.p' in counts_objs: load_path = os.path.join(db.counts_path, f_name + '.p') # Search for object in saved Words files, and set path if found elif f_name + '.p' in words_objs: load_path = os.path.join(db.words_path, f_name + '.p') # Raise an error if the file name is not found else: raise InconsistentDataError('Can not find requested file name.') # Load and return the data return pickle.load(open(load_path, 'rb'))
def plot_clustermap(dat, cmap='purple', save_fig=False, save_name='Clustermap'): """Plot clustermap. Parameters ---------- dat : pandas.DataFrame Data to create clustermap from. """ # Set up plotting and aesthetics sns.set() sns.set_context("paper", font_scale=1.5) # Set colourmap if cmap == 'purple': cmap = sns.cubehelix_palette(as_cmap=True) elif cmap == 'blue': cmap = sns.cubehelix_palette(as_cmap=True, rot=-.3, light=0.9, dark=0.2) # Create the clustermap cg = sns.clustermap(dat, cmap=cmap, method='complete', metric='cosine', figsize=(12, 10)) # Fix axes cg.cax.set_visible(True) _ = plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=60, ha='right') _ = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # Save out - if requested if save_fig: db = check_db(db) s_file = os.path.join(db.figs_path, save_name + '.svg') cg.savefig(s_file, transparent=True)
def save(self, db=None): """Save out json file with all attached data.""" db = check_db(db) with open(db.words_path + '/raw/' + self.label + '.json', 'w') as outfile: for art in self: json.dump(art, outfile) outfile.write('\n') # Update history self.update_history('Saved')
def plot_matrix(dat, x_labels, y_labels, square=False, figsize=(10, 12), save_fig=False, save_name='Matrix'): """Plot the matrix of percent asscociations between terms.""" f, ax = plt.subplots(figsize=figsize) sns.heatmap(dat, square=square, xticklabels=x_labels, yticklabels=y_labels) f.tight_layout() # Save out - if requested if save_fig: db = check_db(db) s_file = os.path.join(db.figs_path, save_name + '.svg') plt.savefig(s_file)
def plot_dendrogram(dat, labels, save_fig=False, save_name='Dendrogram'): """Plot dendrogram.""" plt.figure(figsize=(3, 15)) Y = hier.linkage(dat, method='complete', metric='cosine') Z = hier.dendrogram(Y, orientation='left', labels=labels, color_threshold=0.25, leaf_font_size=12) # Save out - if requested if save_fig: db = check_db(db) s_file = os.path.join(db.figs_path, save_name + '.svg') cg.savefig(s_file, transparent=True)
def load(self, db=None): """Load raw data from json file.""" db = check_db(db) data = _parse_json_dat(db.words_path + '/raw/' + self.label + '.json') for dat in data: self.add_id(dat['id']) self.add_title(dat['title']) self.add_journal(dat['journal'][0], dat['journal'][1]) self.add_authors(dat['authors']) self.add_words(dat['words']) self.add_kws(dat['kws']) self.add_pub_date([dat['year'], dat['month']]) self.add_doi(dat['doi']) self.increment_n_articles() self.check_results()
def save_pickle_obj(obj, f_name, db=None): """Save a custom object from LISC as a pickle file. Parameters ---------- obj : {Counts() object, Words() object} LISC custom object to save out. f_name : str Name to append to saved out file name. db : SCDB() object, optional Database object for the LISC project. """ # Check for database object, initialize if not provided db = check_db(db) # If it's a Counts object, set path and name if isinstance(obj, Count): save_name = f_name + '_counts.p' save_path = db.counts_path # If it's a Words object, set path and name elif isinstance(obj, Words): save_name = f_name + '_words.p' save_path = db.words_path # If neither, raise error as object type is unclear else: raise InconsistentDataError('Object type unclear - can not save.') # Save out labels header file #with open(os.path.join(save_path, 'labels.txt'), 'w') as outfile: # for label in obj.labels: # outfile.write("%s\n" % label) # Save pickle file save_file = os.path.join(save_path, save_name) pickle.dump(obj, open(save_file, 'wb'))