def categorical_predict(X,y_org,method_name,config): # Make sure the sizes match msg = "X shape {}, y_org shape {} (mismatch!)" assert X.shape[0] == y_org.shape[0], msg.format(X.shape[0], y_org.shape[0]) enc = LabelEncoder() y = enc.fit_transform(y_org) label_n = np.unique(y).shape[0] #msg = "[{}] number of unique entries in [y {}]: {}" #print msg.format(method_name, X.shape, label_n) use_SMOTE = config["use_SMOTE"] print " Adjusting class balance using SMOTE" is_PARALLEL = config["_PARALLEL"] clf_args = { "n_jobs" : -1 if is_PARALLEL else 1, "n_estimators" : int(config["n_estimators"]), } skf = StratifiedKFold(y, n_folds=10, shuffle=False) scores = [] F1_scores = [] INPUT_ITR = ((clf_args, idx, X, y, use_SMOTE) for idx in skf) ITR = jobmap(clf_extratree_predictor, INPUT_ITR, True) error_counts = np.zeros(y.size,dtype=float) predict_scores = np.zeros([y.size,label_n],dtype=float) for result in ITR: idx,pred,pred_proba = result train_index, test_index = idx X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] errors = y_test!=pred scores.append(1-errors.mean()) error_counts[test_index[errors]] += 1 F1_scores.append(f1_score(y_test, pred)) predict_scores[test_index] = pred_proba # For StratifiedKFold, each test set is hit only once # so normalization is simple error_counts /= 1.0 return np.array(scores), np.array(F1_scores), error_counts, predict_scores
def categorical_predict(X, y_org, method_name, config): # Make sure the sizes match msg = "X shape {}, y_org shape {} (mismatch!)" assert X.shape[0] == y_org.shape[0], msg.format(X.shape[0], y_org.shape[0]) enc = LabelEncoder() y = enc.fit_transform(y_org) label_n = np.unique(y).shape[0] # msg = "[{}] number of unique entries in [y {}]: {}" # print msg.format(method_name, X.shape, label_n) use_SMOTE = config["use_SMOTE"] if use_SMOTE: print(" Adjusting class balance using SMOTE") is_PARALLEL = config["_PARALLEL"] clf_args = { "n_jobs": -1 if is_PARALLEL else 1, "n_estimators": int(config["n_estimators"]), } skf = StratifiedKFold(n_splits=10, shuffle=False).split(X, y) scores = [] F1_scores = [] INPUT_ITR = ((clf_args, idx, X, y, use_SMOTE) for idx in skf) ITR = jobmap(clf_extratree_predictor, INPUT_ITR, True) error_counts = np.zeros(y.size, dtype=float) predict_scores = np.zeros([y.size, label_n], dtype=float) for result in ITR: idx, pred, pred_proba = result train_index, test_index = idx y_test = y[test_index] errors = y_test != pred scores.append(1 - errors.mean()) error_counts[test_index[errors]] += 1 F1_scores.append(f1_score(y_test, pred)) predict_scores[test_index] = pred_proba # For StratifiedKFold, each test set is hit only once # so normalization is simple error_counts /= 1.0 return np.array(scores), np.array(F1_scores), error_counts, predict_scores
def phrases_from_config(config): """ Identify parenthetical phrases in the documents as they are being imported to the pipeline. import_data_from_config() and phrases_from_config() are the entry points for this step of the pipeline. Args: config: a config file :return: """ _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABBR = collections.Counter() INPUT_ITR = db_utils.CSV_database_iterator( F_CSV, target_column, progress_bar=True ) ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABBR.update(result) logger.info("{} total abbrs found.".format(len(ABBR))) # Merge abbreviations that are similar logger.debug("Deduping abbr list.") df = dedupe_abbr(ABBR) logger.info("{} abbrs remain after deduping.".format(len(df))) # Output top phrase logger.info("Top 5 abbreviations") msg = "({}) {}, {}, {}" for k, (_, row) in enumerate(df[:5].iterrows()): logger.info(msg.format(k + 1, row.name, row["abbr"], row["count"])) mkdir(output_dir) f_csv = os.path.join( output_dir, config["phrase_identification"]["f_abbreviations"] ) df.to_csv(f_csv)
def compute(self, config): func = compute_affinity ITR = jobmap(func, self, self.PARALLEL) print("Computing affinity propagation") for result in tqdm.tqdm(ITR): self.save(config, result) # Save the size of the vocabulary self.h5["documents"].attrs["vocab_n"] = self.vocab_n self.h5["documents"].attrs["cluster_n"] = self.cluster_n self.h5.close()
def compute(self, config): func = compute_document_affinity ITR = jobmap(func, self, self.PARALLEL) doc_data = [] print("Computing document affinity scoring") for result in ITR: doc_data.append(result) df = pd.DataFrame(data=doc_data, columns=["V", "idx", "f_sql"]) self.save(config, df)
def compute(self, config): func = compute_affinity ITR = jobmap(func, self, self.PARALLEL) print "Computing affinity propagation" for result in tqdm.tqdm(ITR): self.save(config, result) # Save the size of the vocabulary self.h5["documents"].attrs["vocab_n"] = self.vocab_n self.h5["documents"].attrs["cluster_n"] = self.cluster_n self.h5.close()
def csv_iterator(f_csv, clean=True, _PARALLEL=False): ''' Creates and iterator over a CSV file, optionally cleans it. ''' with open(f_csv) as FIN: CSV = csv.DictReader(FIN) if clean and _PARALLEL: CSV = jobmap(clean_row, CSV, FLAG_PARALLEL=_PARALLEL) elif clean and not _PARALLEL: CSV = itertools.imap(clean_row, CSV) for row in CSV: yield row
def compute(self, config): func = compute_document_affinity ITR = jobmap(func, self, self.PARALLEL) doc_data = [] print "Computing document affinity scoring" for result in ITR: doc_data.append(result) df = pd.DataFrame(data=doc_data, columns=["V","idx","f_sql"]) self.save(config, df)
def import_directory_csv(d_in, d_out, output_table): F_CSV = [] F_SQL = {} INPUT_FILES = grab_files("*.csv",d_in) if not INPUT_FILES: print "No matching CSV files found, exiting" exit(2) for f_csv in INPUT_FILES: f_sql = '.'.join(os.path.basename(f_csv).split('.')[:-1]) f_sql += ".sqlite" f_sql = os.path.join(d_out,f_sql) if os.path.exists(f_sql) and not _FORCE: print "{} already exists, skipping".format(f_sql) continue F_CSV.append(f_csv) F_SQL[f_csv] = f_sql # Create the output directory if needed mkdir(d_out) ITR = jobmap(load_csv, F_CSV, _PARALLEL) # Create a reference ID for each item _ref_counter = itertools.count() for (f_csv,df) in ITR: f_sql = F_SQL[f_csv] engine = create_engine('sqlite:///'+f_sql) n_data_items = len(df) df["_ref"] = [_ref_counter.next() for _ in range(n_data_items)] df.set_index("_ref",inplace=True) df.to_sql(output_table, engine, if_exists='replace') print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))
def dedupe_abbr(ABR): data = {} ITR = jobmap(dedupe_item, tqdm.tqdm(ABR.items()), True) for result in ITR: # Only add the most common result max_val, max_item = 0, None total_counts = 0 for item in result: current_val = ABR[item] total_counts += current_val if current_val > max_val: max_val = current_val max_item = item data[(' '.join(max_item[0]), max_item[1])] = total_counts ABR = collections.Counter(data) return ABR
def dedupe_abbr(ABR): data = {} ITR = jobmap(dedupe_item, tqdm.tqdm(ABR.items()), True) for result in ITR: # Only add the most common result max_val,max_item = 0, None total_counts = 0 for item in result: current_val = ABR[item] total_counts += current_val if current_val > max_val: max_val = current_val max_item = item data[(' '.join(max_item[0]), max_item[1])] = total_counts ABR = collections.Counter(data) return ABR
def cluster_affinity_states(self, INPUT_ITR, size=0): func = compute_local_affinity ITR = jobmap(func, INPUT_ITR, self.PARALLEL) Z = [] pbar = tqdm.tqdm(total=size//self.batch_size) for result in ITR: V,z_labels = result for i in np.unique(z_labels): z = V[i==z_labels].mean(axis=0) z /= np.linalg.norm(z) Z.append(z) pbar.update() pbar.close() return np.array(Z)
def cluster_affinity_states(self, INPUT_ITR, size=0): func = compute_local_affinity ITR = jobmap(func, INPUT_ITR, self.PARALLEL) Z = [] pbar = tqdm.tqdm(total=size // self.batch_size) for result in ITR: V, z_labels = result for i in np.unique(z_labels): z = V[i == z_labels].mean(axis=0) z /= np.linalg.norm(z) Z.append(z) pbar.update() pbar.close() return np.array(Z)
def csv_iterator(f_csv, clean=True, _PARALLEL=False): ''' Creates an iterator over a CSV file, optionally cleans it. Args f_csv (str): Filename of the csv to open and iterate over clean (bool): Set whether to clean the csv file PARALLEL (bool): Set whether the iterator should be run in parallel ''' with open(f_csv) as FIN: CSV = csv.DictReader(FIN) if clean and _PARALLEL: CSV = jobmap(clean_row, CSV, FLAG_PARALLEL=_PARALLEL) elif clean and not _PARALLEL: CSV = itertools.imap(clean_row, CSV) try: for row in CSV: yield row except Exception: pass
def phrases_from_config(config): _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABR = collections.Counter() dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True) ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABR.update(result) msg = "\n{} total abbrs found." print(msg.format(len(ABR))) # Merge abbreviations that are similar print("Deduping abbr list.") df = dedupe_abbr(ABR) print("{} abbrs remain after deduping".format(len(df))) # Output top phrase print("Top 5 abbreviations") print(df[:5]) mkdir(output_dir) f_csv = os.path.join(output_dir, config["phrase_identification"]["f_abbreviations"]) df.to_csv(f_csv)
FILE_COL_ITR = itertools.product(F_SQL, target_columns) for f_sql,column_name in FILE_COL_ITR: conn = sqlite3.connect(f_sql,check_same_thread=False) INPUT_ITR = dfunc(column_name, input_table, conn, limit=global_limit, offset=global_offset, progress_bar=True, ) ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL) for result in ITR: ABR.update(result) msg = "Completed {} {}. {} total abbrs found." print msg.format(f_sql,column_name,len(ABR)) # Merge abbreviations that are similar print "Deduping list" ABR = dedupe_abbr(ABR) print "{} abbrs remain after deduping".format(len(ABR)) # Convert abbrs to a list data_insert = [(phrase,abbr,count)
_PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABR = collections.Counter() P = parenthesis_nester() dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True) ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABR.update(result) msg = "\n{} total abbrs found." print(msg.format(len(ABR))) # Merge abbreviations that are similar print("Deduping abbr list.") ABR = dedupe_abbr(ABR) print("{} abbrs remain after deduping".format(len(ABR))) # Convert abbrs to a list data_insert = [(phrase, abbr, count) for (phrase, abbr), count in ABR.most_common()]
def parse_from_config(config): _PARALLEL = config.as_bool("_PARALLEL") import_config = config["import_data"] parse_config = config["parse"] input_data_dir = import_config["output_data_directory"] output_dir = parse_config["output_data_directory"] mkdir(output_dir) for name in parse_config["pipeline"]: obj = getattr(nlpre, name) # Load any kwargs in the config file kwargs = {} if name in parse_config: kwargs = dict(parse_config[name]) # Handle the special case of the precomputed acronyms if name == "replace_acronyms": f_abbr = os.path.join( config["phrase_identification"]["output_data_directory"], config["phrase_identification"]["f_abbreviations"]) ABBR = load_phrase_database(f_abbr) kwargs["counter"] = ABBR parser_functions.append(obj(**kwargs)) col = config["target_column"] F_CSV = grab_files("*.csv", input_data_dir) dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, col, include_filename=True, progress_bar=False) ITR = jobmap( dispatcher, INPUT_ITR, _PARALLEL, batch_size=_global_batch_size, target_column=col, ) F_CSV_OUT = {} F_WRITERS = {} for k, row in enumerate(ITR): f = row.pop("_filename") # Create a CSV file object for all outputs if f not in F_CSV_OUT: f_csv_out = os.path.join(output_dir, os.path.basename(f)) F = open(f_csv_out, 'w') F_CSV_OUT[f] = F F_WRITERS[f] = csv.DictWriter(F, fieldnames=['_ref', col]) F_WRITERS[f].writeheader() F_WRITERS[f].writerow(row) # Close the open files for F in F_CSV_OUT.values(): F.close()
conn_out.execute("DROP TABLE {}".format(target_col)) print "Parsing {}:{}".format(f_sql, target_col) args = { "column_name":target_col, "table_name":import_config["output_table"], "conn":conn, "limit":global_limit, "progress_bar":True, } INPUT_ITR = database_iterator(**args) ITR = jobmap(dispatcher, INPUT_ITR, _PARALLEL) cmd_create = ''' DROP TABLE IF EXISTS {table_name}; CREATE TABLE IF NOT EXISTS {table_name} ( _ref INTEGER PRIMARY KEY, text STRING, meta STRING ); '''.format(table_name=target_col) conn_out.executescript(cmd_create) cmd_insert = ''' INSERT INTO {table_name} (_ref,text,meta) VALUES (?,?,?)
def categorical_predict( X, y_org, method_name, n_estimators=50, use_SMOTE=False, use_PARALLEL=True ): # Make sure the sizes match msg = "X shape {}, y_org shape {} (mismatch!)" assert X.shape[0] == y_org.shape[0], msg.format(X.shape[0], y_org.shape[0]) enc = LabelEncoder() y = enc.fit_transform(y_org) label_n = np.unique(y).shape[0] # msg = "[{}] number of unique entries in [y {}]: {}" # logger.info(msg.format(method_name, X.shape, label_n)) use_SMOTE = use_SMOTE if use_SMOTE: logger.info(" Adjusting class balance using SMOTE") clf_args = { "n_jobs": -1 if use_PARALLEL else 1, "n_estimators": n_estimators, } skf = StratifiedKFold(n_splits=10, shuffle=False).split(X, y) scores = [] F1_scores = [] INPUT_ITR = ((clf_args, idx, X, y, use_SMOTE) for idx in skf) ITR = jobmap(clf_extratree_predictor, INPUT_ITR, True) error_counts = np.zeros(y.size, dtype=float) predict_scores = np.zeros([y.size, label_n], dtype=float) df = pd.DataFrame(index=range(y.shape[0])) df["y_truth"] = y df[method_name] = -1 for result in ITR: idx, pred, pred_proba = result train_index, test_index = idx y_test = y[test_index] errors = y_test != pred scores.append(1 - errors.mean()) error_counts[test_index[errors]] += 1 F1_scores.append(f1_score(y_test, pred)) predict_scores[test_index] = pred_proba train_index, test_index = idx df.ix[test_index, method_name] = pred # Make sure all items have been scored assert ~(df[method_name] == -1).any() # For StratifiedKFold, each test set is hit only once # so normalization is simple error_counts /= 1.0 return ( np.array(scores), np.array(F1_scores), error_counts, predict_scores, df, )