def phrases_from_config(config): """ Identify parenthetical phrases in the documents as they are being imported to the pipeline. import_data_from_config() and phrases_from_config() are the entry points for this step of the pipeline. Args: config: a config file :return: """ _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABBR = collections.Counter() INPUT_ITR = db_utils.CSV_database_iterator( F_CSV, target_column, progress_bar=True ) ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABBR.update(result) logger.info("{} total abbrs found.".format(len(ABBR))) # Merge abbreviations that are similar logger.debug("Deduping abbr list.") df = dedupe_abbr(ABBR) logger.info("{} abbrs remain after deduping.".format(len(df))) # Output top phrase logger.info("Top 5 abbreviations") msg = "({}) {}, {}, {}" for k, (_, row) in enumerate(df[:5].iterrows()): logger.info(msg.format(k + 1, row.name, row["abbr"], row["count"])) mkdir(output_dir) f_csv = os.path.join( output_dir, config["phrase_identification"]["f_abbreviations"] ) df.to_csv(f_csv)
def import_directory_csv(d_in, d_out, output_table): F_CSV = [] F_SQL = {} INPUT_FILES = grab_files("*.csv",d_in) if not INPUT_FILES: print "No matching CSV files found, exiting" exit(2) for f_csv in INPUT_FILES: f_sql = '.'.join(os.path.basename(f_csv).split('.')[:-1]) f_sql += ".sqlite" f_sql = os.path.join(d_out,f_sql) if os.path.exists(f_sql) and not _FORCE: print "{} already exists, skipping".format(f_sql) continue F_CSV.append(f_csv) F_SQL[f_csv] = f_sql # Create the output directory if needed mkdir(d_out) ITR = jobmap(load_csv, F_CSV, _PARALLEL) # Create a reference ID for each item _ref_counter = itertools.count() for (f_csv,df) in ITR: f_sql = F_SQL[f_csv] engine = create_engine('sqlite:///'+f_sql) n_data_items = len(df) df["_ref"] = [_ref_counter.next() for _ in range(n_data_items)] df.set_index("_ref",inplace=True) df.to_sql(output_table, engine, if_exists='replace') print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))
def import_directory_csv(d_in, d_out, target_column, merge_columns): ''' Takes a input_directory and output_directory and builds and cleaned (free of encoding errors) CSV for all input and attaches unique _ref numbers to each entry. ''' INPUT_FILES = grab_files("*.csv", d_in) if not INPUT_FILES: print("No matching CSV files found, exiting") exit(2) for f_csv in INPUT_FILES: f_csv_out = os.path.join(d_out, os.path.basename(f_csv)) vals = (f_csv, f_csv_out, target_column, merge_columns) import_csv(vals) '''
def import_directory_csv(d_in, d_out, output_table): ''' Takes a input_directory and output_directory and builds and cleaned (free of encoding errors) CSV for all input and attaches unique _ref numbers to each entry. ''' F_CSV = [] F_CSV_OUT = {} F_CSV_OUT_HANDLE = {} INPUT_FILES = grab_files("*.csv", d_in) if not INPUT_FILES: print("No matching CSV files found, exiting") exit(2) for f_csv in INPUT_FILES: f_csvx = os.path.join(d_out, os.path.basename(f_csv)) if os.path.exists(f_csvx): print("{} already exists, skipping".format(f_csvx)) continue F_CSV.append(f_csv) F_CSV_OUT[f_csv] = open(f_csvx, 'w') F_CSV_OUT_HANDLE[f_csv] = None for f_csv in F_CSV: for k, row in tqdm(enumerate(csv_iterator(f_csv))): row["_ref"] = _ref_counter.next() if F_CSV_OUT_HANDLE[f_csv] is None: F_CSV_OUT_HANDLE[f_csv] = csv.DictWriter( F_CSV_OUT[f_csv], sorted(row.keys())) F_CSV_OUT_HANDLE[f_csv].writeheader() F_CSV_OUT_HANDLE[f_csv].writerow(row) msg = "Imported {}, {} entries" print(msg.format(f_csv, k))
def import_directory_csv(d_in, d_out, target_column, merge_columns): """ Takes a input_directory and output_directory and builds and cleaned (free of encoding errors) CSV for all input and attaches unique _ref numbers to each entry. Args: d_in (str): Directory of the csv file to open d_out (str): Directory of where the input document should be saved to target_column (str): Name of the column with concatenated text merge_columns (list): Names of the text columns that are to be concatenated """ INPUT_FILES = grab_files("*.csv", d_in) if not INPUT_FILES: logger.warning("No matching CSV files found, exiting") exit(2) for f_csv in INPUT_FILES: f_csv_out = os.path.join(d_out, os.path.basename(f_csv)) vals = (f_csv, f_csv_out, target_column, merge_columns) import_csv(vals)
def phrases_from_config(config): _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABR = collections.Counter() dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True) ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABR.update(result) msg = "\n{} total abbrs found." print(msg.format(len(ABR))) # Merge abbreviations that are similar print("Deduping abbr list.") df = dedupe_abbr(ABR) print("{} abbrs remain after deduping".format(len(df))) # Output top phrase print("Top 5 abbreviations") print(df[:5]) mkdir(output_dir) f_csv = os.path.join(output_dir, config["phrase_identification"]["f_abbreviations"]) df.to_csv(f_csv)
return ABR if __name__ == "__main__": import simple_config config = simple_config.load() _PARALLEL = config.as_bool("_PARALLEL") output_dir = config["phrase_identification"]["output_data_directory"] target_column = config["target_column"] import_config = config["import_data"] input_data_dir = import_config["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir) ABR = collections.Counter() P = parenthesis_nester() dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True) ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL, col=target_column) for result in ITR: ABR.update(result) msg = "\n{} total abbrs found." print(msg.format(len(ABR))) # Merge abbreviations that are similar
def parse_from_config(config): _PARALLEL = config.as_bool("_PARALLEL") import_config = config["import_data"] parse_config = config["parse"] input_data_dir = import_config["output_data_directory"] output_dir = parse_config["output_data_directory"] mkdir(output_dir) for name in parse_config["pipeline"]: obj = getattr(nlpre, name) # Load any kwargs in the config file kwargs = {} if name in parse_config: kwargs = dict(parse_config[name]) # Handle the special case of the precomputed acronyms if name == "replace_acronyms": f_abbr = os.path.join( config["phrase_identification"]["output_data_directory"], config["phrase_identification"]["f_abbreviations"]) ABBR = load_phrase_database(f_abbr) kwargs["counter"] = ABBR parser_functions.append(obj(**kwargs)) col = config["target_column"] F_CSV = grab_files("*.csv", input_data_dir) dfunc = db_utils.CSV_database_iterator INPUT_ITR = dfunc(F_CSV, col, include_filename=True, progress_bar=False) ITR = jobmap( dispatcher, INPUT_ITR, _PARALLEL, batch_size=_global_batch_size, target_column=col, ) F_CSV_OUT = {} F_WRITERS = {} for k, row in enumerate(ITR): f = row.pop("_filename") # Create a CSV file object for all outputs if f not in F_CSV_OUT: f_csv_out = os.path.join(output_dir, os.path.basename(f)) F = open(f_csv_out, 'w') F_CSV_OUT[f] = F F_WRITERS[f] = csv.DictWriter(F, fieldnames=['_ref', col]) F_WRITERS[f].writeheader() F_WRITERS[f].writerow(row) # Close the open files for F in F_CSV_OUT.values(): F.close()
if __name__ == "__main__": import simple_config config = simple_config.load("phrase_identification") _PARALLEL = config.as_bool("_PARALLEL") _FORCE = config.as_bool("_FORCE") output_dir = config["output_data_directory"] target_columns = config["target_columns"] import_config = simple_config.load("import_data") input_data_dir = import_config["output_data_directory"] input_table = import_config["output_table"] F_SQL = grab_files("*.sqlite", input_data_dir) ABR = collections.Counter() P = parenthesis_nester() dfunc = utils.db_utils.database_iterator FILE_COL_ITR = itertools.product(F_SQL, target_columns) for f_sql,column_name in FILE_COL_ITR: conn = sqlite3.connect(f_sql,check_same_thread=False) INPUT_ITR = dfunc(column_name, input_table, conn,
use_meta = config["predict"]['use_meta'] use_reduced = config["predict"]['use_reduced'] # For now, we can only deal with one column using meta! assert(len(config["predict"]["categorical_columns"]) == 1) f_h5 = os.path.join( config["score"]["output_data_directory"], config["score"]["document_scores"]["f_db"], ) h5 = h5py.File(f_h5, 'r') methods = h5.keys() pred_dir = config["import_data"]["output_data_directory"] pred_files = grab_files('*.csv', pred_dir) pred_col = config["target_column"] # Load the categorical columns cols = ['_ref', ] + config["predict"]["categorical_columns"] ITR = (pd.read_csv(x, usecols=cols).set_index('_ref') for x in pred_files) df = pd.concat(list(ITR)) ITR = itertools.product(methods, config["predict"]["categorical_columns"]) X_META = [] cfg = config["predict"] cfg["_PARALLEL"] = config["_PARALLEL"] for (method, cat_col) in ITR: