Exemple #1
0
def phrases_from_config(config):
    """
    Identify parenthetical phrases in the documents as they are being
    imported to the pipeline.

    import_data_from_config() and phrases_from_config() are the entry
    points for this step of the pipeline.

    Args:
        config: a config file
    :return:
    """

    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)
    ABBR = collections.Counter()

    INPUT_ITR = db_utils.CSV_database_iterator(
        F_CSV, target_column, progress_bar=True
    )

    ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABBR.update(result)

    logger.info("{} total abbrs found.".format(len(ABBR)))

    # Merge abbreviations that are similar
    logger.debug("Deduping abbr list.")
    df = dedupe_abbr(ABBR)
    logger.info("{} abbrs remain after deduping.".format(len(df)))

    # Output top phrase
    logger.info("Top 5 abbreviations")
    msg = "({}) {}, {}, {}"
    for k, (_, row) in enumerate(df[:5].iterrows()):
        logger.info(msg.format(k + 1, row.name, row["abbr"], row["count"]))

    mkdir(output_dir)
    f_csv = os.path.join(
        output_dir, config["phrase_identification"]["f_abbreviations"]
    )
    df.to_csv(f_csv)
def import_directory_csv(d_in, d_out, output_table):

    F_CSV = []
    F_SQL = {}

    INPUT_FILES = grab_files("*.csv",d_in)
    
    if not INPUT_FILES:
        print "No matching CSV files found, exiting"
        exit(2)

    for f_csv in INPUT_FILES:
        f_sql = '.'.join(os.path.basename(f_csv).split('.')[:-1])
        f_sql += ".sqlite"                        
        f_sql = os.path.join(d_out,f_sql)

        if os.path.exists(f_sql) and not _FORCE:
            print "{} already exists, skipping".format(f_sql)
            continue

        F_CSV.append(f_csv)
        F_SQL[f_csv] = f_sql


    # Create the output directory if needed
    mkdir(d_out)
    ITR = jobmap(load_csv, F_CSV, _PARALLEL)

    # Create a reference ID for each item
    _ref_counter = itertools.count()

    for (f_csv,df) in ITR:

        f_sql = F_SQL[f_csv]
        engine = create_engine('sqlite:///'+f_sql)

        n_data_items = len(df)
        df["_ref"] = [_ref_counter.next()
                      for _ in range(n_data_items)]
        df.set_index("_ref",inplace=True)

        df.to_sql(output_table,
                  engine,
                  if_exists='replace')

        print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))
Exemple #3
0
def import_directory_csv(d_in, d_out, target_column, merge_columns):
    '''
    Takes a input_directory and output_directory and builds
    and cleaned (free of encoding errors) CSV for all input
    and attaches unique _ref numbers to each entry.
    '''

    INPUT_FILES = grab_files("*.csv", d_in)

    if not INPUT_FILES:
        print("No matching CSV files found, exiting")
        exit(2)

    for f_csv in INPUT_FILES:
        f_csv_out = os.path.join(d_out, os.path.basename(f_csv))
        vals = (f_csv, f_csv_out, target_column, merge_columns)
        import_csv(vals)
    '''
def import_directory_csv(d_in, d_out, output_table):
    '''
    Takes a input_directory and output_directory and builds
    and cleaned (free of encoding errors) CSV for all input
    and attaches unique _ref numbers to each entry.
    '''

    F_CSV = []
    F_CSV_OUT = {}
    F_CSV_OUT_HANDLE = {}

    INPUT_FILES = grab_files("*.csv", d_in)

    if not INPUT_FILES:
        print("No matching CSV files found, exiting")
        exit(2)

    for f_csv in INPUT_FILES:
        f_csvx = os.path.join(d_out, os.path.basename(f_csv))

        if os.path.exists(f_csvx):
            print("{} already exists, skipping".format(f_csvx))
            continue

        F_CSV.append(f_csv)
        F_CSV_OUT[f_csv] = open(f_csvx, 'w')
        F_CSV_OUT_HANDLE[f_csv] = None

    for f_csv in F_CSV:

        for k, row in tqdm(enumerate(csv_iterator(f_csv))):
            row["_ref"] = _ref_counter.next()

            if F_CSV_OUT_HANDLE[f_csv] is None:
                F_CSV_OUT_HANDLE[f_csv] = csv.DictWriter(
                    F_CSV_OUT[f_csv], sorted(row.keys()))
                F_CSV_OUT_HANDLE[f_csv].writeheader()

            F_CSV_OUT_HANDLE[f_csv].writerow(row)

        msg = "Imported {}, {} entries"
        print(msg.format(f_csv, k))
Exemple #5
0
def import_directory_csv(d_in, d_out, target_column, merge_columns):
    """
    Takes a input_directory and output_directory and builds
    and cleaned (free of encoding errors) CSV for all input
    and attaches unique _ref numbers to each entry.

    Args:
        d_in (str): Directory of the csv file to open
        d_out (str): Directory of where the input document should be saved to
        target_column (str): Name of the column with concatenated text
        merge_columns (list): Names of the text columns that are to be
           concatenated
    """

    INPUT_FILES = grab_files("*.csv", d_in)

    if not INPUT_FILES:
        logger.warning("No matching CSV files found, exiting")
        exit(2)

    for f_csv in INPUT_FILES:
        f_csv_out = os.path.join(d_out, os.path.basename(f_csv))
        vals = (f_csv, f_csv_out, target_column, merge_columns)
        import_csv(vals)
Exemple #6
0
def phrases_from_config(config):

    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)
    ABR = collections.Counter()

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True)
    ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABR.update(result)

    msg = "\n{} total abbrs found."
    print(msg.format(len(ABR)))

    # Merge abbreviations that are similar
    print("Deduping abbr list.")
    df = dedupe_abbr(ABR)
    print("{} abbrs remain after deduping".format(len(df)))

    # Output top phrase
    print("Top 5 abbreviations")
    print(df[:5])

    mkdir(output_dir)
    f_csv = os.path.join(output_dir,
                         config["phrase_identification"]["f_abbreviations"])
    df.to_csv(f_csv)
Exemple #7
0
    return ABR


if __name__ == "__main__":

    import simple_config
    config = simple_config.load()
    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)

    ABR = collections.Counter()
    P = parenthesis_nester()

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True)
    ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABR.update(result)

    msg = "\n{} total abbrs found."
    print(msg.format(len(ABR)))

    # Merge abbreviations that are similar
Exemple #8
0
def parse_from_config(config):

    _PARALLEL = config.as_bool("_PARALLEL")

    import_config = config["import_data"]
    parse_config = config["parse"]

    input_data_dir = import_config["output_data_directory"]
    output_dir = parse_config["output_data_directory"]

    mkdir(output_dir)

    for name in parse_config["pipeline"]:
        obj = getattr(nlpre, name)

        # Load any kwargs in the config file
        kwargs = {}
        if name in parse_config:
            kwargs = dict(parse_config[name])

        # Handle the special case of the precomputed acronyms
        if name == "replace_acronyms":
            f_abbr = os.path.join(
                config["phrase_identification"]["output_data_directory"],
                config["phrase_identification"]["f_abbreviations"])
            ABBR = load_phrase_database(f_abbr)
            kwargs["counter"] = ABBR

        parser_functions.append(obj(**kwargs))

    col = config["target_column"]
    F_CSV = grab_files("*.csv", input_data_dir)

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, col, include_filename=True, progress_bar=False)

    ITR = jobmap(
        dispatcher,
        INPUT_ITR,
        _PARALLEL,
        batch_size=_global_batch_size,
        target_column=col,
    )

    F_CSV_OUT = {}
    F_WRITERS = {}

    for k, row in enumerate(ITR):
        f = row.pop("_filename")

        # Create a CSV file object for all outputs
        if f not in F_CSV_OUT:
            f_csv_out = os.path.join(output_dir, os.path.basename(f))

            F = open(f_csv_out, 'w')
            F_CSV_OUT[f] = F
            F_WRITERS[f] = csv.DictWriter(F, fieldnames=['_ref', col])
            F_WRITERS[f].writeheader()

        F_WRITERS[f].writerow(row)

    # Close the open files
    for F in F_CSV_OUT.values():
        F.close()
if __name__ == "__main__":

    import simple_config
    config = simple_config.load("phrase_identification")
    _PARALLEL = config.as_bool("_PARALLEL")
    _FORCE = config.as_bool("_FORCE")
    output_dir = config["output_data_directory"]

    target_columns = config["target_columns"]

    import_config = simple_config.load("import_data")
    input_data_dir = import_config["output_data_directory"]
    input_table = import_config["output_table"]
    
    F_SQL = grab_files("*.sqlite", input_data_dir)

    ABR = collections.Counter()
    P = parenthesis_nester()

    dfunc = utils.db_utils.database_iterator

    FILE_COL_ITR = itertools.product(F_SQL, target_columns)

    for f_sql,column_name in FILE_COL_ITR:

        conn = sqlite3.connect(f_sql,check_same_thread=False)

        INPUT_ITR = dfunc(column_name,
                          input_table,
                          conn,
    use_meta = config["predict"]['use_meta']
    use_reduced = config["predict"]['use_reduced']

    # For now, we can only deal with one column using meta!
    assert(len(config["predict"]["categorical_columns"]) == 1)

    f_h5 = os.path.join(
        config["score"]["output_data_directory"],
        config["score"]["document_scores"]["f_db"],
    )

    h5 = h5py.File(f_h5, 'r')

    methods = h5.keys()
    pred_dir = config["import_data"]["output_data_directory"]
    pred_files = grab_files('*.csv', pred_dir)
    pred_col = config["target_column"]

    # Load the categorical columns
    cols = ['_ref', ] + config["predict"]["categorical_columns"]
    ITR = (pd.read_csv(x, usecols=cols).set_index('_ref') for x in pred_files)
    df = pd.concat(list(ITR))

    ITR = itertools.product(methods, config["predict"]["categorical_columns"])

    X_META = []

    cfg = config["predict"]
    cfg["_PARALLEL"] = config["_PARALLEL"]

    for (method, cat_col) in ITR: