Esempio n. 1
0
def embed_from_config(config):

    mkdir(config["embedding"]["output_data_directory"])

    # If there is a whitelist only keep the matching filename
    try:
        whitelist = config["score"]["input_file_whitelist"]
    except:
        whitelist = []

    #
    # Run the functions that act globally on the data

    for name in config["embedding"]["embedding_commands"]:
        obj = getattr(mb, name)

        # Load any kwargs in the config file
        kwargs = config["embedding"].copy()

        if name in kwargs:
            kwargs.update(kwargs[name])
        kwargs['target_column'] = config['target_column']

        func = obj(**kwargs)
        func.set_iterator_function(item_iterator,
                                   config["embedding"],
                                   whitelist,
                                   section="parse")
        func.compute(**kwargs)
Esempio n. 2
0
def embed_from_config(config):
    '''
    Args:
        config (dict): Import parameters
    '''

    # Only load options from the embedding section
    target_column = config['target_column']
    econfig = config['embed']

    # Create any missing directories
    d_out = econfig['output_data_directory']
    mkdir(d_out)

    # Train each embedding model
    for name in econfig["embedding_commands"]:

        # Load any kwargs in the config file
        kwargs = econfig.copy()

        if name in kwargs:
            kwargs.update(kwargs[name])

        model = getattr(mb, name)(**kwargs)
        model.set_iterator_function(text_iterator)
        model.compute(target_column)

        f_save = os.path.join(d_out, kwargs[name]['f_db'])
        model.save(f_save)
Esempio n. 3
0
def import_data_from_config(config):
    """
    Import parameters from the config file. import_data_from_config()
    and phrases_from_config() are the entry points for this step of the
    pipeline.

    Args:
        config: a config file
    """

    merge_columns = config["import_data"]["merge_columns"]

    if not isinstance(merge_columns, list):
        msg = "merge_columns (if used) must be a list"
        raise ValueError(msg)

    data_out = config["import_data"]["output_data_directory"]
    mkdir(data_out)

    # Require 'input_data_directories' to be a list
    data_in_list = config["import_data"]["input_data_directories"]
    if not isinstance(data_in_list, list):
        msg = "input_data_directories must be a list"
        raise ValueError(msg)

    target_column = config["target_column"]

    for d_in in data_in_list:
        import_directory_csv(d_in, data_out, target_column, merge_columns)
Esempio n. 4
0
def phrases_from_config(config):
    """
    Identify parenthetical phrases in the documents as they are being
    imported to the pipeline.

    import_data_from_config() and phrases_from_config() are the entry
    points for this step of the pipeline.

    Args:
        config: a config file
    :return:
    """

    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)
    ABBR = collections.Counter()

    INPUT_ITR = db_utils.CSV_database_iterator(
        F_CSV, target_column, progress_bar=True
    )

    ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABBR.update(result)

    logger.info("{} total abbrs found.".format(len(ABBR)))

    # Merge abbreviations that are similar
    logger.debug("Deduping abbr list.")
    df = dedupe_abbr(ABBR)
    logger.info("{} abbrs remain after deduping.".format(len(df)))

    # Output top phrase
    logger.info("Top 5 abbreviations")
    msg = "({}) {}, {}, {}"
    for k, (_, row) in enumerate(df[:5].iterrows()):
        logger.info(msg.format(k + 1, row.name, row["abbr"], row["count"]))

    mkdir(output_dir)
    f_csv = os.path.join(
        output_dir, config["phrase_identification"]["f_abbreviations"]
    )
    df.to_csv(f_csv)
Esempio n. 5
0
def import_directory_csv(d_in, d_out, output_table):

    F_CSV = []
    F_SQL = {}

    INPUT_FILES = grab_files("*.csv",d_in)
    
    if not INPUT_FILES:
        print "No matching CSV files found, exiting"
        exit(2)

    for f_csv in INPUT_FILES:
        f_sql = '.'.join(os.path.basename(f_csv).split('.')[:-1])
        f_sql += ".sqlite"                        
        f_sql = os.path.join(d_out,f_sql)

        if os.path.exists(f_sql) and not _FORCE:
            print "{} already exists, skipping".format(f_sql)
            continue

        F_CSV.append(f_csv)
        F_SQL[f_csv] = f_sql


    # Create the output directory if needed
    mkdir(d_out)
    ITR = jobmap(load_csv, F_CSV, _PARALLEL)

    # Create a reference ID for each item
    _ref_counter = itertools.count()

    for (f_csv,df) in ITR:

        f_sql = F_SQL[f_csv]
        engine = create_engine('sqlite:///'+f_sql)

        n_data_items = len(df)
        df["_ref"] = [_ref_counter.next()
                      for _ in range(n_data_items)]
        df.set_index("_ref",inplace=True)

        df.to_sql(output_table,
                  engine,
                  if_exists='replace')

        print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))
Esempio n. 6
0
def import_data_from_config(config):

    merge_columns = (config["import_data"]["merge_columns"]
                     if "merge_columns" in config["import_data"] else [])

    if (not isinstance(merge_columns, list)):
        msg = "merge_columns (if used) must be a list"
        raise ValueError(msg)

    data_out = config["import_data"]["output_data_directory"]
    mkdir(data_out)

    # Require `input_data_directories` to be a list
    data_in_list = config["import_data"]["input_data_directories"]
    if (not isinstance(data_in_list, list)):
        msg = "input_data_directories must be a list"
        raise ValueError(msg)

    target_column = config["target_column"]

    for d_in in data_in_list:
        import_directory_csv(d_in, data_out, target_column, merge_columns)
Esempio n. 7
0
def score_from_config(global_config):

    config = global_config["score"]
    mkdir(config["output_data_directory"])

    # Run the functions that can sum over the data (eg. TF counts)
    for name in config["count_commands"]:

        model, kwargs = _load_model(name, config)
        logger.info("Starting mapreduce {}".format(model.function_name))
        map(model, db.text_iterator())
        model.save(**kwargs)

    # Load the reduced representation model
    RREP = ds.reduced_representation()

    # Run the functions that act per documnet (eg. word2vec)
    for name in config["score_commands"]:

        model, kwargs = _load_model(name, config)
        f_db = os.path.join(kwargs["output_data_directory"], kwargs["f_db"])

        logger.info("Starting score model {}".format(model.method))

        for f_csv in db.get_section_filenames('parse'):
            data = {}
            for row in db.text_iterator([
                    f_csv,
            ]):
                data[row["_ref"]] = model(row['text'])

            model.save(data, f_csv, f_db)

        # If required, compute the reduced representation
        if kwargs["compute_reduced_representation"]:
            nc = kwargs['reduced_representation']['n_components']
            rdata = RREP.compute(model.method, n_components=nc)
            RREP.save(model.method, rdata, f_db)
Esempio n. 8
0
def phrases_from_config(config):

    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)
    ABR = collections.Counter()

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True)
    ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABR.update(result)

    msg = "\n{} total abbrs found."
    print(msg.format(len(ABR)))

    # Merge abbreviations that are similar
    print("Deduping abbr list.")
    df = dedupe_abbr(ABR)
    print("{} abbrs remain after deduping".format(len(df)))

    # Output top phrase
    print("Top 5 abbreviations")
    print(df[:5])

    mkdir(output_dir)
    f_csv = os.path.join(output_dir,
                         config["phrase_identification"]["f_abbreviations"])
    df.to_csv(f_csv)
            row["_ref"] = _ref_counter.next()

            if F_CSV_OUT_HANDLE[f_csv] is None:
                F_CSV_OUT_HANDLE[f_csv] = csv.DictWriter(
                    F_CSV_OUT[f_csv], sorted(row.keys()))
                F_CSV_OUT_HANDLE[f_csv].writeheader()

            F_CSV_OUT_HANDLE[f_csv].writerow(row)

        msg = "Imported {}, {} entries"
        print(msg.format(f_csv, k))


if __name__ == "__main__":

    import simple_config
    config = simple_config.load()
    _PARALLEL = config.as_bool("_PARALLEL")

    data_out = config["import_data"]["output_data_directory"]
    mkdir(data_out)

    output_table = config["import_data"]["output_table"]

    # Require `input_data_directories` to be a list
    data_in_list = config["import_data"]["input_data_directories"]
    assert (isinstance(data_in_list, list))

    for d_in in data_in_list:
        import_directory_csv(d_in, data_out, output_table)
Esempio n. 10
0
    ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABR.update(result)

    msg = "\n{} total abbrs found."
    print(msg.format(len(ABR)))

    # Merge abbreviations that are similar
    print("Deduping abbr list.")
    ABR = dedupe_abbr(ABR)
    print("{} abbrs remain after deduping".format(len(ABR)))

    # Convert abbrs to a list
    data_insert = [(phrase, abbr, count)
                   for (phrase, abbr), count in ABR.most_common()]

    # Convert the list to a dataframe and sort
    df = pd.DataFrame(data_insert, columns=("phrase", "abbr", "count"))
    df = df.sort_values(["count", "phrase"],
                        ascending=False).set_index("phrase")

    # Output top phrase
    print("Top 5 abbreviations")
    print(df[:5])

    mkdir(output_dir)
    f_csv = os.path.join(output_dir,
                         config["phrase_identification"]["f_abbreviations"])
    df.to_csv(f_csv)
Esempio n. 11
0
def parse_from_config(config):

    _PARALLEL = config.as_bool("_PARALLEL")

    import_config = config["import_data"]
    parse_config = config["parse"]

    input_data_dir = import_config["output_data_directory"]
    output_dir = parse_config["output_data_directory"]

    mkdir(output_dir)

    for name in parse_config["pipeline"]:
        obj = getattr(nlpre, name)

        # Load any kwargs in the config file
        kwargs = {}
        if name in parse_config:
            kwargs = dict(parse_config[name])

        # Handle the special case of the precomputed acronyms
        if name == "replace_acronyms":
            f_abbr = os.path.join(
                config["phrase_identification"]["output_data_directory"],
                config["phrase_identification"]["f_abbreviations"])
            ABBR = load_phrase_database(f_abbr)
            kwargs["counter"] = ABBR

        parser_functions.append(obj(**kwargs))

    col = config["target_column"]
    F_CSV = grab_files("*.csv", input_data_dir)

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, col, include_filename=True, progress_bar=False)

    ITR = jobmap(
        dispatcher,
        INPUT_ITR,
        _PARALLEL,
        batch_size=_global_batch_size,
        target_column=col,
    )

    F_CSV_OUT = {}
    F_WRITERS = {}

    for k, row in enumerate(ITR):
        f = row.pop("_filename")

        # Create a CSV file object for all outputs
        if f not in F_CSV_OUT:
            f_csv_out = os.path.join(output_dir, os.path.basename(f))

            F = open(f_csv_out, 'w')
            F_CSV_OUT[f] = F
            F_WRITERS[f] = csv.DictWriter(F, fieldnames=['_ref', col])
            F_WRITERS[f].writeheader()

        F_WRITERS[f].writerow(row)

    # Close the open files
    for F in F_CSV_OUT.values():
        F.close()
Esempio n. 12
0
def predict_from_config(config):

    ERROR_MATRIX = {}
    PREDICTIONS = {}

    use_meta = config["predict"]['use_meta']

    # For now, we can only deal with one column using meta!
    assert(len(config["predict"]["categorical_columns"]) == 1)

    methods = uds.get_score_methods()

    pred_col = config["target_column"]

    pred_output_dir = config["predict"]["output_data_directory"]
    extra_cols = config["predict"]["extra_columns"]
    mkdir(pred_output_dir)

    # Load the categorical columns
    df = uds.load_ORG_data(config["predict"]["categorical_columns"])
    ITR = itertools.product(methods, config["predict"]["categorical_columns"])

    X_META = []

    cfg = config["predict"]
    cfg["_PARALLEL"] = config["_PARALLEL"]
    df_scores = None

    for (method, cat_col) in ITR:

        text = "Predicting [{}] [{}:{}]"
        logger.info(text.format(method, cat_col, pred_col))

        DV = uds.load_document_vectors(method)
        X = DV["docv"]

        if use_meta:
            X_META.append(X)

        Y = np.hstack(df[cat_col].values)
        counts = np.array(collections.Counter(Y).values(), dtype=float)
        counts /= counts.sum()

        msg = " Class balance for categorical prediction: {}"
        logger.info(msg.format(counts))

        # Determine the baseline prediction
        y_counts = collections.Counter(Y).values()
        baseline_score = max(y_counts) / float(sum(y_counts))

        # Predict
        scores, F1, errors, pred, dfs = categorical_predict(
            X=X,
            y_org=Y,
            method_name=method,
            use_SMOTE=int(cfg['use_SMOTE']),
            use_PARALLEL=int(cfg['_PARALLEL']),
            n_estimators=int(cfg['n_estimators']),
        )

        text = "  F1 {:0.3f}; Accuracy {:0.3f}; baseline ({:0.3f})"
        logger.info(text.format(scores.mean(), F1.mean(), baseline_score))

        PREDICTIONS[method] = pred
        ERROR_MATRIX[method] = errors

        if df_scores is None:
            df_scores = dfs
        else:
            df_scores[method] = dfs[method]

    if use_meta:
        # Build meta predictor
        # META_X = np.hstack([PREDICTIONS[method] for method
        #                    in config["predict"]["meta_methods"]])
        X_META = np.hstack(X_META)
        method = "meta"

        text = "Predicting [{}] [{}:{}]"
        logger.info(text.format(method, cat_col, pred_col))

        scores, F1, errors, pred, dfs = categorical_predict(
            X=X_META,
            y_org=Y,
            method_name=method,
            n_estimators=int(cfg['n_estimators']),
            use_PARALLEL=int(cfg['_PARALLEL']),
        )

        text = "  F1 {:0.3f}; Accuracy {:0.3f}; baseline ({:0.3f})"
        logger.info(text.format(scores.mean(), F1.mean(), baseline_score))

        PREDICTIONS[method] = pred
        ERROR_MATRIX[method] = errors
        df_scores[method] = dfs[method]

    # Save the predictions
    if extra_cols:
        df_ORG = uds.load_ORG_data(extra_columns=extra_cols)
        for col in extra_cols:
            df_scores[col] = df_ORG[col]

    f_save = os.path.join(pred_output_dir,
                          "{}_prediction.csv".format(cat_col))
    df_scores.index.name = '_ref'
    df_scores.to_csv(f_save)

    names = methods

    if use_meta:
        names += ["meta", ]

    # Plotting methods here

    df = pd.DataFrame(0, index=names, columns=names)

    max_offdiagonal = 0
    for na, nb in itertools.product(names, repeat=2):
        if na != nb:
            idx = (ERROR_MATRIX[na] == 0) * (ERROR_MATRIX[nb] == 1)
            max_offdiagonal = max(max_offdiagonal, idx.sum())
        else:
            idx = ERROR_MATRIX[na] == 0

        df[na][nb] = idx.sum()

    print(df) # Output result to stdout

    sns.heatmap(df, annot=True, vmin=0, vmax=1.2 * max_offdiagonal, fmt="d")
    plt.yticks(rotation=0)
    plt.xticks(rotation=45)

    plt.show()
Esempio n. 13
0
                    yield val
                    progress_bar.update()

            if not self.yield_single:
                yield data

if __name__ == "__main__":

    import simple_config
    config = simple_config.load("score")
    _PARALLEL = config.as_bool("_PARALLEL")
    _FORCE = config.as_bool("_FORCE")

    n_jobs = -1 if _PARALLEL else 1

    mkdir(config["output_data_directory"])

    ###########################################################
    # Fill the pipeline with function objects

    mapreduce_functions = []
    for name in config["mapreduce_commands"]:

        obj  = getattr(ds,name)

        # Load any kwargs in the config file
        kwargs = {}
        if name in config:
            kwargs = config[name]

        # Add in the embedding configuration options
        ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL)

        for result in ITR:
            ABR.update(result)

        msg = "Completed {} {}. {} total abbrs found."
        print msg.format(f_sql,column_name,len(ABR))

    # Merge abbreviations that are similar
    print "Deduping list"    
    ABR = dedupe_abbr(ABR)
    print "{} abbrs remain after deduping".format(len(ABR))


    # Convert abbrs to a list
    data_insert = [(phrase,abbr,count) 
                   for (phrase,abbr),count in ABR.most_common()]

    # Convert the list to a dataframe for insert
    df = pd.DataFrame(data_insert, 
                      columns=("phrase","abbr","count"))

    mkdir(output_dir)
    f_sql = os.path.join(output_dir, config["f_abbreviations"])
    engine = create_engine('sqlite:///'+f_sql)

    # Save the abbrs to a table
    df.to_sql(config["output_table"],
              engine,
              if_exists='replace')
Esempio n. 15
0
def score_from_config(global_config):

    config = global_config["score"]

    mkdir(config["output_data_directory"])

    #
    # Fill the pipeline with function objects

    mapreduce_functions = []
    for name in config["mapreduce_commands"]:

        obj = getattr(ds, name)

        # Load any kwargs in the config file
        kwargs = {}
        if name in config:
            kwargs = config[name]

        # Add in the embedding configuration options
        kwargs["embedding"] = global_config["embedding"]
        kwargs["score"] = global_config["score"]

        val = name, obj(**kwargs)
        mapreduce_functions.append(val)

    col = global_config['target_column']

    # Run the functions that can act like mapreduce (eg. TF counts)

    for name, func in mapreduce_functions:
        print("Starting mapreduce {}".format(func.table_name))
        INPUT_ITR = db.item_iterator(
            config,
            text_column=col,
            progress_bar=True,
            include_filename=True,
        )

        ITR = itertools.imap(func, INPUT_ITR)
        map(func.reduce, ITR)

        func.save(config)

    # Run the functions that act globally on the data

    for name in config["globaldata_commands"]:
        obj = getattr(ds, name)

        # Load any kwargs in the config file
        kwargs = config
        if name in config:
            kwargs.update(config[name])

        # Add in the embedding configuration options
        func = obj(**kwargs)

        F_CSV = db.get_section_filenames("parse")

        for f_csv in F_CSV:
            ITR = db.single_file_item_iterator(f_csv)
            func.compute_single(ITR)
            func.save_single()

        func.compute_reduced_representation()
Esempio n. 16
0
import itertools
from utils.os_utils import mkdir
import document_scoring as ds
import simple_config
from utils.db_utils import item_iterator

if __name__ == "__main__":

    global_config = simple_config.load()
    _PARALLEL = global_config.as_bool("_PARALLEL")

    config = global_config["score"]

    n_jobs = -1 if _PARALLEL else 1
    mkdir(config["output_data_directory"])

    #
    # Fill the pipeline with function objects

    mapreduce_functions = []
    for name in config["mapreduce_commands"]:

        obj = getattr(ds, name)

        # Load any kwargs in the config file
        kwargs = {}
        if name in config:
            kwargs = config[name]

        # Add in the embedding configuration options
        kwargs["embedding"] = global_config["embedding"]
Esempio n. 17
0
def explain_metaclusters(config):
    save_dest = config["postprocessing"]["output_data_directory"]
    uos.mkdir(save_dest)

    args = config["postprocessing"]["LIME_explainer"]

    f_csv_out = os.path.join(save_dest, "cluster_LIME.csv")

    data = uds.load_metacluster_data()
    centroids = data["meta_centroids"]
    labels = data["meta_labels"]

    # Find out which centroids are close, and find their index locations
    C = cdist(centroids, centroids, metric="cosine")
    cidx = np.where(C > float(args["metacluster_cosine_minsim"]))

    n_lime_samples = int(args["n_lime_samples"])
    n_lime_features = int(args["n_lime_features"])
    n_estimators = int(args["n_estimators"])

    INPUT_ITR = udb.text_iterator()
    ALL_TEXT = np.array([row["text"] for row in INPUT_ITR])

    data = []
    for i, j in zip(*cidx):
        # Only take the upper diagonal
        if i >= j:
            continue

        logger.info("Computing LIME for clusters {} and {}".format(i, j))

        labels_i = labels == i
        labels_j = labels == j
        idx = labels_i | labels_j

        LE = sklearn.preprocessing.LabelEncoder()
        Y = LE.fit_transform(labels[idx])

        n_samples = min(labels_i.sum(), labels_j.sum(), n_lime_samples)

        new_idx = _select_even_subset(Y, n_samples)
        Y = Y[new_idx]
        TEXT = ALL_TEXT[idx][new_idx]

        df = _compute_LIME(TEXT, Y, n_estimators, n_lime_features)

        # Remove words that contributes < 0.5%
        df.score /= np.abs(df.score).sum()
        df = df[np.abs(df.score) > 0.005]

        # Normalize the scores and make human friendly
        df.score /= np.abs(df.score).sum()
        df.score *= 100

        class_names = LE.classes_
        df["negative_class"] = class_names[0]
        df["positive_class"] = class_names[1]

        data.append(df)

    df = pd.concat(data).set_index(["negative_class", "positive_class"])
    df.to_csv(f_csv_out)
Esempio n. 18
0
from utils.os_utils import mkdir
import model_building as mb
import simple_config
from utils.db_utils import item_iterator

if __name__ == "__main__":

    config = simple_config.load()
    mkdir(config["embedding"]["output_data_directory"])

    # If there is a whitelist only keep the matching filename
    try:
        whitelist = config["score"]["input_file_whitelist"]
    except:
        whitelist = []

    #
    # Run the functions that act globally on the data

    for name in config["embedding"]["embedding_commands"]:
        obj = getattr(mb, name)

        # Load any kwargs in the config file
        kwargs = config["embedding"].copy()

        if name in kwargs:
            kwargs.update(kwargs[name])
        kwargs['target_column'] = config['target_column']

        func = obj(**kwargs)
        func.set_iterator_function(item_iterator,