def categorical_predict(X,y_org,method_name,config):

    # Make sure the sizes match
    msg = "X shape {}, y_org shape {} (mismatch!)"
    assert X.shape[0] == y_org.shape[0], msg.format(X.shape[0],
                                                    y_org.shape[0])

    enc = LabelEncoder()
    y = enc.fit_transform(y_org)

    label_n = np.unique(y).shape[0]
    #msg = "[{}] number of unique entries in [y {}]: {}"
    #print msg.format(method_name, X.shape, label_n)

    use_SMOTE = config["use_SMOTE"]
    print "  Adjusting class balance using SMOTE"

    is_PARALLEL = config["_PARALLEL"]
    
    clf_args = {
        "n_jobs" : -1 if is_PARALLEL else 1,
        "n_estimators" : int(config["n_estimators"]),
    }
    
    skf = StratifiedKFold(y,
                          n_folds=10,
                          shuffle=False)
    scores = []
    F1_scores = []

    INPUT_ITR = ((clf_args, idx, X, y, use_SMOTE) for idx in skf)

    ITR = jobmap(clf_extratree_predictor, INPUT_ITR, True)

    error_counts   = np.zeros(y.size,dtype=float)
    predict_scores = np.zeros([y.size,label_n],dtype=float)

    for result in ITR:
        idx,pred,pred_proba = result
        train_index, test_index = idx

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        errors = y_test!=pred
        
        scores.append(1-errors.mean())        
        error_counts[test_index[errors]] += 1

        F1_scores.append(f1_score(y_test, pred))
        predict_scores[test_index] = pred_proba

    # For StratifiedKFold, each test set is hit only once
    # so normalization is simple
    error_counts /= 1.0

    return np.array(scores), np.array(F1_scores), error_counts, predict_scores
def categorical_predict(X, y_org, method_name, config):

    # Make sure the sizes match
    msg = "X shape {}, y_org shape {} (mismatch!)"
    assert X.shape[0] == y_org.shape[0], msg.format(X.shape[0], y_org.shape[0])

    enc = LabelEncoder()
    y = enc.fit_transform(y_org)

    label_n = np.unique(y).shape[0]
    # msg = "[{}] number of unique entries in [y {}]: {}"
    # print msg.format(method_name, X.shape, label_n)

    use_SMOTE = config["use_SMOTE"]
    if use_SMOTE:
        print("  Adjusting class balance using SMOTE")

    is_PARALLEL = config["_PARALLEL"]

    clf_args = {
        "n_jobs": -1 if is_PARALLEL else 1,
        "n_estimators": int(config["n_estimators"]),
    }

    skf = StratifiedKFold(n_splits=10, shuffle=False).split(X, y)
    scores = []
    F1_scores = []

    INPUT_ITR = ((clf_args, idx, X, y, use_SMOTE) for idx in skf)

    ITR = jobmap(clf_extratree_predictor, INPUT_ITR, True)

    error_counts = np.zeros(y.size, dtype=float)
    predict_scores = np.zeros([y.size, label_n], dtype=float)

    for result in ITR:
        idx, pred, pred_proba = result
        train_index, test_index = idx

        y_test = y[test_index]

        errors = y_test != pred

        scores.append(1 - errors.mean())
        error_counts[test_index[errors]] += 1

        F1_scores.append(f1_score(y_test, pred))
        predict_scores[test_index] = pred_proba

    # For StratifiedKFold, each test set is hit only once
    # so normalization is simple
    error_counts /= 1.0

    return np.array(scores), np.array(F1_scores), error_counts, predict_scores
Exemple #3
0
def phrases_from_config(config):
    """
    Identify parenthetical phrases in the documents as they are being
    imported to the pipeline.

    import_data_from_config() and phrases_from_config() are the entry
    points for this step of the pipeline.

    Args:
        config: a config file
    :return:
    """

    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)
    ABBR = collections.Counter()

    INPUT_ITR = db_utils.CSV_database_iterator(
        F_CSV, target_column, progress_bar=True
    )

    ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABBR.update(result)

    logger.info("{} total abbrs found.".format(len(ABBR)))

    # Merge abbreviations that are similar
    logger.debug("Deduping abbr list.")
    df = dedupe_abbr(ABBR)
    logger.info("{} abbrs remain after deduping.".format(len(df)))

    # Output top phrase
    logger.info("Top 5 abbreviations")
    msg = "({}) {}, {}, {}"
    for k, (_, row) in enumerate(df[:5].iterrows()):
        logger.info(msg.format(k + 1, row.name, row["abbr"], row["count"]))

    mkdir(output_dir)
    f_csv = os.path.join(
        output_dir, config["phrase_identification"]["f_abbreviations"]
    )
    df.to_csv(f_csv)
    def compute(self, config):

        func = compute_affinity
        ITR = jobmap(func, self, self.PARALLEL)
        print("Computing affinity propagation")

        for result in tqdm.tqdm(ITR):
            self.save(config, result)

        # Save the size of the vocabulary
        self.h5["documents"].attrs["vocab_n"] = self.vocab_n
        self.h5["documents"].attrs["cluster_n"] = self.cluster_n

        self.h5.close()
    def compute(self, config):

        func = compute_document_affinity
        ITR = jobmap(func, self, self.PARALLEL)

        doc_data = []

        print("Computing document affinity scoring")
        for result in ITR:
            doc_data.append(result)

        df = pd.DataFrame(data=doc_data, columns=["V", "idx", "f_sql"])

        self.save(config, df)
    def compute(self, config):

        func = compute_affinity
        ITR = jobmap(func, self, self.PARALLEL)
        print "Computing affinity propagation"

        for result in tqdm.tqdm(ITR):
            self.save(config, result)
                        
        # Save the size of the vocabulary
        self.h5["documents"].attrs["vocab_n"] = self.vocab_n        
        self.h5["documents"].attrs["cluster_n"] = self.cluster_n

        self.h5.close()
def csv_iterator(f_csv, clean=True, _PARALLEL=False):
    '''
    Creates and iterator over a CSV file, optionally cleans it.
    '''
    with open(f_csv) as FIN:
        CSV = csv.DictReader(FIN)

        if clean and _PARALLEL:
            CSV = jobmap(clean_row, CSV, FLAG_PARALLEL=_PARALLEL)
        elif clean and not _PARALLEL:
            CSV = itertools.imap(clean_row, CSV)

        for row in CSV:
            yield row
    def compute(self, config):

        func = compute_document_affinity
        ITR = jobmap(func, self, self.PARALLEL)

        doc_data = []

        print "Computing document affinity scoring"
        for result in ITR:
            doc_data.append(result)

        df = pd.DataFrame(data=doc_data,
                          columns=["V","idx","f_sql"])

        self.save(config, df)
def import_directory_csv(d_in, d_out, output_table):

    F_CSV = []
    F_SQL = {}

    INPUT_FILES = grab_files("*.csv",d_in)
    
    if not INPUT_FILES:
        print "No matching CSV files found, exiting"
        exit(2)

    for f_csv in INPUT_FILES:
        f_sql = '.'.join(os.path.basename(f_csv).split('.')[:-1])
        f_sql += ".sqlite"                        
        f_sql = os.path.join(d_out,f_sql)

        if os.path.exists(f_sql) and not _FORCE:
            print "{} already exists, skipping".format(f_sql)
            continue

        F_CSV.append(f_csv)
        F_SQL[f_csv] = f_sql


    # Create the output directory if needed
    mkdir(d_out)
    ITR = jobmap(load_csv, F_CSV, _PARALLEL)

    # Create a reference ID for each item
    _ref_counter = itertools.count()

    for (f_csv,df) in ITR:

        f_sql = F_SQL[f_csv]
        engine = create_engine('sqlite:///'+f_sql)

        n_data_items = len(df)
        df["_ref"] = [_ref_counter.next()
                      for _ in range(n_data_items)]
        df.set_index("_ref",inplace=True)

        df.to_sql(output_table,
                  engine,
                  if_exists='replace')

        print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))
Exemple #10
0
def dedupe_abbr(ABR):
    data = {}

    ITR = jobmap(dedupe_item, tqdm.tqdm(ABR.items()), True)
    for result in ITR:

        # Only add the most common result
        max_val, max_item = 0, None
        total_counts = 0
        for item in result:
            current_val = ABR[item]
            total_counts += current_val
            if current_val > max_val:
                max_val = current_val
                max_item = item

        data[(' '.join(max_item[0]), max_item[1])] = total_counts

    ABR = collections.Counter(data)

    return ABR
def dedupe_abbr(ABR):
    data = {}
    
    ITR = jobmap(dedupe_item, tqdm.tqdm(ABR.items()), True)
    for result in ITR:

        # Only add the most common result
        max_val,max_item = 0, None
        total_counts = 0
        for item in result:
            current_val = ABR[item]
            total_counts += current_val
            if current_val > max_val:
                max_val = current_val
                max_item = item
                
        data[(' '.join(max_item[0]), max_item[1])] = total_counts

    ABR = collections.Counter(data)

    return ABR
    def cluster_affinity_states(self, INPUT_ITR, size=0):

        func = compute_local_affinity
        ITR = jobmap(func, INPUT_ITR, self.PARALLEL)
        
        Z = []

        pbar = tqdm.tqdm(total=size//self.batch_size)
            
        for result in ITR:
            V,z_labels = result
            
            for i in np.unique(z_labels):
                z = V[i==z_labels].mean(axis=0)
                z /= np.linalg.norm(z)
                Z.append(z)
                
            pbar.update()
            
        pbar.close()
        
        return np.array(Z)
    def cluster_affinity_states(self, INPUT_ITR, size=0):

        func = compute_local_affinity
        ITR = jobmap(func, INPUT_ITR, self.PARALLEL)

        Z = []

        pbar = tqdm.tqdm(total=size // self.batch_size)

        for result in ITR:
            V, z_labels = result

            for i in np.unique(z_labels):
                z = V[i == z_labels].mean(axis=0)
                z /= np.linalg.norm(z)
                Z.append(z)

            pbar.update()

        pbar.close()

        return np.array(Z)
def csv_iterator(f_csv, clean=True, _PARALLEL=False):
    '''
    Creates an iterator over a CSV file, optionally cleans it.

    Args
        f_csv (str): Filename of the csv to open and iterate over
        clean (bool): Set whether to clean the csv file
        PARALLEL (bool): Set whether the iterator should be run in parallel
    '''

    with open(f_csv) as FIN:
        CSV = csv.DictReader(FIN)

        if clean and _PARALLEL:
            CSV = jobmap(clean_row, CSV, FLAG_PARALLEL=_PARALLEL)
        elif clean and not _PARALLEL:
            CSV = itertools.imap(clean_row, CSV)

        try:
            for row in CSV:
                yield row
        except Exception:
            pass
Exemple #15
0
def phrases_from_config(config):

    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)
    ABR = collections.Counter()

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True)
    ITR = jobmap(func_parenthetical, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABR.update(result)

    msg = "\n{} total abbrs found."
    print(msg.format(len(ABR)))

    # Merge abbreviations that are similar
    print("Deduping abbr list.")
    df = dedupe_abbr(ABR)
    print("{} abbrs remain after deduping".format(len(df)))

    # Output top phrase
    print("Top 5 abbreviations")
    print(df[:5])

    mkdir(output_dir)
    f_csv = os.path.join(output_dir,
                         config["phrase_identification"]["f_abbreviations"])
    df.to_csv(f_csv)
    FILE_COL_ITR = itertools.product(F_SQL, target_columns)

    for f_sql,column_name in FILE_COL_ITR:

        conn = sqlite3.connect(f_sql,check_same_thread=False)

        INPUT_ITR = dfunc(column_name,
                          input_table,
                          conn,
                          limit=global_limit,
                          offset=global_offset,
                          progress_bar=True,
        )

        ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL)

        for result in ITR:
            ABR.update(result)

        msg = "Completed {} {}. {} total abbrs found."
        print msg.format(f_sql,column_name,len(ABR))

    # Merge abbreviations that are similar
    print "Deduping list"    
    ABR = dedupe_abbr(ABR)
    print "{} abbrs remain after deduping".format(len(ABR))


    # Convert abbrs to a list
    data_insert = [(phrase,abbr,count) 
Exemple #17
0
    _PARALLEL = config.as_bool("_PARALLEL")
    output_dir = config["phrase_identification"]["output_data_directory"]

    target_column = config["target_column"]

    import_config = config["import_data"]
    input_data_dir = import_config["output_data_directory"]

    F_CSV = grab_files("*.csv", input_data_dir)

    ABR = collections.Counter()
    P = parenthesis_nester()

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, target_column, progress_bar=True)
    ITR = jobmap(evaluate_document, INPUT_ITR, _PARALLEL, col=target_column)

    for result in ITR:
        ABR.update(result)

    msg = "\n{} total abbrs found."
    print(msg.format(len(ABR)))

    # Merge abbreviations that are similar
    print("Deduping abbr list.")
    ABR = dedupe_abbr(ABR)
    print("{} abbrs remain after deduping".format(len(ABR)))

    # Convert abbrs to a list
    data_insert = [(phrase, abbr, count)
                   for (phrase, abbr), count in ABR.most_common()]
Exemple #18
0
def parse_from_config(config):

    _PARALLEL = config.as_bool("_PARALLEL")

    import_config = config["import_data"]
    parse_config = config["parse"]

    input_data_dir = import_config["output_data_directory"]
    output_dir = parse_config["output_data_directory"]

    mkdir(output_dir)

    for name in parse_config["pipeline"]:
        obj = getattr(nlpre, name)

        # Load any kwargs in the config file
        kwargs = {}
        if name in parse_config:
            kwargs = dict(parse_config[name])

        # Handle the special case of the precomputed acronyms
        if name == "replace_acronyms":
            f_abbr = os.path.join(
                config["phrase_identification"]["output_data_directory"],
                config["phrase_identification"]["f_abbreviations"])
            ABBR = load_phrase_database(f_abbr)
            kwargs["counter"] = ABBR

        parser_functions.append(obj(**kwargs))

    col = config["target_column"]
    F_CSV = grab_files("*.csv", input_data_dir)

    dfunc = db_utils.CSV_database_iterator
    INPUT_ITR = dfunc(F_CSV, col, include_filename=True, progress_bar=False)

    ITR = jobmap(
        dispatcher,
        INPUT_ITR,
        _PARALLEL,
        batch_size=_global_batch_size,
        target_column=col,
    )

    F_CSV_OUT = {}
    F_WRITERS = {}

    for k, row in enumerate(ITR):
        f = row.pop("_filename")

        # Create a CSV file object for all outputs
        if f not in F_CSV_OUT:
            f_csv_out = os.path.join(output_dir, os.path.basename(f))

            F = open(f_csv_out, 'w')
            F_CSV_OUT[f] = F
            F_WRITERS[f] = csv.DictWriter(F, fieldnames=['_ref', col])
            F_WRITERS[f].writeheader()

        F_WRITERS[f].writerow(row)

    # Close the open files
    for F in F_CSV_OUT.values():
        F.close()
Exemple #19
0
            conn_out.execute("DROP TABLE {}".format(target_col))
        
        
        print "Parsing {}:{}".format(f_sql, target_col)       

        args = {
            "column_name":target_col,
            "table_name":import_config["output_table"],
            "conn":conn,
            "limit":global_limit,
            "progress_bar":True,
        }
            
        INPUT_ITR = database_iterator(**args)

        ITR = jobmap(dispatcher, INPUT_ITR, _PARALLEL)

        cmd_create = '''
        DROP TABLE IF EXISTS {table_name};
        CREATE TABLE IF NOT EXISTS {table_name} (
        _ref INTEGER PRIMARY KEY,
        text STRING,
        meta STRING
        );
        '''.format(table_name=target_col)
        
        conn_out.executescript(cmd_create)

        cmd_insert = '''
        INSERT INTO {table_name} (_ref,text,meta)
        VALUES (?,?,?)
def categorical_predict(
    X, y_org, method_name, n_estimators=50, use_SMOTE=False, use_PARALLEL=True
):

    # Make sure the sizes match
    msg = "X shape {}, y_org shape {} (mismatch!)"
    assert X.shape[0] == y_org.shape[0], msg.format(X.shape[0], y_org.shape[0])

    enc = LabelEncoder()
    y = enc.fit_transform(y_org)

    label_n = np.unique(y).shape[0]
    # msg = "[{}] number of unique entries in [y {}]: {}"
    # logger.info(msg.format(method_name, X.shape, label_n))

    use_SMOTE = use_SMOTE
    if use_SMOTE:
        logger.info("  Adjusting class balance using SMOTE")

    clf_args = {
        "n_jobs": -1 if use_PARALLEL else 1,
        "n_estimators": n_estimators,
    }

    skf = StratifiedKFold(n_splits=10, shuffle=False).split(X, y)
    scores = []
    F1_scores = []

    INPUT_ITR = ((clf_args, idx, X, y, use_SMOTE) for idx in skf)

    ITR = jobmap(clf_extratree_predictor, INPUT_ITR, True)

    error_counts = np.zeros(y.size, dtype=float)
    predict_scores = np.zeros([y.size, label_n], dtype=float)

    df = pd.DataFrame(index=range(y.shape[0]))
    df["y_truth"] = y
    df[method_name] = -1

    for result in ITR:
        idx, pred, pred_proba = result
        train_index, test_index = idx

        y_test = y[test_index]

        errors = y_test != pred

        scores.append(1 - errors.mean())
        error_counts[test_index[errors]] += 1

        F1_scores.append(f1_score(y_test, pred))
        predict_scores[test_index] = pred_proba

        train_index, test_index = idx

        df.ix[test_index, method_name] = pred

    # Make sure all items have been scored
    assert ~(df[method_name] == -1).any()

    # For StratifiedKFold, each test set is hit only once
    # so normalization is simple
    error_counts /= 1.0

    return (
        np.array(scores),
        np.array(F1_scores),
        error_counts,
        predict_scores,
        df,
    )