Ejemplo n.º 1
0
def single_file_item_iterator(
        f_csv,
        config=None,
        section='parse',
        progress_bar=False,
        text_column=None,
        include_filename=True,
):
    '''
    Iterates over a single file
    '''
    

    if config is None:
        config = simple_config.load()

    config = simple_config.load()

    # Make sure the file we requested exists
    assert(f_csv in get_section_filenames(section))
    

    INPUT_ITR = CSV_database_iterator(
        [f_csv],
        config["target_column"],
        progress_bar=progress_bar,
        include_filename=include_filename,
    )

    for row in INPUT_ITR:
        if text_column is not None:
            row['text'] = row[text_column]
        yield row
Ejemplo n.º 2
0
def load_document_vectors():
    config_score = simple_config.load()["score"]
    config_MC = simple_config.load()["metacluster"]

    score_method = config_MC['score_method']

    f_h5 = os.path.join(
        config_score["output_data_directory"],
        config_score['document_scores']["f_db"],
    )

    with h5py.File(f_h5, 'r') as h5:
        g = h5[score_method]

        # Load the _refs
        _refs = g["_ref"][:]

        # Require the _refs to be in order as a sanity check
        if not (np.sort(_refs) == _refs).all():
            msg = "WARNING, data out of sort order from _refs"
            raise ValueError(msg)

        docv = g["V"][:]

        return {"docv": docv, "_refs": _refs}
Ejemplo n.º 3
0
def load_document_vectors():
    config_score = simple_config.load("score")
    config_MC = simple_config.load("metacluster")

    score_method = config_MC['score_method']
    text_column  = config_MC['score_column']
    
    f_h5 = os.path.join(
        config_score["output_data_directory"],
        config_score['document_scores']["f_db"],
    )

    with h5py.File(f_h5,'r') as h5:
        g = h5[score_method][text_column]
        corpus_keys = g.keys()

        # Load the _refs
        _refs = np.hstack([g[key]["_ref"][:] for key in corpus_keys])
        
        # Require the _refs to be in order as a sanity check
        if not (np.sort(_refs) == _refs).all():
            msg = "WARNING, data out of sort order from _refs"
            raise ValueError(msg)
        
        docv = np.vstack([g[k]["V"][:] for k in corpus_keys])

        return {
            "docv" : docv,
            "_refs": _refs
        }
Ejemplo n.º 4
0
    def __init__(self, *args, **kwargs):
        '''
        Computes various measures of central tendency of a document.
        For Z_X scores, the raw word tokens are summed over the partition
        function. For I_X scores, the same statistics are computed over
        the similarity of all word pairs for words with top 10% Z values.
        This will precompute the partition function if it doesn't exist.
        '''
        cfg_embed = simple_config.load()["embedding"]
        cfg_score = simple_config.load()["score"]

        f_w2v = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_embed["w2v_embedding"]["f_db"],
        )

        f_partition_function = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_score["document_log_probability"]["f_partition_function"],
        )

        if not os.path.exists(f_partition_function):
            self.create_partition_function(f_w2v, f_partition_function)

        self.Z = self.load_partition_function(f_partition_function)
        self.scores = []

        val = cfg_score["document_log_probability"]["intra_document_cutoff"]
        self.intra_document_cutoff = float(val)

        self.model = load_w2vec()
Ejemplo n.º 5
0
    def __init__(self, *args, **kwargs):
        super(generic_document_score, self).__init__(*args, **kwargs)

        # Load the model from disk
        self.M = load_w2vec()
        self.shape = self.M.wv.syn0.shape

        # Build the dictionary
        vocab_n = self.shape[0]
        self.word2index = dict(zip(self.M.wv.index2word, range(vocab_n)))

        # Set parallel option (currently does nothing)
        # self._PARALLEL = kwargs["_PARALLEL"]

        if "negative_weights" in kwargs:
            NV = []
            for word, weight in kwargs["negative_weights"].items():

                if not self.check_word_vector(word):
                    msg = "Negative weight word '{}' not found in dictionary"
                    print(msg.format(word))
                    continue

                vec = self.get_word_vector(word)
                scale = np.exp(-float(weight) * self.M.wv.syn0.dot(vec))

                # Don't oversample, max out weights to unity
                scale[scale > 1] = 1.0
                NV.append(scale)

            self.negative_weights = np.array(NV).T.sum(axis=1)

        else:
            self.negative_weights = np.ones(vocab_n, dtype=float)

        # Save the target column to compute
        self.target_column = simple_config.load()["target_column"]

        # Make sure nothing has been set yet
        self.V = self._ref = None

        # Set the variables for reduced representation
        config_score = simple_config.load()["score"]
        self.compute_reduced = config_score["compute_reduced_representation"]

        if self.compute_reduced:
            sec = config_score['reduced_representation']
            self.reduced_n_components = sec['n_components']

        self.h5py_args = {"compression": "gzip"}
Ejemplo n.º 6
0
def load_ORG_data(extra_columns=None):
    print("Loading import data")

    cols = [
        "_ref",
    ]

    if extra_columns is not None:
        cols += extra_columns

    config_import = simple_config.load()["import_data"]

    # Load the input columns
    F_CSV = grab_files("*.csv", config_import["output_data_directory"])

    ITR = (pd.read_csv(f, usecols=cols) for f in F_CSV)
    df = pd.concat(list(ITR))

    # Require the _refs to be in order as a sanity check
    if not (np.sort(df._ref) == df._ref).all():
        msg = "WARNING, data out of sort order from _refs"
        raise ValueError(msg)

    df = df.set_index('_ref')
    df['_ref'] = df.index

    return df
Ejemplo n.º 7
0
def load_document_vectors(score_method, use_reduced=False):
    config_score = simple_config.load()["score"]

    f_h5 = os.path.join(
        config_score["output_data_directory"],
        config_score['document_scores']["f_db"],
    )

    with h5py.File(f_h5, 'r') as h5:

        assert (score_method in h5)
        g = h5[score_method]

        _refs = np.hstack([g[k]["_ref"][:] for k in g.keys()])

        vector_key = "VX" if use_reduced else "V"
        X = np.vstack([g[k][vector_key][:] for k in g.keys()])

        assert (X.shape[0] == _refs.size)

        # Sort to the proper order
        sort_idx = np.argsort(_refs)
        _refs = _refs[sort_idx]
        X = np.vstack(X)[sort_idx]

    return {"docv": X, "_refs": _refs}
Ejemplo n.º 8
0
def load_ORG_data(extra_columns=None):
    print("Loading import data")

    cols = [
        "_ref",
    ]

    if extra_columns is not None:
        cols += extra_columns

    config = simple_config.load()
    config_import = config["import_data"]

    CORES = -1 if config["_PARALLEL"] else 1

    # Load the input columns
    F_CSV = grab_files("*.csv", config_import["output_data_directory"])

    with joblib.Parallel(CORES) as MP:
        func = joblib.delayed(simple_CSV_read)
        data = MP(func(x, cols) for x in F_CSV)

    # Require the _refs to be in order
    df = pd.concat(data).sort_values('_ref').set_index('_ref')

    # Use _ref as an index, but keep it as a row
    df['_ref'] = df.index

    return df
Ejemplo n.º 9
0
def main():
    args = docopt(__doc__)
    config = simple_config.load()

    if args["import_data"]:
        import_data_from_config(config)
        phrases_from_config(config)

    if args["parse"]:
        parse_from_config(config)

    if args["embed"]:
        embed_from_config(config)

    if args["score"]:
        score_from_config(config)

    if args["predict"]:
        predict_from_config(config)

    if args["metacluster"]:
        metacluster_from_config(config)

    if args["analyze"]:
        func = args["<target_function>"]
        if func == 'metacluster':
            analyze_metacluster_from_config(config)
        else:
            raise KeyError("Analyze Function {} not known".format(func))
Ejemplo n.º 10
0
def load_ORG_data(extra_columns=None):
    """
    DOCUMENTATION_UNKNOWN
    """
    logger.info("Loading original data")

    cols = []

    if extra_columns is not None:
        cols += extra_columns

    config = simple_config.load()
    config_import = config["import_data"]

    CORES = -1 if config["_PARALLEL"] else 1

    # Load the input columns
    F_CSV_REF = grab_files("*.csv", config_import["output_data_directory"])
    F_CSV = grab_files("*.csv", config_import["input_data_directories"][0])

    with joblib.Parallel(CORES) as MP:
        func = joblib.delayed(simple_CSV_read)
        data = MP(func(x, cols) for x in F_CSV)
        _refs = MP(func(x, ["_ref"]) for x in F_CSV_REF)

    for df, df_refs in zip(data, _refs):
        df["_ref"] = df_refs["_ref"].values

    # Require the _refs to be in order
    df = pd.concat(data).sort_values("_ref").set_index("_ref")

    # Use _ref as an index, but keep it as a row
    df["_ref"] = df.index

    return df
Ejemplo n.º 11
0
    def save_single(self):

        assert (self.V is not None)
        assert (self._ref is not None)

        # Set the size explictly as a sanity check
        size_n, dim_V = self.V.shape

        config_score = simple_config.load()["score"]
        f_db = os.path.join(config_score["output_data_directory"],
                            config_score["document_scores"]["f_db"])

        h5 = touch_h5(f_db)
        g = h5.require_group(self.method)
        gx = g.require_group(self.current_filename)

        # Save the data array
        msg = "Saving {} {} ({})"
        print(msg.format(self.method, self.current_filename, size_n))

        for col in [
                "V", "_ref", "VX", "VX_explained_variance_ratio_",
                "VX_components_"
        ]:
            if col in gx:
                #print "  Clearing", self.method, self.current_filename, col
                del gx[col]

        gx.create_dataset("V", data=self.V, **self.h5py_args)
        gx.create_dataset("_ref", data=self._ref, **self.h5py_args)
Ejemplo n.º 12
0
def load_metacluster_data(*args):

    config_metacluster = simple_config.load()["metacluster"]

    f_h5 = os.path.join(config_metacluster["output_data_directory"],
                        config_metacluster["f_centroids"])

    return load_h5_file(f_h5, *args)
Ejemplo n.º 13
0
def item_iterator(
        config=None,
        randomize_file_order=False,
        whitelist=[],
        section='parse',
        progress_bar=False,
        text_column=None,
        include_filename=False,
):
    '''
    Iterates over the parsed corpus items and respects a given whitelist.
    '''

    if config is None:
        config = simple_config.load()

    config = simple_config.load()
    input_data_dir = config['parse']["output_data_directory"]
    F_CSV = grab_files("*.csv", input_data_dir, verbose=False)

    if whitelist:
        assert(isinstance(whitelist, list))

        F_CSV2 = set()
        for f_csv in F_CSV:
            for token in whitelist:
                if token in f_csv:
                    F_CSV2.add(f_csv)
        F_CSV = F_CSV2

    # Randomize the order of the input files each time we get here
    if randomize_file_order:
        F_CSV = random.sample(sorted(F_CSV), len(F_CSV))

    INPUT_ITR = CSV_database_iterator(
        F_CSV,
        config["target_column"],
        progress_bar=progress_bar,
        include_filename=include_filename,
    )

    for row in INPUT_ITR:
        if text_column is not None:
            row['text'] = row[text_column]
        yield row
Ejemplo n.º 14
0
def load_metacluster_data(*args):

    config_metacluster = simple_config.load("metacluster")

    f_h5 = os.path.join(
        config_metacluster["output_data_directory"],
        config_metacluster["f_centroids"])

    return load_h5_file(f_h5, *args)
Ejemplo n.º 15
0
    def __init__(self):

        config = simple_config.load("metacluster")

        self.subcluster_m = int(config["subcluster_m"])
        self.subcluster_pcut = float(config["subcluster_pcut"])
        self.subcluster_repeats = int(config["subcluster_repeats"])
        self.subcluster_kn = int(config["subcluster_kn"])

        config_score = simple_config.load("score")

        self.f_h5_docvecs = os.path.join(config_score["output_data_directory"], config_score["document_scores"]["f_db"])

        self.f_h5_centroids = os.path.join(config["output_data_directory"], config["f_centroids"])

        score_method = config["score_method"]
        text_column = config["score_column"]

        self._load_data(self.f_h5_docvecs, score_method, text_column)
Ejemplo n.º 16
0
def load_dispersion_data():
    print "Loading dispersion data"

    config_post = simple_config.load("postprocessing")

    f_h5 = os.path.join(
        config_post["output_data_directory"],
        "cluster_dispersion.h5")

    return load_h5_file(f_h5)
Ejemplo n.º 17
0
def get_score_methods():
    config_score = simple_config.load()["score"]

    f_h5 = os.path.join(
        config_score["output_data_directory"],
        config_score['document_scores']["f_db"],
    )

    with h5py.File(f_h5, 'r') as h5:
        return h5.keys()
Ejemplo n.º 18
0
def load_w2vec(config=None):
    if config is None:
        config = simple_config.load()

    config_embed = config["embedding"]

    f_w2v = os.path.join(
        config_embed["output_data_directory"],
        config_embed["w2v_embedding"]["f_db"],
    )

    return W2V.Word2Vec.load(f_w2v)
Ejemplo n.º 19
0
def get_section_filenames(section='parse'):
    '''
    Grab filenames in given section of pipeline.

    Args:
        section (str): The section to grab the filenames (default: parse)

    Returns:
         list: files found in directory specified in config
    '''

    config = simple_config.load()
    input_data_dir = config[section]["output_data_directory"]
    return grab_files("*.csv", input_data_dir)
Ejemplo n.º 20
0
def load_embeddings():
    '''
    Loads the gensim word embedding model.
    '''
    config = simple_config.load("embedding")
    
    from gensim.models.word2vec import Word2Vec

    f_w2v = os.path.join(
        config["output_data_directory"],
        config["w2v_embedding"]["f_db"],
    )

    return Word2Vec.load(f_w2v)
Ejemplo n.º 21
0
    def __init__(self, *args, **kwargs):
        super(generic_document_score, self).__init__(*args, **kwargs)

        # Load the model from disk
        self.M = load_w2vec()
        self.shape = self.M.syn0.shape

        # Build the dictionary
        vocab_n = self.shape[0]
        self.word2index = dict(zip(self.M.index2word, range(vocab_n)))

        # Set parallel option (currently does nothing)
        # self._PARALLEL = kwargs["_PARALLEL"]

        # Load the negative weights
        if "negative_weights" in kwargs:
            neg_W = kwargs["negative_weights"]
            self.neg_W = dict((k, float(v)) for k, v in neg_W.items())
            self.neg_vec = dict((k, self.get_word_vector(k))
                                for k, v in neg_W.items())
        else:
            self.neg_W = {}
            self.neg_vec = {}

        # Save the target column to compute
        self.target_column = simple_config.load()["target_column"]

        # Make sure nothing has been set yet
        self.V = self._ref = None

        # Set the variables for reduced representation
        config_score = simple_config.load()["score"]
        self.compute_reduced = config_score["compute_reduced_representation"]

        if self.compute_reduced:
            sec = config_score['reduced_representation']
            self.reduced_n_components = sec['n_components']
Ejemplo n.º 22
0
    def __init__(self):

        config = simple_config.load()["metacluster"]

        self.subcluster_m = int(config["subcluster_m"])
        self.subcluster_pcut = float(config["subcluster_pcut"])
        self.subcluster_repeats = int(config["subcluster_repeats"])
        self.subcluster_kn = int(config["subcluster_kn"])

        config_score = simple_config.load()["score"]

        self.f_h5_docvecs = os.path.join(
            config_score["output_data_directory"],
            config_score['document_scores']["f_db"],
        )

        self.f_h5_centroids = os.path.join(
            config["output_data_directory"],
            config["f_centroids"],
        )

        score_method = config['score_method']

        self._load_data(self.f_h5_docvecs, score_method)
Ejemplo n.º 23
0
def get_score_methods():
    '''
    Determines which scoring methods to return for each document,
    based on what's set in config file

    Returns:
         h5.keys(): DOCUMENTATION_UNKNOWN
    '''
    config_score = simple_config.load()["score"]

    f_h5 = os.path.join(
        config_score["output_data_directory"],
        config_score["f_db"],
    )

    with h5py.File(f_h5, 'r') as h5:
        return h5.keys()
Ejemplo n.º 24
0
def item_iterator(name,cmd_config=None):

    score_config = simple_config.load("parse")
    input_data_dir = score_config["output_data_directory"]

    F_SQL = glob.glob(os.path.join(input_data_dir,'*'))

    # If there is a whitelist only keep the matching filename
    try:
        whitelist = cmd_config["command_whitelist"].strip()
    except:
        whitelist = None
    if whitelist:
        assert(type(whitelist)==list)

        F_SQL2 = set()
        for f_sql in F_SQL:
            for token in whitelist:
                if token in f_sql:
                    F_SQL2.add(f_sql)
        F_SQL = F_SQL2

    
    # Randomize the order of the input files
    F_SQL = random.sample(sorted(F_SQL), len(F_SQL))  
    DB_ITR = itertools.product(F_SQL, config["target_columns"])

    for f_sql, target_col in DB_ITR:

        #print ("Computing {}:{}".format(f_sql, target_col))
        
        conn = sqlite3.connect(f_sql, check_same_thread=False)

        args = {
            "column_name":"text",
            "table_name" :target_col,
            "conn":conn,
            "limit":_global_limit,
            "shuffle":False,
            "include_table_name":True,
        }

        INPUT_ITR = database_iterator(**args)
        for item in INPUT_ITR:
            yield list(item) + [f_sql,]
Ejemplo n.º 25
0
    def save(self):

        assert(self.V is not None)
        assert(self._ref is not None)

        # Set the size explictly as a sanity check
        size_n, dim_V = self.V.shape

        # print "Saving the scored documents"
        config_score = simple_config.load()["score"]
        f_db = os.path.join(
            config_score["output_data_directory"],
            config_score["document_scores"]["f_db"]
        )

        h5 = touch_h5(f_db)

        # Clear the dataset if it already exists
        if self.method in h5:
            del h5[self.method]

        g = h5.require_group(self.method)

        # Save the data array
        print("Saving {} ({})".format(self.method, size_n))

        g.create_dataset("V", data=self.V, compression='gzip')
        g.create_dataset("_ref", data=self._ref)

        # Compute the reduced representation if required
        if self.compute_reduced:
            nc = self.reduced_n_components
            clf = IncrementalPCA(n_components=nc)

            msg = "Performing PCA on {}, ({})->({})"
            print(msg.format(self.method, self.V.shape[1], nc))

            VX = clf.fit_transform(self.V)
            g.create_dataset("VX", data=VX, compression='gzip')
            g.create_dataset("VX_explained_variance_ratio_",
                             data=clf.explained_variance_ratio_)
            g.create_dataset("VX_components_",
                             data=clf.components_)

        h5.close()
Ejemplo n.º 26
0
    def __init__(self):

        config = simple_config.load()["metacluster"]

        self.subcluster_m = int(config["subcluster_m"])
        self.subcluster_pcut = float(config["subcluster_pcut"])
        self.subcluster_repeats = int(config["subcluster_repeats"])
        self.subcluster_kn = int(config["subcluster_kn"])

        self.f_h5_centroids = os.path.join(config["output_data_directory"],
                                           config["f_centroids"])

        score_method = config["score_method"]
        DV = uds.load_document_vectors(score_method)
        self._ref = DV["_refs"]
        self.docv = DV["docv"]

        self.N, self.dim = self.docv.shape
Ejemplo n.º 27
0
def main():
    args = docopt(__doc__)
    config = simple_config.load()

    if args["import_data"]:
        from import_data import import_data_from_config

        import_data_from_config(config)

    elif args["phrase"]:
        from phrase import phrases_from_config

        phrases_from_config(config)

    if args["parse"]:
        from parse import parse_from_config

        parse_from_config(config)

    if args["embed"]:
        from embed import embed_from_config

        embed_from_config(config)

    if args["score"]:
        from score import score_from_config

        score_from_config(config)

    if args["predict"]:
        from predict import predict_from_config

        predict_from_config(config)

    if args["metacluster"]:
        from metacluster import metacluster_from_config

        metacluster_from_config(config)

    if args["analyze"]:

        import postprocessing.analyze_metaclusters as pam

        pam.analyze_metacluster_from_config(config)
Ejemplo n.º 28
0
def load_document_vectors(score_method, use_reduced=False):
    '''
    Load the word2vec document vectors for each document from the h5 file
    saved in pipeline

    Args:
        score_method: string, score method to load
        use_reduced: boolean, flag to determine whether to use reduced
        dimension vectors, or the orgiginal vectors

    Return:
        {"docv": X, "_refs": _refs}: dictionary, contains a list of document
        vectors and corresponding references
    '''

    config_score = simple_config.load()["score"]

    f_h5 = os.path.join(
        config_score["output_data_directory"],
        config_score["f_db"],
    )

    with h5py.File(f_h5, 'r') as h5:

        assert(score_method in h5)
        g = h5[score_method]

        _refs = np.hstack([g[k]["_ref"][:] for k in g.keys()])

        vector_key = "VX" if use_reduced else "V"
        X = np.vstack([g[k][vector_key][:] for k in g.keys()])

        assert(X.shape[0] == _refs.size)

        # Sort to the proper order
        sort_idx = np.argsort(_refs)
        _refs = _refs[sort_idx]
        X = np.vstack(X)[sort_idx]

    return {
        "docv": X,
        "_refs": _refs
    }
Ejemplo n.º 29
0
def load_metacluster_data(*args):
    '''
    Load information on metaclusters from where they're saved in the pipeline

    Args:
        *args: DOCUMENTATION_UNKNOWN

    Returns:
        load_h5_file(f_h5, *args): the data on each cluster found in the
        h5 file
    '''

    config_metacluster = simple_config.load()["metacluster"]

    f_h5 = os.path.join(
        config_metacluster["output_data_directory"],
        config_metacluster["f_centroids"])

    return load_h5_file(f_h5, *args)
Ejemplo n.º 30
0
def main():
    args = docopt(__doc__)
    config = simple_config.load()

    if args["import_data"]:
        from import_data import import_data_from_config
        import_data_from_config(config)

    elif args["phrase"]:
        from phrase import phrases_from_config
        phrases_from_config(config)

    if args["parse"]:
        from parse import parse_from_config
        parse_from_config(config)

    if args["embed"]:
        from embed import embed_from_config
        embed_from_config(config)

    if args["score"]:
        from score import score_from_config
        score_from_config(config)

    if args["predict"]:
        from predict import predict_from_config
        predict_from_config(config)

    if args["metacluster"]:
        from metacluster import metacluster_from_config
        metacluster_from_config(config)

    if args["analyze"]:

        func = args["<target_function>"]
        if func == 'metacluster':
            import postprocessing.analyze_metaclusters as pam
            pam.analyze_metacluster_from_config(config)
        elif func == 'LIME':
            import postprocessing.lime_explainer as le
            le.explain_metaclusters(config)
        else:
            raise KeyError("Analyze Function {} not known".format(func))
Ejemplo n.º 31
0
def get_score_methods():
    """
    Determines which scoring methods to return for each document,
    based on what's set in config file.

    Returns:
         h5.keys(): DOCUMENTATION_UNKNOWN
    """
    config_score = simple_config.load()["score"]

    f_h5 = os.path.join(config_score["output_data_directory"],
                        config_score["f_db"])

    if not os.path.exists(f_h5):
        raise FileNotFoundError(f_h5)

    with h5py.File(f_h5, "r") as h5:
        keys = list(h5.keys())

    return keys
Ejemplo n.º 32
0
    def __init__(self, name, cmd_config=None, yield_single=False):

        # yield_single returns one item at a time,
        # not in chunks like (table_name, f_sql)
        
        self.yield_single = yield_single

        score_config = simple_config.load("parse")
        input_data_dir = score_config["output_data_directory"]

        F_SQL = sorted(glob.glob(os.path.join(input_data_dir,'*')))

        # If there is a whitelist only keep the matching filename
        try:
            whitelist = cmd_config["command_whitelist"].strip()
        except:
            whitelist = None
        if whitelist:
            assert(type(whitelist)==list)

            F_SQL2 = set()
            for f_sql in F_SQL:
                for token in whitelist:
                    if token in f_sql:
                        F_SQL2.add(f_sql)
            F_SQL = F_SQL2

        # Randomize the order of the input files (why? not needed for scoring)
        # F_SQL = random.sample(sorted(F_SQL), len(F_SQL))

        DB_ITR = itertools.product(F_SQL, config["target_columns"])

        # Get database sizes for progress bar
        self.total_items = 0
        for f_sql, target_col in DB_ITR:
            conn = sqlite3.connect(f_sql, check_same_thread=False)
            self.total_items += count_rows(conn, target_col)
            conn.close()
        
        self.F_SQL = F_SQL
        self.config = config
Ejemplo n.º 33
0
    def __init__(self, *args, **kwargs):
        super(score_simple, self).__init__(*args, **kwargs)

        f_db = os.path.join(kwargs['output_data_directory'],
                            kwargs['term_frequency']['f_db'])
        if not os.path.exists(f_db):
            msg = "{} not computed yet, needed for TF methods!"
            raise ValueError(msg.format(f_db))

        score_config = simple_config.load()["score"]
        f_csv = os.path.join(
            score_config["output_data_directory"],
            score_config["term_document_frequency"]["f_db"],
        )
        IDF = pd.read_csv(f_csv)
        IDF = dict(zip(IDF["word"].values, IDF["count"].values))
        self.corpus_N = IDF.pop("__pipeline_document_counter")

        # Compute the IDF
        for key in IDF:
            IDF[key] = np.log(float(self.corpus_N) / (IDF[key] + 1))
        self.IDF = IDF
Ejemplo n.º 34
0
    def __init__(self, *args, **kwargs):
        '''
        The reduced representation takes an incremental PCA decomposition
        and adds new negative weights based off the previous components
        of PCA.
        '''

        # Remove the bais to negative_weights
        kwargs["negative_weights"] = {}

        super(reduced_representation, self).__init__(*args, **kwargs)

        config = simple_config.load()['score']
        f_db = os.path.join(
            config["output_data_directory"],
            config["document_scores"]["f_db"]
        )

        with h5py.File(f_db, 'r') as h5:

            # Make sure the the column has a value
            col = config['reduced_representation']['rescored_command']
            assert(col in h5)

            # Make sure the VX has been computed
            assert("VX" in h5[col])
            c = h5[col]['VX_components_'][:]
            ex_var = h5[col]['VX_explained_variance_ratio_'][:]

        bais = config['reduced_representation']['bais_strength']

        self.word_vecs = {}
        for w in self.M.wv.index2word:
            weight = c.dot(self.M[w])
            weight *= bais
            weight *= ex_var
            adjust_v = (weight.reshape(-1, 1) * c).sum(axis=0)
            self.word_vecs[w] = self.M[w] - adjust_v
Ejemplo n.º 35
0
    def compute_reduced_representation(self):

        if not self.compute_reduced:
            return None

        config_score = simple_config.load()["score"]
        f_db = os.path.join(config_score["output_data_directory"],
                            config_score["document_scores"]["f_db"])

        h5 = touch_h5(f_db)
        g = h5[self.method]

        keys = g.keys()
        V = np.vstack([g[x]["V"][:] for x in keys])
        sizes = [g[x]["_ref"].shape[0] for x in keys]

        nc = self.reduced_n_components
        clf = IncrementalPCA(n_components=nc)

        msg = "Performing PCA on {}, ({})->({})"
        print(msg.format(self.method, V.shape[1], nc))

        VX = clf.fit_transform(V)
        EVR = clf.explained_variance_ratio_
        COMPONENTS = clf.components_

        for key, size in zip(keys, sizes):

            # Take slices equal to the size
            vx, VX = VX[:size, :], VX[size:, :]
            evr, EVR = EVR[:size], EVR[size:]
            com, COMPONENTS = COMPONENTS[:size, :], COMPONENTS[size:, :]

            g[key].create_dataset("VX", data=vx, **self.h5py_args)
            g[key].create_dataset("VX_explained_variance_ratio_", data=evr)
            g[key].create_dataset("VX_components_", data=com)

        h5.close()
Ejemplo n.º 36
0
def load_w2vec(config=None):
    """
    Loads gensim word2vec model saved in pipeline.

    Args:
        config: config file to get parameters from

    Returns:
        W2V.Word2Vec.load(f_w2v): gensim word2vec model
    """
    import gensim.models.word2vec as W2V

    if config is None:
        config = simple_config.load()

    config_embed = config["embed"]

    f_w2v = os.path.join(
        config_embed["output_data_directory"],
        config_embed["w2v_embedding"]["f_db"],
    )

    return W2V.Word2Vec.load(f_w2v)
Ejemplo n.º 37
0
    """

    merge_columns = config["import_data"]["merge_columns"]

    if not isinstance(merge_columns, list):
        msg = "merge_columns (if used) must be a list"
        raise ValueError(msg)

    data_out = config["import_data"]["output_data_directory"]
    mkdir(data_out)

    # Require 'input_data_directories' to be a list
    data_in_list = config["import_data"]["input_data_directories"]
    if not isinstance(data_in_list, list):
        msg = "input_data_directories must be a list"
        raise ValueError(msg)

    target_column = config["target_column"]

    for d_in in data_in_list:
        import_directory_csv(d_in, data_out, target_column, merge_columns)


if __name__ == "__main__":

    import simple_config

    config = simple_config.load()

    import_data_from_config(config)
Ejemplo n.º 38
0
            "column_name":"text",
            "table_name" :target_col,
            "conn":conn,
            "limit":_global_limit,
            "shuffle":False,
            "include_table_name":True,
        }

        INPUT_ITR = database_iterator(**args)
        for item in INPUT_ITR:
            yield list(item) + [f_sql,]

if __name__ == "__main__":

    import simple_config
    config = simple_config.load("embedding")
    _FORCE = config.as_bool("_FORCE")

    mkdir(config["output_data_directory"])
    
    ###########################################################
    # Run the functions that act globally on the data

    for name in config["embedding_commands"]:
        obj  = getattr(mb,name)

        # Load any kwargs in the config file
        kwargs = config
        if name in config:
            kwargs.update(config[name])
            
Ejemplo n.º 39
0
    for f in parser_functions:
        result = f(text)
        text   = unicode(result)
        
        if hasattr(result,"meta"):
            meta.update(result.meta)

    # Convert the meta information into a unicode string for serialization
    meta = unicode(meta)

    return idx, text, meta

if __name__ == "__main__":

    import simple_config
    config = simple_config.load("parse")
    _PARALLEL = config.as_bool("_PARALLEL")
    _FORCE = config.as_bool("_FORCE")

    import_config = simple_config.load("import_data")
    input_data_dir = import_config["output_data_directory"]
    output_dir = config["output_data_directory"]

    import_column = import_config["output_table"]

    mkdir(output_dir)

    # Fill the pipeline with function objects
    parser_functions = []
    for name in config["pipeline"]:
        obj  = getattr(pre,name)
Ejemplo n.º 40
0
        for i in range(n_clusters):
            v = meta_clusters[i]

            dist = W.syn0.dot(v)
            idx = np.argsort(dist)[::-1][:10]

            words = [W.index2word[i].replace("PHRASE_", "") for i in idx]

            all_words.append(u" ".join(words))

        return np.array(all_words)


if __name__ == "__main__":

    config = simple_config.load("metacluster")

    os.system("mkdir -p {}".format(config["output_data_directory"]))

    CO = cluster_object()
    f_h5 = CO.f_h5_centroids

    if not os.path.exists(f_h5):
        h5 = h5py.File(f_h5, "w")
        h5.close()

    h5 = h5py.File(f_h5, "r+")

    keys = ["subcluster_kn", "subcluster_pcut", "subcluster_m", "subcluster_repeats"]
    args = dict([(k, config[k]) for k in keys])
Ejemplo n.º 41
0
                    val = list(item) + [f_sql,]
                    data.append(val)

            if self.yield_single:
                for item in INPUT_ITR:
                    val = list(item) + [f_sql,]
                    yield val
                    progress_bar.update()

            if not self.yield_single:
                yield data

if __name__ == "__main__":

    import simple_config
    config = simple_config.load("score")
    _PARALLEL = config.as_bool("_PARALLEL")
    _FORCE = config.as_bool("_FORCE")

    n_jobs = -1 if _PARALLEL else 1

    mkdir(config["output_data_directory"])

    ###########################################################
    # Fill the pipeline with function objects

    mapreduce_functions = []
    for name in config["mapreduce_commands"]:

        obj  = getattr(ds,name)
Ejemplo n.º 42
0
        n_data_items = len(df)
        df["_ref"] = [_ref_counter.next()
                      for _ in range(n_data_items)]
        df.set_index("_ref",inplace=True)

        df.to_sql(output_table,
                  engine,
                  if_exists='replace')

        print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns))


if __name__ == "__main__":

    import simple_config
    config = simple_config.load("import_data")
    _PARALLEL = config.as_bool("_PARALLEL")
    _FORCE = config.as_bool("_FORCE")

    data_out = config["output_data_directory"]
    output_table = config["output_table"]

    # Require `input_data_directories` to be a list
    data_in_list  = config["input_data_directories"]
    assert(type(data_in_list) == list)
  
    for d_in in data_in_list:
        import_directory_csv(d_in, data_out, output_table)


Ejemplo n.º 43
0
    if config["command_whitelist"]:
        keys = [k for k in keys if k in config["command_whitelist"]]
        print "Only computing over", keys

    X = np.vstack(g[key]["V"] for key in keys)
    h5_score.close()

    return X



if __name__ == "__main__":

    import simple_config   

    config = simple_config.load("cluster")
    output_dir = config["output_data_directory"]
    mkdir(output_dir)

    method = config['score_method']
    target_column = config['score_column']
    
    f_sim = os.path.join(output_dir, config["f_cluster"])

    if config.as_bool("_FORCE"):
        try:
            os.remove(f_sim)
        except:
            pass
        
    if not os.path.exists(f_sim):
Ejemplo n.º 44
0
            total_counts += current_val
            if current_val > max_val:
                max_val = current_val
                max_item = item
                
        data[(' '.join(max_item[0]), max_item[1])] = total_counts

    ABR = collections.Counter(data)

    return ABR
    

if __name__ == "__main__":

    import simple_config
    config = simple_config.load("phrase_identification")
    _PARALLEL = config.as_bool("_PARALLEL")
    _FORCE = config.as_bool("_FORCE")
    output_dir = config["output_data_directory"]

    target_columns = config["target_columns"]

    import_config = simple_config.load("import_data")
    input_data_dir = import_config["output_data_directory"]
    input_table = import_config["output_table"]
    
    F_SQL = grab_files("*.sqlite", input_data_dir)

    ABR = collections.Counter()
    P = parenthesis_nester()
Ejemplo n.º 45
0
import numpy as np
import pandas as pd
import h5py
import os, glob, itertools, collections
from sqlalchemy import create_engine

from predictions import categorical_predict

ERROR_MATRIX = {}
PREDICTIONS = {}

if __name__ == "__main__":

    import simple_config

    config = simple_config.load("predict")
    score_config = simple_config.load("score")
    import_config = simple_config.load("import_data")

    # For now, we can only deal with one column using meta!
    assert len(config["categorical_columns"]) == 1

    f_h5 = os.path.join(score_config["output_data_directory"], score_config["document_scores"]["f_db"])

    h5 = h5py.File(f_h5, "r")

    methods = h5.keys()
    pred_dir = import_config["output_data_directory"]

    input_glob = os.path.join(pred_dir, "*")
    input_files = glob.glob(input_glob)
        if i==j:
            d = pdist(X[labels==i],metric='cosine')
        else:
            d = cdist(X[labels==i],X[labels==j],metric='cosine')
            # Only take upper diagonal (+diagonal elements)
            d = d[np.triu_indices(n=d.shape[0],m=d.shape[1],k=0)]
        
        dist[i,j] = dist[j,i] = d.mean()
        
    return dist

if __name__ == "__main__" and __package__ is None:

    import simple_config
    config = simple_config.load("postprocessing")

    save_dest = config['output_data_directory']
    os.system('mkdir -p {}'.format(save_dest))

    SQL = load_SQL_data(config["master_columns"])

    MC = load_metacluster_data()
    C = MC["meta_centroids"]
    counts = collections.Counter(MC["meta_labels"])

    DV = load_document_vectors()

    # Build the results for the metaclusters
    labels = np.unique(MC["meta_labels"])