def single_file_item_iterator( f_csv, config=None, section='parse', progress_bar=False, text_column=None, include_filename=True, ): ''' Iterates over a single file ''' if config is None: config = simple_config.load() config = simple_config.load() # Make sure the file we requested exists assert(f_csv in get_section_filenames(section)) INPUT_ITR = CSV_database_iterator( [f_csv], config["target_column"], progress_bar=progress_bar, include_filename=include_filename, ) for row in INPUT_ITR: if text_column is not None: row['text'] = row[text_column] yield row
def load_document_vectors(): config_score = simple_config.load()["score"] config_MC = simple_config.load()["metacluster"] score_method = config_MC['score_method'] f_h5 = os.path.join( config_score["output_data_directory"], config_score['document_scores']["f_db"], ) with h5py.File(f_h5, 'r') as h5: g = h5[score_method] # Load the _refs _refs = g["_ref"][:] # Require the _refs to be in order as a sanity check if not (np.sort(_refs) == _refs).all(): msg = "WARNING, data out of sort order from _refs" raise ValueError(msg) docv = g["V"][:] return {"docv": docv, "_refs": _refs}
def load_document_vectors(): config_score = simple_config.load("score") config_MC = simple_config.load("metacluster") score_method = config_MC['score_method'] text_column = config_MC['score_column'] f_h5 = os.path.join( config_score["output_data_directory"], config_score['document_scores']["f_db"], ) with h5py.File(f_h5,'r') as h5: g = h5[score_method][text_column] corpus_keys = g.keys() # Load the _refs _refs = np.hstack([g[key]["_ref"][:] for key in corpus_keys]) # Require the _refs to be in order as a sanity check if not (np.sort(_refs) == _refs).all(): msg = "WARNING, data out of sort order from _refs" raise ValueError(msg) docv = np.vstack([g[k]["V"][:] for k in corpus_keys]) return { "docv" : docv, "_refs": _refs }
def __init__(self, *args, **kwargs): ''' Computes various measures of central tendency of a document. For Z_X scores, the raw word tokens are summed over the partition function. For I_X scores, the same statistics are computed over the similarity of all word pairs for words with top 10% Z values. This will precompute the partition function if it doesn't exist. ''' cfg_embed = simple_config.load()["embedding"] cfg_score = simple_config.load()["score"] f_w2v = os.path.join( cfg_embed["output_data_directory"], cfg_embed["w2v_embedding"]["f_db"], ) f_partition_function = os.path.join( cfg_embed["output_data_directory"], cfg_score["document_log_probability"]["f_partition_function"], ) if not os.path.exists(f_partition_function): self.create_partition_function(f_w2v, f_partition_function) self.Z = self.load_partition_function(f_partition_function) self.scores = [] val = cfg_score["document_log_probability"]["intra_document_cutoff"] self.intra_document_cutoff = float(val) self.model = load_w2vec()
def __init__(self, *args, **kwargs): super(generic_document_score, self).__init__(*args, **kwargs) # Load the model from disk self.M = load_w2vec() self.shape = self.M.wv.syn0.shape # Build the dictionary vocab_n = self.shape[0] self.word2index = dict(zip(self.M.wv.index2word, range(vocab_n))) # Set parallel option (currently does nothing) # self._PARALLEL = kwargs["_PARALLEL"] if "negative_weights" in kwargs: NV = [] for word, weight in kwargs["negative_weights"].items(): if not self.check_word_vector(word): msg = "Negative weight word '{}' not found in dictionary" print(msg.format(word)) continue vec = self.get_word_vector(word) scale = np.exp(-float(weight) * self.M.wv.syn0.dot(vec)) # Don't oversample, max out weights to unity scale[scale > 1] = 1.0 NV.append(scale) self.negative_weights = np.array(NV).T.sum(axis=1) else: self.negative_weights = np.ones(vocab_n, dtype=float) # Save the target column to compute self.target_column = simple_config.load()["target_column"] # Make sure nothing has been set yet self.V = self._ref = None # Set the variables for reduced representation config_score = simple_config.load()["score"] self.compute_reduced = config_score["compute_reduced_representation"] if self.compute_reduced: sec = config_score['reduced_representation'] self.reduced_n_components = sec['n_components'] self.h5py_args = {"compression": "gzip"}
def load_ORG_data(extra_columns=None): print("Loading import data") cols = [ "_ref", ] if extra_columns is not None: cols += extra_columns config_import = simple_config.load()["import_data"] # Load the input columns F_CSV = grab_files("*.csv", config_import["output_data_directory"]) ITR = (pd.read_csv(f, usecols=cols) for f in F_CSV) df = pd.concat(list(ITR)) # Require the _refs to be in order as a sanity check if not (np.sort(df._ref) == df._ref).all(): msg = "WARNING, data out of sort order from _refs" raise ValueError(msg) df = df.set_index('_ref') df['_ref'] = df.index return df
def load_document_vectors(score_method, use_reduced=False): config_score = simple_config.load()["score"] f_h5 = os.path.join( config_score["output_data_directory"], config_score['document_scores']["f_db"], ) with h5py.File(f_h5, 'r') as h5: assert (score_method in h5) g = h5[score_method] _refs = np.hstack([g[k]["_ref"][:] for k in g.keys()]) vector_key = "VX" if use_reduced else "V" X = np.vstack([g[k][vector_key][:] for k in g.keys()]) assert (X.shape[0] == _refs.size) # Sort to the proper order sort_idx = np.argsort(_refs) _refs = _refs[sort_idx] X = np.vstack(X)[sort_idx] return {"docv": X, "_refs": _refs}
def load_ORG_data(extra_columns=None): print("Loading import data") cols = [ "_ref", ] if extra_columns is not None: cols += extra_columns config = simple_config.load() config_import = config["import_data"] CORES = -1 if config["_PARALLEL"] else 1 # Load the input columns F_CSV = grab_files("*.csv", config_import["output_data_directory"]) with joblib.Parallel(CORES) as MP: func = joblib.delayed(simple_CSV_read) data = MP(func(x, cols) for x in F_CSV) # Require the _refs to be in order df = pd.concat(data).sort_values('_ref').set_index('_ref') # Use _ref as an index, but keep it as a row df['_ref'] = df.index return df
def main(): args = docopt(__doc__) config = simple_config.load() if args["import_data"]: import_data_from_config(config) phrases_from_config(config) if args["parse"]: parse_from_config(config) if args["embed"]: embed_from_config(config) if args["score"]: score_from_config(config) if args["predict"]: predict_from_config(config) if args["metacluster"]: metacluster_from_config(config) if args["analyze"]: func = args["<target_function>"] if func == 'metacluster': analyze_metacluster_from_config(config) else: raise KeyError("Analyze Function {} not known".format(func))
def load_ORG_data(extra_columns=None): """ DOCUMENTATION_UNKNOWN """ logger.info("Loading original data") cols = [] if extra_columns is not None: cols += extra_columns config = simple_config.load() config_import = config["import_data"] CORES = -1 if config["_PARALLEL"] else 1 # Load the input columns F_CSV_REF = grab_files("*.csv", config_import["output_data_directory"]) F_CSV = grab_files("*.csv", config_import["input_data_directories"][0]) with joblib.Parallel(CORES) as MP: func = joblib.delayed(simple_CSV_read) data = MP(func(x, cols) for x in F_CSV) _refs = MP(func(x, ["_ref"]) for x in F_CSV_REF) for df, df_refs in zip(data, _refs): df["_ref"] = df_refs["_ref"].values # Require the _refs to be in order df = pd.concat(data).sort_values("_ref").set_index("_ref") # Use _ref as an index, but keep it as a row df["_ref"] = df.index return df
def save_single(self): assert (self.V is not None) assert (self._ref is not None) # Set the size explictly as a sanity check size_n, dim_V = self.V.shape config_score = simple_config.load()["score"] f_db = os.path.join(config_score["output_data_directory"], config_score["document_scores"]["f_db"]) h5 = touch_h5(f_db) g = h5.require_group(self.method) gx = g.require_group(self.current_filename) # Save the data array msg = "Saving {} {} ({})" print(msg.format(self.method, self.current_filename, size_n)) for col in [ "V", "_ref", "VX", "VX_explained_variance_ratio_", "VX_components_" ]: if col in gx: #print " Clearing", self.method, self.current_filename, col del gx[col] gx.create_dataset("V", data=self.V, **self.h5py_args) gx.create_dataset("_ref", data=self._ref, **self.h5py_args)
def load_metacluster_data(*args): config_metacluster = simple_config.load()["metacluster"] f_h5 = os.path.join(config_metacluster["output_data_directory"], config_metacluster["f_centroids"]) return load_h5_file(f_h5, *args)
def item_iterator( config=None, randomize_file_order=False, whitelist=[], section='parse', progress_bar=False, text_column=None, include_filename=False, ): ''' Iterates over the parsed corpus items and respects a given whitelist. ''' if config is None: config = simple_config.load() config = simple_config.load() input_data_dir = config['parse']["output_data_directory"] F_CSV = grab_files("*.csv", input_data_dir, verbose=False) if whitelist: assert(isinstance(whitelist, list)) F_CSV2 = set() for f_csv in F_CSV: for token in whitelist: if token in f_csv: F_CSV2.add(f_csv) F_CSV = F_CSV2 # Randomize the order of the input files each time we get here if randomize_file_order: F_CSV = random.sample(sorted(F_CSV), len(F_CSV)) INPUT_ITR = CSV_database_iterator( F_CSV, config["target_column"], progress_bar=progress_bar, include_filename=include_filename, ) for row in INPUT_ITR: if text_column is not None: row['text'] = row[text_column] yield row
def load_metacluster_data(*args): config_metacluster = simple_config.load("metacluster") f_h5 = os.path.join( config_metacluster["output_data_directory"], config_metacluster["f_centroids"]) return load_h5_file(f_h5, *args)
def __init__(self): config = simple_config.load("metacluster") self.subcluster_m = int(config["subcluster_m"]) self.subcluster_pcut = float(config["subcluster_pcut"]) self.subcluster_repeats = int(config["subcluster_repeats"]) self.subcluster_kn = int(config["subcluster_kn"]) config_score = simple_config.load("score") self.f_h5_docvecs = os.path.join(config_score["output_data_directory"], config_score["document_scores"]["f_db"]) self.f_h5_centroids = os.path.join(config["output_data_directory"], config["f_centroids"]) score_method = config["score_method"] text_column = config["score_column"] self._load_data(self.f_h5_docvecs, score_method, text_column)
def load_dispersion_data(): print "Loading dispersion data" config_post = simple_config.load("postprocessing") f_h5 = os.path.join( config_post["output_data_directory"], "cluster_dispersion.h5") return load_h5_file(f_h5)
def get_score_methods(): config_score = simple_config.load()["score"] f_h5 = os.path.join( config_score["output_data_directory"], config_score['document_scores']["f_db"], ) with h5py.File(f_h5, 'r') as h5: return h5.keys()
def load_w2vec(config=None): if config is None: config = simple_config.load() config_embed = config["embedding"] f_w2v = os.path.join( config_embed["output_data_directory"], config_embed["w2v_embedding"]["f_db"], ) return W2V.Word2Vec.load(f_w2v)
def get_section_filenames(section='parse'): ''' Grab filenames in given section of pipeline. Args: section (str): The section to grab the filenames (default: parse) Returns: list: files found in directory specified in config ''' config = simple_config.load() input_data_dir = config[section]["output_data_directory"] return grab_files("*.csv", input_data_dir)
def load_embeddings(): ''' Loads the gensim word embedding model. ''' config = simple_config.load("embedding") from gensim.models.word2vec import Word2Vec f_w2v = os.path.join( config["output_data_directory"], config["w2v_embedding"]["f_db"], ) return Word2Vec.load(f_w2v)
def __init__(self, *args, **kwargs): super(generic_document_score, self).__init__(*args, **kwargs) # Load the model from disk self.M = load_w2vec() self.shape = self.M.syn0.shape # Build the dictionary vocab_n = self.shape[0] self.word2index = dict(zip(self.M.index2word, range(vocab_n))) # Set parallel option (currently does nothing) # self._PARALLEL = kwargs["_PARALLEL"] # Load the negative weights if "negative_weights" in kwargs: neg_W = kwargs["negative_weights"] self.neg_W = dict((k, float(v)) for k, v in neg_W.items()) self.neg_vec = dict((k, self.get_word_vector(k)) for k, v in neg_W.items()) else: self.neg_W = {} self.neg_vec = {} # Save the target column to compute self.target_column = simple_config.load()["target_column"] # Make sure nothing has been set yet self.V = self._ref = None # Set the variables for reduced representation config_score = simple_config.load()["score"] self.compute_reduced = config_score["compute_reduced_representation"] if self.compute_reduced: sec = config_score['reduced_representation'] self.reduced_n_components = sec['n_components']
def __init__(self): config = simple_config.load()["metacluster"] self.subcluster_m = int(config["subcluster_m"]) self.subcluster_pcut = float(config["subcluster_pcut"]) self.subcluster_repeats = int(config["subcluster_repeats"]) self.subcluster_kn = int(config["subcluster_kn"]) config_score = simple_config.load()["score"] self.f_h5_docvecs = os.path.join( config_score["output_data_directory"], config_score['document_scores']["f_db"], ) self.f_h5_centroids = os.path.join( config["output_data_directory"], config["f_centroids"], ) score_method = config['score_method'] self._load_data(self.f_h5_docvecs, score_method)
def get_score_methods(): ''' Determines which scoring methods to return for each document, based on what's set in config file Returns: h5.keys(): DOCUMENTATION_UNKNOWN ''' config_score = simple_config.load()["score"] f_h5 = os.path.join( config_score["output_data_directory"], config_score["f_db"], ) with h5py.File(f_h5, 'r') as h5: return h5.keys()
def item_iterator(name,cmd_config=None): score_config = simple_config.load("parse") input_data_dir = score_config["output_data_directory"] F_SQL = glob.glob(os.path.join(input_data_dir,'*')) # If there is a whitelist only keep the matching filename try: whitelist = cmd_config["command_whitelist"].strip() except: whitelist = None if whitelist: assert(type(whitelist)==list) F_SQL2 = set() for f_sql in F_SQL: for token in whitelist: if token in f_sql: F_SQL2.add(f_sql) F_SQL = F_SQL2 # Randomize the order of the input files F_SQL = random.sample(sorted(F_SQL), len(F_SQL)) DB_ITR = itertools.product(F_SQL, config["target_columns"]) for f_sql, target_col in DB_ITR: #print ("Computing {}:{}".format(f_sql, target_col)) conn = sqlite3.connect(f_sql, check_same_thread=False) args = { "column_name":"text", "table_name" :target_col, "conn":conn, "limit":_global_limit, "shuffle":False, "include_table_name":True, } INPUT_ITR = database_iterator(**args) for item in INPUT_ITR: yield list(item) + [f_sql,]
def save(self): assert(self.V is not None) assert(self._ref is not None) # Set the size explictly as a sanity check size_n, dim_V = self.V.shape # print "Saving the scored documents" config_score = simple_config.load()["score"] f_db = os.path.join( config_score["output_data_directory"], config_score["document_scores"]["f_db"] ) h5 = touch_h5(f_db) # Clear the dataset if it already exists if self.method in h5: del h5[self.method] g = h5.require_group(self.method) # Save the data array print("Saving {} ({})".format(self.method, size_n)) g.create_dataset("V", data=self.V, compression='gzip') g.create_dataset("_ref", data=self._ref) # Compute the reduced representation if required if self.compute_reduced: nc = self.reduced_n_components clf = IncrementalPCA(n_components=nc) msg = "Performing PCA on {}, ({})->({})" print(msg.format(self.method, self.V.shape[1], nc)) VX = clf.fit_transform(self.V) g.create_dataset("VX", data=VX, compression='gzip') g.create_dataset("VX_explained_variance_ratio_", data=clf.explained_variance_ratio_) g.create_dataset("VX_components_", data=clf.components_) h5.close()
def __init__(self): config = simple_config.load()["metacluster"] self.subcluster_m = int(config["subcluster_m"]) self.subcluster_pcut = float(config["subcluster_pcut"]) self.subcluster_repeats = int(config["subcluster_repeats"]) self.subcluster_kn = int(config["subcluster_kn"]) self.f_h5_centroids = os.path.join(config["output_data_directory"], config["f_centroids"]) score_method = config["score_method"] DV = uds.load_document_vectors(score_method) self._ref = DV["_refs"] self.docv = DV["docv"] self.N, self.dim = self.docv.shape
def main(): args = docopt(__doc__) config = simple_config.load() if args["import_data"]: from import_data import import_data_from_config import_data_from_config(config) elif args["phrase"]: from phrase import phrases_from_config phrases_from_config(config) if args["parse"]: from parse import parse_from_config parse_from_config(config) if args["embed"]: from embed import embed_from_config embed_from_config(config) if args["score"]: from score import score_from_config score_from_config(config) if args["predict"]: from predict import predict_from_config predict_from_config(config) if args["metacluster"]: from metacluster import metacluster_from_config metacluster_from_config(config) if args["analyze"]: import postprocessing.analyze_metaclusters as pam pam.analyze_metacluster_from_config(config)
def load_document_vectors(score_method, use_reduced=False): ''' Load the word2vec document vectors for each document from the h5 file saved in pipeline Args: score_method: string, score method to load use_reduced: boolean, flag to determine whether to use reduced dimension vectors, or the orgiginal vectors Return: {"docv": X, "_refs": _refs}: dictionary, contains a list of document vectors and corresponding references ''' config_score = simple_config.load()["score"] f_h5 = os.path.join( config_score["output_data_directory"], config_score["f_db"], ) with h5py.File(f_h5, 'r') as h5: assert(score_method in h5) g = h5[score_method] _refs = np.hstack([g[k]["_ref"][:] for k in g.keys()]) vector_key = "VX" if use_reduced else "V" X = np.vstack([g[k][vector_key][:] for k in g.keys()]) assert(X.shape[0] == _refs.size) # Sort to the proper order sort_idx = np.argsort(_refs) _refs = _refs[sort_idx] X = np.vstack(X)[sort_idx] return { "docv": X, "_refs": _refs }
def load_metacluster_data(*args): ''' Load information on metaclusters from where they're saved in the pipeline Args: *args: DOCUMENTATION_UNKNOWN Returns: load_h5_file(f_h5, *args): the data on each cluster found in the h5 file ''' config_metacluster = simple_config.load()["metacluster"] f_h5 = os.path.join( config_metacluster["output_data_directory"], config_metacluster["f_centroids"]) return load_h5_file(f_h5, *args)
def main(): args = docopt(__doc__) config = simple_config.load() if args["import_data"]: from import_data import import_data_from_config import_data_from_config(config) elif args["phrase"]: from phrase import phrases_from_config phrases_from_config(config) if args["parse"]: from parse import parse_from_config parse_from_config(config) if args["embed"]: from embed import embed_from_config embed_from_config(config) if args["score"]: from score import score_from_config score_from_config(config) if args["predict"]: from predict import predict_from_config predict_from_config(config) if args["metacluster"]: from metacluster import metacluster_from_config metacluster_from_config(config) if args["analyze"]: func = args["<target_function>"] if func == 'metacluster': import postprocessing.analyze_metaclusters as pam pam.analyze_metacluster_from_config(config) elif func == 'LIME': import postprocessing.lime_explainer as le le.explain_metaclusters(config) else: raise KeyError("Analyze Function {} not known".format(func))
def get_score_methods(): """ Determines which scoring methods to return for each document, based on what's set in config file. Returns: h5.keys(): DOCUMENTATION_UNKNOWN """ config_score = simple_config.load()["score"] f_h5 = os.path.join(config_score["output_data_directory"], config_score["f_db"]) if not os.path.exists(f_h5): raise FileNotFoundError(f_h5) with h5py.File(f_h5, "r") as h5: keys = list(h5.keys()) return keys
def __init__(self, name, cmd_config=None, yield_single=False): # yield_single returns one item at a time, # not in chunks like (table_name, f_sql) self.yield_single = yield_single score_config = simple_config.load("parse") input_data_dir = score_config["output_data_directory"] F_SQL = sorted(glob.glob(os.path.join(input_data_dir,'*'))) # If there is a whitelist only keep the matching filename try: whitelist = cmd_config["command_whitelist"].strip() except: whitelist = None if whitelist: assert(type(whitelist)==list) F_SQL2 = set() for f_sql in F_SQL: for token in whitelist: if token in f_sql: F_SQL2.add(f_sql) F_SQL = F_SQL2 # Randomize the order of the input files (why? not needed for scoring) # F_SQL = random.sample(sorted(F_SQL), len(F_SQL)) DB_ITR = itertools.product(F_SQL, config["target_columns"]) # Get database sizes for progress bar self.total_items = 0 for f_sql, target_col in DB_ITR: conn = sqlite3.connect(f_sql, check_same_thread=False) self.total_items += count_rows(conn, target_col) conn.close() self.F_SQL = F_SQL self.config = config
def __init__(self, *args, **kwargs): super(score_simple, self).__init__(*args, **kwargs) f_db = os.path.join(kwargs['output_data_directory'], kwargs['term_frequency']['f_db']) if not os.path.exists(f_db): msg = "{} not computed yet, needed for TF methods!" raise ValueError(msg.format(f_db)) score_config = simple_config.load()["score"] f_csv = os.path.join( score_config["output_data_directory"], score_config["term_document_frequency"]["f_db"], ) IDF = pd.read_csv(f_csv) IDF = dict(zip(IDF["word"].values, IDF["count"].values)) self.corpus_N = IDF.pop("__pipeline_document_counter") # Compute the IDF for key in IDF: IDF[key] = np.log(float(self.corpus_N) / (IDF[key] + 1)) self.IDF = IDF
def __init__(self, *args, **kwargs): ''' The reduced representation takes an incremental PCA decomposition and adds new negative weights based off the previous components of PCA. ''' # Remove the bais to negative_weights kwargs["negative_weights"] = {} super(reduced_representation, self).__init__(*args, **kwargs) config = simple_config.load()['score'] f_db = os.path.join( config["output_data_directory"], config["document_scores"]["f_db"] ) with h5py.File(f_db, 'r') as h5: # Make sure the the column has a value col = config['reduced_representation']['rescored_command'] assert(col in h5) # Make sure the VX has been computed assert("VX" in h5[col]) c = h5[col]['VX_components_'][:] ex_var = h5[col]['VX_explained_variance_ratio_'][:] bais = config['reduced_representation']['bais_strength'] self.word_vecs = {} for w in self.M.wv.index2word: weight = c.dot(self.M[w]) weight *= bais weight *= ex_var adjust_v = (weight.reshape(-1, 1) * c).sum(axis=0) self.word_vecs[w] = self.M[w] - adjust_v
def compute_reduced_representation(self): if not self.compute_reduced: return None config_score = simple_config.load()["score"] f_db = os.path.join(config_score["output_data_directory"], config_score["document_scores"]["f_db"]) h5 = touch_h5(f_db) g = h5[self.method] keys = g.keys() V = np.vstack([g[x]["V"][:] for x in keys]) sizes = [g[x]["_ref"].shape[0] for x in keys] nc = self.reduced_n_components clf = IncrementalPCA(n_components=nc) msg = "Performing PCA on {}, ({})->({})" print(msg.format(self.method, V.shape[1], nc)) VX = clf.fit_transform(V) EVR = clf.explained_variance_ratio_ COMPONENTS = clf.components_ for key, size in zip(keys, sizes): # Take slices equal to the size vx, VX = VX[:size, :], VX[size:, :] evr, EVR = EVR[:size], EVR[size:] com, COMPONENTS = COMPONENTS[:size, :], COMPONENTS[size:, :] g[key].create_dataset("VX", data=vx, **self.h5py_args) g[key].create_dataset("VX_explained_variance_ratio_", data=evr) g[key].create_dataset("VX_components_", data=com) h5.close()
def load_w2vec(config=None): """ Loads gensim word2vec model saved in pipeline. Args: config: config file to get parameters from Returns: W2V.Word2Vec.load(f_w2v): gensim word2vec model """ import gensim.models.word2vec as W2V if config is None: config = simple_config.load() config_embed = config["embed"] f_w2v = os.path.join( config_embed["output_data_directory"], config_embed["w2v_embedding"]["f_db"], ) return W2V.Word2Vec.load(f_w2v)
""" merge_columns = config["import_data"]["merge_columns"] if not isinstance(merge_columns, list): msg = "merge_columns (if used) must be a list" raise ValueError(msg) data_out = config["import_data"]["output_data_directory"] mkdir(data_out) # Require 'input_data_directories' to be a list data_in_list = config["import_data"]["input_data_directories"] if not isinstance(data_in_list, list): msg = "input_data_directories must be a list" raise ValueError(msg) target_column = config["target_column"] for d_in in data_in_list: import_directory_csv(d_in, data_out, target_column, merge_columns) if __name__ == "__main__": import simple_config config = simple_config.load() import_data_from_config(config)
"column_name":"text", "table_name" :target_col, "conn":conn, "limit":_global_limit, "shuffle":False, "include_table_name":True, } INPUT_ITR = database_iterator(**args) for item in INPUT_ITR: yield list(item) + [f_sql,] if __name__ == "__main__": import simple_config config = simple_config.load("embedding") _FORCE = config.as_bool("_FORCE") mkdir(config["output_data_directory"]) ########################################################### # Run the functions that act globally on the data for name in config["embedding_commands"]: obj = getattr(mb,name) # Load any kwargs in the config file kwargs = config if name in config: kwargs.update(config[name])
for f in parser_functions: result = f(text) text = unicode(result) if hasattr(result,"meta"): meta.update(result.meta) # Convert the meta information into a unicode string for serialization meta = unicode(meta) return idx, text, meta if __name__ == "__main__": import simple_config config = simple_config.load("parse") _PARALLEL = config.as_bool("_PARALLEL") _FORCE = config.as_bool("_FORCE") import_config = simple_config.load("import_data") input_data_dir = import_config["output_data_directory"] output_dir = config["output_data_directory"] import_column = import_config["output_table"] mkdir(output_dir) # Fill the pipeline with function objects parser_functions = [] for name in config["pipeline"]: obj = getattr(pre,name)
for i in range(n_clusters): v = meta_clusters[i] dist = W.syn0.dot(v) idx = np.argsort(dist)[::-1][:10] words = [W.index2word[i].replace("PHRASE_", "") for i in idx] all_words.append(u" ".join(words)) return np.array(all_words) if __name__ == "__main__": config = simple_config.load("metacluster") os.system("mkdir -p {}".format(config["output_data_directory"])) CO = cluster_object() f_h5 = CO.f_h5_centroids if not os.path.exists(f_h5): h5 = h5py.File(f_h5, "w") h5.close() h5 = h5py.File(f_h5, "r+") keys = ["subcluster_kn", "subcluster_pcut", "subcluster_m", "subcluster_repeats"] args = dict([(k, config[k]) for k in keys])
val = list(item) + [f_sql,] data.append(val) if self.yield_single: for item in INPUT_ITR: val = list(item) + [f_sql,] yield val progress_bar.update() if not self.yield_single: yield data if __name__ == "__main__": import simple_config config = simple_config.load("score") _PARALLEL = config.as_bool("_PARALLEL") _FORCE = config.as_bool("_FORCE") n_jobs = -1 if _PARALLEL else 1 mkdir(config["output_data_directory"]) ########################################################### # Fill the pipeline with function objects mapreduce_functions = [] for name in config["mapreduce_commands"]: obj = getattr(ds,name)
n_data_items = len(df) df["_ref"] = [_ref_counter.next() for _ in range(n_data_items)] df.set_index("_ref",inplace=True) df.to_sql(output_table, engine, if_exists='replace') print "Finished {}, {}, {}".format(f_csv, len(df), list(df.columns)) if __name__ == "__main__": import simple_config config = simple_config.load("import_data") _PARALLEL = config.as_bool("_PARALLEL") _FORCE = config.as_bool("_FORCE") data_out = config["output_data_directory"] output_table = config["output_table"] # Require `input_data_directories` to be a list data_in_list = config["input_data_directories"] assert(type(data_in_list) == list) for d_in in data_in_list: import_directory_csv(d_in, data_out, output_table)
if config["command_whitelist"]: keys = [k for k in keys if k in config["command_whitelist"]] print "Only computing over", keys X = np.vstack(g[key]["V"] for key in keys) h5_score.close() return X if __name__ == "__main__": import simple_config config = simple_config.load("cluster") output_dir = config["output_data_directory"] mkdir(output_dir) method = config['score_method'] target_column = config['score_column'] f_sim = os.path.join(output_dir, config["f_cluster"]) if config.as_bool("_FORCE"): try: os.remove(f_sim) except: pass if not os.path.exists(f_sim):
total_counts += current_val if current_val > max_val: max_val = current_val max_item = item data[(' '.join(max_item[0]), max_item[1])] = total_counts ABR = collections.Counter(data) return ABR if __name__ == "__main__": import simple_config config = simple_config.load("phrase_identification") _PARALLEL = config.as_bool("_PARALLEL") _FORCE = config.as_bool("_FORCE") output_dir = config["output_data_directory"] target_columns = config["target_columns"] import_config = simple_config.load("import_data") input_data_dir = import_config["output_data_directory"] input_table = import_config["output_table"] F_SQL = grab_files("*.sqlite", input_data_dir) ABR = collections.Counter() P = parenthesis_nester()
import numpy as np import pandas as pd import h5py import os, glob, itertools, collections from sqlalchemy import create_engine from predictions import categorical_predict ERROR_MATRIX = {} PREDICTIONS = {} if __name__ == "__main__": import simple_config config = simple_config.load("predict") score_config = simple_config.load("score") import_config = simple_config.load("import_data") # For now, we can only deal with one column using meta! assert len(config["categorical_columns"]) == 1 f_h5 = os.path.join(score_config["output_data_directory"], score_config["document_scores"]["f_db"]) h5 = h5py.File(f_h5, "r") methods = h5.keys() pred_dir = import_config["output_data_directory"] input_glob = os.path.join(pred_dir, "*") input_files = glob.glob(input_glob)
if i==j: d = pdist(X[labels==i],metric='cosine') else: d = cdist(X[labels==i],X[labels==j],metric='cosine') # Only take upper diagonal (+diagonal elements) d = d[np.triu_indices(n=d.shape[0],m=d.shape[1],k=0)] dist[i,j] = dist[j,i] = d.mean() return dist if __name__ == "__main__" and __package__ is None: import simple_config config = simple_config.load("postprocessing") save_dest = config['output_data_directory'] os.system('mkdir -p {}'.format(save_dest)) SQL = load_SQL_data(config["master_columns"]) MC = load_metacluster_data() C = MC["meta_centroids"] counts = collections.Counter(MC["meta_labels"]) DV = load_document_vectors() # Build the results for the metaclusters labels = np.unique(MC["meta_labels"])