def get_meta(base_dir): corpus_folders = find_directories("%s/corpus" % (base_dir)) corpus = dict() for folder in corpus_folders: corpus_name = os.path.basename(folder) meta = glob("%s/*meta*" % (folder)) if len(meta) > 0: corpus[corpus_name] = meta return corpus
def get_corpus(analysis_dir): corpus_folders = find_directories("%s/corpus" % (analysis_dir)) corpus = dict() for folder in corpus_folders: corpus_name = os.path.basename(folder) sentences = glob("%s/*sentences*" % (folder)) if len(sentences) > 0: corpus[corpus_name] = sentences return corpus
def get_relations(base_dir,tags=None,read=False): edges = dict() if isinstance(tags,str): tags = [tags] relations_dir = "%s/relations" %(os.path.abspath(base_dir)) if tags == None: tags = [os.path.basename(x) for x in find_directories(relations_dir)] for tag in tags: print("Finding relations for %s" %(tag)) relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag)) if len(relations_files) != 0: if read: edges[tag] = read_relations(relations_files) else: edges[tag] = relations_files return edges
def get_relations(base_dir, tags=None, read=False): edges = dict() if isinstance(tags, str): tags = [tags] relations_dir = "%s/relations" % (os.path.abspath(base_dir)) if tags == None: tags = [os.path.basename(x) for x in find_directories(relations_dir)] for tag in tags: print("Finding relations for %s" % (tag)) relations_files = glob("%s/%s/*_relations.json" % (relations_dir, tag)) if len(relations_files) != 0: if read: edges[tag] = read_relations(relations_files) else: edges[tag] = relations_files return edges
def get_relations_df(base_dir,tags=None): if isinstance(tags,str): tags = [tags] relations_dir = "%s/relations" %(os.path.abspath(base_dir)) if tags == None: tags = [os.path.basename(x) for x in find_directories(relations_dir)] for tag in tags: print("Finding relations for %s" %(tag)) relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag)) term_names = numpy.unique([x.split("_")[0] for x in relations_files]).tolist() edges = pandas.DataFrame(columns=term_names,index=term_names) for r in range(len(relations_files)): relation_file = relations_files[r] print("Parsing %s of %s" %(r,len(relations_files))) term1,term2=os.path.basename(relation_file).split("_")[0:2] edges.loc[term1,term2] = read_json(relation_file)["value"] edges.loc[term2,term1] = read_json(relation_file)["value"] relations[tag] = edges return relations
def get_relations_df(base_dir,tags=None): if isinstance(tags,str): tags = [tags] relations_dir = "%s/relations" %(os.path.abspath(base_dir)) if tags == None: tags = [os.path.basename(x) for x in find_directories(relations_dir)] for tag in tags: print("Finding relations for %s" %(tag)) relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag)) term_names = list(set([x.split("_")[0] for x in relations_files])) edges = pandas.DataFrame(columns=term_names,index=term_names) for r in range(len(relations_files)): relation_file = relations_files[r] print("Parsing %s of %s" %(r,len(relations_files))) term1,term2=os.path.basename(relation_file).split("_")[0:2] edges.loc[term1,term2] = read_json(relation_file)["value"] edges.loc[term2,term1] = read_json(relation_file)["value"] relations[tag] = edges return relations
def get_plugins(plugin_repo=None,load=False): '''get_plugins from a downloaded wordfish-plugins folder download plugin repo to destination folder Parameters ========== plugin_repo: path to plugins repo load: boolean if True, returns loaded json (dict). If false, returns paths to json files ''' if plugin_repo == None: tmpdir = custom_app_download(repo_types=["plugins"]) plugin_repo = "%s/plugins" %(tmpdir) plugins = find_directories(plugin_repo) valid_plugins = [p for p in plugins if validate(p)] print("Found %s valid plugins" %(len(valid_plugins))) if load == True: valid_plugins = load_plugins(valid_plugins) return valid_plugins
def get_plugins(plugin_repo=None, load=False): '''get_plugins from a downloaded wordfish-plugins folder download plugin repo to destination folder Parameters ========== plugin_repo: path to plugins repo load: boolean if True, returns loaded json (dict). If false, returns paths to json files ''' if plugin_repo == None: tmpdir = custom_app_download(repo_types=["plugins"]) plugin_repo = "%s/plugins" % (tmpdir) plugins = find_directories(plugin_repo) valid_plugins = [p for p in plugins if validate(p)] print("Found %s valid plugins" % (len(valid_plugins))) if load == True: valid_plugins = load_plugins(valid_plugins) return valid_plugins
def get_terms(analysis_dir,subset=True): ''' For all terms defined, and relationships for the terms, parse into a single data structure This (maybe) won't work for larger datasets (we will use a database) but it will for testing. nodes: {"[plugin]::[uid]":[node]} Parameters ========== analysis_dir: path full path to analysis directory subset: boolean if True, returns terms in dictionary based on source tag. Default==False ''' nodes = dict() edges = dict() terms_dir = "%s/terms" %(os.path.abspath(analysis_dir)) if os.path.exists(terms_dir): term_plugins = find_directories(terms_dir) nodes = dict() edges = dict() results = dict() for term_plugin in term_plugins: plugin_name = os.path.basename(term_plugin) if subset: nodes = dict() edges = dict() # Here we parse together terms if os.path.exists("%s/terms.json" %term_plugin): terms_json = read_json("%s/terms.json" %term_plugin)["nodes"] for node in terms_json: if "uid" in node: uid = "%s::%s" %(plugin_name,node["uid"]) else: feature_name = node["name"].replace(" ","_") uid = "%s::%s" %(plugin_name,feature_name) nodes[uid] = node # Here we parse together relationships # Currently only supported for terms within the same family if os.path.exists("%s/term_relationships.json" %term_plugin): terms_json = read_json("%s/term_relationships.json" %term_plugin)["edges"] for relation in terms_json: uid_1 = "%s::%s" %(plugin_name,relation["source"]) uid_2 = "%s::%s" %(plugin_name,relation["target"]) relation_uid = "%s<>%s" %(uid_1,uid_2) edges[relation_uid] = {"source": uid_1, "target": uid_2, "value": relation["value"]} result = {"nodes":nodes,"edges":edges} if subset: results[plugin_name] = result if subset: result = results else: result = {"all":result} # Return the result to user with all edges and nodes defined if analysis_dir is not None: tmp = save_pretty_json(result,"%s/terms/terms.json" %(analysis_dir)) return result
def get_terms(analysis_dir, subset=True): ''' For all terms defined, and relationships for the terms, parse into a single data structure This (maybe) won't work for larger datasets (we will use a database) but it will for testing. nodes: {"[plugin]::[uid]":[node]} Parameters ========== analysis_dir: path full path to analysis directory subset: boolean if True, returns terms in dictionary based on source tag. Default==False ''' nodes = dict() edges = dict() terms_dir = "%s/terms" % (os.path.abspath(analysis_dir)) if os.path.exists(terms_dir): term_plugins = find_directories(terms_dir) nodes = dict() edges = dict() results = dict() for term_plugin in term_plugins: plugin_name = os.path.basename(term_plugin) if subset: nodes = dict() edges = dict() # Here we parse together terms if os.path.exists("%s/terms.json" % term_plugin): terms_json = read_json("%s/terms.json" % term_plugin)["nodes"] for node in terms_json: if "uid" in node: uid = "%s::%s" % (plugin_name, node["uid"]) else: feature_name = node["name"].replace(" ", "_") uid = "%s::%s" % (plugin_name, feature_name) nodes[uid] = node # Here we parse together relationships # Currently only supported for terms within the same family if os.path.exists("%s/term_relationships.json" % term_plugin): terms_json = read_json("%s/term_relationships.json" % term_plugin)["edges"] for relation in terms_json: uid_1 = "%s::%s" % (plugin_name, relation["source"]) uid_2 = "%s::%s" % (plugin_name, relation["target"]) relation_uid = "%s<>%s" % (uid_1, uid_2) edges[relation_uid] = { "source": uid_1, "target": uid_2, "value": relation["value"] } result = {"nodes": nodes, "edges": edges} if subset: results[plugin_name] = result if subset: result = results else: result = {"all": result} # Return the result to user with all edges and nodes defined if analysis_dir is not None: tmp = save_pretty_json(result, "%s/terms/terms.json" % (analysis_dir)) return result