def read_relations(relations_list,search_expression=None): if search_expression != None: expression = re.compile(search_expression) return [read_json(x) for x in relations_files if expression.search(x)] else: relations = [] for x in range(len(relations_files)): print("Parsing %s of %s" %(x,len(relations_files))) relations.append(read_json(relations_files[x]))
def read_relations(relations_list, search_expression=None): if search_expression != None: expression = re.compile(search_expression) return [read_json(x) for x in relations_files if expression.search(x)] else: relations = [] for x in range(len(relations_files)): print("Parsing %s of %s" % (x, len(relations_files))) relations.append(read_json(relations_files[x]))
def get_relations_df(base_dir,tags=None): if isinstance(tags,str): tags = [tags] relations_dir = "%s/relations" %(os.path.abspath(base_dir)) if tags == None: tags = [os.path.basename(x) for x in find_directories(relations_dir)] for tag in tags: print("Finding relations for %s" %(tag)) relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag)) term_names = numpy.unique([x.split("_")[0] for x in relations_files]).tolist() edges = pandas.DataFrame(columns=term_names,index=term_names) for r in range(len(relations_files)): relation_file = relations_files[r] print("Parsing %s of %s" %(r,len(relations_files))) term1,term2=os.path.basename(relation_file).split("_")[0:2] edges.loc[term1,term2] = read_json(relation_file)["value"] edges.loc[term2,term1] = read_json(relation_file)["value"] relations[tag] = edges return relations
def get_relations_df(base_dir,tags=None): if isinstance(tags,str): tags = [tags] relations_dir = "%s/relations" %(os.path.abspath(base_dir)) if tags == None: tags = [os.path.basename(x) for x in find_directories(relations_dir)] for tag in tags: print("Finding relations for %s" %(tag)) relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag)) term_names = list(set([x.split("_")[0] for x in relations_files])) edges = pandas.DataFrame(columns=term_names,index=term_names) for r in range(len(relations_files)): relation_file = relations_files[r] print("Parsing %s of %s" %(r,len(relations_files))) term1,term2=os.path.basename(relation_file).split("_")[0:2] edges.loc[term1,term2] = read_json(relation_file)["value"] edges.loc[term2,term1] = read_json(relation_file)["value"] relations[tag] = edges return relations
def featurize_to_corpus(model, meta, size=300, fillna=True): '''featurize_to_corpus generate average feature vectors for a set of documents and their labels based on an existing model (model). The meta json file should describe text and labels ''' analyzer = DeepTextAnalyzer(model) vectors = pandas.DataFrame(columns=range(size)) # Get all unique term names from the meta objects term_names = get_labels(meta=meta) # Create a matrix of labels, keep track of which abstracts are labeled labels = pandas.DataFrame(columns=term_names) for r in range(len(meta)): meta_file = meta[r] text = meta_file.replace("_meta", "_sentences") label = os.path.basename(text).split("_")[0] # Build a model for everyone else if label not in vectors.index: try: print("Processing %s of %s" % (r, len(meta))) vectors.loc[label] = analyzer.text2mean_vector(text) labels.loc[label, read_json(meta_file)["labels"]] = 1 except: pass count = 1 for r in range(len(meta)): meta_file = meta[r] post = meta_file.replace("_meta", "_sentences") # Build a model by taking the mean vector if count not in vectors.index: try: print("Processing %s of %s" % (r, len(meta))) vectors.loc[count] = analyzer.text2mean_vector(post) labels.loc[count, read_json(meta_file)["labels"]] = 1 count += 1 except: pass if fillna: labels = labels.fillna(0) vectors = vectors.fillna(0) return vectors, labels
def featurize_to_corpus(model,meta,size=300,fillna=True): '''featurize_to_corpus generate average feature vectors for a set of documents and their labels based on an existing model (model). The meta json file should describe text and labels ''' analyzer = DeepTextAnalyzer(model) vectors = pandas.DataFrame(columns=range(size)) # Get all unique term names from the meta objects term_names = get_labels(meta=meta) # Create a matrix of labels, keep track of which abstracts are labeled labels = pandas.DataFrame(columns=term_names) for r in range(len(meta)): meta_file = meta[r] text = meta_file.replace("_meta","_sentences") label = os.path.basename(text).split("_")[0] # Build a model for everyone else if label not in vectors.index: try: print("Processing %s of %s" %(r,len(meta))) vectors.loc[label] = analyzer.text2mean_vector(text) labels.loc[label,read_json(meta_file)["labels"]] = 1 except: pass count = 1 for r in range(len(meta)): meta_file = meta[r] post = meta_file.replace("_meta","_sentences") # Build a model by taking the mean vector if count not in vectors.index: try: print("Processing %s of %s" %(r,len(meta))) vectors.loc[count] = analyzer.text2mean_vector(post) labels.loc[count,read_json(meta_file)["labels"]] = 1 count+=1 except: pass if fillna: labels = labels.fillna(0) vectors = vectors.fillna(0) return vectors,labels
def get_terms(analysis_dir,subset=True): ''' For all terms defined, and relationships for the terms, parse into a single data structure This (maybe) won't work for larger datasets (we will use a database) but it will for testing. nodes: {"[plugin]::[uid]":[node]} Parameters ========== analysis_dir: path full path to analysis directory subset: boolean if True, returns terms in dictionary based on source tag. Default==False ''' nodes = dict() edges = dict() terms_dir = "%s/terms" %(os.path.abspath(analysis_dir)) if os.path.exists(terms_dir): term_plugins = find_directories(terms_dir) nodes = dict() edges = dict() results = dict() for term_plugin in term_plugins: plugin_name = os.path.basename(term_plugin) if subset: nodes = dict() edges = dict() # Here we parse together terms if os.path.exists("%s/terms.json" %term_plugin): terms_json = read_json("%s/terms.json" %term_plugin)["nodes"] for node in terms_json: if "uid" in node: uid = "%s::%s" %(plugin_name,node["uid"]) else: feature_name = node["name"].replace(" ","_") uid = "%s::%s" %(plugin_name,feature_name) nodes[uid] = node # Here we parse together relationships # Currently only supported for terms within the same family if os.path.exists("%s/term_relationships.json" %term_plugin): terms_json = read_json("%s/term_relationships.json" %term_plugin)["edges"] for relation in terms_json: uid_1 = "%s::%s" %(plugin_name,relation["source"]) uid_2 = "%s::%s" %(plugin_name,relation["target"]) relation_uid = "%s<>%s" %(uid_1,uid_2) edges[relation_uid] = {"source": uid_1, "target": uid_2, "value": relation["value"]} result = {"nodes":nodes,"edges":edges} if subset: results[plugin_name] = result if subset: result = results else: result = {"all":result} # Return the result to user with all edges and nodes defined if analysis_dir is not None: tmp = save_pretty_json(result,"%s/terms/terms.json" %(analysis_dir)) return result
def get_labels(meta): labels = [] for r in range(len(meta)): meta_file = meta[r] labels = numpy.unique(labels + read_json(meta_file)["labels"]).tolist() return labels
def get_terms(analysis_dir, subset=True): ''' For all terms defined, and relationships for the terms, parse into a single data structure This (maybe) won't work for larger datasets (we will use a database) but it will for testing. nodes: {"[plugin]::[uid]":[node]} Parameters ========== analysis_dir: path full path to analysis directory subset: boolean if True, returns terms in dictionary based on source tag. Default==False ''' nodes = dict() edges = dict() terms_dir = "%s/terms" % (os.path.abspath(analysis_dir)) if os.path.exists(terms_dir): term_plugins = find_directories(terms_dir) nodes = dict() edges = dict() results = dict() for term_plugin in term_plugins: plugin_name = os.path.basename(term_plugin) if subset: nodes = dict() edges = dict() # Here we parse together terms if os.path.exists("%s/terms.json" % term_plugin): terms_json = read_json("%s/terms.json" % term_plugin)["nodes"] for node in terms_json: if "uid" in node: uid = "%s::%s" % (plugin_name, node["uid"]) else: feature_name = node["name"].replace(" ", "_") uid = "%s::%s" % (plugin_name, feature_name) nodes[uid] = node # Here we parse together relationships # Currently only supported for terms within the same family if os.path.exists("%s/term_relationships.json" % term_plugin): terms_json = read_json("%s/term_relationships.json" % term_plugin)["edges"] for relation in terms_json: uid_1 = "%s::%s" % (plugin_name, relation["source"]) uid_2 = "%s::%s" % (plugin_name, relation["target"]) relation_uid = "%s<>%s" % (uid_1, uid_2) edges[relation_uid] = { "source": uid_1, "target": uid_2, "value": relation["value"] } result = {"nodes": nodes, "edges": edges} if subset: results[plugin_name] = result if subset: result = results else: result = {"all": result} # Return the result to user with all edges and nodes defined if analysis_dir is not None: tmp = save_pretty_json(result, "%s/terms/terms.json" % (analysis_dir)) return result