def save_relations(relations,output_dir=None): '''save_relationships Parameters ========= output_dir: path path to save output. If none, will just return json dictionary relations: list of tuples [(source,target,relation)] if defined, all keys must be in input_terms. Not yet decided what a "relation" should be, but for now assume you can have it be a string or number. Returns ======= list of links [{"source":"node1","target":"node2","value":0.5}] ''' # Not sure why anyone would do this, but might as well check links = [] # Save relations for tup in relations: pair = [tup[0],tup[1]] pair.sort() if output_dir is not None: output_file = "%s/%s_relations.json" %(output_dir,"_".join(pair).replace(" ","")) relation = {"source":tup[0],"target":tup[1],"value":tup[2]} links.append(relation) if not os.path.exists(output_file): tmp = save_pretty_json(relation,output_file) return links
def save_relations(relations, output_dir=None): '''save_relationships Parameters ========= output_dir: path path to save output. If none, will just return json dictionary relations: list of tuples [(source,target,relation)] if defined, all keys must be in input_terms. Not yet decided what a "relation" should be, but for now assume you can have it be a string or number. Returns ======= list of links [{"source":"node1","target":"node2","value":0.5}] ''' # Not sure why anyone would do this, but might as well check links = [] # Save relations for tup in relations: pair = [tup[0], tup[1]] pair.sort() if output_dir is not None: output_file = "%s/%s_relations.json" % ( output_dir, "_".join(pair).replace(" ", "")) relation = {"source": tup[0], "target": tup[1], "value": tup[2]} links.append(relation) if not os.path.exists(output_file): tmp = save_pretty_json(relation, output_file) return links
def save_meta(uid, meta, output_dir, prefix=""): '''save_meta Parameters ========== uid: int or string a unique ID for the article meta: dict dictionary with meta info, and labels output_dir: path full path to a plugins corpus directory ''' if prefix != "": prefix = "%s_" % (prefix) output_file = "%s/%s_%smeta.txt" % (output_dir, uid, prefix) tmp = save_pretty_json(meta, output_file) return tmp
def save_terms(input_terms,output_dir=None): '''save_terms Parameters ========= input_terms: list,dict a list or dictionary of terms. if meta data are used to describe the input_terms, provide the input_terms as a dictionary with a dictionary to define {"meta_label":"meta_value"} output_dir: path path to save output. If none, will just return json dictionary Returns ======= links: dict dictionary structure with the following format (parallel to what many d3 algorithms use to define graphs) {"nodes":[{"name":"node1"}, {"name":"node2"}], } ''' nodes = [] ids = [] if isinstance(input_terms,str): input_terms = [input_terms] if isinstance(input_terms,list): input_terms = [x.lower() for x in input_terms] for t in range(len(input_terms)): term = input_terms[t] nodes.append({"name":term.lower(),"uid":str(t)}) ids.append(term.lower()) elif isinstance(input_terms,dict): for node, meta in input_terms.iteritems(): meta["uid"] = str(node).lower() nodes.append(meta) ids.append(str(node).lower()) else: print("Invalid input_terms, must be str, dict, or list.") return result = {"nodes":nodes} if output_dir is not None: tmp = save_pretty_json(result,"%s/terms.json" %(output_dir)) return result
def save_terms(input_terms, output_dir=None): '''save_terms Parameters ========= input_terms: list,dict a list or dictionary of terms. if meta data are used to describe the input_terms, provide the input_terms as a dictionary with a dictionary to define {"meta_label":"meta_value"} output_dir: path path to save output. If none, will just return json dictionary Returns ======= links: dict dictionary structure with the following format (parallel to what many d3 algorithms use to define graphs) {"nodes":[{"name":"node1"}, {"name":"node2"}], } ''' nodes = [] ids = [] if isinstance(input_terms, str): input_terms = [input_terms] if isinstance(input_terms, list): input_terms = [x.lower() for x in input_terms] for t in range(len(input_terms)): term = input_terms[t] nodes.append({"name": term.lower(), "uid": str(t)}) ids.append(term.lower()) elif isinstance(input_terms, dict): for node, meta in input_terms.iteritems(): meta["uid"] = str(node).lower() nodes.append(meta) ids.append(str(node).lower()) else: print("Invalid input_terms, must be str, dict, or list.") return result = {"nodes": nodes} if output_dir is not None: tmp = save_pretty_json(result, "%s/terms.json" % (output_dir)) return result
def get_terms(analysis_dir,subset=True): ''' For all terms defined, and relationships for the terms, parse into a single data structure This (maybe) won't work for larger datasets (we will use a database) but it will for testing. nodes: {"[plugin]::[uid]":[node]} Parameters ========== analysis_dir: path full path to analysis directory subset: boolean if True, returns terms in dictionary based on source tag. Default==False ''' nodes = dict() edges = dict() terms_dir = "%s/terms" %(os.path.abspath(analysis_dir)) if os.path.exists(terms_dir): term_plugins = find_directories(terms_dir) nodes = dict() edges = dict() results = dict() for term_plugin in term_plugins: plugin_name = os.path.basename(term_plugin) if subset: nodes = dict() edges = dict() # Here we parse together terms if os.path.exists("%s/terms.json" %term_plugin): terms_json = read_json("%s/terms.json" %term_plugin)["nodes"] for node in terms_json: if "uid" in node: uid = "%s::%s" %(plugin_name,node["uid"]) else: feature_name = node["name"].replace(" ","_") uid = "%s::%s" %(plugin_name,feature_name) nodes[uid] = node # Here we parse together relationships # Currently only supported for terms within the same family if os.path.exists("%s/term_relationships.json" %term_plugin): terms_json = read_json("%s/term_relationships.json" %term_plugin)["edges"] for relation in terms_json: uid_1 = "%s::%s" %(plugin_name,relation["source"]) uid_2 = "%s::%s" %(plugin_name,relation["target"]) relation_uid = "%s<>%s" %(uid_1,uid_2) edges[relation_uid] = {"source": uid_1, "target": uid_2, "value": relation["value"]} result = {"nodes":nodes,"edges":edges} if subset: results[plugin_name] = result if subset: result = results else: result = {"all":result} # Return the result to user with all edges and nodes defined if analysis_dir is not None: tmp = save_pretty_json(result,"%s/terms/terms.json" %(analysis_dir)) return result
entry = { 'categories': result.categories, 'title': result.title, 'method': method, 'url': result.url, 'summary': result.summary, 'images': result.images } # We can use links to calculate relatedness entry['links'] = get_attribute(result, 'links') entry['references'] = get_attribute(result, 'references') results[method] = entry save_pretty_json(results, "wikipedia_math_articles.json") ## STEP 2: EQUATIONS ########################################################### equations = dict() for pair in pages: domain = pair[0] method = pair[1] if method not in equations: print("Extracting equations from %s" % (method)) result = WikipediaPage(method) html = result.html() soup = BeautifulSoup(html, 'lxml') equation_list = []
'title': result.title, 'method': method, 'url': result.url, 'summary': result.summary, 'images': result.images } # We can use links to calculate relatedness entry['links'] = get_attribute(result, 'links') entry['references'] = get_attribute(result, 'references') key = result.url.split('/')[-1] results[method] = entry # Save to pickle and json, just for fallback save_pretty_json(results, "wikipedia_statistics_articles.json") pickle.dump(results, open('wikipedia_statistics_articles.pkl', 'wb')) len(results) # 2807 ## Step 3: Equations ########################################################### equations = dict() for method in methods: if method not in equations: print("Extracting equations from %s" %(method)) result = WikipediaPage(method) html = result.html() soup = BeautifulSoup(html, 'lxml')
def get_terms(analysis_dir, subset=True): ''' For all terms defined, and relationships for the terms, parse into a single data structure This (maybe) won't work for larger datasets (we will use a database) but it will for testing. nodes: {"[plugin]::[uid]":[node]} Parameters ========== analysis_dir: path full path to analysis directory subset: boolean if True, returns terms in dictionary based on source tag. Default==False ''' nodes = dict() edges = dict() terms_dir = "%s/terms" % (os.path.abspath(analysis_dir)) if os.path.exists(terms_dir): term_plugins = find_directories(terms_dir) nodes = dict() edges = dict() results = dict() for term_plugin in term_plugins: plugin_name = os.path.basename(term_plugin) if subset: nodes = dict() edges = dict() # Here we parse together terms if os.path.exists("%s/terms.json" % term_plugin): terms_json = read_json("%s/terms.json" % term_plugin)["nodes"] for node in terms_json: if "uid" in node: uid = "%s::%s" % (plugin_name, node["uid"]) else: feature_name = node["name"].replace(" ", "_") uid = "%s::%s" % (plugin_name, feature_name) nodes[uid] = node # Here we parse together relationships # Currently only supported for terms within the same family if os.path.exists("%s/term_relationships.json" % term_plugin): terms_json = read_json("%s/term_relationships.json" % term_plugin)["edges"] for relation in terms_json: uid_1 = "%s::%s" % (plugin_name, relation["source"]) uid_2 = "%s::%s" % (plugin_name, relation["target"]) relation_uid = "%s<>%s" % (uid_1, uid_2) edges[relation_uid] = { "source": uid_1, "target": uid_2, "value": relation["value"] } result = {"nodes": nodes, "edges": edges} if subset: results[plugin_name] = result if subset: result = results else: result = {"all": result} # Return the result to user with all edges and nodes defined if analysis_dir is not None: tmp = save_pretty_json(result, "%s/terms/terms.json" % (analysis_dir)) return result