コード例 #1
0
ファイル: terms.py プロジェクト: word-fish/wordfish-python
def save_relations(relations,output_dir=None):
    '''save_relationships
    Parameters
    =========
     output_dir: path
        path to save output. If none, will just return json dictionary
    relations: list of tuples [(source,target,relation)]
        if defined, all keys must be in input_terms. Not yet decided what a "relation" should be, but for now assume you can have it be a string or number.
    Returns
    =======
         list of links
         [{"source":"node1","target":"node2","value":0.5}]

    '''
    # Not sure why anyone would do this, but might as well check
    links = []

    # Save relations
    for tup in relations:
        pair = [tup[0],tup[1]]
        pair.sort()
        if output_dir is not None:
            output_file = "%s/%s_relations.json" %(output_dir,"_".join(pair).replace(" ",""))
            relation = {"source":tup[0],"target":tup[1],"value":tup[2]}
            links.append(relation)
            if not os.path.exists(output_file):
                tmp = save_pretty_json(relation,output_file)

    return links            
コード例 #2
0
ファイル: terms.py プロジェクト: pvk444/wordfish-python
def save_relations(relations, output_dir=None):
    '''save_relationships
    Parameters
    =========
     output_dir: path
        path to save output. If none, will just return json dictionary
    relations: list of tuples [(source,target,relation)]
        if defined, all keys must be in input_terms. Not yet decided what a "relation" should be, but for now assume you can have it be a string or number.
    Returns
    =======
         list of links
         [{"source":"node1","target":"node2","value":0.5}]

    '''
    # Not sure why anyone would do this, but might as well check
    links = []

    # Save relations
    for tup in relations:
        pair = [tup[0], tup[1]]
        pair.sort()
        if output_dir is not None:
            output_file = "%s/%s_relations.json" % (
                output_dir, "_".join(pair).replace(" ", ""))
            relation = {"source": tup[0], "target": tup[1], "value": tup[2]}
            links.append(relation)
            if not os.path.exists(output_file):
                tmp = save_pretty_json(relation, output_file)

    return links
コード例 #3
0
def save_meta(uid, meta, output_dir, prefix=""):
    '''save_meta
 
    Parameters
    ==========
    uid: int or string
        a unique ID for the article
    meta: dict
        dictionary with meta info, and labels
    output_dir: path
        full path to a plugins corpus directory
    '''
    if prefix != "":
        prefix = "%s_" % (prefix)
    output_file = "%s/%s_%smeta.txt" % (output_dir, uid, prefix)
    tmp = save_pretty_json(meta, output_file)
    return tmp
コード例 #4
0
ファイル: terms.py プロジェクト: word-fish/wordfish-python
def save_terms(input_terms,output_dir=None):
    '''save_terms
    Parameters
    =========
    input_terms: list,dict
        a list or dictionary of terms. if meta data are used to describe the  input_terms, provide the input_terms as a dictionary with a dictionary to define {"meta_label":"meta_value"}
     output_dir: path
        path to save output. If none, will just return json dictionary
    Returns
    =======
    links: dict
        dictionary structure with the following format (parallel to what many d3
        algorithms use to define graphs)

        {"nodes":[{"name":"node1"},
                 {"name":"node2"}],
         }

    '''
    nodes = []
    ids = []
    if isinstance(input_terms,str):
        input_terms = [input_terms]
    if isinstance(input_terms,list):
        input_terms = [x.lower() for x in input_terms]
        for t in range(len(input_terms)):
            term = input_terms[t]
            nodes.append({"name":term.lower(),"uid":str(t)})
            ids.append(term.lower())
    elif isinstance(input_terms,dict):
        for node, meta in input_terms.iteritems():
            meta["uid"] = str(node).lower()
            nodes.append(meta)
            ids.append(str(node).lower())
    else:
        print("Invalid input_terms, must be str, dict, or list.")
        return

    result = {"nodes":nodes}
    if output_dir is not None:
        tmp = save_pretty_json(result,"%s/terms.json" %(output_dir))
    return result
コード例 #5
0
ファイル: terms.py プロジェクト: pvk444/wordfish-python
def save_terms(input_terms, output_dir=None):
    '''save_terms
    Parameters
    =========
    input_terms: list,dict
        a list or dictionary of terms. if meta data are used to describe the  input_terms, provide the input_terms as a dictionary with a dictionary to define {"meta_label":"meta_value"}
     output_dir: path
        path to save output. If none, will just return json dictionary
    Returns
    =======
    links: dict
        dictionary structure with the following format (parallel to what many d3
        algorithms use to define graphs)

        {"nodes":[{"name":"node1"},
                 {"name":"node2"}],
         }

    '''
    nodes = []
    ids = []
    if isinstance(input_terms, str):
        input_terms = [input_terms]
    if isinstance(input_terms, list):
        input_terms = [x.lower() for x in input_terms]
        for t in range(len(input_terms)):
            term = input_terms[t]
            nodes.append({"name": term.lower(), "uid": str(t)})
            ids.append(term.lower())
    elif isinstance(input_terms, dict):
        for node, meta in input_terms.iteritems():
            meta["uid"] = str(node).lower()
            nodes.append(meta)
            ids.append(str(node).lower())
    else:
        print("Invalid input_terms, must be str, dict, or list.")
        return

    result = {"nodes": nodes}
    if output_dir is not None:
        tmp = save_pretty_json(result, "%s/terms.json" % (output_dir))
    return result
コード例 #6
0
ファイル: terms.py プロジェクト: word-fish/wordfish-python
def get_terms(analysis_dir,subset=True):
    '''
    For all terms defined, and relationships for the terms, parse into a single data structure
    This (maybe) won't work for larger datasets (we will use a database) but it will for testing.

        nodes:

            {"[plugin]::[uid]":[node]}

    Parameters
    ==========
    analysis_dir: path
        full path to analysis directory
    subset: boolean
        if True, returns terms in dictionary based on source tag. Default==False    
    '''

    nodes = dict()
    edges = dict()

    terms_dir = "%s/terms" %(os.path.abspath(analysis_dir))
    if os.path.exists(terms_dir):
        term_plugins = find_directories(terms_dir)


        nodes = dict()
        edges = dict()
        results = dict()

        for term_plugin in term_plugins:
            plugin_name = os.path.basename(term_plugin)

            if subset:
                nodes = dict()
                edges = dict()

            # Here we parse together terms
            if os.path.exists("%s/terms.json" %term_plugin):
                terms_json = read_json("%s/terms.json" %term_plugin)["nodes"]
                for node in terms_json:
                    if "uid" in node:
                        uid = "%s::%s" %(plugin_name,node["uid"])
                    else:
                        feature_name = node["name"].replace(" ","_")
                        uid = "%s::%s" %(plugin_name,feature_name) 
                    nodes[uid] = node

            # Here we parse together relationships
            # Currently only supported for terms within the same family
            if os.path.exists("%s/term_relationships.json" %term_plugin):
                terms_json = read_json("%s/term_relationships.json" %term_plugin)["edges"]
                for relation in terms_json:
                    uid_1 = "%s::%s" %(plugin_name,relation["source"])
                    uid_2 = "%s::%s" %(plugin_name,relation["target"])
                    relation_uid = "%s<>%s" %(uid_1,uid_2)
                    edges[relation_uid] = {"source": uid_1,
                                           "target": uid_2,
                                           "value": relation["value"]}

            result = {"nodes":nodes,"edges":edges}
            if subset:
                results[plugin_name] = result
    
    if subset:
        result = results
    else:
        result = {"all":result}
    # Return the result to user with all edges and nodes defined
    if analysis_dir is not None:
        tmp = save_pretty_json(result,"%s/terms/terms.json" %(analysis_dir))
    return result
コード例 #7
0
        entry = {
            'categories': result.categories,
            'title': result.title,
            'method': method,
            'url': result.url,
            'summary': result.summary,
            'images': result.images
        }

        # We can use links to calculate relatedness
        entry['links'] = get_attribute(result, 'links')
        entry['references'] = get_attribute(result, 'references')

        results[method] = entry

save_pretty_json(results, "wikipedia_math_articles.json")

## STEP 2: EQUATIONS ###########################################################

equations = dict()

for pair in pages:
    domain = pair[0]
    method = pair[1]
    if method not in equations:
        print("Extracting equations from %s" % (method))
        result = WikipediaPage(method)
        html = result.html()
        soup = BeautifulSoup(html, 'lxml')

        equation_list = []
コード例 #8
0
                  'title': result.title,
                  'method': method,
                  'url': result.url,
                  'summary': result.summary,
                  'images': result.images }

        # We can use links to calculate relatedness
        entry['links'] = get_attribute(result, 'links')
        entry['references'] = get_attribute(result, 'references')
        key = result.url.split('/')[-1]

        results[method] = entry
        

# Save to pickle and json, just for fallback
save_pretty_json(results, "wikipedia_statistics_articles.json")
pickle.dump(results, open('wikipedia_statistics_articles.pkl', 'wb'))
len(results)
# 2807

## Step 3: Equations ###########################################################

equations = dict()

for method in methods:
    if method not in equations:
        print("Extracting equations from %s" %(method))
        result = WikipediaPage(method)
        html = result.html()
        soup = BeautifulSoup(html, 'lxml')
コード例 #9
0
ファイル: terms.py プロジェクト: pvk444/wordfish-python
def get_terms(analysis_dir, subset=True):
    '''
    For all terms defined, and relationships for the terms, parse into a single data structure
    This (maybe) won't work for larger datasets (we will use a database) but it will for testing.

        nodes:

            {"[plugin]::[uid]":[node]}

    Parameters
    ==========
    analysis_dir: path
        full path to analysis directory
    subset: boolean
        if True, returns terms in dictionary based on source tag. Default==False    
    '''

    nodes = dict()
    edges = dict()

    terms_dir = "%s/terms" % (os.path.abspath(analysis_dir))
    if os.path.exists(terms_dir):
        term_plugins = find_directories(terms_dir)

        nodes = dict()
        edges = dict()
        results = dict()

        for term_plugin in term_plugins:
            plugin_name = os.path.basename(term_plugin)

            if subset:
                nodes = dict()
                edges = dict()

            # Here we parse together terms
            if os.path.exists("%s/terms.json" % term_plugin):
                terms_json = read_json("%s/terms.json" % term_plugin)["nodes"]
                for node in terms_json:
                    if "uid" in node:
                        uid = "%s::%s" % (plugin_name, node["uid"])
                    else:
                        feature_name = node["name"].replace(" ", "_")
                        uid = "%s::%s" % (plugin_name, feature_name)
                    nodes[uid] = node

            # Here we parse together relationships
            # Currently only supported for terms within the same family
            if os.path.exists("%s/term_relationships.json" % term_plugin):
                terms_json = read_json("%s/term_relationships.json" %
                                       term_plugin)["edges"]
                for relation in terms_json:
                    uid_1 = "%s::%s" % (plugin_name, relation["source"])
                    uid_2 = "%s::%s" % (plugin_name, relation["target"])
                    relation_uid = "%s<>%s" % (uid_1, uid_2)
                    edges[relation_uid] = {
                        "source": uid_1,
                        "target": uid_2,
                        "value": relation["value"]
                    }

            result = {"nodes": nodes, "edges": edges}
            if subset:
                results[plugin_name] = result

    if subset:
        result = results
    else:
        result = {"all": result}
    # Return the result to user with all edges and nodes defined
    if analysis_dir is not None:
        tmp = save_pretty_json(result, "%s/terms/terms.json" % (analysis_dir))
    return result