Ejemplo n.º 1
0
def read_relations(relations_list,search_expression=None):
    if search_expression != None:
        expression = re.compile(search_expression)
        return [read_json(x) for x in relations_files if expression.search(x)]
    else:
        relations = []
        for x in range(len(relations_files)):
            print("Parsing %s of %s" %(x,len(relations_files)))
            relations.append(read_json(relations_files[x])) 
Ejemplo n.º 2
0
def read_relations(relations_list, search_expression=None):
    if search_expression != None:
        expression = re.compile(search_expression)
        return [read_json(x) for x in relations_files if expression.search(x)]
    else:
        relations = []
        for x in range(len(relations_files)):
            print("Parsing %s of %s" % (x, len(relations_files)))
            relations.append(read_json(relations_files[x]))
Ejemplo n.º 3
0
def get_relations_df(base_dir,tags=None):
    if isinstance(tags,str):
        tags = [tags]
    relations_dir = "%s/relations" %(os.path.abspath(base_dir))
    if tags == None:
        tags = [os.path.basename(x) for x in find_directories(relations_dir)]
    for tag in tags:
        print("Finding relations for %s" %(tag))
        relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag))
        term_names = numpy.unique([x.split("_")[0] for x in relations_files]).tolist()
        edges = pandas.DataFrame(columns=term_names,index=term_names)
        for r in range(len(relations_files)):
            relation_file = relations_files[r]
            print("Parsing %s of %s" %(r,len(relations_files)))
            term1,term2=os.path.basename(relation_file).split("_")[0:2]      
            edges.loc[term1,term2] = read_json(relation_file)["value"]
            edges.loc[term2,term1] = read_json(relation_file)["value"]
        relations[tag] = edges
    return relations
Ejemplo n.º 4
0
def get_relations_df(base_dir,tags=None):
    if isinstance(tags,str):
        tags = [tags]
    relations_dir = "%s/relations" %(os.path.abspath(base_dir))
    if tags == None:
        tags = [os.path.basename(x) for x in find_directories(relations_dir)]
    for tag in tags:
        print("Finding relations for %s" %(tag))
        relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag))
        term_names = list(set([x.split("_")[0] for x in relations_files]))
        edges = pandas.DataFrame(columns=term_names,index=term_names)
        for r in range(len(relations_files)):
            relation_file = relations_files[r]
            print("Parsing %s of %s" %(r,len(relations_files)))
            term1,term2=os.path.basename(relation_file).split("_")[0:2]      
            edges.loc[term1,term2] = read_json(relation_file)["value"]
            edges.loc[term2,term1] = read_json(relation_file)["value"]
        relations[tag] = edges
    return relations
Ejemplo n.º 5
0
def featurize_to_corpus(model, meta, size=300, fillna=True):
    '''featurize_to_corpus
    generate average feature vectors for a set of documents and their labels based
    on an existing model (model). The meta json file should describe text and labels
    '''
    analyzer = DeepTextAnalyzer(model)
    vectors = pandas.DataFrame(columns=range(size))
    # Get all unique term names from the meta objects
    term_names = get_labels(meta=meta)
    # Create a matrix of labels, keep track of which abstracts are labeled
    labels = pandas.DataFrame(columns=term_names)
    for r in range(len(meta)):
        meta_file = meta[r]
        text = meta_file.replace("_meta", "_sentences")
        label = os.path.basename(text).split("_")[0]
    # Build a model for everyone else
    if label not in vectors.index:
        try:
            print("Processing %s of %s" % (r, len(meta)))
            vectors.loc[label] = analyzer.text2mean_vector(text)
            labels.loc[label, read_json(meta_file)["labels"]] = 1
        except:
            pass
    count = 1
    for r in range(len(meta)):
        meta_file = meta[r]
        post = meta_file.replace("_meta", "_sentences")
        # Build a model by taking the mean vector
        if count not in vectors.index:
            try:
                print("Processing %s of %s" % (r, len(meta)))
                vectors.loc[count] = analyzer.text2mean_vector(post)
                labels.loc[count, read_json(meta_file)["labels"]] = 1
                count += 1
            except:
                pass
    if fillna:
        labels = labels.fillna(0)
        vectors = vectors.fillna(0)
    return vectors, labels
Ejemplo n.º 6
0
def featurize_to_corpus(model,meta,size=300,fillna=True):
    '''featurize_to_corpus
    generate average feature vectors for a set of documents and their labels based
    on an existing model (model). The meta json file should describe text and labels
    '''   
    analyzer = DeepTextAnalyzer(model)
    vectors = pandas.DataFrame(columns=range(size))
    # Get all unique term names from the meta objects
    term_names = get_labels(meta=meta)
    # Create a matrix of labels, keep track of which abstracts are labeled
    labels = pandas.DataFrame(columns=term_names)
    for r in range(len(meta)):
        meta_file = meta[r]
        text = meta_file.replace("_meta","_sentences")
        label = os.path.basename(text).split("_")[0]
    # Build a model for everyone else
    if label not in vectors.index:
        try:
            print("Processing %s of %s" %(r,len(meta)))
            vectors.loc[label] = analyzer.text2mean_vector(text)
            labels.loc[label,read_json(meta_file)["labels"]] = 1
        except:
            pass
    count = 1
    for r in range(len(meta)):
        meta_file = meta[r]
        post = meta_file.replace("_meta","_sentences")
        # Build a model by taking the mean vector
        if count not in vectors.index:
            try:
                print("Processing %s of %s" %(r,len(meta)))
                vectors.loc[count] = analyzer.text2mean_vector(post)
                labels.loc[count,read_json(meta_file)["labels"]] = 1
                count+=1
            except:
                pass
    if fillna:
        labels = labels.fillna(0)
        vectors = vectors.fillna(0)
    return vectors,labels
Ejemplo n.º 7
0
def get_terms(analysis_dir,subset=True):
    '''
    For all terms defined, and relationships for the terms, parse into a single data structure
    This (maybe) won't work for larger datasets (we will use a database) but it will for testing.

        nodes:

            {"[plugin]::[uid]":[node]}

    Parameters
    ==========
    analysis_dir: path
        full path to analysis directory
    subset: boolean
        if True, returns terms in dictionary based on source tag. Default==False    
    '''

    nodes = dict()
    edges = dict()

    terms_dir = "%s/terms" %(os.path.abspath(analysis_dir))
    if os.path.exists(terms_dir):
        term_plugins = find_directories(terms_dir)


        nodes = dict()
        edges = dict()
        results = dict()

        for term_plugin in term_plugins:
            plugin_name = os.path.basename(term_plugin)

            if subset:
                nodes = dict()
                edges = dict()

            # Here we parse together terms
            if os.path.exists("%s/terms.json" %term_plugin):
                terms_json = read_json("%s/terms.json" %term_plugin)["nodes"]
                for node in terms_json:
                    if "uid" in node:
                        uid = "%s::%s" %(plugin_name,node["uid"])
                    else:
                        feature_name = node["name"].replace(" ","_")
                        uid = "%s::%s" %(plugin_name,feature_name) 
                    nodes[uid] = node

            # Here we parse together relationships
            # Currently only supported for terms within the same family
            if os.path.exists("%s/term_relationships.json" %term_plugin):
                terms_json = read_json("%s/term_relationships.json" %term_plugin)["edges"]
                for relation in terms_json:
                    uid_1 = "%s::%s" %(plugin_name,relation["source"])
                    uid_2 = "%s::%s" %(plugin_name,relation["target"])
                    relation_uid = "%s<>%s" %(uid_1,uid_2)
                    edges[relation_uid] = {"source": uid_1,
                                           "target": uid_2,
                                           "value": relation["value"]}

            result = {"nodes":nodes,"edges":edges}
            if subset:
                results[plugin_name] = result
    
    if subset:
        result = results
    else:
        result = {"all":result}
    # Return the result to user with all edges and nodes defined
    if analysis_dir is not None:
        tmp = save_pretty_json(result,"%s/terms/terms.json" %(analysis_dir))
    return result
Ejemplo n.º 8
0
def get_labels(meta):
    labels = []
    for r in range(len(meta)):
        meta_file = meta[r]
        labels = numpy.unique(labels + read_json(meta_file)["labels"]).tolist()
    return labels
Ejemplo n.º 9
0
def get_labels(meta):
    labels = []
    for r in range(len(meta)):
        meta_file = meta[r]
        labels = numpy.unique(labels + read_json(meta_file)["labels"]).tolist() 
    return labels
Ejemplo n.º 10
0
def get_terms(analysis_dir, subset=True):
    '''
    For all terms defined, and relationships for the terms, parse into a single data structure
    This (maybe) won't work for larger datasets (we will use a database) but it will for testing.

        nodes:

            {"[plugin]::[uid]":[node]}

    Parameters
    ==========
    analysis_dir: path
        full path to analysis directory
    subset: boolean
        if True, returns terms in dictionary based on source tag. Default==False    
    '''

    nodes = dict()
    edges = dict()

    terms_dir = "%s/terms" % (os.path.abspath(analysis_dir))
    if os.path.exists(terms_dir):
        term_plugins = find_directories(terms_dir)

        nodes = dict()
        edges = dict()
        results = dict()

        for term_plugin in term_plugins:
            plugin_name = os.path.basename(term_plugin)

            if subset:
                nodes = dict()
                edges = dict()

            # Here we parse together terms
            if os.path.exists("%s/terms.json" % term_plugin):
                terms_json = read_json("%s/terms.json" % term_plugin)["nodes"]
                for node in terms_json:
                    if "uid" in node:
                        uid = "%s::%s" % (plugin_name, node["uid"])
                    else:
                        feature_name = node["name"].replace(" ", "_")
                        uid = "%s::%s" % (plugin_name, feature_name)
                    nodes[uid] = node

            # Here we parse together relationships
            # Currently only supported for terms within the same family
            if os.path.exists("%s/term_relationships.json" % term_plugin):
                terms_json = read_json("%s/term_relationships.json" %
                                       term_plugin)["edges"]
                for relation in terms_json:
                    uid_1 = "%s::%s" % (plugin_name, relation["source"])
                    uid_2 = "%s::%s" % (plugin_name, relation["target"])
                    relation_uid = "%s<>%s" % (uid_1, uid_2)
                    edges[relation_uid] = {
                        "source": uid_1,
                        "target": uid_2,
                        "value": relation["value"]
                    }

            result = {"nodes": nodes, "edges": edges}
            if subset:
                results[plugin_name] = result

    if subset:
        result = results
    else:
        result = {"all": result}
    # Return the result to user with all edges and nodes defined
    if analysis_dir is not None:
        tmp = save_pretty_json(result, "%s/terms/terms.json" % (analysis_dir))
    return result