Exemple #1
0
def main():

    #### Create a response object
    response = Response()

    #### Create an ActionsParser object
    from actions_parser import ActionsParser
    actions_parser = ActionsParser()
 
    #### Set a simple list of actions
    actions_list = [
        "filter(start_node=1, maximum_results=10, minimum_confidence=0.5)",
        "return(message=true,store=false)"
    ]

    #### Parse the action_list and print the result
    result = actions_parser.parse(actions_list)
    response.merge(result)
    if result.status != 'OK':
        print(response.show(level=Response.DEBUG))
        return response
    actions = result.data['actions']

    #### Read message #2 from the database. This should be the acetaminophen proteins query result message
    sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../../UI/Feedback")
    from RTXFeedback import RTXFeedback
    araxdb = RTXFeedback()
    message_dict = araxdb.getMessage(2)

    #### The stored message comes back as a dict. Transform it to objects
    sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../OpenAPI/python-flask-server/")
    from swagger_server.models.message import Message
    message = Message().from_dict(message_dict)

    #### Create a filter object and use it to apply action[0] from the list
    filter = ARAXFilter()
    result = filter.apply(message,actions[0]['parameters'])
    response.merge(result)
    if result.status != 'OK':
        print(response.show(level=Response.DEBUG))
        return response
    response.data = result.data

    #### Show the final message
    print(response.show(level=Response.DEBUG))
    response.data['message_stats'] = { 'n_results': message.n_results, 'id': message.id,
        'reasoner_id': message.reasoner_id, 'tool_version': message.tool_version }
    print(json.dumps(ast.literal_eval(repr(response.data['parameters'])),sort_keys=True,indent=2))
    for result in message.results:
        if result.essence is not None:
            essence = result.essence
        else:
            essence = f"{len(result.node_bindings)} node bindings, {len(result.edge_bindings)} edge bindings"
        print(f" - {essence}")
    print(json.dumps(ast.literal_eval(repr(response.data['message_stats'])),sort_keys=True,indent=2))
Exemple #2
0
def main():

    # Note that most of this is just manually doing what ARAXQuery() would normally do for you
    response = Response()
    from actions_parser import ActionsParser
    actions_parser = ActionsParser()
    actions_list = [
        "create_message",
        "add_qnode(id=n00, curie=CHEMBL.COMPOUND:CHEMBL112)",  # acetaminophen
        "add_qnode(id=n01, type=protein, is_set=true)",
        "add_qedge(id=e00, source_id=n00, target_id=n01)",
        "expand(edge_id=e00, kp=BTE)",
        "return(message=true, store=false)",
    ]

    # Parse the raw action_list into commands and parameters
    result = actions_parser.parse(actions_list)
    response.merge(result)
    if result.status != 'OK':
        print(response.show(level=Response.DEBUG))
        return response
    actions = result.data['actions']

    from ARAX_messenger import ARAXMessenger
    messenger = ARAXMessenger()
    expander = ARAXExpander()
    for action in actions:
        if action['command'] == 'create_message':
            result = messenger.create_message()
            message = result.data['message']
            response.data = result.data
        elif action['command'] == 'add_qnode':
            result = messenger.add_qnode(message, action['parameters'])
        elif action['command'] == 'add_qedge':
            result = messenger.add_qedge(message, action['parameters'])
        elif action['command'] == 'expand':
            result = expander.apply(message, action['parameters'])
        elif action['command'] == 'return':
            break
        else:
            response.error(f"Unrecognized command {action['command']}",
                           error_code="UnrecognizedCommand")
            print(response.show(level=Response.DEBUG))
            return response

        # Merge down this result and end if we're in an error state
        response.merge(result)
        if result.status != 'OK':
            print(response.show(level=Response.DEBUG))
            return response

    # Show the final response
    # print(json.dumps(ast.literal_eval(repr(message.knowledge_graph)),sort_keys=True,indent=2))
    print(response.show(level=Response.DEBUG))
Exemple #3
0
    def run(self):
        res = Response()
        params = self.params

        max_thresh = cast_int(params['max_thresh'])
        n = cast_int(params['n'])
        data = pd.DataFrame.from_dict(json.loads(params['data']))
        viz_df = pd.DataFrame.from_dict(json.loads(params['viz_df']))
        cluster_method = params['clusteringMethod']
        linkage_matrix = np.array([
            float(x) for x in params['linkage_matrix'].split(',')
        ]).reshape(n - 1, 4) if cluster_method == "hac" else None
        height = cast_int(
            params['threshold']) if cluster_method == "hac" else None
        k = cast_int(
            params['threshold']) if cluster_method == "kmeans" else None
        min_cluster_size = cast_int(params['minClusterSize'])
        topics_per_cluster = cast_int(params['topicsPerCluster'])

        # Recluster
        self.status = 'Reclustering...'
        data, cluster_df = topex.recluster(
            data,
            viz_df,
            linkage_matrix=linkage_matrix,
            cluster_method=cluster_method,
            height=height,
            k=k,
            min_cluster_size=min_cluster_size,
            topics_per_cluster=topics_per_cluster,
            show_chart=False)
        viz_df.cluster = data.cluster
        viz_df['valid'] = data.valid

        # Return
        res = Response()
        res.viz_df = viz_df.to_json()
        res.data = data[[
            'id', 'text', 'tokens', 'phrase', 'vec', 'cluster', 'valid'
        ]].to_json()  #only return the needed subset of data columns
        res.linkage_matrix = [list(row) for row in list(linkage_matrix)
                              ] if linkage_matrix is not None else []
        res.main_cluster_topics = list(cluster_df.topics)
        res.count = len(data)
        res.max_thresh = max_thresh
        res.thresh = height if cluster_method == "hac" else k

        self.result = dict(res)
        self.status = 'Complete'
Exemple #4
0
    def run(self):
        res = Response()
        params = self.params
        files = self.files

        # Process input from request
        self.status = 'Loading files'
        names = []
        docs = []
        for file in files:
            fileob = files[file]
            print(f"File: {fileob}")
            if fileob.content_type == 'application/json':
                scriptArgs = json.loads(fileob.stream.read())
            else:
                fileText = fileob.read().decode()
                docs.append(fileText)
                names.append(fileob.filename)
        docs = [doc.replace('\n', ' ').replace('\r', ' ') for doc in docs]
        df = pd.DataFrame(dict(doc_name=names, text=docs))

        self.status = 'Parsing params'
        stopwords = [s.strip() for s in params['stopwords'].split('\n')
                     ] if str_valid(params['stopwords']) else None
        window_size = cast_int(params['windowSize'])
        vectorization_method = params['wordVectorType'] if str_valid(
            params['wordVectorType']) else 'svd'
        dimensions = cast_int(params['dimensions'])
        tfidf_corpus = params['tfidfCorpus'] if str_valid(
            params['tfidfCorpus']) else 'both'
        include_sentiment = params['include_sentiment'] != 'false'
        custom_stopwords_only = params['custom_stopwords_only'] != 'false'

        clustering_method = params['clusteringMethod']
        cluster_dist_metric = params['cluster_dist_metric'] if str_valid(
            params['cluster_dist_metric']) else 'euclidean'
        height = cast_int(
            params['threshold']) if clustering_method == "hac" else None
        k = cast_int(
            params['threshold']) if clustering_method == "kmeans" else None

        visualization_method = params['visualizationMethod'] if str_valid(
            params['visualizationMethod']) else 'umap'
        viz_dist_metric = params['viz_dist_metric'] if str_valid(
            params['viz_dist_metric']) else 'cosine'
        umap_neighbors = cast_int(params['umap_neighbors'])

        if str_valid(params['expansionCorpus']):
            expansionCorpus = params['expansionCorpus'].rstrip("<newdoc>")
            expansion_docs = expansionCorpus.split(
                "<newdoc>") if len(expansionCorpus) > 0 else []
            expansion_names = [
                f"expansion_{i}" for i in range(len(expansion_docs))
            ]
            expansion_df = pd.DataFrame(
                dict(doc_name=expansion_names, text=expansion_docs))
        else:
            expansion_df = None
            tfidf_corpus = 'clustering'

        # Cluster the sentences in a dataframe
        self.status = 'Importing data'
        data, doc_df = topex.import_data(
            df,
            save_results=False,
            file_name=None,
            stop_words_list=stopwords,
            custom_stopwords_only=custom_stopwords_only)
        self.status = 'Creating TF-IDF'
        tfidf, dictionary = topex.create_tfidf(tfidf_corpus,
                                               doc_df,
                                               expansion_df=expansion_df)

        if dimensions is None or dimensions >= tfidf.shape[1]:
            new_dim = min(200, tfidf.shape[1] - 1)
            res.msg += f"Dimensions changed from {dimensions} to {new_dim}.\n"
            dimensions = 2 if vectorization_method == 'umap' else new_dim

        self.status = 'Getting phrases'
        data = topex.get_phrases(data,
                                 dictionary.token2id,
                                 tfidf,
                                 tfidf_corpus=tfidf_corpus,
                                 window_size=window_size,
                                 include_sentiment=include_sentiment)
        self.status = 'Vectorizing phrases'
        data = topex.get_vectors(vectorization_method,
                                 data,
                                 dictionary=dictionary,
                                 tfidf=tfidf,
                                 dimensions=dimensions,
                                 umap_neighbors=umap_neighbors)

        if clustering_method == 'kmeans' and k > len(data):
            res.msg += f"k exceeds number of sentences. Changed from {k} to {len(data)}.\n"
            k = len(data)

        self.status = 'Clustering sentences'
        data, linkage_matrix, max_thresh, thresh = topex.assign_clusters(
            data,
            method=clustering_method,
            k=k,
            height=height,
            dist_metric=cluster_dist_metric)
        self.status = 'Visualizing sentences'
        viz_df = topex.visualize_clustering(data,
                                            method=visualization_method,
                                            dist_metric=viz_dist_metric,
                                            show_chart=False,
                                            return_data=True,
                                            umap_neighbors=umap_neighbors)
        viz_df['valid'] = True
        data['valid'] = True  # Show all points on the first run
        cluster_df = topex.get_cluster_topics(data, doc_df)

        res.viz_df = viz_df.to_json()
        res.data = data[[
            'id', 'text', 'tokens', 'phrase', 'vec', 'cluster', 'valid'
        ]].to_json()  #only return the needed subset of data columns
        res.linkage_matrix = [list(row) for row in list(linkage_matrix)
                              ] if linkage_matrix is not None else []
        res.main_cluster_topics = list(cluster_df.topics)
        res.count = len(data)
        res.max_thresh = max_thresh
        res.thresh = thresh
        self.result = dict(res)
        self.status = 'Complete'