Ejemplo n.º 1
0
def query(query: str, input_dir: str, output_dir: str) -> None:
    """Perform a query of knowledge graph using a class contained in query_utils

    Args:
        query: A query class containing instructions for performing a query
        input_dir: Directory where any input files required to execute query are
        located (typically 'data', where transformed and merged graph files are)
        output_dir: Directory to output results of query

    Returns:
        None.

    """
    run_query(query=query, input_dir=input_dir, output_dir=output_dir)
Ejemplo n.º 2
0
def query(yaml: str,
          output_dir: str,
          query_key: str = 'query',
          endpoint_key: str = 'endpoint',
          outfile_ext: str = ".tsv") -> None:
    """Perform a query of knowledge graph using a class contained in query_utils

    Args:
        yaml: A YAML file containing a SPARQL query (see queries/sparql/ for examples)
        output_dir: Directory to output results of query
        query_key: the key in the yaml file containing the query string
        endpoint_key: the key in the yaml file containing the sparql endpoint URL
        outfile_ext: file extension for output file [.tsv]
    Returns:
        None.

    """
    query = parse_query_yaml(yaml)
    result_dict = run_query(query=query[query_key],
                            endpoint=query[endpoint_key])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    outfile = os.path.join(
        output_dir,
        os.path.splitext(os.path.basename(yaml))[0] + outfile_ext)
    result_dict_to_tsv(result_dict, outfile)
Ejemplo n.º 3
0
def query(yaml: str, output_dir: str,
          query_key: str='query', endpoint_key: str='endpoint',
          outfile_ext: str=".tsv") -> None:
    """Perform a query of knowledge graph using a class contained in query_utils

    Args:
        yaml: A rq file containing a SPARQL query in grlc format:
        https://github.com/CLARIAH/grlc/blob/master/README.md
        output_dir: Directory to output results of query
        query_key: the key in the yaml file containing the query string
        endpoint_key: the key in the yaml file containing the sparql endpoint URL
        outfile_ext: file extension for output file [.tsv]
    Returns:
        None.

    """
    query = parse_query_rq(yaml)
    result_dict = run_query(query=query[query_key], endpoint=query[endpoint_key])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    outfile = os.path.join(output_dir, os.path.splitext(os.path.basename(yaml))[0] +
                           outfile_ext)
    result_dict_to_tsv(result_dict, outfile)
    if not os.path.exists(kg_tar):
        wget.download('http://kg-hub.berkeleybop.io/kg-covid-19.tar.gz', data_dir)
    with tarfile.open(kg_tar) as tar:
        tar.extractall(data_dir)

for path in [intact_path, sars_genes_path, drug_central]:
    node_file = '/'.join([data_dir, path, 'nodes.tsv'])
    edge_file = '/'.join([data_dir, path, 'edges.tsv'])
    node_url = '/'.join(['http://kg-hub.berkeleybop.io', path, 'nodes.tsv'])
    edge_url = '/'.join(['http://kg-hub.berkeleybop.io', path, 'edges.tsv'])
    if not os.path.exists(node_file):
        wget.download(node_url, node_file)
    if not os.path.exists(edge_file):
        wget.download(edge_url, edge_file)

run_query('TargetCandidates', input_dir=data_dir, output_dir=data_dir)

target_info = read_csv('data/target_candidates.tsv', sep="\t")

pnnl_data = read_csv('data/pnnl/PNNLTargetList_2020_06_08 - TargetList_2020_06_05.tsv', sep="\t")
pnnl_data.rename(columns={'Uniprot': 'protein ID'}, inplace=True)

pnnl_data['protein ID'] = \
    pnnl_data['protein ID'].apply(lambda x: "{}{}".format('UniProtKB:', x))

target_info.drop(['confidence score','comments'], axis=1, inplace=True)

target_info = pd.concat([pnnl_data, target_info], axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

host_only = False