Exemple #1
0
def analyze(filename, plot=False):
    # Load the file
    results = load_file(filename)

    # Put together a list of all statements
    all_stmts = [stmt for paper_stmts in results.values()
                      for stmt in paper_stmts]

    # Map grounding
    logger.info('Mapping grounding...')
    gmap = gm.GroundingMapper(gm.default_grounding_map)
    map_stmts = gmap.map_agents(all_stmts)
    map_stmts = gmap.rename_agents(map_stmts)

    # Combine duplicates
    logger.info('Removing duplicates...')
    pa = Preassembler(hierarchies, map_stmts)
    pa.combine_duplicates()

    # Map GO IDs to genes and associated statements
    logger.info('Building map from GO IDs to stmts')
    go_gene_map = {}
    go_name_map = {}
    for stmt in pa.unique_stmts:
        (bp_name, go, gene) = go_gene_pair(stmt)
        if bp_name is None and go is None and gene is None:
            continue
        go_gene_list = go_gene_map.get(go, [])
        go_gene_list.append((gene, stmt))
        go_gene_map[go] = go_gene_list
        go_name_set = go_name_map.get(go, set([]))
        go_name_set.add(bp_name)
        go_name_map[go] = go_name_set

    # Iterate over all of the GO IDs and compare the annotated genes in GO
    # to the ones from the given statements
    go_stmt_map = {}
    for ix, go_id in enumerate(go_gene_map.keys()):
        logger.info('Getting genes for %s (%s) from GO (%d of %d)' %
                    (go_id, ','.join(list(go_name_map[go_id])),
                     ix+1, len(go_gene_map.keys())))
        genes_from_go = get_genes_for_go_id(go_id)
        gene_stmt_list = go_gene_map[go_id]
        in_go = []
        not_in_go = []
        for (gene, stmt) in gene_stmt_list:
            if gene in genes_from_go:
                in_go.append(stmt)
            else:
                not_in_go.append(stmt)
        go_stmt_map[go_id] = {'names': list(go_name_map[go_id]),
                              'in_go': in_go, 'not_in_go': not_in_go}

    with open('go_stmt_map.pkl', 'wb') as f:
        pickle.dump(go_stmt_map, f, protocol=2)

    if plot:
        plot_stmt_counts(go_stmt_map, 'go_stmts.pdf')
def assemble_model(requester_name):
    global stmts
    # Performing grounding mapping on the statements
    gmapper = gm.GroundingMapper(gm.default_grounding_map)
    stmts = gmapper.map_agents(stmts)
    pa = Preassembler(hierarchies, stmts)
    pa.combine_related()
    stmts = pa.related_stmts
    ml = MechLinker(stmts)
    linked_stmts = ml.link_statements()
    if linked_stmts:
        for linked_stmt in linked_stmts:
            if linked_stmt.inferred_stmt:
                question = mechlinker_queries.print_linked_stmt(linked_stmt)
                say(question)
                stmts.append(linked_stmt.inferred_stmt)
    say("%s: Done, updating layout." % requester_name)
    update_layout()
def remove_mechanism(text, user_name):
    global stmts
    import ipdb
    ipdb.set_trace()
    tp = trips.process_text(text)
    gmapper = gm.GroundingMapper(gm.default_grounding_map)
    tp_stmts = gmapper.map_agents(tp.statements)
    print "Statements to remove:", tp.statements
    stmts_to_keep = []
    for model_stmt in stmts:
        remove_stmt = False
        for stmt_to_remove in tp_stmts:
            if model_stmt.refinement_of(stmt_to_remove, hierarchies):
                remove_stmt = True
                break
        if not remove_stmt:
            stmts_to_keep.append(model_stmt)

    if stmts_to_keep == stmts:
        say("Sorry, I couldn't find any matching mechanisms to remove.")
    else:
        say("OK, I removed it.")
        stmts = stmts_to_keep
        assemble_model(user_name)
        if args.date_range:
            min_date_str, max_date_str = args.date_range.split(':')
            if min_date_str:
                min_date = datetime.strptime(min_date_str, '%Y%m%d%H%M%S')
                clauses.add(db.RawStatements.create_date > min_date)
            if max_date_str:
                max_date = datetime.strptime(max_date_str, '%Y%m%d%H%M%S')
                clauses.add(db.RawStatements.create_date < max_date)

        all_stmts, results = load_stmts_from_db(clauses, db)

    report_stmt_counts(results['reach'], plot_prefix='raw_reach')
    report_stmt_counts(results['sparser'], plot_prefix='raw_sparser')
    report_grounding(all_stmts, plot_prefix='raw')
    report_stmt_types(all_stmts, plot_prefix='raw')
    report_stmt_participants(all_stmts)

    # Map grounding
    logger.info('Mapping grounding...')
    gmap = gm.GroundingMapper(gm.default_grounding_map)
    map_stmts = gmap.map_agents(all_stmts)

    report_grounding(map_stmts, plot_prefix='preassembled')

    # Combine duplicates
    logger.info('Removing duplicates...')
    pa = Preassembler(hierarchies, map_stmts)
    pa.combine_duplicates()

    report_evidence_distribution(pa.unique_stmts, plot_prefix='preassembled')
Exemple #5
0
def analyze(filename):
    results = load_file(filename)

    all_stmts = [stmt for paper_stmts in results.values()
                      for stmt in paper_stmts]

    # Map grounding
    logger.info('Mapping grounding...')
    gmap = gm.GroundingMapper(gm.default_grounding_map)
    map_stmts = gmap.map_agents(all_stmts)
    map_stmts = gmap.rename_agents(map_stmts)

    # Combine duplicates
    logger.info('Removing duplicates...')
    pa = Preassembler(hierarchies, map_stmts)
    pa.combine_duplicates()

    # Get complexes
    complexes = [s for s in pa.unique_stmts if isinstance(s, Complex)]
    # Get HGNC grounding
    protein_complexes = [s for s in complexes
                           if all([True if 'HGNC' in ag.db_refs.keys()
                                        else False
                                        for ag in s.agent_list()])]

    logger.info('Mapping gene IDs to gene symbols')
    gene_ids = list(set([ag.db_refs['HGNC'] for stmt in protein_complexes
                                            for ag in stmt.members]))
    genes = [hgnc_client.get_hgnc_name(id) for id in gene_ids]

    # Get complexes from BioGrid and combine duplicates
    num_genes_per_query = 50
    start_indices = range(0, len(genes), num_genes_per_query)
    end_indices = [i + num_genes_per_query
                   if i + num_genes_per_query < len(genes) else len(genes)
                   for i in start_indices]
    bg_complexes = []
    for i in range(len(start_indices)):
        logger.info("Querying biogrid for %s" %
                    str(genes[start_indices[i]:end_indices[i]]))
        bg_complexes += (bg.get_statements(
                                genes[start_indices[i]:end_indices[i]]))

    # Filter out Biogrid statements not involving genes in the gene list
    # (this will make duplicate removal more efficient
    bg_filt = []
    for stmt in bg_complexes:
        if stmt.members[0].name in genes and \
           stmt.members[1].name in genes:
            bg_filt.append(stmt)
    # Might as well free up some memory
    del bg_complexes

    logger.info("Combining duplicates with biogrid...")
    pa = Preassembler(hierarchies, bg_filt + protein_complexes)
    pa.combine_duplicates()

    indra_only = []
    bg_only = []
    indra_and_bg = []
    for stmt in pa.unique_stmts:
        evidence_source_list = set([])
        for e in stmt.evidence:
            evidence_source_list.add(e.source_api)
        if 'reach' in evidence_source_list and \
           'biogrid' in evidence_source_list:
            indra_and_bg.append(stmt)
        elif 'reach' in evidence_source_list and \
             'biogrid' not in evidence_source_list:
            indra_only.append(stmt)
        elif 'reach' not in evidence_source_list and \
             'biogrid' in evidence_source_list:
            bg_only.append(stmt)

    rows = []
    for stmt in indra_only:
        rows.append([stmt.members[0].name, stmt.members[1].name,
                     str(len(stmt.evidence))])
    write_unicode_csv('unmatched_complexes.tsv', rows, delimiter='\t')

    return {'indra_only': indra_only,
            'bg_only': bg_only,
            'indra_and_bg': indra_and_bg}