Ejemplo n.º 1
0
    def __init__(self, text, pmid, tees_path, python2_path):
        # Store pmid
        self.pmid = pmid

        # Load grounding information
        path_this = os.path.dirname(os.path.abspath(__file__))
        gm_fname = os.path.join(path_this, '../../resources/',
                                'extracted_reach_grounding_map.csv')
        try:
            gm = load_grounding_map(gm_fname)
        except BaseException:
            raise Exception('Could not load the grounding map from ' +
                            gm_fname)
        mapper = GroundingMapper(gm)

        # Run TEES and parse into networkx graph
        self.G = run_and_parse_tees(text, tees_path, python2_path)

        # Extract statements from the TEES graph
        self.statements = []
        self.statements.extend(self.process_phosphorylation_statements())
        self.statements.extend(self.process_binding_statements())
        self.statements.extend(self.process_increase_expression_amount())
        self.statements.extend(self.process_decrease_expression_amount())

        # Ground statements
        self.statements = mapper.map_agents(self.statements)
Ejemplo n.º 2
0
def run_with_pmids_helper(model_path, pmids):
    default_config_fname = os.path.join(model_path, 'config.yaml')
    config = get_machine_config(default_config_fname)

    belief_threshold = config.get('belief_threshold', 0.95)
    twitter_cred = get_twitter_cred(config)
    ndex_cred = get_ndex_cred(config)

    # Get optional grounding map
    gm_path = config.get('grounding_map_path')
    if gm_path:
        try:
            from indra.preassembler.grounding_mapper import load_grounding_map
            grounding_map = load_grounding_map(gm_path)
        except Exception as e:
            logger.error('Could not load grounding map from %s' % gm_path)
            logger.error(e)
            grounding_map = None
    else:
        grounding_map = None

    run_machine(model_path, {'enumerated': [pmid.strip() for pmid in pmids]},
                belief_threshold,
                ndex_cred=ndex_cred,
                twitter_cred=twitter_cred,
                grounding_map=grounding_map)
Ejemplo n.º 3
0
def run_with_pmids_helper(model_path, pmids):
    default_config_fname = os.path.join(model_path, 'config.yaml')
    config = get_machine_config(default_config_fname)

    belief_threshold = config.get('belief_threshold', 0.95)
    twitter_cred = get_twitter_cred(config)
    ndex_cred = get_ndex_cred(config)

    # Get optional grounding map
    gm_path = config.get('grounding_map_path')
    if gm_path:
        try:
            from indra.preassembler.grounding_mapper import load_grounding_map
            grounding_map = load_grounding_map(gm_path)
        except Exception as e:
            logger.error('Could not load grounding map from %s' % gm_path)
            logger.error(e)
            grounding_map = None
    else:
        grounding_map = None

    run_machine(
        model_path,
        {'enumerated': [pmid.strip() for pmid in pmids]},
        belief_threshold,
        ndex_cred=ndex_cred,
        twitter_cred=twitter_cred,
        grounding_map=grounding_map
    )
Ejemplo n.º 4
0
    def __init__(self, a1_text, a2_text, sentence_segmentations, pmid):
        # Store pmid
        self.pmid = pmid

        # Load grounding information
        path_this = os.path.dirname(os.path.abspath(__file__))
        gm_fname = os.path.join(path_this, '../../resources/',
                                'extracted_reach_grounding_map.csv')
        try:
            gm = load_grounding_map(gm_fname)
        except BaseException:
            raise Exception('Could not load the grounding map from ' +
                            gm_fname)
        mapper = GroundingMapper(gm)

        # Run TEES and parse into networkx graph
        self.G = parse_output(a1_text, a2_text, sentence_segmentations)

        # Extract statements from the TEES graph
        self.statements = []
        self.statements.extend(self.process_phosphorylation_statements())
        self.statements.extend(self.process_binding_statements())
        self.statements.extend(self.process_increase_expression_amount())
        self.statements.extend(self.process_decrease_expression_amount())

        # Ground statements
        self.statements = mapper.map_agents(self.statements)
Ejemplo n.º 5
0
 def add_grounding(self):
     # Load grounding information
     path_this = os.path.dirname(os.path.abspath(__file__))
     gm_fname = os.path.join(path_this, '../../resources/',
                             'extracted_reach_grounding_map.csv')
     try:
         gm = load_grounding_map(gm_fname)
     except BaseException:
         raise Exception('Could not load the grounding map from ' +
                         gm_fname)
     mapper = GroundingMapper(gm)
     self.statements = mapper.map_agents(self.statements)
Ejemplo n.º 6
0
def run_assembly(stmts, folder, pmcid, background_assertions=None):
    '''Run assembly on a list of statements, for a given PMCID.'''
    # Folder for index card output (scored submission)
    indexcard_prefix = folder + '/index_cards/' + pmcid
    # Folder for other outputs (for analysis, debugging)
    otherout_prefix = folder + '/other_outputs/' + pmcid

    # Do grounding mapping here
    # Load the TRIPS-specific grounding map and add to the default
    # (REACH-oriented) grounding map:
    trips_gm = load_grounding_map('trips_grounding_map.csv')
    default_grounding_map.update(trips_gm)
    gm = GroundingMapper(default_grounding_map)

    mapped_agent_stmts = gm.map_agents(stmts)
    renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts)

    # Filter for grounding
    grounded_stmts = []
    for st in renamed_agent_stmts:
        if all([is_protein_or_chemical(a) for a in st.agent_list()]):
            grounded_stmts.append(st)

    # Instantiate the Preassembler
    pa = Preassembler(hierarchies)
    pa.add_statements(grounded_stmts)
    print('== %s ====================' % pmcid)
    print('%d statements collected in total.' % len(pa.stmts))

    # Combine duplicates
    unique_stmts = pa.combine_duplicates()
    print('%d statements after combining duplicates.' % len(unique_stmts))

    # Run BeliefEngine on unique statements
    epe = BeliefEngine()
    epe.set_prior_probs(pa.unique_stmts)

    # Build statement hierarchy
    related_stmts = pa.combine_related()
    # Run BeliefEngine on hierarchy
    epe.set_hierarchy_probs(related_stmts)
    print('%d statements after combining related.' % len(related_stmts))

    # Instantiate the mechanism linker
    ml = MechLinker(related_stmts)
    # Link statements
    linked_stmts = ml.link_statements()
    # Run BeliefEngine on linked statements
    epe.set_linked_probs(linked_stmts)
    # Print linked statements for debugging purposes
    print('Linked\n=====')
    for ls in linked_stmts:
        print(ls.inferred_stmt.belief, ls.inferred_stmt)
    print('=============')

    # Combine all statements including linked ones
    all_statements = ml.statements + [ls.inferred_stmt for ls in linked_stmts]

    # Instantiate a new preassembler
    pa = Preassembler(hierarchies, all_statements)
    # Build hierarchy again
    pa.combine_duplicates()
    # Choose the top-level statements
    related_stmts = pa.combine_related()

    # Remove top-level statements that came only from the prior
    if background_assertions is not None:
        nonbg_stmts = [
            stmt for stmt in related_stmts if stmt not in background_assertions
        ]
    else:
        nonbg_stmts = related_stmts

    # Dump top-level statements in a pickle
    with open(otherout_prefix + '.pkl', 'wb') as fh:
        pickle.dump(nonbg_stmts, fh, protocol=2)

    # Flatten evidence for statements
    flattened_evidence_stmts = flatten_evidence(nonbg_stmts)

    # Start a card counter
    card_counter = 1
    # We don't limit the number of cards reported in this round
    card_lim = float('inf')
    top_stmts = []
    ###############################################
    # The belief cutoff for statements
    belief_cutoff = 0.3
    ###############################################
    # Sort by amount of evidence
    for st in sorted(flattened_evidence_stmts,
                     key=lambda x: x.belief,
                     reverse=True):
        if st.belief >= belief_cutoff:
            print(st.belief, st)
        if st.belief < belief_cutoff:
            print('SKIP', st.belief, st)

        # If it's background knowledge, we skip the statement
        if is_background_knowledge(st):
            print('This statement is background knowledge - skipping.')
            continue

        # Assemble IndexCards
        ia = IndexCardAssembler([st], pmc_override=pmcid)
        ia.make_model()
        # If the index card was actually made
        # (not all statements can be assembled into index cards to
        # this is often not the case)
        if ia.cards:
            # Save the index card json
            ia.save_model(indexcard_prefix + '-%d.json' % card_counter)
            card_counter += 1
            top_stmts.append(st)
            if card_counter > card_lim:
                break

    # Print the English-assembled model for debugging purposes
    ea = EnglishAssembler(top_stmts)
    print('=======================')
    print(ea.make_model())
    print('=======================')

    # Print the statement graph
    graph = render_stmt_graph(nonbg_stmts)
    graph.draw(otherout_prefix + '_graph.pdf', prog='dot')
    # Print statement diagnostics
    print_stmts(pa.stmts, otherout_prefix + '_statements.tsv')
    print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
Ejemplo n.º 7
0
def run_with_search_helper(model_path, config, num_days=None):
    logger.info('-------------------------')
    logger.info(time.strftime('%c'))

    if not os.path.isdir(model_path):
        logger.error('%s is not a directory', model_path)
        sys.exit()

    default_config_fname = os.path.join(model_path, 'config.yaml')

    if config:
        config = get_machine_config(config)
    elif os.path.exists(default_config_fname):
        logger.info('Loading default configuration from %s',
                    default_config_fname)
        config = get_machine_config(default_config_fname)
    else:
        logger.error('Configuration file argument missing.')
        sys.exit()

    # Probability cutoff for filtering statements
    default_belief_threshold = 0.95
    belief_threshold = config.get('belief_threshold')
    if belief_threshold is None:
        belief_threshold = default_belief_threshold
        msg = 'Belief threshold argument (belief_threshold) not specified.' + \
              ' Using default belief threshold %.2f' % default_belief_threshold
        logger.info(msg)
    else:
        logger.info('Using belief threshold: %.2f' % belief_threshold)

    twitter_cred = get_twitter_cred(config)
    if twitter_cred:
        logger.info('Using Twitter with given credentials.')
    else:
        logger.info('Not using Twitter due to missing credentials.')

    gmail_cred = get_gmail_cred(config)
    if gmail_cred:
        logger.info('Using Gmail with given credentials.')
    else:
        logger.info('Not using Gmail due to missing credentials.')

    ndex_cred = get_ndex_cred(config)
    if ndex_cred:
        logger.info('Using NDEx with given credentials.')
    else:
        logger.info('Not using NDEx due to missing information.')

    pmids = {}
    # Get email PMIDs
    if gmail_cred:
        logger.info('Getting PMIDs from emails.')
        try:
            email_pmids = get_email_pmids(gmail_cred)
            # Put the email_pmids into the pmids dictionary
            pmids['Gmail'] = email_pmids
            logger.info('Collected %d PMIDs from Gmail', len(email_pmids))
        except Exception:
            logger.exception('Could not get PMIDs from Gmail, continuing.')

    # Get PMIDs for general search_terms and genes
    search_genes = config.get('search_genes')
    search_terms = config.get('search_terms')
    if not search_terms:
        logger.info('No search terms argument (search_terms) specified.')
    else:
        if search_genes is not None:
            search_terms += search_genes
        logger.info('Using search terms: %s' % ', '.join(search_terms))

        if num_days is None:
            num_days = int(config.get('search_terms_num_days', 5))
        logger.info('Searching the last %d days', num_days)

        pmids_term = get_searchterm_pmids(search_terms, num_days=num_days)
        num_pmids = len(set(itt.chain.from_iterable(pmids_term.values())))
        logger.info('Collected %d PMIDs from PubMed search_terms.', num_pmids)
        pmids = _extend_dict(pmids, pmids_term)

    # Get optional grounding map
    gm_path = config.get('grounding_map_path')
    if gm_path:
        try:
            from indra.preassembler.grounding_mapper import load_grounding_map
            grounding_map = load_grounding_map(gm_path)
        except Exception as e:
            logger.error('Could not load grounding map from %s' % gm_path)
            logger.error(e)
            grounding_map = None
    else:
        grounding_map = None

    '''
    # Get PMIDs for search_genes
    # Temporarily removed because Entrez-based article searches
    # are lagging behind and cannot be time-limited
    if not search_genes:
        logger.info('No search genes argument (search_genes) specified.')
    else:
        logger.info('Using search genes: %s' % ', '.join(search_genes))
        pmids_gene = get_searchgenes_pmids(search_genes, num_days=5)
        num_pmids = sum([len(pm) for pm in pmids_gene.values()])
        logger.info('Collected %d PMIDs from PubMed search_genes.' % num_pmids)
        pmids = _extend_dict(pmids, pmids_gene)
    '''
    run_machine(
        model_path,
        pmids,
        belief_threshold,
        search_genes=search_genes,
        ndex_cred=ndex_cred,
        twitter_cred=twitter_cred,
        grounding_map=grounding_map
    )
Ejemplo n.º 8
0
def run_assembly(stmts, folder, pmcid, background_assertions=None):
    '''Run assembly on a list of statements, for a given PMCID.'''
    # Folder for index card output (scored submission)
    indexcard_prefix = folder + '/index_cards/' + pmcid
    # Folder for other outputs (for analysis, debugging)
    otherout_prefix = folder + '/other_outputs/' + pmcid

    # Do grounding mapping here
    # Load the TRIPS-specific grounding map and add to the default
    # (REACH-oriented) grounding map:
    trips_gm = load_grounding_map('trips_grounding_map.csv')
    default_grounding_map.update(trips_gm)
    gm = GroundingMapper(default_grounding_map)

    mapped_agent_stmts = gm.map_agents(stmts)
    renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts)

    # Filter for grounding
    grounded_stmts = []
    for st in renamed_agent_stmts:
        if all([is_protein_or_chemical(a) for a in st.agent_list()]):
            grounded_stmts.append(st)

    # Instantiate the Preassembler
    pa = Preassembler(hierarchies)
    pa.add_statements(grounded_stmts)
    print('== %s ====================' % pmcid)
    print('%d statements collected in total.' % len(pa.stmts))

    # Combine duplicates
    unique_stmts = pa.combine_duplicates()
    print('%d statements after combining duplicates.' % len(unique_stmts))

    # Run BeliefEngine on unique statements
    epe = BeliefEngine()
    epe.set_prior_probs(pa.unique_stmts)

    # Build statement hierarchy
    related_stmts = pa.combine_related()
    # Run BeliefEngine on hierarchy
    epe.set_hierarchy_probs(related_stmts)
    print('%d statements after combining related.' % len(related_stmts))

    # Instantiate the mechanism linker
    # Link statements
    linked_stmts = MechLinker.infer_active_forms(related_stmts)
    linked_stmts += MechLinker.infer_modifications(related_stmts)
    linked_stmts += MechLinker.infer_activations(related_stmts)
    # Run BeliefEngine on linked statements
    epe.set_linked_probs(linked_stmts)
    # Print linked statements for debugging purposes
    print('Linked\n=====')
    for ls in linked_stmts:
        print(ls.inferred_stmt.belief, ls.inferred_stmt)
    print('=============')

    # Combine all statements including linked ones
    all_statements = related_stmts + [ls.inferred_stmt for ls in linked_stmts]

    # Instantiate a new preassembler
    pa = Preassembler(hierarchies, all_statements)
    # Build hierarchy again
    pa.combine_duplicates()
    # Choose the top-level statements
    related_stmts = pa.combine_related()

    # Remove top-level statements that came only from the prior
    if background_assertions is not None:
        nonbg_stmts = [stmt for stmt in related_stmts
                       if stmt not in background_assertions]
    else:
        nonbg_stmts = related_stmts

    # Dump top-level statements in a pickle
    with open(otherout_prefix + '.pkl', 'wb') as fh:
        pickle.dump(nonbg_stmts, fh)

    # Flatten evidence for statements
    flattened_evidence_stmts = flatten_evidence(nonbg_stmts)

    # Start a card counter
    card_counter = 1
    # We don't limit the number of cards reported in this round
    card_lim = float('inf')
    top_stmts = []
    ###############################################
    # The belief cutoff for statements
    belief_cutoff = 0.3
    ###############################################
    # Sort by amount of evidence
    for st in sorted(flattened_evidence_stmts,
                     key=lambda x: x.belief, reverse=True):
        if st.belief >= belief_cutoff:
            print(st.belief, st)
        if st.belief < belief_cutoff:
            print('SKIP', st.belief, st)

        # If it's background knowledge, we skip the statement
        if is_background_knowledge(st):
            print('This statement is background knowledge - skipping.')
            continue

        # Assemble IndexCards
        ia = IndexCardAssembler([st], pmc_override=pmcid)
        ia.make_model()
        # If the index card was actually made 
        # (not all statements can be assembled into index cards to
        # this is often not the case)
        if ia.cards:
            # Save the index card json
            ia.save_model(indexcard_prefix + '-%d.json' % card_counter)
            card_counter += 1
            top_stmts.append(st)
            if card_counter > card_lim:
                break

    # Print the English-assembled model for debugging purposes
    ea = EnglishAssembler(top_stmts)
    print('=======================')
    print(ea.make_model().encode('utf-8'))
    print('=======================')

    # Print the statement graph
    graph = render_stmt_graph(nonbg_stmts)
    graph.draw(otherout_prefix + '_graph.pdf', prog='dot')
    # Print statement diagnostics
    print_stmts(pa.stmts, otherout_prefix + '_statements.tsv')
    print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
Ejemplo n.º 9
0
def run_with_search_helper(model_path, config, num_days=None):
    logger.info('-------------------------')
    logger.info(time.strftime('%c'))

    if not os.path.isdir(model_path):
        logger.error('%s is not a directory', model_path)
        sys.exit()

    default_config_fname = os.path.join(model_path, 'config.yaml')

    if config:
        config = get_machine_config(config)
    elif os.path.exists(default_config_fname):
        logger.info('Loading default configuration from %s',
                    default_config_fname)
        config = get_machine_config(default_config_fname)
    else:
        logger.error('Configuration file argument missing.')
        sys.exit()

    # Probability cutoff for filtering statements
    default_belief_threshold = 0.95
    belief_threshold = config.get('belief_threshold')
    if belief_threshold is None:
        belief_threshold = default_belief_threshold
        msg = 'Belief threshold argument (belief_threshold) not specified.' + \
              ' Using default belief threshold %.2f' % default_belief_threshold
        logger.info(msg)
    else:
        logger.info('Using belief threshold: %.2f' % belief_threshold)

    twitter_cred = get_twitter_cred(config)
    if twitter_cred:
        logger.info('Using Twitter with given credentials.')
    else:
        logger.info('Not using Twitter due to missing credentials.')

    gmail_cred = get_gmail_cred(config)
    if gmail_cred:
        logger.info('Using Gmail with given credentials.')
    else:
        logger.info('Not using Gmail due to missing credentials.')

    ndex_cred = get_ndex_cred(config)
    if ndex_cred:
        logger.info('Using NDEx with given credentials.')
    else:
        logger.info('Not using NDEx due to missing information.')

    pmids = {}
    # Get email PMIDs
    if gmail_cred:
        logger.info('Getting PMIDs from emails.')
        try:
            email_pmids = get_email_pmids(gmail_cred)
            # Put the email_pmids into the pmids dictionary
            pmids['Gmail'] = email_pmids
            logger.info('Collected %d PMIDs from Gmail', len(email_pmids))
        except Exception:
            logger.exception('Could not get PMIDs from Gmail, continuing.')

    # Get PMIDs for general search_terms and genes
    search_genes = config.get('search_genes')
    search_terms = config.get('search_terms')
    if not search_terms:
        logger.info('No search terms argument (search_terms) specified.')
    else:
        if search_genes is not None:
            search_terms += search_genes
        logger.info('Using search terms: %s' % ', '.join(search_terms))

        if num_days is None:
            num_days = int(config.get('search_terms_num_days', 5))
        logger.info('Searching the last %d days', num_days)

        pmids_term = get_searchterm_pmids(search_terms, num_days=num_days)
        num_pmids = len(set(itt.chain.from_iterable(pmids_term.values())))
        logger.info('Collected %d PMIDs from PubMed search_terms.', num_pmids)
        pmids = _extend_dict(pmids, pmids_term)

    # Get optional grounding map
    gm_path = config.get('grounding_map_path')
    if gm_path:
        try:
            from indra.preassembler.grounding_mapper import load_grounding_map
            grounding_map = load_grounding_map(gm_path)
        except Exception as e:
            logger.error('Could not load grounding map from %s' % gm_path)
            logger.error(e)
            grounding_map = None
    else:
        grounding_map = None
    '''
    # Get PMIDs for search_genes
    # Temporarily removed because Entrez-based article searches
    # are lagging behind and cannot be time-limited
    if not search_genes:
        logger.info('No search genes argument (search_genes) specified.')
    else:
        logger.info('Using search genes: %s' % ', '.join(search_genes))
        pmids_gene = get_searchgenes_pmids(search_genes, num_days=5)
        num_pmids = sum([len(pm) for pm in pmids_gene.values()])
        logger.info('Collected %d PMIDs from PubMed search_genes.' % num_pmids)
        pmids = _extend_dict(pmids, pmids_gene)
    '''
    run_machine(model_path,
                pmids,
                belief_threshold,
                search_genes=search_genes,
                ndex_cred=ndex_cred,
                twitter_cred=twitter_cred,
                grounding_map=grounding_map)