Beispiel #1
0
def model_to_s3(model_name):
    model_name = escape_filename(model_name)
    local_models_path = os.path.join(DATA_PATH, 'models', model_name)
    with open(
            os.path.join(local_models_path,
                         f'{model_name}_grounding_dict.json')) as f:
        grounding_dict = json.load(f)
    model_map = {key: model_name for key in grounding_dict}
    s3_models = get_s3_models()
    s3_models.update(model_map)

    client = boto3.client('s3')
    with tempfile.NamedTemporaryFile() as temp:
        with open(temp.name, 'w') as f:
            json.dump(s3_models, f)
        client.upload_file(temp.name, S3_BUCKET, 's3_models.json')

    file_names = [
        f'{model_name}_{end}'
        for end in ('model.gz', 'grounding_dict.json', 'names.json')
    ]

    for file_name in file_names:
        client.upload_file(os.path.join(local_models_path, file_name),
                           S3_BUCKET, os.path.join(model_name, file_name))
Beispiel #2
0
import json
import argparse

from adeft.discover import DeftMiner

from adeft_app.locations import DATA_PATH
from adeft_app.filenames import escape_filename

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Use adeft to find longforms'
                                     ' associated with shortform')
    parser.add_argument('vars', nargs='*')
    args = parser.parse_args()
    shortforms = args.vars
    agg_name = ':'.join(
        sorted([escape_filename(shortform) for shortform in shortforms]))
    texts_path = os.path.join(DATA_PATH, 'texts', agg_name,
                              f'{agg_name}_texts.json')
    with open(texts_path, 'r') as f:
        texts = json.load(f)
    texts = texts.values()
    texts = [text for text in texts if text]
    for shortform in shortforms:
        dm = DeftMiner(shortform)
        dm.process_texts(texts)
        longforms = dm.get_longforms()
        escaped_shortform = escape_filename(shortform)
        out_path = os.path.join(DATA_PATH, 'longforms',
                                f'{escaped_shortform}_longforms.json')
        with open(out_path, 'w') as f:
            json.dump(longforms, f)
Beispiel #3
0
from indra.literature.adeft_tools import universal_extract_text
from indra_db.util.content_scripts import get_text_content_from_stmt_ids

from adeft_app.locations import DATA_PATH
from adeft_app.filenames import escape_filename

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get texts for statements'
                                     ' with agent from a list of shortforms')
    parser.add_argument('vars', nargs='*')
    args = parser.parse_args()
    shortforms = args.vars
    all_stmts = set()
    cased_shortforms = [
        escape_filename(shortform) for shortform in sorted(shortforms)
    ]
    for shortform in shortforms:
        cased_shortform = escape_filename(shortform)
        path = os.path.join(DATA_PATH, 'statements',
                            f'{cased_shortform}_statements.json')
        with open(path, 'r') as f:
            stmts = json.load(f)
        all_stmts.update(stmts)
        ref_dict, text_dict = get_text_content_from_stmt_ids(all_stmts)
    text_dict = {
        text_ref: universal_extract_text(article, contains=shortforms)
        for text_ref, article in text_dict.items()
    }
    agg_name = ':'.join(cased_shortforms)
    dir_path = os.path.join(DATA_PATH, 'texts', agg_name)
Beispiel #4
0
def train(shortforms, additional=None, n_jobs=1):
    """Train a deft model and produce quality statistics"""
    if additional is None:
        additional = []
    # gather needed data
    groundings_path = os.path.join(DATA_PATH, 'groundings')
    texts_path = os.path.join(DATA_PATH, 'texts')
    models_path = os.path.join(DATA_PATH, 'models')

    grounding_dict = {}
    names = {}
    pos_labels = set()
    # combine grounding maps and names from multiple shortforms into one model
    for shortform in shortforms:
        cased_shortform = escape_filename(shortform)
        with open(
                os.path.join(groundings_path, cased_shortform,
                             f'{cased_shortform}_grounding_map.json'),
                'r') as f:
            grounding_map = json.load(f)
            grounding_dict[shortform] = grounding_map
        with open(
                os.path.join(groundings_path, cased_shortform,
                             f'{cased_shortform}_names.json'), 'r') as f:
            names.update(json.load(f))
        with open(
                os.path.join(groundings_path, cased_shortform,
                             f'{cased_shortform}_pos_labels.json'), 'r') as f:
            pos_labels.update(json.load(f))

    if not check_grounding_dict(grounding_dict):
        raise RuntimeError('Inconsistent grounding maps for shortforms.')
    pos_labels = sorted(pos_labels)

    cased_shortforms = [
        escape_filename(shortform) for shortform in sorted(shortforms)
    ]

    # model name is built up from shortforms in model
    # (most models only have one shortform)
    agg_name = ':'.join(cased_shortforms)
    with open(os.path.join(texts_path, agg_name, f'{agg_name}_texts.json'),
              'r') as f:
        text_dict = json.load(f)
    with open(os.path.join(texts_path, agg_name, f'{agg_name}_text_map.json'),
              'r') as f:
        ref_dict = json.load(f)

    # get statistics for matches to standard patterns
    stats = adeft_stats(grounding_dict, names, text_dict, ref_dict)

    # build corpus for training models
    refs, texts = zip(*text_dict.items())
    texts = [text for text in texts if text is not None]
    deft_cb = DeftCorpusBuilder(grounding_dict)
    corpus = deft_cb.build_from_texts(texts)

    # gather additional texts
    for grounding, name, agent_text in additional:
        names[grounding] = name
        with open(
                os.path.join(texts_path, agent_text,
                             f'{agent_text}_texts.json'), 'r') as f:
            additional_texts = json.load(f)
            corpus.extend([(text, grounding)
                           for text_ref, text in additional_texts.items()
                           if text_ref not in text_dict])
            pos_labels.append(grounding)

    pos_labels = sorted(set(pos_labels))

    train, labels = zip(*corpus)
    deft_cl = DeftClassifier(shortforms, pos_labels)
    params = {'C': [100.0], 'max_features': [10000], 'ngram_range': [(1, 2)]}
    deft_cl.cv(train, labels, params, n_jobs=n_jobs, cv=5)
    cv = deft_cl.grid_search.cv_results_

    preds = cross_val_predict(deft_cl.estimator,
                              train,
                              labels,
                              n_jobs=n_jobs,
                              cv=5)
    conf_matrix = confusion_matrix(labels, preds)
    cv_results = {
        'labels': sorted(set(labels)),
        'conf_matrix': conf_matrix.tolist(),
        'f1': {
            'mean': cv['mean_test_f1'][0],
            'std': cv['std_test_f1'][0]
        },
        'precision': {
            'mean': cv['mean_test_pr'][0],
            'std': cv['std_test_pr'][0]
        },
        'recall': {
            'mean': cv['mean_test_rc'][0],
            'std': cv['std_test_rc'][0]
        }
    }

    logit = deft_cl.estimator.named_steps['logit']
    coef = logit.coef_
    classes = logit.classes_

    # calculate feature importance
    feature_names = deft_cl.estimator.named_steps['tfidf'].get_feature_names()
    important_terms = {}
    # when there are greater than 2 classes, the logistic regression model
    # will have a row of coefficients for each class. when there are only
    # two classes, there is only one row of coefficients
    if len(classes) > 2:
        for index, label in enumerate(classes):
            fi = pd.DataFrame({
                'name': feature_names,
                'importance': coef[index, :]
            })
            fi.sort_values('importance', ascending=False, inplace=True)
            top = fi.head(20)
            bottom = fi.tail(20)
            important_terms[label] = {
                'top': list(zip(top['name'], top['importance'])),
                'bottom': list(zip(bottom['name'], bottom['importance']))
            }
    else:
        # in case of binary labels, the coefficients correspond to the labels
        # in LogisticRegression.classes_[1]
        fi = pd.DataFrame({'name': feature_names, 'importance': coef})
        fi.sort_values('importance', ascending=False, inplace=True)
        top = fi.head(20)
        bottom = fi.tail(20)
        important_terms[classes[1]] = list(zip(top['name'], top['importance']))
        important_terms[classes[0]] = list(
            zip(bottom['name'], bottom['importance']))

    unlabeled = []
    recognizers = [
        DeftRecognizer(shortform, grounding_map)
        for shortform, grounding_Map in grounding_dict.items()
    ]
    for text in texts:
        for rec in recognizers:
            if rec.recognize(text):
                break
        else:
            unlabeled.append(text)

    preds = deft_cl.estimator.predict(texts)
    preds = dict(Counter(preds))
    data = {
        'stats': stats,
        'cv_results': cv_results,
        'preds_on_unlabeled': preds,
        'important_terms': important_terms
    }
    try:
        os.mkdir(os.path.join(models_path, agg_name))
    except FileExistsError:
        pass
    deft_cl.dump_model(
        os.path.join(models_path, agg_name, f'{agg_name}_model.gz'))
    with open(
            os.path.join(models_path, agg_name,
                         f'{agg_name}_grounding_dict.json'), 'w') as f:
        json.dump(grounding_dict, f)
    with open(os.path.join(models_path, agg_name, f'{agg_name}_names.json'),
              'w') as f:
        json.dump(names, f)
    with open(os.path.join(models_path, agg_name, f'{agg_name}_stats.json'),
              'w') as f:
        json.dump(data, f)
    return deft_cl
Beispiel #5
0
import argparse

from indra_db.util.content_scripts import get_stmts_with_agent_text_like

from adeft_app.locations import DATA_PATH
from adeft_app.filenames import escape_filename

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get statements with agent'
                                     'text matching a pattern')
    parser.add_argument('pattern')
    parser.add_argument('keep', nargs='?')

    args = parser.parse_args()
    pattern = args.pattern
    keep = args.keep
    if not keep:
        keep = ''
    keep = re.compile(keep)
    stmt_dict = get_stmts_with_agent_text_like(pattern, filter_genes=True)
    for shortform, stmts in stmt_dict.items():
        if (shortform[0] in ['-', '.'] or set(shortform) & set(': ')):
            continue
        cased_shortform = escape_filename(shortform)
        if re.match(keep, shortform):
            with open(
                    os.path.join(DATA_PATH, 'statements',
                                 f'{cased_shortform}_statements.json'),
                    'w') as f:
                json.dump(stmts, f)