Beispiel #1
0
def concept_features_for_sentence(sentence, chunk_inds):
    """
    concept_features()

    @param  sentence.   A sentence in list of chunk format
    @param  chunk_inds. A list of indices for non-None-labeled chunks
    @return             A list of feature dictionaries
    """

    global dependency_parser

    # Get a feature set for each word in the sentence
    features_list = []
    for ind in chunk_inds:
        features_list.append(
            feat_word.concept_features_for_chunk(sentence, ind))

    dependencies = None

    if dependency_parser is not None:
        dependencies = dependency_parser.get_collapsed_dependencies(sentence)

    # Allow for particular features to be enabled
    for feature in enabled_concept_features:

        # Features: UMLS features
        if (feature == "UMLS") and enabled['UMLS']:
            umls_features = feat_umls.concept_features_for_chunks(
                sentence, chunk_inds)
            for i in range(len(chunk_inds)):
                features_list[i].update(umls_features[i])

        if (feature == "grammar_features" and enabled["PY4J"]):
            print "getting grammar features"
            for i, target_index in enumerate(chunk_inds):
                if dependencies is not None:
                    features_list[i].update(
                        dependency_parser.get_related_tokens(
                            target_index, sentence, dependencies))

    if enabled_modules()["WORD2VEC"]:
        print "getting vectors..."
        for i, chunk_index in enumerate(chunk_inds):

            chunk = sentence[chunk_index]
            cluster = predict_sequence_cluster(chunk)

            features_list[i].update({("cluster", cluster): 1})

    return features_list
Beispiel #2
0
import cPickle as pickle
import sys
import os

import atexit

features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if features_dir not in sys.path:
    sys.path.append(features_dir)

# find where umls tables are located
from read_config import enabled_modules
enabled = enabled_modules()
umls_tables = enabled['UMLS']

features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if features_dir not in sys.path:
    sys.path.append(features_dir)
from utilities import load_pickled_obj


class UmlsCache:

    # static class variables
    filename = None
    cache = None

    def __init__(self):

        try:
Beispiel #3
0
__date__   = 'Jan. 27, 2014'


# What modules are available
from utilities import load_pos_tagger
from read_config import enabled_modules
import word_features as feat_word



################################################
# Build a few expensive one-time objects


# what to build requires knowing what tools are enabled
enabled = enabled_modules()


# Import feature modules
feat_genia=None
if enabled['GENIA']:
    from genia_dir.genia_features import GeniaFeatures


# Only create UMLS cache if module is available
if enabled['UMLS']:
    from umls_dir import interface_umls
    from umls_dir import interpret_umls
    import umls_dir.umls_features as feat_umls

Beispiel #4
0
                word = line[0]
                vector = map(float, line[1:-1])

                assert len(vector) == vector_size

                word_vecs[word] = vector

    print '\n\tword2vec embeddings complete'
    return word_vecs



if embeddings is None:

    # Load word vectors
    vectors_bin = read_config.enabled_modules()["WORD2VEC"]

    pretrained = load_bin(vectors_bin, bin_mode=True)

    # be able to handle OOV by giving them 0 vectors
    embeddings = defaultdict(lambda:np.zeros(len(pretrained.values()[0])))
    embeddings = defaultdict(lambda:np.array([.0000000000000000000000000001]*len(pretrained.values()[0])))
    embeddings.update(pretrained)

    print "\tsuccessfully loaded word2vec embeddings\n"


def cosine_similarity(x, y):
    return (np.inner(x,y) / (np.linalg.norm(x) * np.linalg.norm(y)))

def get_word_from_vec(vector):