Beispiel #1
0
 def __init__(self, language, window_width=2):
     self.language = language
     self.tagger = TTPosTagger(language)
     self.feature_index = SortedSet()
     self.role_index = SortedSet()
     self.window_width = window_width
     self.features = []
     self.unk_index = self.feature_index.put('UNK')
def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter('en')
    tagger = TTPosTagger('en')
    parser = StanfordParser(
        path_to_jar='dev/stanford-corenlp-3.6.0.jar',
        path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
        java_options=' -mx1G -Djava.ext.dirs=dev/'
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y),
                       imap(set,
                            json.load(verbs).values()), set())
    all_verbs.discard('be')
    all_verbs.discard('have')

    args = load_corpus(corpus, 'bio', text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info('Processed %d documents', i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
    def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start()
Beispiel #4
0
    def __init__(self, corpus, document_key, sentences_key, language,
                 lemma_to_token, match_base_form):
        """ Initializes the extractor.

            :param iterable corpus: The corpus, iterable of `dict`s
            :param str document_key: The key from which to retrieve the textual document
            :param str sentences_key: The key to which the extracted sentences should be stored
            :param str language: The language the text is in
            :param dict lemma_to_token: Mapping from lemma to list of tokens
        """
        self.corpus = corpus
        self.sentences_key = sentences_key
        self.document_key = document_key
        self.lemma_to_token = lemma_to_token
        self.language = language
        self.lemma_to_token = lemma_to_token if match_base_form else self._filter_base_form(
            lemma_to_token)
        self.tokenizer = Tokenizer(self.language)
        self.tagger = TTPosTagger(self.language)
Beispiel #5
0
class FactExtractorFeatureExtractor(BaseFeatureExtractor):
    """ Feature extractor inspired from the fact-extractor
    """

    def __init__(self, language, window_width=2):
        self.language = language
        self.tagger = TTPosTagger(language)
        self.feature_index = SortedSet()
        self.role_index = SortedSet()
        self.window_width = window_width
        self.features = []
        self.unk_index = self.feature_index.put('UNK')

    def sentence_to_tokens(self, sentence, fes):
        """ Transforms a sentence into a list of tokens

            :param unicode sentence: Text of the sentence
            :param dict fes: mapping FE -> chunk
            :return: List of tokens
        """

        tagged = self.tagger.tag_one(sentence, skip_unknown=False)

        # find entities and group them into single tokens
        for fe, chunk in fes.iteritems():
            if chunk is None:
                continue

            fe_tokens = self.tagger.tokenize(chunk)
            if not fe_tokens:
                continue

            # find fe_tokens into tagged
            found = False
            i = j = 0
            while i < len(tagged):
                if fe_tokens[j].lower() == tagged[i][0].lower():
                    j += 1
                    if j == len(fe_tokens):
                        found = True
                        break
                else:
                    j = 0
                i += 1

            if found:
                position = i - len(fe_tokens) + 1
                pos = 'ENT' if len(fe_tokens) > 1 else tagged[position][1]
                tagged = tagged[:position] + [[chunk, pos, chunk, fe]] + tagged[position + len(fe_tokens):]
            else:
                logger.debug('cunk "%s" of fe "%s" not found in sentence "%s". Overlapping chunks?',
                             chunk, fe, sentence)

        return tagged

    def feature_for(self, term, type_, position, add_unknown):
        """ Returns the feature for the given token, i.e. the column of the feature in a sparse matrix

            :param str term: Actual term
            :param str type_: Type of the term, for example token, pos or lemma
            :param int position: Relative position (used for context windows)
            :param bool add_unknown: Whether to add previously unseen terms to the dictionary
             or use the UNK token instead
            :return: Column of the corresponding feature
        """
        feat = '%s_%s_%+d' % (term.lower(), type_.lower(), position)
        if add_unknown:
            index = self.feature_index.put(feat)
        else:
            index = self.feature_index.index(feat)
            if index == -1:
                index = self.unk_index
        return index

    def token_to_features(self, tokens, position, add_unknown, gazetteer):
        """ Extracts the features for the token in the given position

            :param list tokens: POS-tagged tokens of the sentence
            :param int position: position of the token for which features are requestsd
            :param dict gazetteer: mapping chunk -> additional features
            :return: sparse set of features (i.e. numbers are indexes in a row of a sparse matrix)
        """
        features = set()

        for i in xrange(max(position - self.window_width, 0), min(position + self.window_width + 1, len(tokens))):
            rel = i - position
            features.add(self.feature_for(tokens[i][0], 'TERM', rel, add_unknown))
            features.add(self.feature_for(tokens[i][1], 'POS', rel, add_unknown))
            features.add(self.feature_for(tokens[i][2], 'LEMMA', rel, add_unknown))
            for feat in gazetteer.get(tokens[i][0], []):
                features.add(self.feature_for(feat, 'GAZ', rel, add_unknown))

        return features

    def extract_features(self, sentence, fes, add_unknown, gazetteer):
        """ Extracts the features for each token of the sentence

            :param unicode sentence: Text of the sentence
            :param dicr fes: mapping FE -> chunk
            :param dict gazetteer: mapping chunk -> additional features
            :return: List of features, each one as a sparse row
             (i.e. with the indexes of the relevant columns)
        """
        tagged = self.sentence_to_tokens(sentence, fes)
        features = []

        for i in xrange(len(tagged)):
            feat = self.token_to_features(tagged, i, add_unknown, gazetteer)
            label = 'O' if len(tagged[i]) == 3 else tagged[i][3]
            features.append((feat, self.role_index.put(label)))

        return tagged, features

    def process_sentence(self, sentence, fes, add_unknown, gazetteer):
        tagged, features = self.extract_features(sentence, fes, add_unknown, gazetteer)
        self.features.extend(features)
        return tagged

    def start(self):
        self.features = []

    def get_features(self):
        x, y = [], []
        data, indices, indptr = [], [], []

        for sample, label in self.features:
            y.append(label)

            indptr.append(len(data))
            for feature in sample:
                indices.append(int(feature))
                data.append(1.0)

        indptr.append(len(data))
        x = csr_matrix((data, indices, indptr),
                       shape=(len(indptr) - 1, len(self.feature_index.items)),
                       dtype=np.float32)
        y = np.array(y)

        return x, y

    def __getstate__(self):
        return (self.language, self.unk_index, self.window_width, self.role_index.items,
                self.feature_index.items, self.features)

    def __setstate__(self, (language, unk_index, window_width, role_index, feature_index, features)):
        self.__init__(language, window_width)
        self.feature_index.items = feature_index
        self.role_index.items = role_index
        self.features = features
        self.unk_index = unk_index
class BagOfTermsFeatureExtractor(object):
    """ Extracts features from sentences. Will process sentences one by one
        accumulating their features and finalizes them into the final
        training set.

        It should be used to extract features prior to classification,
        in which case the fe arguments can be used to group tokens of
        the same entity into a single chunk while ignoring the actual
        frame element name, e.g. `fes = dict(enumerate(entities))`
    """

    def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start()

    def start(self):
        """ Clears the samples accumulated so far and starts over.
        """
        self.samples = []

    def lu_column(self):
        return self.vectorizer.vocabulary_['lu'] if not self.target_size else None

    def process_sentence(self, sentence, lu, fes, add_unknown, gazetteer):
        """ Extracts and accumulates features for the given sentence

            :param unicode sentence: Text of the sentence
            :param unicode lu: lexical unit of the sentence
            :param dict fes: Dictionary with FEs and corresponding chunks
            :param bol add_unknown: Whether unknown tokens should be added
             to the index of treaded as a special, unknown token.
             Set to True when building the training set and to False
             when building the features used to classify new sentences
            :param dict gazetteer: Additional features to add when a given
             chunk is found in the sentence. Keys should be chunks and
             values should be list of features
            :return: List of tuples whose first elements are chunks of words
             and the second ones indicate whether the chunk was used as a
             sample or skipped altogether
            :type: list of tuples (chunk, is_sample)
        """

        gazetteer = gazetteer or {}
        tagged = self.sentence_to_tokens(sentence, fes)

        ret = []
        for position in xrange(len(tagged)):
            if tagged[position][0].lower() in self.stopwords or tagged[position][2] != 'ENT':
                ret.append((tagged[position][0], False))
                continue
            else:
                ret.append((tagged[position][0], True))

            # add the unknown feature to every sample to trick the dict vectorizer into
            # thinking that there is a feature like that. will be useful when add_unknown
            # is false, because by default the dict vectorizer skips unseen labels
            self.lu_index[lu] = self.lu_index.get(lu, len(self.lu_index))
            sample = {'unk': self.unk_feature, 'lu': self.lu_index[lu]}

            for i in xrange(max(position - self.window_width, 0),
                            min(position + self.window_width + 1, len(tagged))):

                rel = i - position
                self.add_feature_to(sample, 'TERM%+d' % rel, tagged[i][0], add_unknown)
                self.add_feature_to(sample, 'POS%+d' % rel, tagged[i][1], add_unknown)
                self.add_feature_to(sample, 'LEMMA%+d' % rel, tagged[i][2], add_unknown)

                for feat in gazetteer.get(tagged[i][0], []):
                    sample['GAZ%+d' % rel] = feat

            label = 'O' if len(tagged[i]) == 3 else tagged[i][3]
            self.label_index[label] = self.label_index.get(label, len(self.label_index))
            self.samples.append((sample, label))

        return ret

    def add_feature_to(self, sample, feature_name, feature_value, add_unknown):
        if add_unknown or feature_value in self.vocabulary:
            sample[feature_name] = feature_value
            self.vocabulary.add(feature_value)
        else:
            sample[feature_name] = self.unk_feature

    def get_features(self, refit):
        """ Returns the final features matrix

            :param bool refit: whether to refit the features or use the previous model.
             use refit=True when training and refit=False when retrieving features
             for classifying unknown samples
            :return: A matrix whose rows are samples and columns are features and a
             row vector with the sample label (i.e. the correct answer for the classifier)
            :rtype: tuple
        """
        samples, labels = zip(*self.samples)

        if refit:
            features = self.vectorizer.fit_transform(samples)
            if self.target_size:
                features = self.reducer.fit_transform(features)
        else:
            features = self.vectorizer.transform(samples)
            if self.target_size:
                features = self.reducer.transform(features)

        labels = np.array([self.label_index[label] for label in labels])

        return features, labels

    def sentence_to_tokens(self, sentence, fes):
        """ Transforms a sentence into a list of tokens. Appends the FE type
            to all tokens composing a certain FE and optionally group them into
            a single token.

            :param unicode sentence: Text of the sentence
            :param dict fes: mapping FE -> chunk
            :return: List of tokens
        """

        if not sentence.strip():
            return []

        tagged = self.tagger.tag_one(sentence, skip_unknown=False)
        for fe, chunk in fes.iteritems():
            if chunk is None:
                continue

            fe_tokens = self.tagger.tokenize(chunk)
            if not fe_tokens:
                continue

            # find fe_tokens into tagged
            found = False
            i = j = 0
            while i < len(tagged):
                if len(tagged[i]) == 3 and fe_tokens[j].lower() == tagged[i][0].lower():
                    j += 1
                    if j == len(fe_tokens):
                        found = True
                        break
                else:
                    j = 0
                i += 1

            if found:
                position = i - len(fe_tokens) + 1
                pos = 'ENT' if len(fe_tokens) > 1 else tagged[position][1]

                if self.collapse_fes:
                    # make a single token with the whole chunk
                    tagged = tagged[:position] + [[chunk, pos, 'ENT', fe]] + tagged[position + len(fe_tokens):]
                else:
                    # set custom lemma and label for the tokens of the FE
                    for i in xrange(position, position + len(fe_tokens)):
                        token, pos, _ = tagged[i]
                        tagged[i] = (token, pos, 'ENT', fe)
            else:
                logger.debug('cunk "%s" of fe "%s" not found in sentence "%s". Overlapping chunks?',
                             chunk, fe, sentence)

        return tagged

    def __getstate__(self):
        return (self.language, self.unk_feature, self.window_width, self.samples,
                self.vocabulary, self.label_index, self.vectorizer, self.collapse_fes,
                self.reducer, self.target_size)

    def __setstate__(self, (language, unk_feature, window_width, samples, vocabulary,
                     label_index, vectorizer, collapse_fes, reducer, target_size)):
        self.__init__(language, window_width, collapse_fes, target_size)
        self.samples = samples
        self.vocabulary = vocabulary
        self.unk_feature = unk_feature
        self.label_index = label_index
        self.vectorizer = vectorizer
        self.reducer = reducer