Beispiel #1
0
    def map_mention(self, mention):
        (start, stop), text = mention

        pre = list(ngrams(text[:start], 1))
        ins = list(ngrams(text[start:stop], 1))
        post = list(ngrams(text[stop:], 1))

        return (len(pre), len(pre)+len(ins)), pre+ins+post
Beispiel #2
0
    def transform(item, vocab):
        target, source, text, span = item
        vocab = vocab.value

        start, stop = span
        pre = list(ngrams(text[:start], 1))
        ins = list(ngrams(text[start:stop], 1))
        post = list(ngrams(text[stop:], 1))
        indexes = [vocab.get(t, len(vocab) - 1) for t in (pre + ins + post)]

        return target, source, indexes, (len(pre), len(pre) + len(ins))
Beispiel #3
0
    def transform(xxx_todo_changeme4, vocab):
        (target, source, text, span) = xxx_todo_changeme4
        vocab = vocab.value

        start, stop = span
        pre = list(ngrams(text[:start], 1))
        ins = list(ngrams(text[start:stop], 1))
        post = list(ngrams(text[stop:], 1))
        indexes = [vocab.get(t, len(vocab)-1) for t in (pre+ins+post)]

        return target, source, indexes, (len(pre), len(pre)+len(ins))
Beispiel #4
0
 def build(self, corpus, n=1):
     return (
         corpus.flatMap(lambda d: set(ngrams(d["text"], n)))
         .map(lambda t: (t, 1))
         .reduceByKey(add)
         .filter(lambda (k, v): v > 1)
     )
Beispiel #5
0
 def build(self, corpus, n=1):
     return (
         corpus.flatMap(lambda d: ngrams(d["text"], n, lowercase=self.lowercase))
         .map(lambda t: (t, 1))
         .reduceByKey(add)
         .filter(lambda (k, v): v > 1)
     )
Beispiel #6
0
 def iter_span_count_types(anchor, n):
     parts = list(ngrams(anchor, n, n))
     if parts:
         yield parts[0], 'B'
         yield parts[-1], 'E'
         for i in xrange(1, len(parts)-1):
             yield parts[i], 'I'
Beispiel #7
0
 def iter_span_count_types(anchor, n):
     parts = list(ngrams(anchor, n, n))
     if parts:
         yield parts[0], 'B'
         yield parts[-1], 'E'
         for i in xrange(1, len(parts) - 1):
             yield parts[i], 'I'
Beispiel #8
0
    def build(self, mentions):
        from gensim.models.word2vec import Word2Vec
        sentences = mentions\
            .filter(lambda (target, source, text, span): target.startswith(self.filter_target))\

        sentences = sentences\
            .map(lambda (target, source, text, (s,e)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1))))

        if self.coalesce:
            sentences = sentences.coalesce(self.coalesce)

        sentences = sentences.cache()

        model = Word2Vec(sample=1e-5,
                         size=self.dimensions,
                         workers=self.workers)

        log.info('Preparing corpus...')
        model.corpus_count = sentences.count()

        log.info('Computing vocab statistics...')
        term_counts = sentences\
            .flatMap(lambda tokens: ((t, 1) for t in tokens))\
            .reduceByKey(add)\
            .filter(lambda (t, count): \
                (t.startswith(self.filter_target) and count >= self.min_entity_count) or \
                (count >= self.min_word_count))

        model.raw_vocab = dict(term_counts.collect())
        model.scale_vocab(trim_rule=self.get_trim_rule())
        model.finalize_vocab()

        log.info('Training local word2vec model...')
        model.train(sentences.toLocalIterator())

        log.info('Normalising embeddings...')
        model.init_sims(replace=True)

        total_entities = sum(1 if t.startswith(self.filter_target) else 0
                             for t in model.vocab.iterkeys())
        total_words = len(model.vocab) - total_entities

        vocab_sz = 0
        if not self.exclude_entities:
            log.info('Including %i entity embeddings in exported vocab...',
                     total_entities)
            vocab_sz += total_entities
        if not self.exclude_words:
            log.info('Including %i word embeddings in exported vocab...',
                     total_words)
            vocab_sz += total_words

        log.info('Parallelizing %i learned embeddings...', vocab_sz)
        return corpus\
            .context\
            .parallelize(
                (t, model.syn0[vi.index].tolist())
                for t, vi in model.vocab.iteritems()
                    if (not self.exclude_entities and t.startswith(self.filter_target)) or
                       (not self.exclude_words and not t.startswith(self.filter_target)))
Beispiel #9
0
    def build(self, corpus):
        from gensim.models.word2vec import Word2Vec
        sentences = corpus\
            .flatMap(EntityMentions.iter_mentions)\
            .filter(lambda (target, (span, text)): target.startswith(self.filter_target))\

        if self.lowercase:
            sentences = sentences.map(lambda (target, (span, text)): (target, (span, text.lower())))

        sentences = sentences\
            .map(lambda (target, ((s,e), text)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1))))

        if self.coalesce:
            sentences = sentences.coalesce(self.coalesce)

        sentences = sentences.cache()

        model = Word2Vec(sample=1e-5, size=self.dimensions, workers=self.workers)

        log.info('Preparing corpus...')
        model.corpus_count = sentences.count()

        log.info('Computing vocab statistics...')
        term_counts = sentences\
            .flatMap(lambda tokens: ((t, 1) for t in tokens))\
            .reduceByKey(add)\
            .filter(lambda (t, count): \
                (t.startswith(self.filter_target) and count >= self.min_entity_count) or \
                (count >= self.min_word_count))

        model.raw_vocab = dict(term_counts.collect())
        model.scale_vocab(trim_rule=self.get_trim_rule())
        model.finalize_vocab()

        log.info('Training local word2vec model...')
        model.train(sentences.toLocalIterator())

        log.info('Normalising embeddings...')
        model.init_sims(replace=True)

        total_entities = sum(1 if t.startswith(self.filter_target) else 0 for t in model.vocab.iterkeys())
        total_words = len(model.vocab) - total_entities

        vocab_sz = 0
        if not self.exclude_entities:
            log.info('Including %i entity embeddings in exported vocab...', total_entities)
            vocab_sz += total_entities
        if not self.exclude_words:
            log.info('Including %i word embeddings in exported vocab...', total_words)
            vocab_sz += total_words

        log.info('Parallelizing %i learned embeddings...', vocab_sz)
        return corpus\
            .context\
            .parallelize(
                (t, model.syn0[vi.index].tolist())
                for t, vi in model.vocab.iteritems()
                    if (not self.exclude_entities and t.startswith(self.filter_target)) or
                       (not self.exclude_words and not t.startswith(self.filter_target)))
Beispiel #10
0
    def build(self, docs):
        m = docs.map(lambda d: d['text'])
        if self.lowercase:
            m = m.map(unicode.lower)

        return m\
            .flatMap(lambda text: ngrams(text, self.max_ngram))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda (k,v): v > 1)
Beispiel #11
0
    def build(self, docs):
        m = docs.map(lambda d: d['text'])
        if self.lowercase:
            m = m.map(lambda text: text.lower())

        return m\
            .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda (k,v): v > self.min_df)
Beispiel #12
0
class IndexMappedMentions(EntityMentions, IndexedMentions):
    """ Entity mention corpus with terms mapped to numeric indexes """
    def build(self, sc, docs, vocab):
        tv = sc.broadcast(
            dict(vocab.map(lambda r: (r['_id'], r['rank'])).collect()))
        return super(IndexMappedMentions, self)\
            .build(docs)\
            .map(lambda m: self.transform(m, tv))

    @staticmethod
    def transform((target, source, text, span), vocab):
        vocab = vocab.value

        start, stop = span
        pre = list(ngrams(text[:start], 1))
        ins = list(ngrams(text[start:stop], 1))
        post = list(ngrams(text[stop:], 1))
        indexes = [vocab.get(t, len(vocab) - 1) for t in (pre + ins + post)]

        return target, source, indexes, (len(pre), len(pre) + len(ins))
Beispiel #13
0
    def build(self, corpus):
        max_ngram = self.max_ngram

        m = corpus.map(lambda d: d['text'])
        if self.lowercase:
            m = m.map(unicode.lower)

        return m\
            .flatMap(lambda text: ngrams(text, max_ngram))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda (k,v): v > 1)
Beispiel #14
0
    def build(self, corpus):
        log.info('Building df model: max-ngram=%i, min-df=%i', self.max_ngram, self.min_df)

        m = corpus.map(lambda d: d['text'])

        if self.lowercase:
            m = m.map(lambda text: text.lower())

        return m\
            .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda (k,v): v > self.min_df)
Beispiel #15
0
    def build(self, mentions, idfs):
        m = mentions\
            .map(lambda (target, (span, text)): (target, text))\
            .mapValues(lambda v: ngrams(v, self.max_ngram))\
            .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\
            .reduceByKey(add)\
            .map(lambda ((target, token), count): (token, (target, count)))\
            .leftOuterJoin(idfs)\
            .filter(lambda (token, ((target, count), idf)): idf != None)\
            .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\
            .groupByKey()

        return m.mapValues(self.normalize_counts if self.normalize else list)
Beispiel #16
0
    def build(self, mentions, idfs):
        m = mentions\
            .map(lambda target_span_text: (target_span_text[0], target_span_text[1][1]))\
            .mapValues(lambda v: ngrams(v, self.max_ngram))\
            .flatMap(lambda target_tokens: (((target_tokens[0], t), 1) for t in target_tokens[1]))\
            .reduceByKey(add)\
            .map(lambda target_token_count: (target_token_count[0][1], (target_token_count[0][0], target_token_count[1])))\
            .leftOuterJoin(idfs)\
            .filter(lambda token_target_count_idf: token_target_count_idf[1][1] != None)\
            .map(lambda token_target_count_idf3: (token_target_count_idf3[0][0], (token_target_count_idf3[0], math.sqrt(token_target_count_idf3[0][1])*token_target_count_idf3[1][1])))\
            .groupByKey()

        return m.mapValues(self.normalize_counts if self.normalize else list)
Beispiel #17
0
    def build(self, mentions, idfs):
        m = mentions \
            .map(lambda r: (r[0], r[1][1])) \
            .mapValues(lambda v: ngrams(v, self.max_ngram)) \
            .flatMap(lambda r: (((r[0], t), 1) for t in r[1])) \
            .reduceByKey(add) \
            .map(lambda r: (r[0][1], (r[0][0], r[1]))) \
            .leftOuterJoin(idfs) \
            .filter(lambda r: r[1][1] != None) \
            .map(lambda r: (r[1][0][0], (r[0], math.sqrt(r[1][0][1]) * r[1][1]))) \
            .groupByKey()
        # .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count) * idf))) \
        # .groupByKey()

        return m.mapValues(self.normalize_counts if self.normalize else list)
Beispiel #18
0
    def build(self, docs):
        part_counts = docs\
            .flatMap(self.iter_anchors)\
            .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\
            .map(lambda p: (p, 1))\
            .reduceByKey(add)\
            .map(lambda ((term, spantype), count): (term, (spantype, count)))

        part_counts += docs\
            .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda (t, c): c > 1)\
            .map(lambda (t, c): (t, ('O', c)))

        return part_counts\
            .groupByKey()\
            .mapValues(dict)\
            .filter(lambda (t, cs): 'O' in cs and len(cs) > 1)
Beispiel #19
0
    def build(self, corpus):
        m = corpus.flatMap(EntityMentions.iter_mentions)

        if self.filter_target:
            log.info('Filtering mentions targeting: %s', self.filter_target)
            m = m.filter(lambda (target, _): target.startswith(self.filter_target))

        m = m\
            .map(lambda (target, (span, text)): (target, text))\
            .mapValues(lambda v: ngrams(v, self.max_ngram))\
            .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\
            .reduceByKey(add)\
            .map(lambda ((target, token), count): (token, (target, count)))\
            .leftOuterJoin(self.idf_model)\
            .filter(lambda (token, ((target, count), idf)): idf != None)\
            .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\
            .groupByKey()

        return m.mapValues(self.normalize_counts if self.normalize else list)
Beispiel #20
0
    def build(self, corpus):
        part_counts = corpus\
            .flatMap(self.iter_anchors)\
            .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\
            .map(lambda p: (p, 1))\
            .reduceByKey(add)\
            .map(lambda ((term, spantype), count): (term, (spantype, count)))

        part_counts += corpus\
            .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda (t, c): c > 1)\
            .map(lambda (t, c): (t, ('O', c)))

        return part_counts\
            .groupByKey()\
            .mapValues(dict)\
            .filter(lambda (t, cs): 'O' in cs and len(cs) > 1)
Beispiel #21
0
    def build(self, docs):
        part_counts = docs\
            .flatMap(self.iter_anchors)\
            .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in range(1, self.max_ngram+1)))\
            .map(lambda p: (p, 1))\
            .reduceByKey(add)\
            .map(lambda term_spantype_count: (term_spantype_count[0][0], (term_spantype_count[0][1], term_spantype_count[1])))

        part_counts += docs\
            .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\
            .map(lambda t: (t, 1))\
            .reduceByKey(add)\
            .filter(lambda t_c2: t_c2[1] > 1)\
            .map(lambda t_c3: (t_c3[0], ('O', t_c3[1])))

        return part_counts\
            .groupByKey()\
            .mapValues(dict)\
            .filter(lambda t_cs: 'O' in t_cs[1] and len(t_cs[1]) > 1)
Beispiel #22
0
    def build(self, corpus):
        log.info(
            "Building tf-idf model: N=%i, ngrams=%i, df-range=(%i, %i), norm=%s",
            N,
            self.max_ngram,
            self.min_rank,
            self.max_rank,
            str(self.normalize),
        )
        idfs = TermIdfs(max_ngram=self.max_ngram, min_rank=self.min_rank, max_rank=self.max_rank)

        m = (
            corpus.flatMap(EntityMentions.iter_mentions)
            .mapValues(lambda v: ngrams(v, max_ngram))
            .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))
            .reduceByKey(add)
            .map(lambda ((target, token), count): (token, (target, count)))
            .leftOuterJoin(idfs)
            .filter(lambda (token, ((target, count), idf)): idf != None)
            .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count) * idf)))
            .groupByKey()
        )

        return m.mapValues(self.normalize_counts if self.normalize else list)