def map_mention(self, mention): (start, stop), text = mention pre = list(ngrams(text[:start], 1)) ins = list(ngrams(text[start:stop], 1)) post = list(ngrams(text[stop:], 1)) return (len(pre), len(pre)+len(ins)), pre+ins+post
def transform(item, vocab): target, source, text, span = item vocab = vocab.value start, stop = span pre = list(ngrams(text[:start], 1)) ins = list(ngrams(text[start:stop], 1)) post = list(ngrams(text[stop:], 1)) indexes = [vocab.get(t, len(vocab) - 1) for t in (pre + ins + post)] return target, source, indexes, (len(pre), len(pre) + len(ins))
def transform(xxx_todo_changeme4, vocab): (target, source, text, span) = xxx_todo_changeme4 vocab = vocab.value start, stop = span pre = list(ngrams(text[:start], 1)) ins = list(ngrams(text[start:stop], 1)) post = list(ngrams(text[stop:], 1)) indexes = [vocab.get(t, len(vocab)-1) for t in (pre+ins+post)] return target, source, indexes, (len(pre), len(pre)+len(ins))
def build(self, corpus, n=1): return ( corpus.flatMap(lambda d: set(ngrams(d["text"], n))) .map(lambda t: (t, 1)) .reduceByKey(add) .filter(lambda (k, v): v > 1) )
def build(self, corpus, n=1): return ( corpus.flatMap(lambda d: ngrams(d["text"], n, lowercase=self.lowercase)) .map(lambda t: (t, 1)) .reduceByKey(add) .filter(lambda (k, v): v > 1) )
def iter_span_count_types(anchor, n): parts = list(ngrams(anchor, n, n)) if parts: yield parts[0], 'B' yield parts[-1], 'E' for i in xrange(1, len(parts)-1): yield parts[i], 'I'
def iter_span_count_types(anchor, n): parts = list(ngrams(anchor, n, n)) if parts: yield parts[0], 'B' yield parts[-1], 'E' for i in xrange(1, len(parts) - 1): yield parts[i], 'I'
def build(self, mentions): from gensim.models.word2vec import Word2Vec sentences = mentions\ .filter(lambda (target, source, text, span): target.startswith(self.filter_target))\ sentences = sentences\ .map(lambda (target, source, text, (s,e)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1)))) if self.coalesce: sentences = sentences.coalesce(self.coalesce) sentences = sentences.cache() model = Word2Vec(sample=1e-5, size=self.dimensions, workers=self.workers) log.info('Preparing corpus...') model.corpus_count = sentences.count() log.info('Computing vocab statistics...') term_counts = sentences\ .flatMap(lambda tokens: ((t, 1) for t in tokens))\ .reduceByKey(add)\ .filter(lambda (t, count): \ (t.startswith(self.filter_target) and count >= self.min_entity_count) or \ (count >= self.min_word_count)) model.raw_vocab = dict(term_counts.collect()) model.scale_vocab(trim_rule=self.get_trim_rule()) model.finalize_vocab() log.info('Training local word2vec model...') model.train(sentences.toLocalIterator()) log.info('Normalising embeddings...') model.init_sims(replace=True) total_entities = sum(1 if t.startswith(self.filter_target) else 0 for t in model.vocab.iterkeys()) total_words = len(model.vocab) - total_entities vocab_sz = 0 if not self.exclude_entities: log.info('Including %i entity embeddings in exported vocab...', total_entities) vocab_sz += total_entities if not self.exclude_words: log.info('Including %i word embeddings in exported vocab...', total_words) vocab_sz += total_words log.info('Parallelizing %i learned embeddings...', vocab_sz) return corpus\ .context\ .parallelize( (t, model.syn0[vi.index].tolist()) for t, vi in model.vocab.iteritems() if (not self.exclude_entities and t.startswith(self.filter_target)) or (not self.exclude_words and not t.startswith(self.filter_target)))
def build(self, corpus): from gensim.models.word2vec import Word2Vec sentences = corpus\ .flatMap(EntityMentions.iter_mentions)\ .filter(lambda (target, (span, text)): target.startswith(self.filter_target))\ if self.lowercase: sentences = sentences.map(lambda (target, (span, text)): (target, (span, text.lower()))) sentences = sentences\ .map(lambda (target, ((s,e), text)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1)))) if self.coalesce: sentences = sentences.coalesce(self.coalesce) sentences = sentences.cache() model = Word2Vec(sample=1e-5, size=self.dimensions, workers=self.workers) log.info('Preparing corpus...') model.corpus_count = sentences.count() log.info('Computing vocab statistics...') term_counts = sentences\ .flatMap(lambda tokens: ((t, 1) for t in tokens))\ .reduceByKey(add)\ .filter(lambda (t, count): \ (t.startswith(self.filter_target) and count >= self.min_entity_count) or \ (count >= self.min_word_count)) model.raw_vocab = dict(term_counts.collect()) model.scale_vocab(trim_rule=self.get_trim_rule()) model.finalize_vocab() log.info('Training local word2vec model...') model.train(sentences.toLocalIterator()) log.info('Normalising embeddings...') model.init_sims(replace=True) total_entities = sum(1 if t.startswith(self.filter_target) else 0 for t in model.vocab.iterkeys()) total_words = len(model.vocab) - total_entities vocab_sz = 0 if not self.exclude_entities: log.info('Including %i entity embeddings in exported vocab...', total_entities) vocab_sz += total_entities if not self.exclude_words: log.info('Including %i word embeddings in exported vocab...', total_words) vocab_sz += total_words log.info('Parallelizing %i learned embeddings...', vocab_sz) return corpus\ .context\ .parallelize( (t, model.syn0[vi.index].tolist()) for t, vi in model.vocab.iteritems() if (not self.exclude_entities and t.startswith(self.filter_target)) or (not self.exclude_words and not t.startswith(self.filter_target)))
def build(self, docs): m = docs.map(lambda d: d['text']) if self.lowercase: m = m.map(unicode.lower) return m\ .flatMap(lambda text: ngrams(text, self.max_ngram))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda (k,v): v > 1)
def build(self, docs): m = docs.map(lambda d: d['text']) if self.lowercase: m = m.map(lambda text: text.lower()) return m\ .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda (k,v): v > self.min_df)
class IndexMappedMentions(EntityMentions, IndexedMentions): """ Entity mention corpus with terms mapped to numeric indexes """ def build(self, sc, docs, vocab): tv = sc.broadcast( dict(vocab.map(lambda r: (r['_id'], r['rank'])).collect())) return super(IndexMappedMentions, self)\ .build(docs)\ .map(lambda m: self.transform(m, tv)) @staticmethod def transform((target, source, text, span), vocab): vocab = vocab.value start, stop = span pre = list(ngrams(text[:start], 1)) ins = list(ngrams(text[start:stop], 1)) post = list(ngrams(text[stop:], 1)) indexes = [vocab.get(t, len(vocab) - 1) for t in (pre + ins + post)] return target, source, indexes, (len(pre), len(pre) + len(ins))
def build(self, corpus): max_ngram = self.max_ngram m = corpus.map(lambda d: d['text']) if self.lowercase: m = m.map(unicode.lower) return m\ .flatMap(lambda text: ngrams(text, max_ngram))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda (k,v): v > 1)
def build(self, corpus): log.info('Building df model: max-ngram=%i, min-df=%i', self.max_ngram, self.min_df) m = corpus.map(lambda d: d['text']) if self.lowercase: m = m.map(lambda text: text.lower()) return m\ .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda (k,v): v > self.min_df)
def build(self, mentions, idfs): m = mentions\ .map(lambda (target, (span, text)): (target, text))\ .mapValues(lambda v: ngrams(v, self.max_ngram))\ .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\ .reduceByKey(add)\ .map(lambda ((target, token), count): (token, (target, count)))\ .leftOuterJoin(idfs)\ .filter(lambda (token, ((target, count), idf)): idf != None)\ .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\ .groupByKey() return m.mapValues(self.normalize_counts if self.normalize else list)
def build(self, mentions, idfs): m = mentions\ .map(lambda target_span_text: (target_span_text[0], target_span_text[1][1]))\ .mapValues(lambda v: ngrams(v, self.max_ngram))\ .flatMap(lambda target_tokens: (((target_tokens[0], t), 1) for t in target_tokens[1]))\ .reduceByKey(add)\ .map(lambda target_token_count: (target_token_count[0][1], (target_token_count[0][0], target_token_count[1])))\ .leftOuterJoin(idfs)\ .filter(lambda token_target_count_idf: token_target_count_idf[1][1] != None)\ .map(lambda token_target_count_idf3: (token_target_count_idf3[0][0], (token_target_count_idf3[0], math.sqrt(token_target_count_idf3[0][1])*token_target_count_idf3[1][1])))\ .groupByKey() return m.mapValues(self.normalize_counts if self.normalize else list)
def build(self, mentions, idfs): m = mentions \ .map(lambda r: (r[0], r[1][1])) \ .mapValues(lambda v: ngrams(v, self.max_ngram)) \ .flatMap(lambda r: (((r[0], t), 1) for t in r[1])) \ .reduceByKey(add) \ .map(lambda r: (r[0][1], (r[0][0], r[1]))) \ .leftOuterJoin(idfs) \ .filter(lambda r: r[1][1] != None) \ .map(lambda r: (r[1][0][0], (r[0], math.sqrt(r[1][0][1]) * r[1][1]))) \ .groupByKey() # .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count) * idf))) \ # .groupByKey() return m.mapValues(self.normalize_counts if self.normalize else list)
def build(self, docs): part_counts = docs\ .flatMap(self.iter_anchors)\ .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\ .map(lambda p: (p, 1))\ .reduceByKey(add)\ .map(lambda ((term, spantype), count): (term, (spantype, count))) part_counts += docs\ .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda (t, c): c > 1)\ .map(lambda (t, c): (t, ('O', c))) return part_counts\ .groupByKey()\ .mapValues(dict)\ .filter(lambda (t, cs): 'O' in cs and len(cs) > 1)
def build(self, corpus): m = corpus.flatMap(EntityMentions.iter_mentions) if self.filter_target: log.info('Filtering mentions targeting: %s', self.filter_target) m = m.filter(lambda (target, _): target.startswith(self.filter_target)) m = m\ .map(lambda (target, (span, text)): (target, text))\ .mapValues(lambda v: ngrams(v, self.max_ngram))\ .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\ .reduceByKey(add)\ .map(lambda ((target, token), count): (token, (target, count)))\ .leftOuterJoin(self.idf_model)\ .filter(lambda (token, ((target, count), idf)): idf != None)\ .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\ .groupByKey() return m.mapValues(self.normalize_counts if self.normalize else list)
def build(self, corpus): part_counts = corpus\ .flatMap(self.iter_anchors)\ .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\ .map(lambda p: (p, 1))\ .reduceByKey(add)\ .map(lambda ((term, spantype), count): (term, (spantype, count))) part_counts += corpus\ .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda (t, c): c > 1)\ .map(lambda (t, c): (t, ('O', c))) return part_counts\ .groupByKey()\ .mapValues(dict)\ .filter(lambda (t, cs): 'O' in cs and len(cs) > 1)
def build(self, docs): part_counts = docs\ .flatMap(self.iter_anchors)\ .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in range(1, self.max_ngram+1)))\ .map(lambda p: (p, 1))\ .reduceByKey(add)\ .map(lambda term_spantype_count: (term_spantype_count[0][0], (term_spantype_count[0][1], term_spantype_count[1]))) part_counts += docs\ .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\ .map(lambda t: (t, 1))\ .reduceByKey(add)\ .filter(lambda t_c2: t_c2[1] > 1)\ .map(lambda t_c3: (t_c3[0], ('O', t_c3[1]))) return part_counts\ .groupByKey()\ .mapValues(dict)\ .filter(lambda t_cs: 'O' in t_cs[1] and len(t_cs[1]) > 1)
def build(self, corpus): log.info( "Building tf-idf model: N=%i, ngrams=%i, df-range=(%i, %i), norm=%s", N, self.max_ngram, self.min_rank, self.max_rank, str(self.normalize), ) idfs = TermIdfs(max_ngram=self.max_ngram, min_rank=self.min_rank, max_rank=self.max_rank) m = ( corpus.flatMap(EntityMentions.iter_mentions) .mapValues(lambda v: ngrams(v, max_ngram)) .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens)) .reduceByKey(add) .map(lambda ((target, token), count): (token, (target, count))) .leftOuterJoin(idfs) .filter(lambda (token, ((target, count), idf)): idf != None) .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count) * idf))) .groupByKey() ) return m.mapValues(self.normalize_counts if self.normalize else list)