class NewsFeeder(object): def __init__(self): self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() def feed(self, record, granularity='default'): global logger_feeder try: contents = self.wfilter( self.seg.cut(record['title'].replace('\n', ' '))) if record.get('original_tags', []) and isinstance( record.get('original_tags', []), list): contents.extend(record.get('original_tags', [])) if granularity == 'fine': contents.extend( self.wfilter( self.seg.cut4search(' '.join([ piece['content'].replace('\n', ' ') for piece in record['contents'] ])))) else: contents.extend( self.wfilter( self.seg.cut(' '.join([ piece['content'].replace('\n', ' ') for piece in record['contents'] ])))) return contents except Exception, e: logger_feeder.error('Fail to feed, %s, %s' % (record['_id'], e)) return []
class News(object): def __init__(self): self.mongo = dbcon.connect_mongo() self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() def __iter__(self): for news in self.mongo.article.news.find({'processStatus': 1}).sort('_id', DESCENDING).limit(200000): try: content = [] content.extend(self.wfilter(self.seg.cut4search(news.get('title', '')))) for piece in news.get('contents', []): content.extend(self.wfilter(self.seg.cut(piece.get('content', '')))) if len(content) > 10: yield content except Exception, e: continue for c in self.db.query('select description from company where verify="Y" and modifyTime>"2016-06-01";'): try: if len(c.description) > 10: yield self.wfilter(self.seg.cut4search(c.description)) except: continue
class Feeder(object): def __init__(self): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.non_trusted_discount = 0.5 self.brief_promote = 1.5 self.trusted_sources = dicts.get_known_company_source() self.wfilter = word_filter.get_default_filter() self.seg = Segmenter(tag=True) def feed(self, cid, mode='default', quanlity='low'): feeds = { 'default': self.__feed_default, 'with_tag': self.__feed_with_tag }.get(mode, 'default')(cid) feeds = list(feeds) if quanlity == 'medium': ave = min(mean([feed[1] for feed in feeds]), 2) return filter(lambda x: x[1] >= ave, feeds) if quanlity == 'low': return feeds def feed_string(self, cid, mode='default'): feeds = list(self.feed(cid, mode, 'medium')) return ' '.join([feed[0].strip() for feed in feeds]) def feed_seged(self, cid, feed_mode='default'): return self.wfilter(self.seg.cut(self.feed_string(cid, feed_mode))) def feed_seged_fine(self, cid, feed_mode='default'): return self.wfilter( self.seg.cut4search(self.feed_string(cid, feed_mode))) def feed_relevant_string(self, cid): pass def __feed_with_tag(self, cid): for feed in self.__feed_default(cid): yield feed for source_tag in dbutil.get_source_company_tags( self.db, cid, self.trusted_sources): if source_tag and source_tag.strip(): yield source_tag, 2 def __feed_default(self, cid): cscore = dbutil.get_company_score(self.db, cid, 37010) # company info info = dbutil.get_company_info(self.db, cid) score = 1.5 if cscore > 0.5 else 1 if info.verify and info.verify == 'Y': score += 1 if info.brief and info.brief.strip(): yield self.__preprocess(info.brief.strip()), score if info.description and info.description.strip(): yield self.__preprocess(info.description.strip()), score # source company for info in dbutil.get_source_company_infos(self.db, cid): discount = self.non_trusted_discount if info.source not in self.trusted_sources else 1 if info.brief and info.brief.strip(): yield self.__preprocess( info.brief.strip()), discount * self.brief_promote if info.description and info.description.strip(): yield self.__preprocess(info.description.strip()), discount # iOS info = dbutil.get_recommend_artifact(self.db, cid) if info and info.description and info.description.strip(): ascore = 1 if (info.verify and info.verify == 'Y') else 0.5 yield self.__preprocess(info.description.strip()), ascore def __preprocess(self, content): # clean and narrow down candidates # 繁转简 content = hants.translate(unicode(content)) # 转小写 content = content.lower() return content.strip()
class KeywordExtractor(object): def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited' def __load_trained_clfs(self): model_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], 'models') clfs = {} for model_file in os.listdir(model_dir): if model_file.endswith('.model'): tid = model_file.split('.')[0] if not isinstance(tid, int): continue clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load( os.path.join(model_dir, model_file)) return clfs def __load_tag_novelties(self, tid=False): if not tid: return { tag.name: (tag.novelty or 1) for tag in dbutil.get_tags_by_type(self.db) } else: return { tag.id: (tag.novelty or 1) for tag in dbutil.get_tags_by_type(self.db) } def __load_tag_types(self): return { tag.name: (tag.type or 0) for tag in dbutil.get_tags_by_type(self.db) } def __extract_source_tag(self, cid): tags = dbutil.get_source_company_tags(self.db, cid, self.trusted_sources) if tags: return set( chain(*[ dbutil.analyze_source_tag( self.db, tname, self.replacements) for tname in tags if tname and tname.strip() ])) return set([]) def __extract_vecrank(self, candidates, candidates_important, candidates_vips, topn): graph = UndirectWeightedGraph() weights = collections.defaultdict(int) proper_hyponym = dict.fromkeys( set( chain(*[ self.hyponym.get(dbutil.get_tag_name(self.db, cv)) for cv in candidates_vips.iterkeys() ])), 2) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 if candidates[i] not in self.w2v: continue for word, weight in candidates_important.items(): if word == candidates[i] or word not in self.w2v: continue similarity = self.w2v.similarity(candidates[i], word) if similarity > self.similarity_threshold: weights[(candidates[i], word)] += similarity * weight for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus, proper_hyponym) topn = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:topn]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1 def extract_vip(self, cid): desc = ' '.join( self.wfilter( self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag')))) if not desc: return {} classifier_vips = [ (int(tag.replace(u'__label__', '')), weight) for (tag, weight) in self.vip_classifier.predict_proba([desc], 2)[0] if weight > self.vip_lower ] classifier_vips.sort(key=lambda x: -x[1]) # if 2 candidate vip label, check whether their probability is comparable if len(classifier_vips ) == 2 and classifier_vips[0][1] > classifier_vips[1][1] * 2: return {classifier_vips[0][0]: classifier_vips[0][1]} return dict(classifier_vips) def __extract_important(self, contents, candidates): # support assginment supports = deepcopy(candidates) for word in contents: if word not in self.w2v: continue for candidate in candidates.keys(): if candidate not in self.w2v: continue similarity = self.w2v.similarity(candidate, word) if similarity > self.similarity_threshold: supports[candidate] = supports.get(candidate, 0) + similarity # support selection results = {} candi_size, content_size = len(candidates), len(''.join(candidates)) for candidate, weight in supports.iteritems(): if candi_size >= 2 and weight < content_size / 20: continue results[candidate] = weight * self.thesaurus.get(candidate, 1) if len(results) == 0: return results # normalization normalizer = max(results.values()) for k, v in results.items(): results[k] = round(v / normalizer, 2) # narrow down results size if len(results) < 4: pass else: results = dict( filter(lambda x: x[1] > self.important_threshold, results.iteritems())) if len(results) > self.important_max_count: size = min( 10, max(int(ceil(len(results) / 2.0)), self.important_max_count)) results = dict( sorted(results.iteritems(), key=lambda x: -x[1])[:size]) return results def __extract_textrank(self, candidates, topn=15): """ weighted textrank, weights use tags' novelties """ graph = UndirectWeightedGraph() weights = collections.defaultdict(int) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus) index = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:index]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1 def __prepare_tag_contents(self, cid): # prepare contents contents = list(self.feeder.feed(cid, quanlity='medium')) candidates = [] for content, _ in contents: candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) source_tags = self.__extract_source_tag(cid) candidates_important = {} for content, weight in contents: for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight for tag in source_tags: candidates_important[tag] = candidates_important.get( tag, 0) + self.source_tag_default_weight return source_tags, candidates, candidates_important def __normalize_replacement(self, tags): if type(tags) is dict: normalized_tags = {} for tag, weight in tags.items(): if tag in self.replacements: for replacement in self.replacements.get(tag): normalized_tags[replacement] = weight else: normalized_tags[tag] = weight else: normalized_tags = [] for tag in tags: if tag in self.replacements: for replacement in self.replacements.get(tag): normalized_tags.append(replacement) else: normalized_tags.append(tag) return normalized_tags def __normalize(self, d): if not d: return d normalizer = max(d.values()) + 1.0 for tag, weight in d.items(): type_promotion = { 11011: 1, 11013: 1.5, 11012: 2.5 }.get(self.tag_types.get(tag, 0), 0) d[tag] = round(weight / normalizer, 2) + type_promotion return d def merge(self, d1, d2, weight=0): # weight is a bonus weight for k, v in d2.iteritems(): d1[k] = d1.get(k, 0) + v + weight return d1 def extract(self, cid, topn=15): # prepare contents source_tags, candidates, candidates_important = self.__prepare_tag_contents( cid) candidates_vips = self.extract_vip(cid) # generate results results = dict( self.__extract_vecrank(candidates, candidates_important, candidates_vips, topn)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1) # results = self.merge(results, dict(self.__extract_textrank(candidates, topn))) results = self.__normalize(results) results = self.__normalize_replacement(results) return results def extract_from_text(self, text): candidates = [] for content, _ in text.iteritems(): candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) candidates_important = {} for content, weight in text.iteritems(): for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight desc = ' '.join( self.wfilter(self.seg.cut4search(' '.join(text.keys())))) candidates_vips = { int(tag.replace(u'__label__', '')): weight for (tag, weight) in self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower } results = {} results = self.merge( results, self.__extract_important(candidates, candidates_important), 1) results = self.merge(results, dict(self.__extract_textrank(candidates, 10))) # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) results = self.__normalize(results) results = self.__normalize_replacement(results) deducts = self.__deduct_2nd(results) if len(deducts) < 3: results = self.merge(results, deducts) return results def __deduct_2nd(self, tags): deduct = [] tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()] for (tid, tag) in tags: if self.tag_types.get(tag, 0) == 11013: t1s = dbutil.get_hypernym_tags(self.db, tid, 1) for t1 in set(t1s) & set([t[0] for t in tags]): t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set( dbutil.get_hypernym_tags(self.db, tid, 2)) for t2 in t2s: if t2 not in set([t[0] for t in tags]): deduct.append(t2) return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}