def __init__(self, model='default'): # self.segmenter = Segmenter() self.tagger = Tagger(cname=True) # self.extractor = Extractor(record=False) if model == 'easy': self.clf = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/backup/news.score.lrmodel')) self.vec = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/backup/news.featurizer')) else: self.clf = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/news.score.lrmodel')) self.vec = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/news.featurizer')) self.nf = NewsFeatures() self.db = dbcon.connect_torndb()
def __init__(self): self.db = dbcon.connect_torndb() self.tagger = Tagger() self.stopwords = stopword.get_standard_stopwords() self.source = json.load(codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'dumps/news.domain'), encoding='utf-8')) self.failed = 0
def feed_doc_s(sid): mongo = dbcon.connect_mongo() tagger = Tagger(tags=True) wfilter = get_default_filter() for record in mongo.article.news.find({'sectors': sid}): yield chain(*[ wfilter([ w[0] for w in tagger.tag(piece['content'].strip()) if w[1] in ('tag', 'itag') ]) for piece in record['contents'] if piece['content'].strip() ])
def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited'
class NewsFeatures(object): def __init__(self): self.db = dbcon.connect_torndb() self.tagger = Tagger() self.stopwords = stopword.get_standard_stopwords() self.source = json.load(codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'dumps/news.domain'), encoding='utf-8')) self.failed = 0 def featurize(self, cid, **kwargs): features = {} instance = dict(**kwargs) if instance.get('name') and instance.get('name_update', True): self.tagger.add_word(instance.get('name'), tag='cn') name, title, content, link = instance.get('name'), instance.get('title'), instance.get('content'), \ instance.get('link') if title and title.strip(): title = self.tagger.tag(title) matches = [x[0] for x in title if x[1] == 'cn'] features['title_ne'] = matches.count(name) if content and content.strip(): content = list(self.tagger.tag(content)) # content ne matches = [x[0] for x in content if x[1] == 'cn'] features['content_ne'] = round(float(matches.count(name))/len(content), 4) # features['content_length'] = len(content) # content similarity try: odesc = Counter([item[0] for item in self.tagger.tag(dbutil.get_company_solid_description(self.db, cid)) if item[0].strip() and item[0] not in self.stopwords and len(item[0]) > 1 and not item[0].isnumeric()]) idesc = Counter([item[0] for item in content if item[0].strip() and item[0] not in self.stopwords and len(item[0]) > 1 and not item[0].isnumeric()]) length = max(min(50, len(idesc), len(odesc)), 20) odesc, idesc = odesc.most_common(length), idesc.most_common(length) odesc.extend([(x[1], x[2]*5) for x in dbutil.get_company_tags_idname(self.db, cid)]) simi = weighted_jaccard(odesc, idesc) # if simi > 0.05: # print simi, cid, instance features['content_simi'] = simi except: self.failed += 1 # if link and link.strip(): # features['source'] = self.source.get(urlparse.urlparse(link).netloc, 0) return features
def __init__(self): global word2vec_model self.mongo = dbcon.connect_mongo() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.important_lower = 0.1 self.important_threshold = 0.2 self.important_max_num = 5 self.tagger = Tagger(tags=True) self.wfilter = word_filter.get_default_filter()
class RelatednessScorer(object): global logger_relate def __init__(self, model='default'): # self.segmenter = Segmenter() self.tagger = Tagger(cname=True) # self.extractor = Extractor(record=False) if model == 'easy': self.clf = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/backup/news.score.lrmodel')) self.vec = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/backup/news.featurizer')) else: self.clf = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/news.score.lrmodel')) self.vec = joblib.load( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'dumps/news.featurizer')) self.nf = NewsFeatures() self.db = dbcon.connect_torndb() def compare(self, cid, **kwargs): logger_relate.info('Compare news of company#%s' % cid) instance = dict(**kwargs) if instance.get('name'): self.tagger.add_word(instance.get('name'), tag='cn') name, title, content = instance.get('name'), instance.get( 'title'), instance.get('content') if len(content) <= 20: return False, 0 # print self.nf.featurize(cid, name=name, title=title, content=content) news = self.vec.transform( [self.nf.featurize(cid, name=name, title=title, content=content)]) result = self.clf.predict_proba(news)[0] return (float(result[1]) > float(result[0])), round(result[1], 4)
def __init__(self): global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag logger_tag.info('Extractor model initing') self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(itags=True) self.wfilter = word_filter.get_default_filter() self.gang = GangTag() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.chain_simi_threshold = 0.25 self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)} self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.traditional_classifier = fasttext.load_model(viptag_model_traditional) self.trained_tag_clfs = self.__load_trained_clfs() self.important_lower = 0.1 self.important_threshold = 0.2 self.relevant_threshold = 0.4 self.vip_lower = 0.3 self.vip_threshold = 0.25 self.important_max_num = 5 self.max_contents_length = 20 self.yellows = dbutil.get_yellow_tags(self.db) self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_weighted_tags() self.thesaurus_ids = self.__load_weighted_tags(tid=True) self.junk_terms = self.__load_junk_tags() self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()} self.trusted_sources = dicts.get_known_company_source() self.general_tagger = GeneralTagger() logger_tag.info('Extractor model inited')
class Extractor(object): def __init__(self): global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag logger_tag.info('Extractor model initing') self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(itags=True) self.wfilter = word_filter.get_default_filter() self.gang = GangTag() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.chain_simi_threshold = 0.25 self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)} self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.traditional_classifier = fasttext.load_model(viptag_model_traditional) self.trained_tag_clfs = self.__load_trained_clfs() self.important_lower = 0.1 self.important_threshold = 0.2 self.relevant_threshold = 0.4 self.vip_lower = 0.3 self.vip_threshold = 0.25 self.important_max_num = 5 self.max_contents_length = 20 self.yellows = dbutil.get_yellow_tags(self.db) self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_weighted_tags() self.thesaurus_ids = self.__load_weighted_tags(tid=True) self.junk_terms = self.__load_junk_tags() self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()} self.trusted_sources = dicts.get_known_company_source() self.general_tagger = GeneralTagger() logger_tag.info('Extractor model inited') def __load_trained_clfs(self): model_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'models') return {175747: joblib.load(os.path.join(model_dir, '175747.20180311.model'))} def __extract_source_tag(self, cid): tags = dbutil.get_source_company_tags(self.db, cid, self.trusted_sources) if tags: return set(chain(*[dbutil.analyze_source_tag(self.db, tname, self.replacements) for tname in tags if tname and tname.strip()])) return set([]) def __extract_important(self, contents, source_tags=None): # candidates generation candidates = {} if not source_tags else {}.fromkeys(source_tags, 1) for content, weight in contents: for tag in [x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants]: candidates[tag] = candidates.get(tag, 0) + weight if len(candidates) < 1: return {} # support assignment content_length = 0 supports = {} for index, (content, dweight) in enumerate(contents): for word in self.wfilter([x[0] for x in self.tagger.tag(content)]): if word not in self.w2v: continue content_length += 1 for candidate in candidates.keys(): if candidate not in self.w2v: continue similarity = self.w2v.similarity(candidate, word) if similarity > self.similarity_threshold: supports.setdefault(candidate, []).append((index, dweight, similarity)) # for k, v in supports.iteritems(): # print k, v # support selection results = {} csize = len(candidates) for candidate, v in supports.iteritems(): # if (csize >= 2) and \ # (sum([y[1] for y in set([(x[0], x[1]) for x in v])]) < min(6, ceil(float(len(contents))/3))): # continue support = sum([round(item[1]*item[2], 2) for item in v]) if csize >= 2 and sum([round(item[2], 2) for item in v]) < content_length / 20: continue results[candidate] = support * self.thesaurus.get(candidate, 1) if len(results) == 0: return results # normalization normalizer = max(results.values()) for k, v in results.items(): # if round(v/normalizer, 2) < self.important_lower: # continue results[k] = round(v/normalizer, 2) # narrow down results size if len(results) < 4: pass else: results = dict(filter(lambda x: x[1] > self.important_threshold, results.iteritems())) if len(results) > self.important_max_num: size = min(10, max(int(ceil(len(results)/2.0)), self.important_max_num)) results = dict(sorted(results.iteritems(), key=lambda x: -x[1])[:size]) return results def __extract_vectorrank(self, contents): pass def __extract_textrank(self, contents, topn=15): """ weighted textrank, weights use tags' novelties """ global textrank_window_size, textrank_threshold candidates = [] for content, _ in contents: candidates.extend([x[0] for x in self.tagger.tag(content)]) # filter candidates = self.wfilter(candidates) # print ' '.join(candidates) if len(candidates) < 5: return graph = UndirectWeightedGraph() weights = collections.defaultdict(int) for i in xrange(len(candidates)): for j in xrange(i+1, i+textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus) index = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:index]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= textrank_threshold: yield tag, round(weight, 2) start += 1 def extract(self, cid, topn=15, fast=False, update_only=False): # general tag new_general = self.general_tagger.label(cid) if new_general: logger_tag.info('General Tag of %s, %s' % (cid, ','.join([str(tid) for tid in new_general]))) contents = list(self.feeder.feed(cid, quanlity='medium')) results = {} if len(contents) > self.max_contents_length: contents = sorted(contents, key=lambda x: -x[1])[:self.max_contents_length] # source tags source_tags = self.__extract_source_tag(cid) # print ','.join(source_tags) # results = self.merge(results, {}.fromkeys(source_tags, 0.5)) # important tag results = self.merge(results, self.__extract_important(contents, source_tags), 1) # regular tag results = self.merge(results, dict(self.__extract_textrank(contents, topn))) # verified tag results = self.merge(results, dict.fromkeys(dbutil.get_company_tags_verified(self.db, cid), 1)) # topic tag results = self.merge(results, dict.fromkeys(dbutil.get_company_topics_tags(self.db, cid), 1.5)) # normalize results = self.__normalize(results) # vip tags vips = self.update_vip_tags(cid, results, source_tags) # update contents based tags results = self.__normalize_replacement(results) try: new_tags, remove_tags = self.update_contents_tags(cid, results, source_tags, vips, topn) except Exception, e: new_tags, remove_tags = [], [] logger_tag.info('Fail to update contents tags, %s, %s' % (cid, e)) if not update_only: for remove_tag in remove_tags: dbutil.update_company_tag(self.db, cid, remove_tag, 0, active="N") logger_tag.info('Processed %s, new tags %s, removed %s' % (cid, ','.join([str(tid) for tid in new_tags]), ','.join([str(tid) for tid in remove_tags]))) # process gang tag 派系标签 gangtag_ids = self.gang.predict(cid) for gangtagid in gangtag_ids: dbutil.update_company_tag(self.db, cid, gangtagid, 1.001) try: self.review(cid, contents) except Exception, e: logger_tag.exception('Review failed, %s, due to %s' % (cid, e))
class SubSector(object): def __init__(self): global word2vec_model self.mongo = dbcon.connect_mongo() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.important_lower = 0.1 self.important_threshold = 0.2 self.important_max_num = 5 self.tagger = Tagger(tags=True) self.wfilter = word_filter.get_default_filter() def train(self): pass def extract_tag(self, nid): return self.__extract_itag(nid) def __extract_itag(self, nid): contents = list(self.mongo.article.news.find({"_id": nid}))[0]['contents'] # candidates generation candidates = {} for content in contents: for tag in [ x[0] for x in self.tagger.tag(content['content']) if x[1] == 'itag' and x[0] in self.w2v.vocab ]: candidates[tag] = candidates.get(tag, 0) # support assginment total = 0 supports = {} for index, content in enumerate(contents): for word in self.wfilter( [x[0] for x in self.tagger.tag(content['content'])]): if word not in self.w2v.vocab: continue total += 1 for candidate in candidates.keys(): similarity = self.w2v.similarity(candidate, word) if similarity > self.similarity_threshold: supports.setdefault(candidate, []).append( (index, 1, similarity)) # support selection results = {} for candidate, v in supports.iteritems(): # if (csize >= 2) and \ # (sum([y[1] for y in set([(x[0], x[1]) for x in v])]) < min(6, ceil(float(len(contents))/3))): # continue support = sum([round(item[1] * item[2], 2) for item in v]) results[candidate] = support if len(results) == 0: return results # normalization, max weight equals to 1 normalizer = max(results.values()) for k, v in results.items(): # if round(v/normalizer, 2) < self.important_lower: # continue results[k] = round(v / normalizer, 2) # narrow down results size if len(results) < 4: pass else: results = dict( filter(lambda x: x[1] > self.important_threshold, results.iteritems())) if len(results) > self.important_max_num: size = min( 10, max(int(ceil(len(results) / 2.0)), self.important_max_num)) results = dict( sorted(results.iteritems(), key=lambda x: -x[1])[:size]) return results
sys.path.append('..') sys.setdefaultencoding('utf-8') import torndb import os import codecs import time import re from random import random from common import nlpconfig from common.zhtools import stopword from common.zhtools import hants from common.zhtools.postagger import Tagger tagger = Tagger('ltp') doc_len_threshold = 10 year = re.compile(u'\d+年') month = re.compile(u'\d+月') day = re.compile(u'\d+日') stopwords = stopword.get_standard_stopwords() class Corpus(object): def __init__(self, extrc_func=None, dirs=None): self.extract = extrc_func if extrc_func else lambda x: x.strip() self.dirs = dirs def __iter__(self):
class KeywordExtractor(object): def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited' def __load_trained_clfs(self): model_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], 'models') clfs = {} for model_file in os.listdir(model_dir): if model_file.endswith('.model'): tid = model_file.split('.')[0] if not isinstance(tid, int): continue clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load( os.path.join(model_dir, model_file)) return clfs def __load_tag_novelties(self, tid=False): if not tid: return { tag.name: (tag.novelty or 1) for tag in dbutil.get_tags_by_type(self.db) } else: return { tag.id: (tag.novelty or 1) for tag in dbutil.get_tags_by_type(self.db) } def __load_tag_types(self): return { tag.name: (tag.type or 0) for tag in dbutil.get_tags_by_type(self.db) } def __extract_source_tag(self, cid): tags = dbutil.get_source_company_tags(self.db, cid, self.trusted_sources) if tags: return set( chain(*[ dbutil.analyze_source_tag( self.db, tname, self.replacements) for tname in tags if tname and tname.strip() ])) return set([]) def __extract_vecrank(self, candidates, candidates_important, candidates_vips, topn): graph = UndirectWeightedGraph() weights = collections.defaultdict(int) proper_hyponym = dict.fromkeys( set( chain(*[ self.hyponym.get(dbutil.get_tag_name(self.db, cv)) for cv in candidates_vips.iterkeys() ])), 2) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 if candidates[i] not in self.w2v: continue for word, weight in candidates_important.items(): if word == candidates[i] or word not in self.w2v: continue similarity = self.w2v.similarity(candidates[i], word) if similarity > self.similarity_threshold: weights[(candidates[i], word)] += similarity * weight for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus, proper_hyponym) topn = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:topn]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1 def extract_vip(self, cid): desc = ' '.join( self.wfilter( self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag')))) if not desc: return {} classifier_vips = [ (int(tag.replace(u'__label__', '')), weight) for (tag, weight) in self.vip_classifier.predict_proba([desc], 2)[0] if weight > self.vip_lower ] classifier_vips.sort(key=lambda x: -x[1]) # if 2 candidate vip label, check whether their probability is comparable if len(classifier_vips ) == 2 and classifier_vips[0][1] > classifier_vips[1][1] * 2: return {classifier_vips[0][0]: classifier_vips[0][1]} return dict(classifier_vips) def __extract_important(self, contents, candidates): # support assginment supports = deepcopy(candidates) for word in contents: if word not in self.w2v: continue for candidate in candidates.keys(): if candidate not in self.w2v: continue similarity = self.w2v.similarity(candidate, word) if similarity > self.similarity_threshold: supports[candidate] = supports.get(candidate, 0) + similarity # support selection results = {} candi_size, content_size = len(candidates), len(''.join(candidates)) for candidate, weight in supports.iteritems(): if candi_size >= 2 and weight < content_size / 20: continue results[candidate] = weight * self.thesaurus.get(candidate, 1) if len(results) == 0: return results # normalization normalizer = max(results.values()) for k, v in results.items(): results[k] = round(v / normalizer, 2) # narrow down results size if len(results) < 4: pass else: results = dict( filter(lambda x: x[1] > self.important_threshold, results.iteritems())) if len(results) > self.important_max_count: size = min( 10, max(int(ceil(len(results) / 2.0)), self.important_max_count)) results = dict( sorted(results.iteritems(), key=lambda x: -x[1])[:size]) return results def __extract_textrank(self, candidates, topn=15): """ weighted textrank, weights use tags' novelties """ graph = UndirectWeightedGraph() weights = collections.defaultdict(int) for i in xrange(len(candidates)): for j in xrange(i + 1, i + self.textrank_window_size): if j >= len(candidates): break weights[(candidates[i], candidates[j])] += 1 for terms, weight in weights.iteritems(): graph.add_edge(terms[0], terms[1], weight) nodes_rank = graph.rank(self.thesaurus) index = min(topn, len(candidates)) start = 0 for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:index]: if tag in self.junk_terms: continue if start < 2: yield tag, round(weight, 2) elif weight >= self.textrank_threshold: yield tag, round(weight, 2) start += 1 def __prepare_tag_contents(self, cid): # prepare contents contents = list(self.feeder.feed(cid, quanlity='medium')) candidates = [] for content, _ in contents: candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) source_tags = self.__extract_source_tag(cid) candidates_important = {} for content, weight in contents: for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight for tag in source_tags: candidates_important[tag] = candidates_important.get( tag, 0) + self.source_tag_default_weight return source_tags, candidates, candidates_important def __normalize_replacement(self, tags): if type(tags) is dict: normalized_tags = {} for tag, weight in tags.items(): if tag in self.replacements: for replacement in self.replacements.get(tag): normalized_tags[replacement] = weight else: normalized_tags[tag] = weight else: normalized_tags = [] for tag in tags: if tag in self.replacements: for replacement in self.replacements.get(tag): normalized_tags.append(replacement) else: normalized_tags.append(tag) return normalized_tags def __normalize(self, d): if not d: return d normalizer = max(d.values()) + 1.0 for tag, weight in d.items(): type_promotion = { 11011: 1, 11013: 1.5, 11012: 2.5 }.get(self.tag_types.get(tag, 0), 0) d[tag] = round(weight / normalizer, 2) + type_promotion return d def merge(self, d1, d2, weight=0): # weight is a bonus weight for k, v in d2.iteritems(): d1[k] = d1.get(k, 0) + v + weight return d1 def extract(self, cid, topn=15): # prepare contents source_tags, candidates, candidates_important = self.__prepare_tag_contents( cid) candidates_vips = self.extract_vip(cid) # generate results results = dict( self.__extract_vecrank(candidates, candidates_important, candidates_vips, topn)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1) # results = self.merge(results, dict(self.__extract_textrank(candidates, topn))) results = self.__normalize(results) results = self.__normalize_replacement(results) return results def extract_from_text(self, text): candidates = [] for content, _ in text.iteritems(): candidates.extend([x[0] for x in self.tagger.tag(content)]) candidates = self.wfilter(candidates) candidates_important = {} for content, weight in text.iteritems(): for tag in [ x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants ]: candidates_important[tag] = candidates_important.get( tag, 0) + weight desc = ' '.join( self.wfilter(self.seg.cut4search(' '.join(text.keys())))) candidates_vips = { int(tag.replace(u'__label__', '')): weight for (tag, weight) in self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower } results = {} results = self.merge( results, self.__extract_important(candidates, candidates_important), 1) results = self.merge(results, dict(self.__extract_textrank(candidates, 10))) # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10)) results = self.merge( results, { dbutil.get_tag_name(self.db, tid): w for tid, w in candidates_vips.iteritems() }) results = self.__normalize(results) results = self.__normalize_replacement(results) deducts = self.__deduct_2nd(results) if len(deducts) < 3: results = self.merge(results, deducts) return results def __deduct_2nd(self, tags): deduct = [] tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()] for (tid, tag) in tags: if self.tag_types.get(tag, 0) == 11013: t1s = dbutil.get_hypernym_tags(self.db, tid, 1) for t1 in set(t1s) & set([t[0] for t in tags]): t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set( dbutil.get_hypernym_tags(self.db, tid, 2)) for t2 in t2s: if t2 not in set([t[0] for t in tags]): deduct.append(t2) return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}