Esempio n. 1
0
 def for_pivot(label, pred, output):
     pivot_map = dict()
     for i_ in xrange(len(label)):
         l_, p_ = label[i_][0], pred[i_][0][0]
         pivot_map[l_] = pivot_map.get(l_, dict())
         pivot_map[l_][p_] = pivot_map[l_].get(p_, 0) + 1
     with codecs.open(output, 'w', 'utf-8') as fo:
         fo.write('%-8s\t%-8s\t%s\n' % ('Actual', 'Predict', 'Count'))
         for k, v in pivot_map.items():
             for k_, v_ in v.items():
                 fo.write('%-8s\t%-8s\t%d\n' % (dbutil.get_tag_name(
                     db, k), dbutil.get_tag_name(db, k_), v_))
Esempio n. 2
0
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'
Esempio n. 3
0
def stat(output, *sources):
    with codecs.open(output, 'w', 'utf-8') as fo:
        for s in sources:
            count = _count(s)
            summary = '\nTagId\t\t\tTag Name\t\tCount\n'\
               + '\n'.join(['%-8s\t\t%-8s\t\t%-8d' % (k, dbutil.get_tag_name(db, k), v) for k, v in count.items()])\
               + '\nSum\t:%-8d Max\t:%-8d Min\t:%-8d' % (sum(count.values()), max(count.values()), min(count.values()))
            fo.write('%s:\n------\n' % (s.split('/')[-1]) + summary + '\n\n\n')
Esempio n. 4
0
def dump_sectors():

    db = dbcon.connect_torndb()
    with codecs.open('dumps/xiniu.tag', 'w', 'utf-8') as fo:
        for t1 in dbutil.get_sectored_tags(db, 1):
            for t2id in dbutil.get_tags_by_relation(db, t1.id, 54041):
                for t3id in dbutil.get_tags_by_relation(db, t2id, 54041):
                    fo.write('%s\t%s\t%s\n' % (t1.name, dbutil.get_tag_name(db, t2id), dbutil.get_tag_name(db, t3id)))
    db.close()
Esempio n. 5
0
def dump():

    global mapping
    mongo = dbcon.connect_mongo()
    db = dbcon.connect_torndb()
    ke = KeywordExtractor()
    raw = mongo.raw.qmp.find(
        {
            "url": "http://vip.api.qimingpian.com/d/c3",
            "processed": True
        }, {
            'postdata': 1,
            'data.basic': 1
        })
    results = {}
    fo = codecs.open('dumps/20180726', 'w', 'utf-8')
    for qmp in raw:
        basic = qmp.get('data', {}).get('basic')
        tags = []
        tags.append(basic.get('hangye1', ''))
        tags.append(basic.get('hangye2', ''))
        tags.extend(basic.get('tags_match', '').split('|'))
        tags = [tag for tag in tags if tag.strip()]
        sc = db.get(
            'select companyId from source_company where source=13121 and sourceId=%s;',
            qmp['postdata']['id'])
        tag_qmp = set(tags) & set(mapping.keys())
        if not tag_qmp:
            continue
        if not (sc and sc.companyId):
            continue
        orignal = copy(tag_qmp)
        tag_qmp = [mapping.get(tag) for tag in tag_qmp]
        tag_xiniu = [
            dbutil.get_tag_name(db, tid)
            for tid in ke.extract_vip(sc.companyId).keys()
        ]
        url = 'http://www.xiniudata.com/company/%s/overview' % dbutil.get_company_code(
            db, sc.companyId)
        desc = db.get('select brief from company where id=%s;',
                      sc.companyId).brief
        desc = desc.replace('\n', '') if desc else ''
        if set(tag_qmp) & set(tag_xiniu):
            # results[1] = results.get(1, 0) + 1
            fo.write('%s\t%s\t1\t%s\t%s\n' %
                     (','.join(orignal), ','.join(tag_xiniu), url, desc))
        else:
            fo.write('%s\t%s\t0\t%s\t%s\n' %
                     (','.join(orignal), ','.join(tag_xiniu), url, desc))
            # results[0] = results.get(0, 0) + 1
    for k, v in results.items():
        print k, v
Esempio n. 6
0
    def __load_trained_clfs(self):

        model_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0], 'models')
        clfs = {}
        for model_file in os.listdir(model_dir):
            if model_file.endswith('.model'):
                tid = model_file.split('.')[0]
                if not isinstance(tid, int):
                    continue
                clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load(
                    os.path.join(model_dir, model_file))
        return clfs
Esempio n. 7
0
def predict(model, k=3, cid=None, raw_info=None):
    clf = fasttext.load_model(model, encoding='utf-8')
    if cid or raw_info:
        content = dbutil.get_company_info(db,
                                          cid).description if cid else raw_info
        content = [
            ' '.join(nf.wfilter(nf.seg.cut4search(content.replace('\n', ''))))
        ]
        return '\n'.join([
            '%-8s\t%f' %
            (dbutil.get_tag_name(db, l.replace(u'__label__', u'')), p)
            for l, p in clf.predict_proba(content, k=k)[0]
        ])
    return 'No company id or text found.'
Esempio n. 8
0
    def __deduct_2nd(self, tags):

        deduct = []
        tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()]
        for (tid, tag) in tags:
            if self.tag_types.get(tag, 0) == 11013:
                t1s = dbutil.get_hypernym_tags(self.db, tid, 1)
                for t1 in set(t1s) & set([t[0] for t in tags]):
                    t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set(
                        dbutil.get_hypernym_tags(self.db, tid, 2))
                    for t2 in t2s:
                        if t2 not in set([t[0] for t in tags]):
                            deduct.append(t2)
        return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}
Esempio n. 9
0
 def summary(label, pred, output):
     pt, pos, true = dict(), dict(), dict()
     for i_ in xrange(len(label)):
         for l_ in label[i_]:
             true[l_] = true.get(l_, 0) + 1
         for p_ in pred[i_]:
             pos[p_] = pos.get(p_, 0) + 1
             if p_ in label[i_]:
                 pt[p_] = pt.get(p_, 0) + 1
     with codecs.open(output, 'w', 'utf-8') as fo:
         fo.write('Tag\t\tPrecision\tRecall\t\tPredict\t\tActual\n')
         for k in true:
             precision, recall = float(pt[k]) / float(pos[k]), float(
                 pt[k]) / float(true[k])
             fo.write(
                 '%-8s\t%-8f\t%-8f\t%-8d\t%-8d\n' % ((dbutil.get_tag_name(
                     db, k)), precision, recall, pos[k], true[k]))
Esempio n. 10
0
def get_investor_portfolio_companies(db, mongo, iid):

    companies = {}
    pfls = db.query(
        'select distinct company.id cid '
        'from company, funding, funding_investor_rel rel, corporate cp '
        'where rel.investorId=%s and funding.corporateId = company.corporateId '
        'and (company.active is null or company.active="Y") '
        'and company.corporateId=cp.id and (cp.active is null or cp.active="Y") '
        'and rel.fundingId=funding.id and (funding.active is null or funding.active="Y") '
        'and (rel.active is null or rel.active="Y") '
        'and funding.fundingDate>="2013-01-01" and funding.fundingDate<="2018-06-01" '
        'order by fundingDate asc;', iid)
    cids = [pfl.cid for pfl in pfls]
    # tags
    tags = {}
    for cid in cids:
        for t in dbutil.get_company_tags_info(db, cid, [11012, 11013]):
            tags[t.tid] = tags.get(t.tid, 0) + 1
    normalizer = sum(tags.values())
    ntags = {
        dbutil.get_tag_name(db, tid): round(count * 1.0 / normalizer, 4)
        for tid, count in tags.items()
    }
    companies['tags'] = sorted(ntags.items(), key=lambda x: -x[1])[:20]
    # count of news
    y2017 = datetime.strptime('2017-01-01', '%Y-%m-%d')
    companies['news'] = len(
        list(
            mongo.article.news.find({
                'investorIds': iid,
                'processStatus': 1,
                'date': {
                    '$gte': y2017
                }
            })))
    # locations
    locations = [dbutil.get_company_location(db, cid)[1] for cid in cids]
    locations = {
        l: round(locations.count(l) * 1.0 / len(locations), 4)
        for l in set(locations)
    }
    companies['location'] = locations
    return companies
Esempio n. 11
0
def predict(model, data_path=None, out_path=None, text=None):
    if not text and not data_path:
        print('Input at least one of valid text or data path')
        raise ValueError
    clf = fasttext.load_model(model, encoding='utf-8')
    if text:
        content = [' '.join(seg.cut4search(i)) for i in text]
    else:
        df = pd.read_csv(data_path, index_col='ID', encoding='utf_8_sig')
        content = [' '.join(seg.cut4search(i)) for i in df[u'原文本']]
    preds = clf.predict_proba(content, k=10)
    tags = []
    for pred in preds:
        pred_sum = sum(p[1] for p in pred)
        tags.append(' '.join(
            dbutil.get_tag_name(db, int(p[0].replace(u'__label__', '')))
            for p in pred if p[1] > 0.05 * pred_sum))
    if text:
        return tags
    df['tag'] = tags
    df.to_csv(out_path, encoding='utf_8_sig')
Esempio n. 12
0
    def extract(self, cid, topn=15):

        # prepare contents
        source_tags, candidates, candidates_important = self.__prepare_tag_contents(
            cid)
        candidates_vips = self.extract_vip(cid)

        # generate results
        results = dict(
            self.__extract_vecrank(candidates, candidates_important,
                                   candidates_vips, topn))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1)
        # results = self.merge(results, dict(self.__extract_textrank(candidates, topn)))
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        return results
Esempio n. 13
0
    def extract_from_text(self, text):

        candidates = []
        for content, _ in text.iteritems():
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        candidates_important = {}
        for content, weight in text.iteritems():
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        desc = ' '.join(
            self.wfilter(self.seg.cut4search(' '.join(text.keys()))))
        candidates_vips = {
            int(tag.replace(u'__label__', '')): weight
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 3)[0]
            if weight > self.vip_lower
        }
        results = {}
        results = self.merge(
            results, self.__extract_important(candidates,
                                              candidates_important), 1)
        results = self.merge(results,
                             dict(self.__extract_textrank(candidates, 10)))
        # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        deducts = self.__deduct_2nd(results)
        if len(deducts) < 3:
            results = self.merge(results, deducts)
        return results
Esempio n. 14
0
    def __extract_vecrank(self, candidates, candidates_important,
                          candidates_vips, topn):

        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        proper_hyponym = dict.fromkeys(
            set(
                chain(*[
                    self.hyponym.get(dbutil.get_tag_name(self.db, cv))
                    for cv in candidates_vips.iterkeys()
                ])), 2)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
            if candidates[i] not in self.w2v:
                continue
            for word, weight in candidates_important.items():
                if word == candidates[i] or word not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidates[i], word)
                if similarity > self.similarity_threshold:
                    weights[(candidates[i], word)] += similarity * weight
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus, proper_hyponym)
        topn = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:topn]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1
Esempio n. 15
0
def testing(test, model, path, one_label=True, pivot=True, auc=True):
    clf = fasttext.load_model(model, encoding='utf-8')
    tids, labels, contents = list(), list(), list()
    for line in codecs.open(test):
        tid, rest = line.split(' ', 1)
        tids.append(tid), labels.append([])
        while rest.startswith(u'__label__'):
            label, rest = rest.split(' ', 1)
            labels[-1].append(label.replace(u'__label__', u''))
        contents.append(rest)
    preds = [[(l.replace(u'__label__', u''), p) for l, p in lp]
             for lp in clf.predict_proba(contents, k=3)]
    with codecs.open(path + 'predict', 'w', 'utf-8') as fo:
        for i in xrange(len(tids)):
            ab_res = 'T' if labels[i][0] == preds[i][0][0] else 'F'
            res = 'T' if labels[i][0] in [l for l, _ in preds[i]] else 'F'
            fo.write(
                '%-8s%s\t%s\t%-30s%-30s%-s\n' %
                (tids[i], ab_res, res, dbutil.get_company_name(db, tids[i]),
                 '&'.join([dbutil.get_tag_name(db, l)
                           for l in labels[i]]), '\t'.join([
                               dbutil.get_tag_name(db, l) + ' ' + str(p)
                               for l, p in preds[i]
                           ])))

    def summary(label, pred, output):
        pt, pos, true = dict(), dict(), dict()
        for i_ in xrange(len(label)):
            for l_ in label[i_]:
                true[l_] = true.get(l_, 0) + 1
            for p_ in pred[i_]:
                pos[p_] = pos.get(p_, 0) + 1
                if p_ in label[i_]:
                    pt[p_] = pt.get(p_, 0) + 1
        with codecs.open(output, 'w', 'utf-8') as fo:
            fo.write('Tag\t\tPrecision\tRecall\t\tPredict\t\tActual\n')
            for k in true:
                precision, recall = float(pt[k]) / float(pos[k]), float(
                    pt[k]) / float(true[k])
                fo.write(
                    '%-8s\t%-8f\t%-8f\t%-8d\t%-8d\n' % ((dbutil.get_tag_name(
                        db, k)), precision, recall, pos[k], true[k]))

    def for_pivot(label, pred, output):
        pivot_map = dict()
        for i_ in xrange(len(label)):
            l_, p_ = label[i_][0], pred[i_][0][0]
            pivot_map[l_] = pivot_map.get(l_, dict())
            pivot_map[l_][p_] = pivot_map[l_].get(p_, 0) + 1
        with codecs.open(output, 'w', 'utf-8') as fo:
            fo.write('%-8s\t%-8s\t%s\n' % ('Actual', 'Predict', 'Count'))
            for k, v in pivot_map.items():
                for k_, v_ in v.items():
                    fo.write('%-8s\t%-8s\t%d\n' % (dbutil.get_tag_name(
                        db, k), dbutil.get_tag_name(db, k_), v_))

    def roc_auc(label, pred):
        y_true, y_prob = list(), list()
        for i_ in xrange(len(label)):
            y_true = y_true + [
                1
            ] if label[i_][0] == pred[i_][0][0] else y_true + [0]
            y_prob.append(pred[i_][0][1])
        fpr, tpr, thresholds = metrics.roc_curve(y_true, y_prob)
        auc_score = metrics.auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %.2f)' % auc_score)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.show()
        return auc_score

    if one_label:
        summary([[l[0]] for l in labels],
                [[ps[0][0].replace(u'__label__', u'')] for ps in preds],
                path + 'one_label')
    else:
        summary(labels, [[p[0].replace(u'__label__', u'') for p in ps]
                         for ps in preds], path + 'mul_labels')
    if pivot:
        for_pivot(labels, preds, path + 'pivot')
    if auc:
        print 'AUC: %f' % roc_auc(labels, preds)
Esempio n. 16
0
    def __get_sector_filter(self, source, ftype):

        sector_filters = self.mongo.keywords.sector_filters.find_one({'source': source, 'filter_type': ftype})
        sector_filters = sector_filters.get('sectors', []) if sector_filters else []
        sector_filters = [dbutil.get_tag_name(self.db, tid) for tid in sector_filters]
        return sector_filters