Exemple #1
0
    def __find_industry(self, codes):

        cids = [dbutil.get_id_from_code(self.db, code) for code in codes]
        idids = [ind.industryId for ind in dbutil.get_company_industries(self.db, cids, True)]
        inds = [dbutil.get_industry_info(self.db, idid) for idid in idids]
        return [{"id": ind.id, "code": ind.code, "name": ind.name} for ind in inds
                if (ind.active is None or ind.active == 'Y')]
Exemple #2
0
    def fit_company(self, client):

        if self.tag:
            for cid, verify, modify in dbutil.get_company_from_tag(
                    self.db, self.tag, with_verify=True):
                # active = 'Y' if (verify and verify == 'Y') else 'P'
                active = 'P'
                modify = modify if active == 'Y' else 139
                dbutil.update_industry_company(self.db,
                                               self.idid,
                                               cid,
                                               active=active,
                                               source='tag',
                                               modify=modify)
        if self.company_rules:
            codes = client.search('topic', query=self.company_rules).get(
                'company', {}).get('data', [])
            if codes:
                codes.reverse()
                for code in codes:
                    cid = dbutil.get_id_from_code(self.db, code)
                    dbutil.update_industry_company(self.db,
                                                   self.idid,
                                                   cid,
                                                   source='rule')
        if self.auto_expand:
            self.expand()
Exemple #3
0
    def __process_visit_web(self, message):

        cid = dbutil.get_id_from_code(self.db, message.get('id'))
        uid = int(message.get('detail', -1))
        if self.need_verify(cid) and uid != -1 \
                and uid not in self.xiniu_users and dbutil.exist_verified_investor(self.db, uid):
            self.update_task(cid, 'visit_local', message.get('detail'))
Exemple #4
0
def extract_pack():

    from key import Extractor
    e = Extractor()
    db = dbcon.connect_torndb()
    for line in open('files/todo.pack'):
        cid = dbutil.get_id_from_code(db, line.strip())
        e.extract(cid)
        print line.strip()
Exemple #5
0
def dump():

    db = dbcon.connect_torndb()
    feeder = Feeder()
    mode = 'basic'
    ftrain = codecs.open('tmp/traditional.%s.train' % mode, 'w', 'utf=8')
    ftest = codecs.open('tmp/traditional.%s.test' % mode, 'w', 'utf=8')
    cids = [
        c.companyId for c in db.query(
            "select distinct companyId from company_tag_rel where tagId=604330 "
            "and verify='Y' and (active is null or active='Y');")
    ]
    cids.extend([
        dbutil.get_id_from_code(db, code.strip())
        for code in codecs.open('files/traditional')
    ])
    for cid in cids:
        content = ' '.join(feeder.feed_seged_fine(cid))
        if randint(0, 4):
            ftrain.write('__label__1 %s\n' % content)
        else:
            ftest.write('__label__1 %s\n' % content)
    print 'Positive Done'
    cids2 = [
        c.id for c in db.query(
            "select id from company where modifytime>'2018-03-01' "
            "and modifyuser is not null order by rand() limit 15000;")
    ]
    cids = set(cids)
    for cid in cids2:
        if cid in cids:
            continue
        content = ' '.join(feeder.feed_seged_fine(cid))
        if randint(0, 4):
            ftrain.write('__label__0 %s\n' % content)
            # ftrain.write('__label__0 %s %s\n' % (dbutil.get_company_establish_date(db, cid).year, content))
        else:
            ftest.write('__label__0 %s\n' % content)
    print 'Negetive Done'
    ftrain.close()
    ftest.close()
    db.close()
Exemple #6
0
    def infer_rules(self):

        global logger_tag
        for t in dbutil.get_ruled_tags(self.db):
            logger_tag.info('Processing rule for %s' % t.name)
            try:
                rule = t.rule.replace(u',', u',').replace(u'(', u'(').replace(u')', u')').replace(u' ', u'').lower()
                rule = generate_rule_based_query(rule)
                if rule:
                    codes = self.client.search('topic', query=rule).get('company', {}).get('data', [])
                    if len(codes) > 2000:
                        logger_tag.exception('To many results, %s, %s' % (t.name, len(codes)))
                    else:
                        logger_tag.info('%s processed' % t.name)
                        for code in codes:
                            cid = dbutil.get_id_from_code(self.db, code)
                            if not dbutil.exist_company_tag(self.db, cid, t.id):
                                dbutil.update_company_tag(self.db, cid, t.id, 1.505)
            except Exception, e:
                logger_tag.exception('Fail to process tag rules %s, due to %s' % (t.name, e))
Exemple #7
0
def train():

    nf = NewsFeatures()
    db = dbcon.connect_torndb()
    mongo = dbcon.connect_mongo()
    vec = DictVectorizer()
    random_source = [item[0] for item in nf.source.items() if item[1] <= 20]
    setx, sety = [], []

    print 'init'

    for index, item in enumerate(mongo.article.news.find({'source': 13030}).limit(2000)):

        if index % 50 == 0:
            print index

        link = item.get('link')
        cid = item.get('companyId')
        name = dbutil.get_company_name(db, cid)
        title = item.get('title')
        content = []
        for v in item.get('contents'):
            if v.get('content') and v.get('content').strip():
                content.append(v.get('content').strip())
        content = '\n'.join(content)

        if len(content) > 50:
            normal = nf.featurize(cid, name=name, title=title, content=content, link=link)
            setx.append(normal)
            sety.append(1)

            # rcid = dbutil.random_company_id(db)
            # rname = name[:-1]
            # makeup = nf.featurize(rcid, name=rname, title=title, content=content,
            #                       link=choice(random_source), name_update=False)
            # # print makeup
            # if makeup.get('content_simi', 0) > 0.02:
            #     print 'makeup', makeup, rcid
            # print 'mongo', cid, item.get('_id'), normal
            # setx.append(makeup)
            # sety.append(0)

    # load bad cases
    with codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'corpus/news.badcase'),
                     encoding='utf-8') as f:
        for line in f:
            try:
                cid = dbutil.get_id_from_code(db, line.split('\t')[0])
                nids = [nid for nid in line.strip().split('\t')[1:] if nid.strip()]
            except Exception, e:
                print 'fail to load bad case', line
                continue
            for nid in nids:
                try:
                    item = mongo.article.news.find({'_id': ObjectId(nid)})[0]
                    link = item.get('link')
                    name = dbutil.get_company_name(db, cid)
                    title = item.get('title')
                    content = []
                    for v in item.get('contents'):
                        if v.get('content') and v.get('content').strip():
                            content.append(v.get('content').strip())
                    content = '\n'.join(content)
                    setx.append(nf.featurize(cid, name=name, title=title, content=content, link=link, name_update=False))
                    sety.append(0)
                except Exception, e:
                    print 'nid', nid, e