Beispiel #1
0
def generate_sector_filters():

    db = dbcon.connect_torndb()
    mongo = dbcon.connect_mongo()
    for tag in dbutil.get_tags_by_type(db, [11054]):
        cids = dbutil.get_company_from_tag(db, tag.id)
        if not cids:
            continue
        sectors = dbutil.get_companies_sector_tag(db, cids, [1], 'novelty')
        mongo.keywords.sector_filters.update({'source': tag.name, 'filter_type': 'tag'},
                                             {'$set': {'sectors': sectors, 'modifyTime': datetime.utcnow()}}, True)
    for idid, _ in dbutil.get_industries(db):
        cids = [c.companyId for c in dbutil.get_industry_companies(db, idid)]
        if not cids:
            continue
        sectors = dbutil.get_companies_sector_tag(db, cids, [1], 'novelty')
        mongo.keywords.sector_filters.update({'source': idid, 'filter_type': 'industry'},
                                             {'$set': {'sectors': sectors, 'modifyTime': datetime.utcnow()}}, True)
    for tpid in dbutil.get_all_topics(db):
        cids = [c.companyId for c in dbutil.get_topic_companies(db, tpid)]
        if not cids:
            continue
        sectors = dbutil.get_companies_sector_tag(db, cids, [1], 'novelty')
        mongo.keywords.sector_filters.update({'source': tpid, 'filter_type': 'topic'},
                                             {'$set': {'sectors': sectors, 'modifyTime': datetime.utcnow()}}, True)
    for iid, name in dbutil.get_all_investor(db):
        sectors = dbutil.filter_sector_tags(db, json.loads(dbutil.get_investor_tags(db, iid, 0)).keys())
        mongo.keywords.sector_filters.update({'source': name, 'filter_type': 'investor'},
                                             {'$set': {'sectors': sectors, 'modifyTime': datetime.utcnow()}}, True)
    db.close()
Beispiel #2
0
    def __init__(self, query):

        self.query = query
        self.intent = None
        db = dbcon.connect_torndb()
        self.en_tags = [
            t.name for t in dbutil.get_tags_by_type(db)
            if is_engish_word(t.name)
        ]
        db.close()
Beispiel #3
0
    def reload_tags(self):

        if datetime.now().hour == self.reload_control:
            return
        self.reload_control = datetime.now().hour
        db = dbcon.connect_torndb()
        self.en_tags = set([
            t.name.lower() for t in dbutil.get_tags_by_type(db)
            if is_engish_word(t.name)
        ])
        self.tags = set([t.lower() for t in dbutil.get_searchable_tags(db)])
        db.close()
Beispiel #4
0
class NewsQuery(object):

    seg = Segmenter()
    tag_max_len = 5
    stopwords = stopword.get_standard_stopwords()
    db = dbcon.connect_torndb()
    news_tags = {
        t.name: t.id
        for t in dbutil.get_tags_by_type(db, [11500, 11801])
    }
    db.close()

    def __init__(self, query_input, filters):

        self.input = query_input.lower().replace(' ', '').replace('/', '')
        # self.input = input.split()
        self.filters = filters
        self.len = len(self.input)

    # field is search target field, can be 'content' or 'title'
    def generate_query(self, field, extend=False):

        query = dict()
        if self.input:
            if self.input.strip() in self.news_tags:
                query = templates.get_term(
                    'features', self.news_tags.get(self.input.strip()))
            elif extend:
                if self.len <= self.tag_max_len:
                    key = filter(
                        lambda x: (x not in self.stopwords) and len(x) > 1,
                        self.seg.cut4search(self.input))
                    query.setdefault('bool', {}).setdefault('must', []).append(
                        templates.get_string_template(field, ' '.join(key),
                                                      '100%'))
            else:
                if isinstance(self.input, str) or isinstance(
                        self.input, unicode):
                    key = filter(
                        lambda x: (x not in self.stopwords) and len(x) > 1,
                        self.seg.cut4search(self.input))
                    if self.len <= self.tag_max_len:
                        query.setdefault('bool', {}).setdefault(
                            'must',
                            []).append(templates.get_term(field, self.input))
                    elif self.len <= 20:
                        query.setdefault('bool',
                                         {}).setdefault('must', []).append(
                                             templates.get_string_template(
                                                 field, ' '.join(key), '100%'))
                    else:
                        query.setdefault('bool',
                                         {}).setdefault('must', []).append(
                                             templates.get_string_template(
                                                 field, ' '.join(key), '95%'))
        if self.filters:
            for key in self.filters.keys():
                if self.filters.get(key):
                    query.setdefault('bool', {}).setdefault('must', []).append(
                        templates.get_terms(key, self.filters.get(key)))
        if not extend and (not query):
            return {"match_all": {}}

        return query
Beispiel #5
0
class InvestorQuery(object):

    db = dbcon.connect_torndb()
    en_tags = set([
        t.name.lower() for t in dbutil.get_tags_by_type(db)
        if is_engish_word(t.name)
    ])
    tags = set([t.lower() for t in dbutil.get_searchable_tags(db)])
    reload_control = 0
    db.close()

    def __init__(self, inputs, filters, online=True):

        self.input = inputs.strip().lower()
        self.filters = filters
        self.online = online

        self.reload_tags()

    def __intent_classify(self):

        # 空搜索
        if not self.input.strip():
            self.intent = 'empty'
            return
        # 英文直接处理
        if self.input.replace(' ', '').replace('+', '').replace(
                '-', '').encode('utf-8').isalnum():
            if self.input.lower() in self.en_tags:
                self.intent = 'tag'
                self.query = [self.input.strip().lower().replace(' ', '')]
                return
            else:
                self.intent = 'general'
                self.input = self.input.strip().lower()
                self.query = self.input
            return
        words = self.input.split(' ')
        if [w.lower().strip() in self.tags for w in words].count(False) == 0:
            self.intent = 'tag'
            self.query = [word.strip().lower() for word in words]
        else:
            self.intent = 'general'
            self.query = self.input.strip().lower()
            return

    def generate_query(self):

        query = {}
        self.__intent_classify()
        if self.intent == 'tag':
            query_piece = templates.get_nested_template(
                'investor_tag', 'investor_tag.tag', self.query)
            query.setdefault('bool', {}).setdefault('must',
                                                    []).append(query_piece)
        elif self.intent == 'general':
            query.setdefault(
                'bool', {})['should'] = templates.get_investor_name_completion(
                    self.query)
            query['bool']['minimum_number_should_match'] = 1
        if self.online:
            query.setdefault('bool', {}).setdefault('must', []).append(
                templates.get_term('online', True))
        if self.filters:
            query.setdefault('bool', {}).setdefault('must', []).extend(
                self.__generate_filter(self.filters))
        return query

    def __generate_filter(self, kv):

        pending = []
        if kv.get('round'):
            pending.append(templates.get_round(kv.get('round')))
        if kv.get('location'):
            pending.append(templates.get_terms('location', kv.get('location')))
        if kv.get('tag'):
            for t in kv.get('tag'):
                pending.append(
                    templates.get_nested_template('investor_tag',
                                                  'investor_tag.tag',
                                                  [t.lower().strip()]))
        if kv.get('domestic', None) is not None:
            pending.append(self.__generate_domestic_filter(kv.get('domestic')))
        return pending

    def get_intent(self):

        return self.intent

    def get_tag(self):

        tags = self.filters.get('tag', [])
        return tags[0] if tags else self.query[0]

    def __generate_domestic_filter(self, domestic):

        if domestic:
            return {'range': {'location': {'gt': 0, 'lte': 370}}}
        else:
            return {
                'bool': {
                    'should': [
                        {
                            'range': {
                                'location': {
                                    'gt': 370
                                }
                            }
                        },
                        {
                            'term': {
                                'location': 0
                            }
                        },
                    ],
                    'minimum_number_should_match':
                    1
                }
            }

    def reload_tags(self):

        if datetime.now().hour == self.reload_control:
            return
        self.reload_control = datetime.now().hour
        db = dbcon.connect_torndb()
        self.en_tags = set([
            t.name.lower() for t in dbutil.get_tags_by_type(db)
            if is_engish_word(t.name)
        ])
        self.tags = set([t.lower() for t in dbutil.get_searchable_tags(db)])
        db.close()
Beispiel #6
0
class GeneralQuery(object):

    # tagger = Tagger(tags=True)
    seg = Segmenter(tags=True)
    yellows = dicts.get_yellow_tags_name()
    name_parser = NameSegmenter()

    db = dbcon.connect_torndb()
    en_tags = set([
        t.name.lower() for t in dbutil.get_tags_by_type(db)
        if is_engish_word(t.name)
    ])
    tags = set([t.name.lower() for t in dbutil.get_tags_by_type(db)])
    db.close()

    def __init__(self, input, filters):

        self.input = input if isinstance(input,
                                         dict) else input.lower().strip()
        self.filters = filters

        self.intent = None
        self.query = None

    def __intent_classify(self, logger):

        # logger.info('Start to classify intent')
        if isinstance(self.input, dict):
            self.intent = 'tag'
            self.query = self.input
            return
        else:
            # 空搜索
            if not self.input.strip():
                self.intent = 'empty'
                return
            # 英文直接处理
            if self.input.replace(' ', '').replace('+', '').replace(
                    '-', '').encode('utf-8').isalnum():
                if self.input.lower() in self.en_tags:
                    self.intent = 'tag'
                    self.query = self.query = {
                        'or': [self.input.strip().lower().replace(' ', '')]
                    }
                    return
                else:
                    self.intent = 'general'
                    self.input = self.input.strip().lower()
                    self.query = self.input
                return
            # any query that contains yellow tag
            if set(self.input.split(' ')).intersection(set(self.yellows)):
                self.intent = 'tag'
                self.query = {}
                for tag in self.input.split(' '):
                    if tag in self.yellows:
                        self.filters.setdefault('yellow', []).append(tag)
                    else:
                        self.query.setdefault('or', []).append(tag)
                return
            # default query analyze
            query = set([word for word in self.seg.cut(self.input)])
            # logger.info('POS Tag Done')
            tagratio = float(len(self.tags & query)) / len(query)
            if tagratio > 0.8 or (len(self.input) > 5 and tagratio > 0.5):
                # 判断是tag搜索
                self.intent = 'tag'
                self.query = {'or': [word.strip().lower() for word in query]}
                return
            else:
                self.intent = 'general'
                self.query = self.input.strip().lower()
                return

    def generate_query(self, logger=None):

        self.__intent_classify(logger)
        # logger.info('End to classify intent')
        query = {}
        if isinstance(self.input, str) or isinstance(self.input, unicode):
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_term('name', self.input, 10))
        if self.intent == 'tag':
            for tag in self.query.get('and', []):
                query.setdefault('bool', {}).setdefault('must', []).append(
                    templates.get_terms('tags', [tag.lower()]))
                # query.setdefault('bool', {}).setdefault('must', []).append(templates.get_tag_template(tag))
            for tag in self.query.get('or', []):
                query.setdefault('bool', {}).setdefault('should', []).extend(
                    templates.get_fast_tag_template(tag.lower()))
            if self.query.get('or'):
                query['bool']['minimum_number_should_match'] = 1
            for tag in self.query.get('not', []):
                query.setdefault('bool', {}).setdefault('must_not', []).append(
                    templates.get_tag_template(tag.lower()))
        if self.intent == 'general':
            name = self.query
            # parsed_names = self.name_parser.segment(name)
            # query.setdefault('bool', {}).setdefault('should', []).append(templates.get_term('alias', name, 100))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_name_template(name))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_fuzzy('name', name, 5))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_fuzzy('alias', name))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_term('members', name))
            # fuzzy name
            # if parsed_names:
            #     for parsed_name in parsed_names.split():
            #         query.setdefault('bool', {}).setdefault('should', []).append(
            #             templates.get_term('alias', parsed_name))
            query.setdefault('bool', {})['minimum_number_should_match'] = 1

        if self.filters:
            if logger:
                logger.info('Filter', self.filters)
            query.setdefault('bool', {}).setdefault('must', []).extend(
                self.__generate_filter(self.filters))
        return query

    def __generate_filter(self, kv):

        global collection_ranking_threshold
        pending = []
        if kv.get('round'):
            pending.append(templates.get_round(kv.get('round')))
        if kv.get('date'):
            pending.append(
                templates.get_terms('established',
                                    self.__extend_date(kv.get('date'))))
        if kv.get('location'):
            pending.append(templates.get_terms('location', kv.get('location')))
        if kv.get('domestic', None) is not None:
            pending.append(self.__generate_domestic_filter(kv.get('domestic')))
        if kv.get('team'):
            pending.append(templates.get_terms('team', kv.get('team')))
        if kv.get('threshold'):
            pending.append(
                templates.get_range('ranking_score', 1,
                                    collection_ranking_threshold))
        if kv.get('yellow'):
            pending.append(
                templates.get_terms(
                    'yellows',
                    [yellow.lower().strip() for yellow in kv.get('yellow')]))
        if kv.get('tag'):
            pending.append(templates.get_terms('tags', kv.get('tag')))
        if kv.get('category'):
            pending.append(templates.get_terms('category', kv.get('category')))
        return pending

    def __generate_domestic_filter(self, domestic):

        if domestic:
            return {'range': {'location': {'gt': 0, 'lte': 370}}}
        else:
            return {
                'bool': {
                    'should': [
                        {
                            'range': {
                                'location': {
                                    'gt': 370
                                }
                            }
                        },
                        {
                            'term': {
                                'location': 0
                            }
                        },
                    ],
                    'minimum_number_should_match':
                    1
                }
            }

    def __extend_date(self, dates):

        dates = list(dates)
        if 2013 in dates:
            dates.extend(xrange(1990, 2013))
        extends = []
        for date in dates:
            for month in xrange(1, 13):
                if month > 9:
                    extends.append(int('%s%s' % (date, month)))
                else:
                    extends.append(int('%s0%s' % (date, month)))
        return extends

    def get_intent(self):

        return self.intent
Beispiel #7
0
class UniversalQuery(object):

    db = dbcon.connect_torndb()
    reload_control = 0
    en_tags = set([
        t.name.lower() for t in dbutil.get_tags_by_type(db)
        if is_engish_word(t.name)
    ])
    tags = set([t.lower() for t in dbutil.get_searchable_tags(db)])
    db.close()

    def __init__(self, input, filters, nested=None):

        self.input = input if isinstance(input,
                                         dict) else input.lower().strip()
        self.filters = filters
        self.nested = nested

        self.intent = None
        self.query = None

        self.reload_tags()

    def __intent_classify(self):

        # 空搜索
        if not self.input.strip():
            self.intent = 'empty'
            return
        # 英文直接处理
        if self.input.replace(' ', '').replace('+', '').replace(
                '-', '').encode('utf-8').isalnum():
            if self.input.lower() in self.en_tags:
                self.intent = 'tag'
                self.query = self.query = {
                    'or': [self.input.strip().lower().replace(' ', '')]
                }
                return
            else:
                self.intent = 'general'
                self.input = self.input.strip().lower()
                self.query = self.input
            return
        words = self.input.split(' ')
        if [w.lower().strip() in self.tags for w in words].count(False) == 0:
            self.intent = 'tag'
            self.query = {'or': [word.strip().lower() for word in words]}
        else:
            self.intent = 'general'
            self.query = self.input.strip().lower()
            return

    def generate_query(self):

        self.__intent_classify()
        query = {}
        if self.intent == 'tag':
            for tag in self.query.get('and', []):
                query.setdefault('bool', {}).setdefault('must', []).append(
                    templates.get_terms('tags', [tag.lower()]))
                # query.setdefault('bool', {}).setdefault('must', []).append(templates.get_tag_template(tag))
            for tag in self.query.get('or', []):
                query.setdefault('bool', {}).setdefault('should', []).extend(
                    templates.get_fast_tag_template(tag.lower()))
            if self.query.get('or'):
                query['bool']['minimum_number_should_match'] = 1
            for tag in self.query.get('not', []):
                query.setdefault('bool', {}).setdefault('must_not', []).append(
                    templates.get_tag_template(tag.lower()))
        if self.intent == 'general':
            name = self.query
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_name_template(name))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_fuzzy('name', name, 5))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_fuzzy('alias', name))
            query.setdefault('bool', {}).setdefault('should', []).append(
                templates.get_term('members', name))
            query.setdefault('bool', {})['minimum_number_should_match'] = 1
        if self.filters:
            query.setdefault('bool', {}).setdefault('must', []).extend(
                self.__generate_filter(self.filters))
        if self.nested:
            query.setdefault('bool', {}).setdefault('must', []).extend(
                self.__generate_filter(self.nested))
        return query

    def __generate_filter(self, kv):

        global collection_ranking_threshold
        pending = []
        if kv.get('round'):
            pending.append(templates.get_terms('round', kv.get('round', [])))
        if kv.get('date'):
            pending.append(
                templates.get_terms('established',
                                    self.__extend_date(kv.get('date'))))
        if kv.get('location'):
            pending.append(templates.get_terms('location', kv.get('location')))
        if kv.get('domestic', None) is not None:
            pending.append(self.__generate_domestic_filter(kv.get('domestic')))
        if kv.get('team'):
            pending.append(templates.get_terms('team', kv.get('team')))
        if kv.get('threshold'):
            pending.append(
                templates.get_range('ranking_score', 1,
                                    collection_ranking_threshold))
        if kv.get('yellow'):
            pending.append(templates.get_terms('yellows', kv.get('yellow')))
        if kv.get('status'):
            pending.append(templates.get_terms('status', kv.get('status')))
        if kv.get('tag'):
            if kv.get('operator', 'and') == 'or':
                pending.append(
                    templates.get_terms('tags',
                                        [t.lower() for t in kv.get('tag')]))
            else:
                for t in kv.get('tag', []):
                    pending.append(
                        templates.get_term('tags',
                                           t.lower().strip()))
        if kv.get('category'):
            pending.append(templates.get_terms('category', kv.get('category')))
        if kv.get('industry'):
            pending.append(
                templates.get_nested_term('nested_tag.id', 'industry',
                                          kv.get('industry')))
        if kv.get('topic'):
            pending.append(
                templates.get_nested_term('nested_tag.id', 'topic',
                                          kv.get('topic')))
        if kv.get('funding_date'):
            today = datetime.today().date()
            for fd in kv.get('funding_date'):
                if fd == 'latest7':
                    start, end = today - timedelta(days=7), today
                    pending.append(
                        templates.get_range('last_funding_date', end, start))
                elif fd == 'latest30':
                    start, end = today - timedelta(days=30), today
                    pending.append(
                        templates.get_range('last_funding_date', end, start))
                elif fd == 'latest90':
                    start, end = today - timedelta(days=90), today
                    pending.append(
                        templates.get_range('last_funding_date', end, start))
                elif fd.isalnum():
                    start = datetime.strptime('%s-01-01' % fd, '%Y-%m-%d')
                    end = datetime.strptime('%s-12-31' % fd, '%Y-%m-%d')
                    pending.append(
                        templates.get_range('last_funding_date', end, start))
        return pending

    def __generate_domestic_filter(self, domestic):

        if domestic:
            return {'range': {'location': {'gt': 0, 'lte': 370}}}
        else:
            return {
                'bool': {
                    'should': [
                        {
                            'range': {
                                'location': {
                                    'gt': 370
                                }
                            }
                        },
                        {
                            'term': {
                                'location': 0
                            }
                        },
                    ],
                    'minimum_number_should_match':
                    1
                }
            }

    def __extend_date(self, dates):

        dates = list(dates)
        if 2013 in dates:
            dates.extend(xrange(1990, 2013))
        extends = []
        for date in dates:
            for month in xrange(1, 13):
                if month > 9:
                    extends.append(int('%s%s' % (date, month)))
                else:
                    extends.append(int('%s0%s' % (date, month)))
        return extends

    def get_intent(self):

        return self.intent

    def reload_tags(self):

        if datetime.now().hour == self.reload_control:
            return
        self.reload_control = datetime.now().hour
        db = dbcon.connect_torndb()
        self.en_tags = set([
            t.name.lower() for t in dbutil.get_tags_by_type(db)
            if is_engish_word(t.name)
        ])
        self.tags = set([t.lower() for t in dbutil.get_searchable_tags(db)])
        db.close()