Python Segmenter.cut4search Examples

Programming Language: Python

Namespace/Package Name: common.zhtools.segment

Class/Type: Segmenter

Method/Function: cut4search

Examples at hotexamples.com: 4

Python Segmenter.cut4search - 4 examples found. These are the top rated real world Python examples of common.zhtools.segment.Segmenter.cut4search extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Segmenter(24)

cut(18)

cut4search(4)

Example #1

Show file

File: feed.py Project: yujiye/Codes

class NewsFeeder(object):
    def __init__(self):

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()

    def feed(self, record, granularity='default'):

        global logger_feeder

        try:
            contents = self.wfilter(
                self.seg.cut(record['title'].replace('\n', ' ')))
            if record.get('original_tags', []) and isinstance(
                    record.get('original_tags', []), list):
                contents.extend(record.get('original_tags', []))
            if granularity == 'fine':
                contents.extend(
                    self.wfilter(
                        self.seg.cut4search(' '.join([
                            piece['content'].replace('\n', ' ')
                            for piece in record['contents']
                        ]))))
            else:
                contents.extend(
                    self.wfilter(
                        self.seg.cut(' '.join([
                            piece['content'].replace('\n', ' ')
                            for piece in record['contents']
                        ]))))
            return contents
        except Exception, e:
            logger_feeder.error('Fail to feed, %s, %s' % (record['_id'], e))
            return []

Example #2

Show file

File: w2v.py Project: yujiye/Codes

class News(object):

    def __init__(self):

        self.mongo = dbcon.connect_mongo()
        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()

    def __iter__(self):

        for news in self.mongo.article.news.find({'processStatus': 1}).sort('_id', DESCENDING).limit(200000):
            try:
                content = []
                content.extend(self.wfilter(self.seg.cut4search(news.get('title', ''))))
                for piece in news.get('contents', []):
                    content.extend(self.wfilter(self.seg.cut(piece.get('content', ''))))
                if len(content) > 10:
                    yield content
            except Exception, e:
                continue
        for c in self.db.query('select description from company where verify="Y" and modifyTime>"2016-06-01";'):
            try:
                if len(c.description) > 10:
                    yield self.wfilter(self.seg.cut4search(c.description))
            except:
                continue

Example #3

Show file

File: feed.py Project: yujiye/Codes

class Feeder(object):
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.non_trusted_discount = 0.5
        self.brief_promote = 1.5
        self.trusted_sources = dicts.get_known_company_source()

        self.wfilter = word_filter.get_default_filter()
        self.seg = Segmenter(tag=True)

    def feed(self, cid, mode='default', quanlity='low'):

        feeds = {
            'default': self.__feed_default,
            'with_tag': self.__feed_with_tag
        }.get(mode, 'default')(cid)
        feeds = list(feeds)
        if quanlity == 'medium':
            ave = min(mean([feed[1] for feed in feeds]), 2)
            return filter(lambda x: x[1] >= ave, feeds)
        if quanlity == 'low':
            return feeds

    def feed_string(self, cid, mode='default'):

        feeds = list(self.feed(cid, mode, 'medium'))
        return ' '.join([feed[0].strip() for feed in feeds])

    def feed_seged(self, cid, feed_mode='default'):

        return self.wfilter(self.seg.cut(self.feed_string(cid, feed_mode)))

    def feed_seged_fine(self, cid, feed_mode='default'):

        return self.wfilter(
            self.seg.cut4search(self.feed_string(cid, feed_mode)))

    def feed_relevant_string(self, cid):

        pass

    def __feed_with_tag(self, cid):

        for feed in self.__feed_default(cid):
            yield feed
        for source_tag in dbutil.get_source_company_tags(
                self.db, cid, self.trusted_sources):
            if source_tag and source_tag.strip():
                yield source_tag, 2

    def __feed_default(self, cid):

        cscore = dbutil.get_company_score(self.db, cid, 37010)
        # company info
        info = dbutil.get_company_info(self.db, cid)
        score = 1.5 if cscore > 0.5 else 1
        if info.verify and info.verify == 'Y':
            score += 1
        if info.brief and info.brief.strip():
            yield self.__preprocess(info.brief.strip()), score
        if info.description and info.description.strip():
            yield self.__preprocess(info.description.strip()), score

        # source company
        for info in dbutil.get_source_company_infos(self.db, cid):
            discount = self.non_trusted_discount if info.source not in self.trusted_sources else 1
            if info.brief and info.brief.strip():
                yield self.__preprocess(
                    info.brief.strip()), discount * self.brief_promote
            if info.description and info.description.strip():
                yield self.__preprocess(info.description.strip()), discount

        # iOS
        info = dbutil.get_recommend_artifact(self.db, cid)
        if info and info.description and info.description.strip():
            ascore = 1 if (info.verify and info.verify == 'Y') else 0.5
            yield self.__preprocess(info.description.strip()), ascore

    def __preprocess(self, content):

        # clean and narrow down candidates
        # 繁转简
        content = hants.translate(unicode(content))
        # 转小写
        content = content.lower()

        return content.strip()

Example #4

Show file

class KeywordExtractor(object):
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'

    def __load_trained_clfs(self):

        model_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0], 'models')
        clfs = {}
        for model_file in os.listdir(model_dir):
            if model_file.endswith('.model'):
                tid = model_file.split('.')[0]
                if not isinstance(tid, int):
                    continue
                clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load(
                    os.path.join(model_dir, model_file))
        return clfs

    def __load_tag_novelties(self, tid=False):

        if not tid:
            return {
                tag.name: (tag.novelty or 1)
                for tag in dbutil.get_tags_by_type(self.db)
            }
        else:
            return {
                tag.id: (tag.novelty or 1)
                for tag in dbutil.get_tags_by_type(self.db)
            }

    def __load_tag_types(self):

        return {
            tag.name: (tag.type or 0)
            for tag in dbutil.get_tags_by_type(self.db)
        }

    def __extract_source_tag(self, cid):

        tags = dbutil.get_source_company_tags(self.db, cid,
                                              self.trusted_sources)
        if tags:
            return set(
                chain(*[
                    dbutil.analyze_source_tag(
                        self.db, tname, self.replacements) for tname in tags
                    if tname and tname.strip()
                ]))
        return set([])

    def __extract_vecrank(self, candidates, candidates_important,
                          candidates_vips, topn):

        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        proper_hyponym = dict.fromkeys(
            set(
                chain(*[
                    self.hyponym.get(dbutil.get_tag_name(self.db, cv))
                    for cv in candidates_vips.iterkeys()
                ])), 2)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
            if candidates[i] not in self.w2v:
                continue
            for word, weight in candidates_important.items():
                if word == candidates[i] or word not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidates[i], word)
                if similarity > self.similarity_threshold:
                    weights[(candidates[i], word)] += similarity * weight
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus, proper_hyponym)
        topn = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:topn]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def extract_vip(self, cid):

        desc = ' '.join(
            self.wfilter(
                self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag'))))
        if not desc:
            return {}
        classifier_vips = [
            (int(tag.replace(u'__label__', '')), weight)
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 2)[0]
            if weight > self.vip_lower
        ]
        classifier_vips.sort(key=lambda x: -x[1])
        # if 2 candidate vip label, check whether their probability is comparable
        if len(classifier_vips
               ) == 2 and classifier_vips[0][1] > classifier_vips[1][1] * 2:
            return {classifier_vips[0][0]: classifier_vips[0][1]}
        return dict(classifier_vips)

    def __extract_important(self, contents, candidates):

        # support assginment
        supports = deepcopy(candidates)
        for word in contents:
            if word not in self.w2v:
                continue
            for candidate in candidates.keys():
                if candidate not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidate, word)
                if similarity > self.similarity_threshold:
                    supports[candidate] = supports.get(candidate,
                                                       0) + similarity
        # support selection
        results = {}
        candi_size, content_size = len(candidates), len(''.join(candidates))
        for candidate, weight in supports.iteritems():
            if candi_size >= 2 and weight < content_size / 20:
                continue
            results[candidate] = weight * self.thesaurus.get(candidate, 1)
        if len(results) == 0:
            return results
        # normalization
        normalizer = max(results.values())
        for k, v in results.items():
            results[k] = round(v / normalizer, 2)
        # narrow down results size
        if len(results) < 4:
            pass
        else:
            results = dict(
                filter(lambda x: x[1] > self.important_threshold,
                       results.iteritems()))
            if len(results) > self.important_max_count:
                size = min(
                    10,
                    max(int(ceil(len(results) / 2.0)),
                        self.important_max_count))
                results = dict(
                    sorted(results.iteritems(), key=lambda x: -x[1])[:size])
        return results

    def __extract_textrank(self, candidates, topn=15):
        """
        weighted textrank, weights use tags' novelties
        """
        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus)
        index = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:index]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def __prepare_tag_contents(self, cid):

        # prepare contents
        contents = list(self.feeder.feed(cid, quanlity='medium'))
        candidates = []
        for content, _ in contents:
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        source_tags = self.__extract_source_tag(cid)
        candidates_important = {}
        for content, weight in contents:
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        for tag in source_tags:
            candidates_important[tag] = candidates_important.get(
                tag, 0) + self.source_tag_default_weight

        return source_tags, candidates, candidates_important

    def __normalize_replacement(self, tags):

        if type(tags) is dict:
            normalized_tags = {}
            for tag, weight in tags.items():
                if tag in self.replacements:
                    for replacement in self.replacements.get(tag):
                        normalized_tags[replacement] = weight
                else:
                    normalized_tags[tag] = weight
        else:
            normalized_tags = []
            for tag in tags:
                if tag in self.replacements:
                    for replacement in self.replacements.get(tag):
                        normalized_tags.append(replacement)
                else:
                    normalized_tags.append(tag)
        return normalized_tags

    def __normalize(self, d):

        if not d:
            return d
        normalizer = max(d.values()) + 1.0
        for tag, weight in d.items():
            type_promotion = {
                11011: 1,
                11013: 1.5,
                11012: 2.5
            }.get(self.tag_types.get(tag, 0), 0)
            d[tag] = round(weight / normalizer, 2) + type_promotion
        return d

    def merge(self, d1, d2, weight=0):

        # weight is a bonus weight
        for k, v in d2.iteritems():
            d1[k] = d1.get(k, 0) + v + weight
        return d1

    def extract(self, cid, topn=15):

        # prepare contents
        source_tags, candidates, candidates_important = self.__prepare_tag_contents(
            cid)
        candidates_vips = self.extract_vip(cid)

        # generate results
        results = dict(
            self.__extract_vecrank(candidates, candidates_important,
                                   candidates_vips, topn))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1)
        # results = self.merge(results, dict(self.__extract_textrank(candidates, topn)))
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        return results

    def extract_from_text(self, text):

        candidates = []
        for content, _ in text.iteritems():
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        candidates_important = {}
        for content, weight in text.iteritems():
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        desc = ' '.join(
            self.wfilter(self.seg.cut4search(' '.join(text.keys()))))
        candidates_vips = {
            int(tag.replace(u'__label__', '')): weight
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 3)[0]
            if weight > self.vip_lower
        }
        results = {}
        results = self.merge(
            results, self.__extract_important(candidates,
                                              candidates_important), 1)
        results = self.merge(results,
                             dict(self.__extract_textrank(candidates, 10)))
        # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        deducts = self.__deduct_2nd(results)
        if len(deducts) < 3:
            results = self.merge(results, deducts)
        return results

    def __deduct_2nd(self, tags):

        deduct = []
        tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()]
        for (tid, tag) in tags:
            if self.tag_types.get(tag, 0) == 11013:
                t1s = dbutil.get_hypernym_tags(self.db, tid, 1)
                for t1 in set(t1s) & set([t[0] for t in tags]):
                    t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set(
                        dbutil.get_hypernym_tags(self.db, tid, 2))
                    for t2 in t2s:
                        if t2 not in set([t[0] for t in tags]):
                            deduct.append(t2)
        return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}