Beispiel #1
0
def dump_thesaurus(theme='source', topn=1000):

    db = dbcon.connect_torndb()
    seg = Segmenter()
    stopwords = stopword.get_standard_stopwords()
    tags = set(x.name for x in db.query('select name from tag where type>11001;'))
    vocab = {}

    if theme == 'source':
        query = 'select * from source_company where (active is null or active="Y");'
    else:
        query = 'select * from source_company where (active is null or active="Y");'

    for index, item in enumerate(db.iter(query)):
        for word in set(filter(lambda x: x not in stopwords and
                        len(x) > 1 and not x.isnumeric() and x not in tags and x.strip(), seg.cut(item.description))):
            vocab[word] = vocab.get(word, 0) + 1
        if index % 10000 == 0:
            low = [x[0] for x in vocab.iteritems() if x[1] < 20]
            for lowword in low:
                vocab.pop(lowword)
            print index, 'processed, size of vocab', len(vocab)
    db.close()

    vocab = sorted(vocab.iteritems(), key=lambda x: x[1], reverse=True)[:topn]
    with codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0],
                                  'thesaurus/%s.%s.lowidf' % (theme, topn)), 'w', 'utf-8') as fo:
        fo.write('\n'.join([x[0] for x in vocab]))
Beispiel #2
0
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.tagger = Tagger()
        self.stopwords = stopword.get_standard_stopwords()
        self.source = json.load(codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0],
                                                         'dumps/news.domain'), encoding='utf-8'))
        self.failed = 0
class UniversalIndexCreator(object):

    stopwords = stopword.get_standard_stopwords()
    seg = Segmenter()
    nameseg = NameSegmenter()

    def __init__(self, es=None):

        global logger_universal_index
        if not es:
            host, port = tsbconfig.get_es_config()
            self.es = Elasticsearch([{'host': host, 'port': port}])
        else:
            self.es = es
        self.topic_tags = {}
        logger_universal_index.info('Universal Index Creator inited')

    def __check(self):

        global logger_universal_index
        if not self.es.indices.exists(["xiniudata2"]):
            logger_universal_index.info('Creating index xiniudata2')
            self.es.indices.create("xiniudata2")
            logger_universal_index.info('Created')
        self.es.indices.put_mapping("universal",
                                    mappings.get_universal_company_mapping(),
                                    "xiniudata2")
        logger_universal_index.info('Universal Company mapping created')

    def create_indice(self):

        global logger_universal_index
        self.__check()
        db = dbcon.connect_torndb()
        self.topic_tags = dbutil.get_topic_corresponding_tags(db)
        logger_universal_index.info('Start to create indice')
        logger_universal_index.info(str(self.es.info()))
        logger_universal_index.info('ES Config %s' %
                                    str(tsbconfig.get_es_config()))
        for cid in dbutil.get_all_company_id(db):
            try:
                self.create_single(db, cid)
                logger_universal_index.info(
                    '%s index created, %s' %
                    (cid, dbutil.get_company_name(db, cid)))
            except Exception, e:
                logger_universal_index.exception('%s failed # %s' % (cid, e))
        db.close()
Beispiel #4
0
    def __init__(self, opt=None):

        if not isinstance(opt, dict):
            opt = {}

        if opt.get('segmenter'):
            self.seg = opt.get('segmenter')
        else:
            self.seg = Segmenter()
        self.vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            stop_words=stopword.get_standard_stopwords(),
            max_df=opt.get('max_df', 0.5),
            min_df=opt.get('min_df', 50),
            max_features=5000)
        self.selector = SelectKBest(chi2, k=opt.get('topk', 'all'))
Beispiel #5
0
    def __init__(self):

        self.stopwords = stopword.get_standard_stopwords()
        self.stopwords.update(punctuation)
Beispiel #6
0
import codecs
import time
import re
from random import random
from common import nlpconfig
from common.zhtools import stopword
from common.zhtools import hants
from common.zhtools.postagger import Tagger


tagger = Tagger('ltp')
doc_len_threshold = 10
year = re.compile(u'\d+年')
month = re.compile(u'\d+月')
day = re.compile(u'\d+日')
stopwords = stopword.get_standard_stopwords()


class Corpus(object):

    def __init__(self, extrc_func=None, dirs=None):

        self.extract = extrc_func if extrc_func else lambda x: x.strip()
        self.dirs = dirs

    def __iter__(self):

        for subdir in self.dirs:
            for f in os.listdir(subdir):
                if u'DS_Store' in f:
                    continue