Exemple #1
0
def deal_with_text(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_text = get_config_values('cache', 'text_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_text = get_config_values('cache', 'text_dev')
    elif len(data_list) == 2 and mode == 'mix':
        cache_text = get_config_values('cache', 'text_mix')
    elif len(data_list) == 3 and mode == 'full':
        cache_text = get_config_values('cache', 'text_full')
    else:
        logger.warn('Found data format wrong when dealing with text...')

    if not os.path.exists(cache_text):
        logger.info("dealing with text...")
        text = []
        for dataset in tqdm(data_list):
            text.extend([
                Converter('zh-hans').convert(line['text']) for line in dataset
            ])
        save_pickle(cache_text, text)
    else:
        logger.info("loading with text...")
        text = load_pickle(cache_text)
    logger.info("text total num: {0}".format(len(text)))
    return text
Exemple #2
0
def deal_with_postag(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_postag = get_config_values('cache', 'postag_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_postag = get_config_values('cache', 'postag_dev')
    elif len(data_list) == 2 and mode == 'mix':
        cache_postag = get_config_values('cache', 'postag_mix')
    elif len(data_list) == 3 and mode == 'full':
        cache_postag = get_config_values('cache', 'postag_full')
    else:
        logger.warn('Found data format wrong when dealing with postag...')

    if not os.path.exists(cache_postag):
        logger.info("dealing with postag...")
        postag = []
        for dataset in tqdm(data_list):
            for line in dataset:
                postag.append([[
                    Converter('zh-hans').convert(word['word'].strip().replace(
                        ' ', '')), word['pos'],
                    len(word['word'])
                ] for word in line['postag']])
        save_pickle(cache_postag, postag)
    else:
        logger.info("loading with postag...")
        postag = load_pickle(cache_postag)
    logger.info("postag total num: {0}".format(len(postag)))
    logger.info("postag 5: {0}".format(postag[:5]))
    return postag
Exemple #3
0
def full_data():

    file_train = get_config_values('dataset', 'train')
    file_dev = get_config_values('dataset', 'dev')
    file_test = get_config_values('dataset', 'test')
    data_train = load_json(file_train)
    data_dev = load_json(file_dev)
    data_test = load_json(file_test)

    return [data_train, data_dev, data_test]
Exemple #4
0
def tasks():

    # stock symbol
    symbols = get_config_values('spider', 'symbols')
    # article category
    categorys = get_config_values('spider', 'categorys')

    # get cookies
    result = app.send_task('tasks.spider.get_cookies', queue='crawl_queue')
    while not result.ready():
        time.sleep(1)
    cookies = result.get()

    # send tasks
    for symbol in symbols:
        app.send_task('tasks.spider.comment', [cookies, symbol],
                      queue='crawl_queue')
    for category in categorys:
        app.send_task('tasks.spider.article', [cookies, category],
                      queue='crawl_queue')
Exemple #5
0
def main():

    #1.load raw_data
    file_train = get_config_values('dataset', 'train')
    file_dev = get_config_values('dataset', 'dev')
    file_test = get_config_values('dataset', 'test')
    data_train = load_json(file_train)
    data_dev = load_json(file_dev)
    data_test = load_json(file_test)
    #2.using all the data build vocab
    text = deal_with_text([data_train, data_dev, data_test], mode='full')
    postag = deal_with_postag([data_train, data_dev, data_test], mode='full')
    spo = deal_with_spo([data_train, data_dev], mode='full')
    char = build_char_dict(text)
    segm = build_segm_dict([data_train, data_dev, data_test])
    # save_for_corpus([data_train,data_dev,data_test])
    #3.loading the embeddings
    w2v_char = get_config_values('vector', 'w2v_char')
    # w2v_segm1 = get_config_values('vector', 'w2v_segm1')
    # w2v_segm2 = get_config_values('vector', 'w2v_segm2')
    twe_segm = get_config_values('vector', 'twe_segm')
    embeddings_index1 = KeyedVectors.load_word2vec_format(w2v_char,
                                                          binary=False)
    # embeddings_index2 = KeyedVectors.load_word2vec_format(w2v_segm1, binary=False)
    # embeddings_index3 = KeyedVectors.load_word2vec_format(w2v_segm2, binary=False)
    embeddings_index4 = KeyedVectors.load_word2vec_format(twe_segm,
                                                          binary=False)
    #4.build vocab
    char_vocab = Char_vocab([data_train, data_dev, data_test],
                            embeddings_index1, 10000)
    word_vocab = Word_vocab([data_train, data_dev, data_test],
                            embeddings_index4, 500000)
    postag_vocab = Postag_vocab([data_train, data_dev, data_test])
    schemas_vocab = Schema_vocab()
    logger.info('vocab char num: {0}'.format(char_vocab.size()))
    logger.info('vocab word num: {0}'.format(word_vocab.size()))
    logger.info('vocab postag num: {0}'.format(postag_vocab.size()))
    logger.info('schemas object num: {0}'.format(schemas_vocab.object_size()))
    logger.info('schemas label num: {0}'.format(schemas_vocab.label_size()))
Exemple #6
0
def build_char_dict(text):
    cache_char = get_config_values('cache', 'char')
    if not os.path.exists(cache_char):
        logger.info('dealing with char data...')
        char = Counter()
        for line in tqdm(text):
            char.update(line)
        save_pickle(cache_char, char)
    else:
        logger.info('loading with char data...')
        char = load_pickle(cache_char)
    logger.info('char total num: {0}'.format(len(dict(char))))
    logger.info('char frequent char: {0}'.format(char.most_common()[:10]))
    return char
Exemple #7
0
def save_for_corpus(data_list):
    file_corpus = get_config_values('corpus', 'text')
    if not os.path.exists(file_corpus):
        with codecs.open(file_corpus, 'w') as fp:
            total = sum([len(dataset) for dataset in data_list])
            fp.write((str(total) + '\n'))
            for dataset in tqdm(data_list):
                for line in dataset:
                    for word in line['postag']:
                        fp.write((Converter('zh-hans').convert(
                            word['word'].strip().replace(' ', '')) + ' '))
                    if line['postag'] != None:
                        fp.write('\n')
    else:
        logger.info('corpus already done...')
Exemple #8
0
def build_postag_dict(postags):
    cache_postag = get_config_values('cache', 'postag')
    if not os.path.exists(cache_postag):
        logger.info('dealing with postag...')
        postag = Counter()
        for line in postags:
            for tag in line:
                postag.update(tag[1])
        save_pickle(cache_postag, postag)
    else:
        logger.info('loading with postag data...')
        postag = load_pickle(cache_postag)
    logger.info('postag total num: {0}'.format(len(dict(postag))))
    logger.info('postag frequent postag: {0}'.format(
        postag.most_common()[:10]))
    return postag
Exemple #9
0
def build_segm_dict(data_list):
    cache_segm = get_config_values('cache', 'segm')
    if not os.path.exists(cache_segm):
        logger.info("dealing with segm...")
        segm = Counter()
        for dataset in tqdm(data_list):
            for line in dataset:
                segm.update([
                    Converter('zh-hans').convert(word['word'].strip().replace(
                        ' ', '')) for word in line['postag']
                ])
        save_pickle(cache_segm, segm)
    else:
        logger.info("loading with segn...")
        segm = load_pickle(cache_segm)
    logger.info('segm total num: {0}'.format(len(dict(segm))))
    logger.info('segm frequent segm: {0}'.format(segm.most_common()[:10]))
    return segm
Exemple #10
0
 def read_schemas(self):
     filename = get_config_values('dataset', 'schemas')
     self.schemas = load_json(filename)
Exemple #11
0
def deal_with_spo(data_list, mode='full'):

    if len(data_list) == 1 and mode == 'train':
        cache_spo = get_config_values('cache', 'spo_train')
    elif len(data_list) == 1 and mode == 'dev':
        cache_spo = get_config_values('cache', 'spo_dev')
    elif len(data_list) == 2 and mode == 'full':
        cache_spo = get_config_values('cache', 'spo_full')
    else:
        logger.warn('Found data format wrong when dealing with spo...')

    if not os.path.exists(cache_spo):
        logger.info("dealing with spo...")
        spos = []
        for dataset in tqdm(data_list):
            for line in dataset:
                pairs = []
                # position
                func = lambda x, y: list(
                    re.search(re.escape(x.lower()), y.lower()).span())

                # true classes
                cnt = 0
                for spo in line['spo_list']:
                    if (cnt % 2) == 0:
                        pairs.append([
                            spo['predicate'],
                            [spo['object_type'], spo['subject_type']],
                            func(spo['object'], line['text']) +
                            func(spo['subject'], line['text']),
                        ])
                    else:
                        pairs.append([
                            spo['predicate'],
                            [spo['subject_type'], spo['object_type']],
                            func(spo['subject'], line['text']) +
                            func(spo['object'], line['text']),
                        ])
                    cnt += 1

                # NA classes
                data = {}
                # {data:{obj/sub_type, pos, co-occur}}
                for spo in line['spo_list']:
                    if spo['object'] not in data.keys():
                        data[spo['object']] = {}
                        data[spo['object']]['type'] = spo['object_type']
                        data[spo['object']]['pos'] = func(
                            spo['object'], line['text'])
                        data[spo['object']]['co-occur'] = []
                    if spo['subject'] not in data.keys():
                        data[spo['subject']] = {}
                        data[spo['subject']]['type'] = spo['subject_type']
                        data[spo['subject']]['pos'] = func(
                            spo['subject'], line['text'])
                        data[spo['subject']]['co-occur'] = []

                for spo in line['spo_list']:
                    for key in data.keys():
                        if (spo['object'] == key) or (spo['subject']) == key:
                            data[key]['co-occur'].append(1)
                        else:
                            data[key]['co-occur'].append(0)

                # [('',{}),('',{}),('',{})]
                data = list(data.items())

                # judge by co-occurance
                for idx1 in range(len(data) - 1):
                    for idx2 in range(idx1 + 1, len(data)):
                        co1 = np.array(data[idx1][1]['co-occur'])
                        co2 = np.array(data[idx2][1]['co-occur'])
                        if 2 not in co1 + co2:
                            pairs.append([
                                'NA',
                                [data[idx1][1]['type'], data[idx2][1]['type']],
                                func(data[idx1][0], line['text']) +
                                func(data[idx2][0], line['text']),
                            ])

                spos.append(pairs)

                # spos.append(
                #    [[
                #    spo['predicate'],
                #    [spo['object_type'], spo['subject_type']],
                #    list(re.search(re.escape(spo['object'].lower()), line['text'].lower()).span())
                #        + list(re.search(re.escape(spo['subject'].lower()), line['text'].lower()).span()),
                #    ] for spo in line['spo_list']]
                #    )
                # except:
                #    logger.info('what the f**k is that ??? text:{0} ; spo:{1}'.format(line['text'], line['spo_list']))

        save_pickle(cache_spo, spos)
    else:
        logger.info("loading with spo...")
        spos = load_pickle(cache_spo)
    logger.info("spo total num: {0}".format(len(spos)))
    logger.info("spo 5: {0}".format(spos[:5]))
    return spos
Exemple #12
0
def monitor():
    # turn on the monitor
    broker = get_config_values('celery', 'broker')
    daemon = PrometheusMonitor(app=app, broker=broker)
    daemon.run_loop()
Exemple #13
0
 def auth_handler(url, method, timeout, headers, data):
     username = get_config_values('pushgateway', 'username')
     password = get_config_values('pushgateway', 'password')
     return basic_auth_handler(url, method, timeout, headers, data,
                               username, password)
Exemple #14
0
import os
from celery import Celery, platforms
from kombu import Exchange, Queue
from utils import get_config_values

platforms.C_FORCE_ROOT = True

broker = get_config_values('celery','broker')
backend = get_config_values('celery','backend')
tasks = get_config_values('celery','tasks')

app = Celery('tasks', broker=broker, backend=backend, include=tasks)

app.conf.update(

    CELERY_TIMEZONE='Asia/Shanghai',
    CELERY_ENABLE_UTC=True,
    CELERY_ACCEPT_CONTENT=['json', 'pickle'],
    CELERY_TASK_SERIALIZER='json',
    CELERY_RESULT_SERIALIZER='json',
    CELERYD_MAX_TASKS_PER_CHILD=500,
    CELERY_BROKER_HEARTBEAT=0,
    CELERYD_SEND_EVENTS=True,
    CELERYD_PREFETCH_MULTIPLIER=2,
    CELERY_QUEUES=(
        Queue('crawl_queue', exchange=Exchange('crawl_queue', type='direct'), routing_key='crawl'),
        Queue('parse_queue', exchange=Exchange('parse_queue', type='direct'), routing_key='parse'),
    ),

)