def deal_with_text(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_text = get_config_values('cache', 'text_train') elif len(data_list) == 1 and mode == 'dev': cache_text = get_config_values('cache', 'text_dev') elif len(data_list) == 2 and mode == 'mix': cache_text = get_config_values('cache', 'text_mix') elif len(data_list) == 3 and mode == 'full': cache_text = get_config_values('cache', 'text_full') else: logger.warn('Found data format wrong when dealing with text...') if not os.path.exists(cache_text): logger.info("dealing with text...") text = [] for dataset in tqdm(data_list): text.extend([ Converter('zh-hans').convert(line['text']) for line in dataset ]) save_pickle(cache_text, text) else: logger.info("loading with text...") text = load_pickle(cache_text) logger.info("text total num: {0}".format(len(text))) return text
def deal_with_postag(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_postag = get_config_values('cache', 'postag_train') elif len(data_list) == 1 and mode == 'dev': cache_postag = get_config_values('cache', 'postag_dev') elif len(data_list) == 2 and mode == 'mix': cache_postag = get_config_values('cache', 'postag_mix') elif len(data_list) == 3 and mode == 'full': cache_postag = get_config_values('cache', 'postag_full') else: logger.warn('Found data format wrong when dealing with postag...') if not os.path.exists(cache_postag): logger.info("dealing with postag...") postag = [] for dataset in tqdm(data_list): for line in dataset: postag.append([[ Converter('zh-hans').convert(word['word'].strip().replace( ' ', '')), word['pos'], len(word['word']) ] for word in line['postag']]) save_pickle(cache_postag, postag) else: logger.info("loading with postag...") postag = load_pickle(cache_postag) logger.info("postag total num: {0}".format(len(postag))) logger.info("postag 5: {0}".format(postag[:5])) return postag
def full_data(): file_train = get_config_values('dataset', 'train') file_dev = get_config_values('dataset', 'dev') file_test = get_config_values('dataset', 'test') data_train = load_json(file_train) data_dev = load_json(file_dev) data_test = load_json(file_test) return [data_train, data_dev, data_test]
def tasks(): # stock symbol symbols = get_config_values('spider', 'symbols') # article category categorys = get_config_values('spider', 'categorys') # get cookies result = app.send_task('tasks.spider.get_cookies', queue='crawl_queue') while not result.ready(): time.sleep(1) cookies = result.get() # send tasks for symbol in symbols: app.send_task('tasks.spider.comment', [cookies, symbol], queue='crawl_queue') for category in categorys: app.send_task('tasks.spider.article', [cookies, category], queue='crawl_queue')
def main(): #1.load raw_data file_train = get_config_values('dataset', 'train') file_dev = get_config_values('dataset', 'dev') file_test = get_config_values('dataset', 'test') data_train = load_json(file_train) data_dev = load_json(file_dev) data_test = load_json(file_test) #2.using all the data build vocab text = deal_with_text([data_train, data_dev, data_test], mode='full') postag = deal_with_postag([data_train, data_dev, data_test], mode='full') spo = deal_with_spo([data_train, data_dev], mode='full') char = build_char_dict(text) segm = build_segm_dict([data_train, data_dev, data_test]) # save_for_corpus([data_train,data_dev,data_test]) #3.loading the embeddings w2v_char = get_config_values('vector', 'w2v_char') # w2v_segm1 = get_config_values('vector', 'w2v_segm1') # w2v_segm2 = get_config_values('vector', 'w2v_segm2') twe_segm = get_config_values('vector', 'twe_segm') embeddings_index1 = KeyedVectors.load_word2vec_format(w2v_char, binary=False) # embeddings_index2 = KeyedVectors.load_word2vec_format(w2v_segm1, binary=False) # embeddings_index3 = KeyedVectors.load_word2vec_format(w2v_segm2, binary=False) embeddings_index4 = KeyedVectors.load_word2vec_format(twe_segm, binary=False) #4.build vocab char_vocab = Char_vocab([data_train, data_dev, data_test], embeddings_index1, 10000) word_vocab = Word_vocab([data_train, data_dev, data_test], embeddings_index4, 500000) postag_vocab = Postag_vocab([data_train, data_dev, data_test]) schemas_vocab = Schema_vocab() logger.info('vocab char num: {0}'.format(char_vocab.size())) logger.info('vocab word num: {0}'.format(word_vocab.size())) logger.info('vocab postag num: {0}'.format(postag_vocab.size())) logger.info('schemas object num: {0}'.format(schemas_vocab.object_size())) logger.info('schemas label num: {0}'.format(schemas_vocab.label_size()))
def build_char_dict(text): cache_char = get_config_values('cache', 'char') if not os.path.exists(cache_char): logger.info('dealing with char data...') char = Counter() for line in tqdm(text): char.update(line) save_pickle(cache_char, char) else: logger.info('loading with char data...') char = load_pickle(cache_char) logger.info('char total num: {0}'.format(len(dict(char)))) logger.info('char frequent char: {0}'.format(char.most_common()[:10])) return char
def save_for_corpus(data_list): file_corpus = get_config_values('corpus', 'text') if not os.path.exists(file_corpus): with codecs.open(file_corpus, 'w') as fp: total = sum([len(dataset) for dataset in data_list]) fp.write((str(total) + '\n')) for dataset in tqdm(data_list): for line in dataset: for word in line['postag']: fp.write((Converter('zh-hans').convert( word['word'].strip().replace(' ', '')) + ' ')) if line['postag'] != None: fp.write('\n') else: logger.info('corpus already done...')
def build_postag_dict(postags): cache_postag = get_config_values('cache', 'postag') if not os.path.exists(cache_postag): logger.info('dealing with postag...') postag = Counter() for line in postags: for tag in line: postag.update(tag[1]) save_pickle(cache_postag, postag) else: logger.info('loading with postag data...') postag = load_pickle(cache_postag) logger.info('postag total num: {0}'.format(len(dict(postag)))) logger.info('postag frequent postag: {0}'.format( postag.most_common()[:10])) return postag
def build_segm_dict(data_list): cache_segm = get_config_values('cache', 'segm') if not os.path.exists(cache_segm): logger.info("dealing with segm...") segm = Counter() for dataset in tqdm(data_list): for line in dataset: segm.update([ Converter('zh-hans').convert(word['word'].strip().replace( ' ', '')) for word in line['postag'] ]) save_pickle(cache_segm, segm) else: logger.info("loading with segn...") segm = load_pickle(cache_segm) logger.info('segm total num: {0}'.format(len(dict(segm)))) logger.info('segm frequent segm: {0}'.format(segm.most_common()[:10])) return segm
def read_schemas(self): filename = get_config_values('dataset', 'schemas') self.schemas = load_json(filename)
def deal_with_spo(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_spo = get_config_values('cache', 'spo_train') elif len(data_list) == 1 and mode == 'dev': cache_spo = get_config_values('cache', 'spo_dev') elif len(data_list) == 2 and mode == 'full': cache_spo = get_config_values('cache', 'spo_full') else: logger.warn('Found data format wrong when dealing with spo...') if not os.path.exists(cache_spo): logger.info("dealing with spo...") spos = [] for dataset in tqdm(data_list): for line in dataset: pairs = [] # position func = lambda x, y: list( re.search(re.escape(x.lower()), y.lower()).span()) # true classes cnt = 0 for spo in line['spo_list']: if (cnt % 2) == 0: pairs.append([ spo['predicate'], [spo['object_type'], spo['subject_type']], func(spo['object'], line['text']) + func(spo['subject'], line['text']), ]) else: pairs.append([ spo['predicate'], [spo['subject_type'], spo['object_type']], func(spo['subject'], line['text']) + func(spo['object'], line['text']), ]) cnt += 1 # NA classes data = {} # {data:{obj/sub_type, pos, co-occur}} for spo in line['spo_list']: if spo['object'] not in data.keys(): data[spo['object']] = {} data[spo['object']]['type'] = spo['object_type'] data[spo['object']]['pos'] = func( spo['object'], line['text']) data[spo['object']]['co-occur'] = [] if spo['subject'] not in data.keys(): data[spo['subject']] = {} data[spo['subject']]['type'] = spo['subject_type'] data[spo['subject']]['pos'] = func( spo['subject'], line['text']) data[spo['subject']]['co-occur'] = [] for spo in line['spo_list']: for key in data.keys(): if (spo['object'] == key) or (spo['subject']) == key: data[key]['co-occur'].append(1) else: data[key]['co-occur'].append(0) # [('',{}),('',{}),('',{})] data = list(data.items()) # judge by co-occurance for idx1 in range(len(data) - 1): for idx2 in range(idx1 + 1, len(data)): co1 = np.array(data[idx1][1]['co-occur']) co2 = np.array(data[idx2][1]['co-occur']) if 2 not in co1 + co2: pairs.append([ 'NA', [data[idx1][1]['type'], data[idx2][1]['type']], func(data[idx1][0], line['text']) + func(data[idx2][0], line['text']), ]) spos.append(pairs) # spos.append( # [[ # spo['predicate'], # [spo['object_type'], spo['subject_type']], # list(re.search(re.escape(spo['object'].lower()), line['text'].lower()).span()) # + list(re.search(re.escape(spo['subject'].lower()), line['text'].lower()).span()), # ] for spo in line['spo_list']] # ) # except: # logger.info('what the f**k is that ??? text:{0} ; spo:{1}'.format(line['text'], line['spo_list'])) save_pickle(cache_spo, spos) else: logger.info("loading with spo...") spos = load_pickle(cache_spo) logger.info("spo total num: {0}".format(len(spos))) logger.info("spo 5: {0}".format(spos[:5])) return spos
def monitor(): # turn on the monitor broker = get_config_values('celery', 'broker') daemon = PrometheusMonitor(app=app, broker=broker) daemon.run_loop()
def auth_handler(url, method, timeout, headers, data): username = get_config_values('pushgateway', 'username') password = get_config_values('pushgateway', 'password') return basic_auth_handler(url, method, timeout, headers, data, username, password)
import os from celery import Celery, platforms from kombu import Exchange, Queue from utils import get_config_values platforms.C_FORCE_ROOT = True broker = get_config_values('celery','broker') backend = get_config_values('celery','backend') tasks = get_config_values('celery','tasks') app = Celery('tasks', broker=broker, backend=backend, include=tasks) app.conf.update( CELERY_TIMEZONE='Asia/Shanghai', CELERY_ENABLE_UTC=True, CELERY_ACCEPT_CONTENT=['json', 'pickle'], CELERY_TASK_SERIALIZER='json', CELERY_RESULT_SERIALIZER='json', CELERYD_MAX_TASKS_PER_CHILD=500, CELERY_BROKER_HEARTBEAT=0, CELERYD_SEND_EVENTS=True, CELERYD_PREFETCH_MULTIPLIER=2, CELERY_QUEUES=( Queue('crawl_queue', exchange=Exchange('crawl_queue', type='direct'), routing_key='crawl'), Queue('parse_queue', exchange=Exchange('parse_queue', type='direct'), routing_key='parse'), ), )