def main(): global logger args = parser.parse_args() if args.evaluate: args.save = os.path.join(os.path.dirname(args.resume), "linear", os.path.basename(args.resume), args.eval_data) os.makedirs(args.save, exist_ok=True) logger = get_logger(logpath=os.path.join(args.save, 'logs'), filepath=os.path.abspath(__file__)) else: os.makedirs(args.save,exist_ok=True) logger = get_logger(logpath=os.path.join(args.save, 'logs'), filepath=os.path.abspath(__file__)) logger.info(args) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') main_worker(args)
def __init__(self, target_id): super().__init__(target_id) self.module = 'Youtube' self.api_key = api_key # 品质设置 self.database = Database('Queues') self.logger = get_logger('Youtube')
def parse_opts(opts): global tcp_port,udp_port,http_port,logger #create a logger logger = get_logger() for arg,param in opts: if arg == '-h': usage() sys.exit(0) elif arg == '-u': try: udp_port = int(param) except Exception as e: usage() sys.exit(0) elif arg == '-p': try: tcp_port = int(param) except Exception as e: usage() sys.exit(0) elif arg == '-t': try: http_port = int(param) except Exception as e: usage() sys.exit(0)
def parse_opts(opts): global tcp_port,udp_port,host,logger,global_options #create a logger logger = get_logger() for arg,param in opts: if arg == '-h': usage() sys.exit(0) elif arg == '-u': try: udp_port = int(param) global_options["run_mode"] = "udp_mode" except Exception as e: usage() sys.exit(0) elif arg == '-t': try: tcp_port = int(param) global_options["run_mode"] = "tcp_mode" except Exception as e: usage() sys.exit(0) elif arg == '-r': try: host = param.strip() except Exception as e: usage() sys.exit(0)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) preprocess = [] if 'stoplist' in p.as_dict(): stoplist = open(path.join(base_path, p['stoplist'])).readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] def remove_stopwords(sentence): return [word for word in sentence if not word in stoplist] preprocess.append(remove_stopwords) if 'stemmer' in p.as_dict(): stemmer = Stemmer.Stemmer(p['stemmer']) preprocess.append(stemmer.stemWords) if p['input'].endswith('.json'): cor = JsonCorpus(path.join(base_path, p['input']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) else: cor = TextFilesCorpus(path.join(base_path, p['input']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) MmCorpus.serialize(path.join(output_dir, p['corpus_name']), cor, progress_cnt=10000) cor.dictionary.save(path.join(output_dir, p['dict_name']))
def __init__(self, nick, logfile=None, verbosity='INFO'): self.nick = self.base_nick = nick self.logger = get_logger('ircconnection.logger', logfile, verbosity) # gevent pool self.gpool = Pool(10) self._valid_orders = []
def __init__(self, octobot): self.octobot = octobot # Logger self.logger = get_logger(self.__class__.__name__) self.performance_analyser = None self.time_frames = None self.relevant_evaluators = []
def __init__(self, spiders): # 初始化RequestHandler的logger self.logger = tools.get_logger(__name__) # 初始化爬虫下载中间件 self.middlewares = defaultdict(list) self.__init_first_middleware() self.__init_spider_middleware(spiders) # 设置下载间隔时间 self.sleep_time = getattr(setting, 'SLEEP_TIME', 3) # 当得到一下HTTP Code时不再重试该请求 self.abandon_code = {400, 401, 403, 404, 405, 406, 407, 410, 411, 413, 416, 500, 501, 502, 505}
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # in test case if param_file: files = [path.join(base_path, p['wiki_txt'])] else: files = glob.glob(path.join(base_path, p['wiki_txt']) + '*.txt') out = codecs.open(os.path.join(output_dir, 'wiki.json'), mode='w', encoding='utf-8') headline = re.compile('\[\[(.*)\]\]') level2 = re.compile('== (.*) ==') t0 = time.time() c = 0 res = {} for file in files: print 'work on: %s' % file with codecs.open(file, encoding='utf-8') as f: for line in f: # ignore linebreaks if line == '\n': continue # if headline found if headline.search(line): if len(res) > 0: out.write(json.dumps(res, encoding='utf-8', ensure_ascii=False) + '\n') topic = headline.search(line).groups()[0] res = {topic: {}} sub = None elif level2.search(line): sub = level2.search(line).groups()[0] else: if not sub: res[topic].setdefault('desc', []).append(line.strip()) else: res[topic].setdefault(sub, []).append(line.strip()) c += 1 print 'average execution time: %f' % ((time.time() - t0) / c) out.write(json.dumps(res, encoding='utf-8', ensure_ascii=False) + '\n') print time.time() - t0
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) inp = codecs.open(os.path.join(p['base_path'], p['corpora_path'], p['corpus_name']), mode='r', encoding='utf-8') out = codecs.open(os.path.join(output_dir, p['result_name']), mode='w', encoding='utf-8') pair = re.compile('\d\.(\w+):(\w+)') exclude = set(string.punctuation) line_count = 0 res = [] for line in inp: # skip empty lines if line == "\n": continue # finished one entry if line_count % 5 == 0: print pair.search(line).groups() res.append({'terms': pair.search(line).groups(), 'sentences': [], 'sentences_tagged': [], 'values': []}) # annotate sentence and add it to result if line_count % 5 == 1 or line_count % 5 == 2: res[-1]['sentences'].append(line.strip()) cleaned = "".join(ch for ch in line.strip() if ch not in exclude) tagged = tools.tag(cleaned, p['senna_path']) res[-1]['sentences_tagged'].append(tagged) # add the ratings if line_count % 5 == 3 or line_count % 5 == 4: res[-1]['values'].append(float(line)) line_count = line_count+1 # store the output json.dump(res, out, indent=2)
def process_video(video_dict): """ 处理直播视频,包含bot的发送,视频下载,视频上传和存入数据库 :param video_dict: 含有直播视频数据的dict :return: None """ bot(f"[直播提示] {video_dict['Provide']}{video_dict.get('Title')} 正在直播 链接: {video_dict['Target']}" ) logger = get_logger('Process Video') logger.info(f'{video_dict["Provide"]} Found A Live, starting downloader') video_dict['Title'] = AdjustFileName(video_dict['Title']).adjust() if video_dict["Provide"] == 'Youtube': downloader(r"https://www.youtube.com/watch?v=" + video_dict['Ref'], video_dict['Title'], proxy, '720p') else: downloader(video_dict['Ref'], video_dict['Title'], proxy) upload_queue.put_nowait(video_dict)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) logger.info('load the articles..') article_path = path.join(result_path, p['article_label']) wiki = pickle.load(open(path.join(article_path, 'articles.pickle'))) logger.info('load dictionary and models') dictionary = Dictionary.load(path.join(result_path, p['model_label'], 'dic.dict')) model_path = path.join(result_path, p['model_label']) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) pre = pickle.load(open(path.join(model_path, 'pre.model'))) if int(p['num_topics']) > lsi.num_topics: logger.error('model to small') lsi.num_topics = int(p['num_topics']) data = {} for topic, entries in wiki.iteritems(): logger.info('working on: %s' % topic) data[topic] = {} data[topic]['keys'] = [] vecs = [] data[topic]['ratings'] = [] for key, val in entries.iteritems(): data[topic]['keys'].append(key) vecs.append(lsi[pre[dictionary.doc2bow(val['text'])]]) data[topic]['ratings'].append(val['rating']) data[topic]['vecs'] = np.squeeze(np.array(vecs)[:, :, 1:2]).T U, d, _ = np.linalg.svd(data[topic]['vecs'], full_matrices=False) data[topic]['U'] = U data[topic]['d'] = d f = open(os.path.join(output_dir, "data.pickle"), 'wb') pickle.dump(data, f)
def start_temp_daemon(): db = Database('Queues') while True: event = [] for target_url in db.select(): p = YoutubeTemp(target_url) event.append(p) p.start() is_running = True while is_running: has_running = False for p in event: if p.is_alive(): has_running = True if not has_running: is_running = False logger = get_logger('YoutubeTemp') logger.info('A check has finished.') sleep(sec)
def bd_upload(file): logger = get_logger('bd_upload') if enable_upload: if 'nt' in name: command = [ f"{ABSPATH}\\BaiduPCS-Go\\BaiduPCS-Go.exe", "upload", "--nofix" ] command2 = [ f'{ABSPATH}\\BaiduPCS-GO\\BaiduPCS-Go.exe', "share", "set" ] else: command = [ f"{ABSPATH}/BaiduPCS-Go/BaiduPCS-Go", "upload", "--nofix" ] command2 = [f"{ABSPATH}/BaiduPCS-Go/BaiduPCS-Go", "share", "set"] command.append(f"{ddir}/{file}") command.append("/") command2.append(file) subprocess.run(command) s2 = subprocess.run(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', universal_newlines=True) share_info = s2.stdout if 'https' in share_info: share_info = share_info.replace('\n', '') logger.info(f'{file}: Share successful {share_info}') else: logger.error('Share failed') raise RuntimeError(f'{file} share failed') reg = r'https://pan.baidu.com/s/([A-Za-z0-9_-]{23})' linkre = re.compile(reg) link = re.search(linkre, share_info) try: link = 'https://pan.baidu.com/s/' + link.group(1) return link except AttributeError: logger.exception('get share link error') raise RuntimeError('get share link error') return None
def downloader(link, title, dl_proxy, quality='best'): logger = get_logger('Downloader') # co = ["streamlink", "--hls-live-restart", "--loglevel", "trace", "--force"] co = ["streamlink", "--hls-live-restart", "--force"] if enable_proxy: co.append('--http-proxy') co.append(f'http://{dl_proxy}') co.append('--https-proxy') co.append(f'https://{dl_proxy}') co.append("-o") co.append(f"{ddir}/{title}") co.append(link) co.append(quality) subprocess.run(co) paths = f'{ddir}/{title}' if isfile(paths): logger.info(f'{title} has been downloaded.') bot(f"[下载提示] {title} 已下载完成,等待上传") else: logger.error(f'{title} Download error, link: {link}') raise RuntimeError(f'{title} Download error, link: {link}')
# -*- coding: utf-8 -*- from sources.osm_source import OSMSource import tools import os import requests import tempfile from osm_lint_entity import OsmLintEntity logger = tools.get_logger(__name__) class PBFSource(OSMSource): """ Source reading from .pbf file """ def __init__(self, context, process_entity_callback, map_name, pbf_url): super(PBFSource, self).__init__(context, map_name, process_entity_callback) self.pbf_url = pbf_url def _download_map(self): """ Downloads map from internet. It is up to the caller to remove this temporary file. :param map_name: Name of the map to download :param map_uri: URI of the map to download :return: Temprorary filename where map is downloaded """ logger.info('[%s] Downloading %s', self.map_name, self.pbf_url) r = requests.get(self.pbf_url, stream=True) if not r.ok:
def __init__(self, target_id): super().__init__(target_id) self.module = 'Youtube' self.api_key = api_key # 品质设置 self.logger = get_logger('Youtube')
# -*- coding: utf-8 -*- # @Author: lim # @Email: [email protected] # @Date: 2018-04-04 10:06:34 # @Last Modified by: lim # @Last Modified time: 2018-04-10 11:47:00 import time from tools import get_logger from dbs.redis_db import redis_task from config import LOOP_INTERVAL, DOING_CLEAR CURSOR = redis_task() clear_redis_log = get_logger('clear_redis') def clear_redis(): """func for trim redis finish & failed set.""" record = 0 tatoal = DOING_CLEAR*86400 while True: time.sleep(LOOP_INTERVAL) try: record +=LOOP_INTERVAL CURSOR.trim_finish_set() CURSOR.trim_failed_set() if record >= tatoal: CURSOR.handle_bad_doing() record = 0 except Exception as e: clear_redis_log.error('000:clear redis error{}'.format(e.message))
from multiprocessing import Process from bilibili import Bilibili from config import config from mirrativ import Mirrativ from openrec import Openrec from tools import check_ddir_is_exist, get_logger from twitcasting import Twitcasting from youtube import Youtube, start_temp_daemon logger = get_logger() class Event: def __init__(self): self.events_multi = [] self.gen_process() logger.info(self.events_multi) def start(self): self.start_multi_task() if config['youtube']['enable_temp']: temp = Process(target=start_temp_daemon) temp.run() for event in self.events_multi: event.join() def gen_process(self): if config['youtube']['enable']: for user_config in config['youtube']['users']:
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() # load model and corpus logger.info('loading word mapping') dictionary = Dictionary.load(path.join(result_path, p['run'], p['dict_extension'])) model_path = path.join(result_path, p['run'], p['lsi_ext']) logger.info('load model from: %s' % model_path) lsi = LsiModel.load(model_path) pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext'])) logging.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (only pre model)') corpus_pre = pre[bow_lee_texts] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file'])) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] max_topics = lsi.num_topics logger.info("iterate from %d to %d dimensions (stepsize: %d)" % (p['min_dim'], max_topics, p['dim_step'])) iter_range = range(p['min_dim'], max_topics, p['dim_step']) res = np.zeros(len(iter_range)) for k, l in enumerate(iter_range): # do the lower dimensionality transformation lsi.num_topics = l corpus_lsi = lsi[corpus_pre] # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1])) res[k] = cor[0, 1] plt.figure() plt.plot(iter_range, res) plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension'])) plt.close() np.save(path.join(output_dir, 'model_dim_res.npy'), res) dif = datetime.now() - start logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) logger.info('loading models and dictionary') dictionary = Dictionary.load(path.join(result_path, p['model_label'], 'dic.dict')) model_path = path.join(result_path, p['model_label']) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) pre = pickle.load(open(path.join(model_path, 'pre.model'))) lsi.num_topics = p['num_topics'] logger.info('load wikipedia articles') article_path = path.join(result_path, p['article_label']) wiki = pickle.load(open(path.join(article_path, 'articles.pickle'))) times = np.zeros((1, len(wiki))) count = 0 for query_key, query in wiki.iteritems(): logger.info("working on: %s" % query_key) n = len(query) human = [val['rating'] for val in query.itervalues()] t0 = time.time() corpus = [lsi[pre[dictionary.doc2bow(val['text'])]] for val in query.itervalues()] sim_res = MatrixSimilarity(corpus)[corpus] sim_res.save(path.join(output_dir, 'sim_' + query_key)) avg = np.mean(sim_res, axis=0) idx = np.argsort(avg) times[count] = time.time() - t0 # compute correlation with human rating res = np.zeros((n, 1)) for i in range(n): human_r = [human[j] for j in idx[i:]] res[i, 0] = np.mean(human_r) # plot correlation fig = plt.figure() ax = fig.add_subplot(3, 1, 1) ax.plot(res) ax = fig.add_subplot(3, 1, 2) ratings = [val['rating'] for val in query.itervalues()] ax.scatter(avg[idx], [ratings[i] for i in idx]) # plot similarity distribution ax = fig.add_subplot(3, 1, 3) ax.bar(range(n), avg[idx]) # Set the x tick labels to the group_labels defined above and rotate ax.set_xticks(range(n)) k = [key + ' ' + str(query[key]['rating']) for key in query.keys()] ax.set_xticklabels([k[i] for i in idx]) fig.autofmt_xdate() plt.savefig(path.join(output_dir, query_key + '.' + p['format'])) plt.close() logger.info('average similarity calculation time: %f' % np.mean(times))
def __init__(self): self.PICS_DIR = "data/images" self.ROOT_URL = "https://www.doutula.com/photo/list/" self.LOGGER = tools.get_logger("doutu")
# -*- coding: utf-8 -*- # @Author: lim # @Email: [email protected] # @Date: 2018-04-02 14:31:54 # @Last Modified by: lim # @Last Modified time: 2018-04-09 16:11:05 import psycopg2 from tools import get_logger, error_record from config import PG_DB ,PG_USER, PG_PWD, PG_HOST, PG_PORT pd_db_log = get_logger('pgsql_db') class PgSql(object): def __init__(self): self.conn = self.get_conn() self.cursor = self.get_cursor() #self.table_1 = self.create_table_1() #self.table_2 = self.create_table_2() #self.table_3 = self.create_table_3() def get_conn(self): try: return psycopg2.connect(database=PG_DB, user=PG_USER, password=PG_PWD, host=PG_HOST, port=PG_PORT) except Exception as e: error_record('200') pd_db_log.warning('200:Can not establish a connection to guangzhou pg DB: {}'.format(e.message))
def __init__(self): self.logger = get_logger('BDUpload')
def __init__(self): super().__init__() self.queue = upload_queue self.logger = get_logger('VideoUpload') self.video_info = None
import asyncio import os import time import json from nats.aio.client import Client from tools import get_logger, make_fake_tick logger = get_logger(__name__) NATS_HOSTNAME = os.environ.get('NATS_HOSTNAME', 'localhost') NATS_SERVERS = [f'nats://{NATS_HOSTNAME}:4222'] TICKER_SUBJECT_NAME = 'ticker' async def main(event_loop): nats_client = Client() await nats_client.connect(NATS_SERVERS, loop=event_loop) logger.info(f"Connected to NATS at {nats_client.connected_url.netloc}...") logger.info(f'Publishing ticks to [{TICKER_SUBJECT_NAME}]') while True: tick = make_fake_tick() await nats_client.publish(TICKER_SUBJECT_NAME, json.dumps(tick).encode()) await nats_client.flush(timeout=1) logger.info(f'Published: {tick}') time.sleep(3)
import logging import time from datetime import datetime from tools import get_clients, get_conversion, get_need_volumes from structs import MarketInfo, Deal, ArbOpp from tools import get_logger, send_notifier from tools import (get_sum_on_volume, get_base_and_coin, get_usd_price, get_arb_amount) from collections import defaultdict log = get_logger('runner_v2.log') def fetch_order_book(func, market): import random for _ in range(10): try: return func(market) except Exception as e: if 'too often' in str(e): wait = 0.1 + random.random() time.sleep(wait) else: break raise RuntimeError def process_coins(worker_id, coins_markets, config): open_arbs_all_coins = defaultdict(lambda: [])
import io import os import time from datetime import datetime from threading import Event import numpy as np import zmq from PIL import Image import tools from picamera import PiCamera, PiCameraCircularIO, array logger = tools.get_logger('vigilant') RECORD_RESOLUTION = (1920, 1080) MOTION_RESOLUTION = (640, 480) SAVE_FOLDER = '/home/pi/mnt' CV_THRESHOLD = .9 PRESECONDS = 3 ANALYSE_PERIOD = .5 MACROBLOCK_THRESHOLD = 60 MACROBLOCK_COUNT_FOR_MOTON = 10 class Watcher(array.PiMotionAnalysis): def __init__(self, camera, motion_event): super().__init__(camera) self.are_some_movement = motion_event self.kernel = None
def __init__(self, target_id): super().__init__(target_id) self.logger = get_logger('Twitcasting') self.module = 'Twitcasting'
def __init__(self, target_id): super().__init__(target_id) self.logger = get_logger('Mirrativ') self.module = 'Mirrativ'
def main(): args = parse_option() os.makedirs(args.checkpoint_path, exist_ok=True) if not args.debug: os.environ['PYTHONBREAKPOINT'] = '0' logger = get_logger(logpath=os.path.join(args.checkpoint_path, 'logs'), filepath=os.path.abspath(__file__)) def print_pass(*args): logger.info(*args) builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) print(args) train_loader = get_train_loader(args) isd = ISD(args.arch, K=args.queue_size, m=args.momentum, T=args.temp) isd.data_parallel() isd = isd.cuda() print(isd) criterion = KLD().cuda() params = [p for p in isd.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.learning_rate, momentum=args.sgd_momentum, weight_decay=args.weight_decay) cudnn.benchmark = True args.start_epoch = 1 if args.resume: print('==> resume from checkpoint: {}'.format(args.resume)) ckpt = torch.load(args.resume) print('==> resume from epoch: {}'.format(ckpt['epoch'])) isd.load_state_dict(ckpt['state_dict'], strict=True) optimizer.load_state_dict(ckpt['optimizer']) args.start_epoch = ckpt['epoch'] + 1 # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() loss = train_student(epoch, train_loader, isd, criterion, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # saving the model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'state_dict': isd.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } save_file = os.path.join( args.checkpoint_path, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
#coding=utf-8 import redis import traceback from tools import get_current_day, get_logger, error_record from config import * redis_db_log = get_logger('redis_db') clear_redis_log = get_logger('clear_redis') class redis_task(object): def __init__(self): self.R = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=0, password=REDIS_PWD) self.test = self.connection_test() def connection_test(self): try: self.R.set('test', 'test') except: error_record('101') redis_db_log.warning( '101:Can not establish a connection to local redis DB') def save_task_to_redis(self, task): """save task to todo list""" if self.R.dbsize() >= REDIS_AMOUNT: return 'full' try:
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # initializations articles = {} all_missing = [] redir_on = {} collisions = {} non_ascii = [] site = mwclient.Site('en.wikipedia.org', '/w/api.php/') # get all txt files in a folder and iterate over them filelist = glob.glob(os.path.join(base_path, p['folder_path'], "*.txt")) for f in filelist: # get the word we are working on f_name = os.path.basename(f) k_word = os.path.splitext(f_name)[0] logger.info("working on file: %s" % f_name) # try to convert the word into ascii for the http query file_obj = codecs.open(f, "r", "utf-16") counter = 0 words = [] for w in file_obj.readlines(): try: s = w.strip().decode('ascii') words.append(s) except Exception: counter += 1 non_ascii.append(w.strip()) logger.info("\t%d words containing non ascii are ommited" % counter) articles[k_word] = {} logger.info("\tfound %d words in file" % len(words)) for word in words: data = {} page = site.Pages[word] # follow the redirect and check for collisions if page.redirect: res = re.search('\[\[(.+)\]\]', page.edit()) redir_word = urllib.unquote(res.groups()[0]) if redir_word in redir_on: logger.warning("[%s AND %s] both redirect on --> %s" % (word, redir_on[redir_word], redir_word)) collisions[redir_word] = redir_on[redir_word] else: logger.info("[%s] redir from [%s]" % (redir_word, word)) redir_on[redir_word] = word text = site.Pages[redir_word].edit() data['redirected'] = redir_word else: text = page.edit() # check for missing wikipedia articles if text == "": all_missing.append(word) continue # preprocess the received article data['text'] = wikicorpus.filter_wiki(text) in_ascii = ud.normalize('NFKD', data['text']).encode('ascii', 'ignore') data['text'] = preprocess_string(in_ascii) articles[k_word][word] = data logger.info('add human rating to the articles') id_word = {} sparql_path = os.path.join(base_path, p['sparql_path']) with open(os.path.join(sparql_path, 'id_word.txt')) as f: for line in f.readlines(): idx, word = line.strip().split('\t') id_word[idx] = word #add human rating to the wikipedia data not_found = [] with open(os.path.join(sparql_path, p['human_file'])) as f: for line in f.readlines(): arr = line.split() word = id_word[arr[0]] term = arr[3] try: articles[word][term]['rating'] = int(arr[4]) except KeyError: not_found.append(term) logger.info("%d words from the ref queries not found" % len(not_found)) f = open(os.path.join(output_dir, "articles.pickle"), 'wb') pickle.dump(articles, f) f.close info = {} info['missing'] = all_missing info['redirs'] = redir_on info['collisions'] = collisions info['not_found'] = not_found info['non_ascii'] = non_ascii f = open(os.path.join(output_dir, "info.pickle"), 'wb') pickle.dump(info, f) f.close logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
# -*- coding: utf-8 -*- import Image import tools import os logger = tools.get_logger(__name__) class HandlingImage(object): def __init__(self,path,folder_out_title,last_index): self.path = path self.folder_out_title = folder_out_title self.last_index = last_index def execute(self,index): size_thumb = (160,120) size_light = (1024,768) image = Image.open(self.path) filename= str(self.last_index + index).zfill(5) image.save(os.path.join(self.folder_out_title,filename + ".jpg"),"JPEG") image_light = image.copy() image_light.thumbnail(size_light,Image.ANTIALIAS) image_light.save(os.path.join(self.folder_out_title, filename+ "_light.jpg"), "JPEG") image_thumb = image.copy() image_thumb.thumbnail(size_thumb,Image.ANTIALIAS) image_thumb.save(os.path.join(self.folder_out_title, filename + "_thumb.jpg"), "JPEG") logger.info(u"Média %s=>%s terminé avec succès" % (self.path,filename))
from tools import get_redis, get_logger import defaultsettings # build-in import sys try: import cPickle as pickle except ImportError: import pickle import time # need import gevent reload(sys) sys.setdefaultencoding('utf-8') logger = get_logger(__name__) redis_conn = get_redis() class EmptyError(Exception): pass def log_push(func): def _log(request, spider): func(request, spider) logger.debug('Push request<%s> into queue', request) return _log class RequestQueue(object):
def __init__(self): self.logger = get_logger('S3Upload') self.minio = Minio(s3_server, access_key=s3_access_key, secret_key=s3_secret_key, secure=True)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv))
# pylint: disable=E1101, E1103, W0632 import collections import itertools import numpy as np import operator import pandas as pd from numpy import linalg from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier import tools import ranking_constants LOGGER = tools.get_logger(__name__) @tools.timeit def load_data(kpi): """ Function to load data from the metrics csv files and hammer file :param kpi: name of the kpi to analyze :return: dataframe containing data """ LOGGER.info("Loading hammer data") # the first three lines do not contain meaningful data: they are dropped statistics = pd.read_csv( ranking_constants.CSV_FILES["hammer_statistics"] ).iloc[3:, :].set_index("timestamp")
def __init__(self, vinfo): super().__init__(None) self.vinfo = vinfo self.vid = None self.db = Database('Queues') self.logger = get_logger('YoutubeTemp')
def __init__(self): self.API = BilibiliAPI() self.logger = get_logger('Bilibili') self.old_video_num = None
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name']) human_data_file = path.join(base_path, p['human_data_file']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() logger.info('loading word mapping') dictionary = Dictionary.load(path.join(base_path, p['corpus_path'], p['dict_name'])) Dictionary.save(dictionary, path.join(output_dir, p['dict_name'])) logger.info(dictionary) logger.info('loading corpus') corpus_bow = MmCorpus(working_corpus) logger.info("create preprocessing model and save it to disk") if p['pre_model'] == 'tfidf': pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) elif p['pre_model'] == 'log_ent': pre_model = LogEntropyModel(corpus_bow, id2word=dictionary, normalize=True) else: raise ValueError('model parameter %s not known' % p['pre_model']) pre_model.save(os.path.join(output_dir, p['pre_model_extension'])) logger.info('initialize LSI model') lsi = models.LsiModel(pre_model[corpus_bow], id2word=dictionary, num_topics=p['num_topics']) lsi.save(os.path.join(output_dir, p['lsi_extension'])) logger.info('finished --> lsi model saved to: %s' % os.path.join(output_dir, p['lsi_extension'])) # check for correlation with lee human data logger.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[pre_model[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(human_data_file) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("correlation with lee human data: %f" % cor[0, 1]) dif = start - datetime.now() logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
from settings import IDC_TAG from settings import SALT_CHECK from consul import consul from tools import get_logger, switch import getinfo import sys import os import requests import re import fire import simplejson scmd = {'sh': '/bin/sh', 'py': '/usr/local/bin/python'} logger = get_logger('Jenkins publish', '/www/logs/', True) _, upstreams = getinfo.main() def getHostname(ip): res = requests.get(CMDB + ip) hosts = res.json() if not hosts: logger.error('%s: cant find the host.' % ip) sys.exit(4) hostnames = [] for h in hosts: hname = h.get('hostname') email = h.get('email') if IDC_TAG not in hname:
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) model_path = path.join(base_path, p['result_path'], p['model_label']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # train the model on the small marketing corpus preprocess = [] if 'stoplist' in p.as_dict(): stoplist = open(path.join(base_path, p['stoplist'])).readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] def remove_stopwords(sentence): return [word for word in sentence if not word in stoplist] preprocess.append(remove_stopwords) if 'stemmer' in p.as_dict(): stemmer = Stemmer.Stemmer(p['stemmer']) preprocess.append(stemmer.stemWords) if not p['model_label']: cor = TextFilesCorpus(path.join(base_path, p['corpus_path']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) dictionary = cor.dictionary pre = LogEntropyModel(cor, id2word=dictionary, normalize=True) lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics']) else: dictionary = Dictionary.load(path.join(model_path, p['dict_name'])) pre = SaveLoad.load(path.join(model_path, 'pre.model')) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) lsi.num_topics = p['num_topics'] test_cor_path = path.join(base_path, p['test_cor_path']) test_answers, gold_answers, ratings = [], [], [] flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt')) for file in flist: match = re.search('data3_(\d)_\d+.txt', file) ratings.append(int(match.group(1))) with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] test_answers.append(corpus) flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt')) for file in flist: with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] gold_answers.append(corpus) sim = MatrixSimilarity(test_answers)[gold_answers] mean_sim = np.mean(sim, axis=0) print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1] print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
# define what should happen when a point is picked def onpick(event): plt.subplot(2, 1, 1) event.artist.figure.axes[0].texts = [] plt.annotate(event.artist.name, (event.artist._x, event.artist._y)) # setup p = build_parameters(sys.argv[1]) result_path = path.join(p['base_path'], p['result_path']) output_dir = path.join(result_path, p['sumatra_label']) if not path.exists(output_dir): os.mkdir(output_dir) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) data = pickle.load(open(path.join(result_path, p['data_label'], 'data.pickle'))) for key, val in data.iteritems(): # for bla in [1]: # key, val = 'eagle', data['eagle'] fig = plt.figure() fig.canvas.mpl_connect('pick_event', onpick) plt.subplot(3, 1, 1) plt.title(key)
def __init__(self, target_id): super().__init__(target_id) self.logger = get_logger('Openrec') self.module = 'Openrec'