def test(): loghelper.get_logger('wrf_forecast') #logger.debug('running test') template = '/home/slha/code/wrftools/devel/queue/template.sge' job_script = '/home/slha/code/wrftools/devel/queue/job.sge' executable = '/home/slha/forecasting/development/run/wrf.exe' run_dir = '/home/slha/forecasting/development/run' jobname = 'WRF' qname = 'all.q' nprocs = 8 replacements = {'<executable>': executable, '<jobname>': jobname, '<qname>' : qname, '<nprocs>' : nprocs} fill_template(template, job_script, replacements) os.chdir(run_dir) job_id = qsub(job_script) for i in range(3): status = qstat(job_id) print status if status==None: 'print job not in queue, presume complete' break if 'E' in status: raise QueueError('job %s has queue status of %s' %(job_id, status)) time.sleep(5)
def _filter(frame, variables=None, dimspec=None, log_name=LOGGER): logger = loghelper.get_logger(log_name) # filter by variables if variables: use_var = map(str,variables) logger.debug("filtering on variable: %s" % str(use_var)) frame = frame[frame['variable'].isin(use_var)] logger.debug("%d rows" % len(frame)) # filter by location if dimspec and 'location' in dimspec: use_loc = map(str,dimspec['location']) logger.debug("filtering on location: %s" % str(use_loc)) frame = frame[frame['location'].isin(use_loc)] logger.debug("%d rows" % len(frame)) # filter by height. How do we treat surface here? if dimspec and 'height' in dimspec: use_hgt = dimspec['height'] use_hgt = [HGT2DNUM if h==HGT2DSTR else h for h in use_hgt] logger.debug("filtering on height: %s" % str(use_hgt)) ind = frame['height'].isin(use_hgt) frame = frame[ind] logger.debug("%d rows" % len(frame)) return frame
def ncdump(config): logger = loghelper.get_logger(config['log.name']) # subset of config to be used for expanding filenames scope = {'init_time' : config['init_time'], 'grid_id' : config['grid_id']} for name, entry in config['ncdump'].items(): logger.debug("procesing entry %s " % name) if config.get('<files>'): files = config['<files>'] if type(files)!=type([]): files = [files] else: tseries_files = expand(entry['tseries_file'], config) logger.debug("expanding file list from pattern and init time") #logger.debug(tseries_files) files = glob.glob(tseries_files) logger.debug("found %d files" % len(files)) dump(files, entry, scope, log_name=config['log.name'])
def dispatch_entry(config, entry, dry_run=None, log_name=LOGGER): """Dispacthes one entry of distribution list""" logger = loghelper.get_logger(log_name) address = expand(entry['mailto'], config) subject = expand(entry['subject'], config) body = expand(entry['body'], config) from_addr = expand(entry['from'], config) attachments = [expand(a, config) for a in entry['attach']] logger.debug('dispatch_entry() called') if type(attachments)==type([]): a_arg = ' '.join(['-a %s' % a for a in attachments]) else: a_arg = '-a %s' % attachments if 'cc' in entry: cc_arg = '-c %s' % entry['cc'] else: cc_arg = '' if 'content_type' in entry: ct_arg = '-e "my_hdr Content-Type: :%s"' % dist['content_type'] else: ct_arg = '' cmd = """EMAIL="%s" mutt %s -s"%s" %s %s -- %s < %s """ %(from_addr, ct_arg, subject, a_arg, cc_arg, address, body) logger.debug(cmd) logger.debug(dry_run) if not dry_run: subprocess.call(cmd, shell=True)
def dispatch(config): if __name__ == "__main__": logger = loghelper.create_logger(config) else: logger = loghelper.get_logger(config['log.name']) dist = config['dispatch.list'] logger.info("dispatch.py sending files via email") dry_run=config['dry_run'] for name, entry in dist.items(): logger.info("dispatching files for entry: %s" % name) dispatch_entry(config, entry, dry_run=dry_run, log_name=config['log.name'])
def write_csv_files(frame, out_dir, out_name, variables, dimspec, drop, values, rows, cols, sort_by=None, rename=None, float_format='%0.3f', na_rep="",log_name=LOGGER): """Writes each variable and height into a seperate column. Columns will be labelled variable_height where height if formatted as %03d int(height) Takes as input a DataFrame in a record based format, e.g. init_time, valid_time, height, location, variable, units, value.""" logger = loghelper.get_logger(log_name) if not os.path.exists(out_dir): os.makedirs(out_dir) # drop columns first, this will cause problems later if someone else wants to use it! #if drop: # for col in drop: # del(frame[col]) #logger.debug(frame) # drop columns by subsetting to create a view if drop: _drop(frame, drop) # subset based on variable, location, height frame = _filter(frame, variables, dimspec, log_name) #frame = frame.set_index(['init_time','valid_time']) logger.debug(frame) logger.debug("about to pivot") logger.debug("values: %s" % values) logger.debug("rows: %s" % rows) logger.debug("cols: %s" % cols) frame = pd.pivot_table(frame, values=values, rows=rows,cols=cols) frame = frame.reset_index() if sort_by: frame.sort(sort_by, inplace=True) if rename: frame = _rename(frame, rename) logger.debug("outputting csv file: %s/%s " % (out_dir, out_name)) logger.debug(na_rep) frame.to_csv('%s/%s' % (out_dir, out_name), index=False, float_format=float_format, na_rep=na_rep)
def dump(files,entry,scope,log_name=LOGGER): logger = loghelper.get_logger(log_name) vars = entry['tseries_vars'] global_atts = entry['global_atts'] var_atts = entry['var_atts'] coord_vars = entry['coord_vars'] format = entry['format'].strip() #logger.warn("subsetting at read time is not implemented") # Read all data into memory as pandas Series objects logger.debug("ncdump called with arguments") logger.debug("\t files: %s" % str(files)) logger.debug("\t vars: %s" % str(vars)) logger.debug("\t global_atts: %s" % str(global_atts)) logger.debug("\t var_atts: %s" % str(var_atts)) logger.debug("\t coord_vars: %s" % str(coord_vars)) logger.debug("\t log_name: %s" % str(log_name)) for file in files: logger.debug(file) frame = frame_from_nc([file], vars, global_atts, var_atts, coord_vars,log_name) if format not in FORMATS: logger.error("format %s not understood" % format) raise UnknownFormat("format not understood") if format=='txt' : pass #write_txt_files(frame, entry['dir'], entry['dimspec'], log_name) elif format=='json': write_json_files(frame, entry['dir'], expand(entry['fname'], scope), entry['tseries_vars'], entry['dimspec'], entry['drop'], entry['rename'], entry['float_format'], log_name) elif format=='csv': write_csv_files(frame, entry['dir'], expand(entry['fname'], scope), entry['tseries_vars'],entry['dimspec'], entry['drop'], values='value', rows=entry['rows'],cols=entry['cols'],sort_by=entry['sort_by'],rename=entry['rename'],float_format=entry['float_format'], na_rep=entry['na_rep'], log_name=log_name) elif format=='aot': write_aot_files(frame, entry['dir'])
def qsub(job_script): """Submits a PBS job via qsub Arguments: @job_script -- full path to a pbs job script file Returns: @job_id -- the job id returned by the PBS system """ logger = loghelper.get_logger('wrf_forecast') #logger.debug('submitting job %s' % job_script) cmd = 'qsub %s ' % job_script # # The output from PBS is of the format # "Your job 3681 ("TEST") has been submitted" # proc = subprocess.Popen([cmd], stdout=subprocess.PIPE, shell=True) output = proc.stdout.read() job_id = output.split(' ')[2] logger.debug("%s ------> %s" % (cmd, job_id)) return job_id
# -*- coding: utf-8 -*- import os, sys import datetime import time sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, db #logger loghelper.init_logger("openapi_funding", stream=True) logger = loghelper.get_logger("openapi_funding") def add_new_created_fundings(): today = datetime.datetime.now().date() date1 = today + datetime.timedelta(days=-30) date2 = today + datetime.timedelta(days=-365 * 2) logger.info("today: %s, date1: %s, date2: %s", today, date1, date2) conn = db.connect_torndb() fundings = conn.query( "select * " "from funding " "where companyId is not null and " "createTime>=%s and " "(" "(publishDate is not null and publishDate>=%s) " "or " "(publishDate is null and fundingDate>=%s)" ")", today, date1, date2)
os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util import time DATE = None # logger loghelper.init_logger("crawler_feixiaohao_marketdata", stream=True) logger = loghelper.get_logger("crawler_feixiaohao_marketdata") def save_marketdata(content): fileName = 'file/market_data_%s.xls' % datetime.datetime.now().strftime( "%Y-%m-%d%H:%M:%S") path = os.path.join(os.path.split(os.path.realpath(__file__))[0], fileName) logger.info('saving file:%s', path) with open(path, "wb") as file: file.write(content) return fileName def run(crawler):
def frame_from_nc(ncfiles, vars, global_atts, var_atts, coord_vars,log_name): """ Build a Pandas DataFrame from a series of netcdf files""" logger = loghelper.get_logger(log_name) frames = [] # Open files one-by-one for f in ncfiles: logger.debug("reading: %s" % f) dataset = Dataset(f, 'r') #logger.debug(dataset) variables = dataset.variables # lookup global attributes in dataset # shouldn't really use this, but it works dataset_atts = dataset.__dict__ # if no vars specified, use all in ncfiles if vars==None: vars = list(variables.keys()) # get coorinate variables time = variables['time'] datetimes = num2date(time,units=time.units,calendar=time.calendar) ntime = len(datetimes) init_time = datetimes[0] # hack to catch thanet files which have location_id rather than location try: location = variables['location'] except KeyError: location = variables['location_id'] # Unmask string and strip, convert from unicode to string nloc = location.shape[0] loc_masked = np.ma.array(location) loc_id_raw = [''.join(loc_masked[l,:].filled('')) for l in range(nloc)] location = map(string.strip, loc_id_raw) location = map(str,location) height = variables['height'] nheight = len(height) varnames = [v for v in vars if v not in coord_vars] vars2D = [v for v in varnames if len(variables[v].shape)==2] vars3D = [v for v in varnames if len(variables[v].shape)==3] #can't really avoid nested loop here without making code unintelligble for v in vars2D: for l in range(nloc): # create dataframe then append columns avoids copying each series df = pd.DataFrame(datetimes, index=range(len(datetimes)), columns=['valid_time']) df['init_time'] = init_time df['location'] = location[l] # this creates is an object type df['location'] = df['location'].astype(str) df['height'] = HGT2DNUM df['variable'] = v df['value'] = variables[v][:,l] for att in global_atts: df[str(att)] = dataset_atts[att] for att in var_atts: df[str(att)] = variables[v].getncattr(att) frames.append(df) for v in vars3D: for l in range(nloc): for h in range(nheight): # create dataframe then append columns avoids copying each series df = pd.DataFrame(datetimes, index=range(len(datetimes)),columns=['valid_time']) df['init_time'] = init_time df['location'] = location[l] df['height'] = height[h] df['variable'] = v df['value'] = variables[v][:,l,h] for att in global_atts: df[str(att)] = dataset_atts[att] for att in var_atts: df[str(att)] = variables[v].getncattr(att) frames.append(df) dataset.close() df = pd.concat(frames) cols = df.columns # re-order the columns for cleaner output pre_cols = ['init_time','valid_time','location'] data_cols = [c for c in cols if c not in pre_cols] new_cols = pre_cols + data_cols df = df[new_cols] df.index = range(len(df)) return df
# -*- coding: utf-8 -*- import os, sys import datetime reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, db #logger loghelper.init_logger("patch_ether_fa", stream=True) logger = loghelper.get_logger("patch_ether_fa") def main(): utokens = {} mongo = db.connect_mongo() items = list( mongo.xiniudata.user_cookie.find({ "type": "utoken", "active": 'Y' })) for item in items: userCookie = item["userCookie"] utokenUserId = item["utokenUserId"] if utokens.has_key(userCookie): logger.info(userCookie) else:
from email.mime.text import MIMEText from email.header import Header from email.utils import formataddr import requests, json from aliyun_monitor import AliyunMonitor reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, util, db, config #logger loghelper.init_logger("email_send_patch", stream=True) logger = loghelper.get_logger("email_send_patch") def merge_users(to_list, from_list): for user in from_list: exist = False for u in to_list: if user["id"] == u["id"]: exist = True if exist is False: to_list.append(user) if __name__ == "__main__": cnt = 0 conn = db.connect_torndb()
import random reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../support')) import loghelper, db import proxy_pool #logger loghelper.init_logger("BaseCrawler", stream=True) logger = loghelper.get_logger("BaseCrawler") #mongo #mongo = db.connect_mongo() #collection = mongo.raw.projectdata class RedirectHandler(urllib2.HTTPRedirectHandler): def http_error_301(self, req, fp, code, msg, headers): pass def http_error_302(self, req, fp, code, msg, headers): pass class BaseCrawler:
sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import GlobalValues sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, db #logger loghelper.init_logger("crawler_36kr_company2", stream=True) logger = loghelper.get_logger("crawler_36kr_company2") TYPE = 36001 SOURCE = 13022 URLS = [] CURRENT_PAGE = 1 linkPattern = "/article/\d+" Nocontents = [] columns0 = [ { "column": None, "max": 3 }, { "column": "FARMING",
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import config import loghelper import name_helper import db sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import aggregator_db_util import helper #logger loghelper.init_logger("company_aggregator_baseinfo", stream=True) logger = loghelper.get_logger("company_aggregator_baseinfo") # mongo = db.connect_mongo() collection = mongo.trend.android collection_alexa = mongo.trend.alexa gongshang = mongo.info.gongshang def get_company_code(name, test=False): table_names = helper.get_table_names(test) conn = db.connect_torndb() if len(name) <8 : pinyin = lazy_pinyin(name.decode('utf-8')) company_code = ''.join(pinyin) else: pinyin = lazy_pinyin(name.decode('utf-8'), style=pypinyin.INITIALS)
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, db, extract # logger loghelper.init_logger("crawler_szse_an", stream=True) logger = loghelper.get_logger("crawler_szse_an") class AnnounceCrawler(BaseCrawler.BaseCrawler): def __init__(self, timeout=20): BaseCrawler.BaseCrawler.__init__(self, timeout=timeout) # 实现 def is_crawl_success(self, url, content): try: res = content.replace('var szzbAffiches=', '')[:-2] # logger.info(res) contentnew = eval(res.decode("gbk").strip()) logger.info(contentnew) if len(contentnew) > 0:
import os, sys import time, datetime import traceback import requests import json reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper, db #logger loghelper.init_logger("patch_wechat_unionid", stream=True) logger = loghelper.get_logger("patch_wechat_unionid") appid = "wx766854150052d912" appsecret = "d4fc5ea387e938c7641dd434a4d7a891" ACCESS_TOKEN = None #有效期为7200秒,开发者必须在自己的服务全局缓存access_token ACCESS_TOKEN_TIME = 0 def refreshToken(): global ACCESS_TOKEN, ACCESS_TOKEN_TIME if ACCESS_TOKEN is None or ACCESS_TOKEN_TIME + 7000 < time.time(): print "get ACCESS_TOKEN and JSAPI_TICKET" ACCESS_TOKEN_TIME = time.time() url = "https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid=%s&secret=%s" % ( appid, appsecret)
sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import db import name_helper import config import image_helper import url_helper #logger loghelper.init_logger("card_v1", stream=True) logger = loghelper.get_logger("card_v1") #parse data from qimingpian directly, bamy called it step 1 to checkout company def find_companies_by_full_name_corporate(full_names, idmax=0): companyIds = [] for full_name in full_names: if full_name is None or full_name == "": continue # full_name = name_helper.company_name_normalize(full_name) conn = db.connect_torndb() corporate_aliases = conn.query( "select a.* from corporate_alias a join corporate c on c.id=a.corporateId where "
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download, traceback_decorator sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import proxy_pool # logger loghelper.init_logger("crawler_forbes_news", stream=True) logger = loghelper.get_logger("crawler_forbes_news") NEWSSOURCE = "forbes" RETRY = 3 TYPE = 60004 SOURCE = 13883 URLS = [] CURRENT_PAGE = 1 # https://www.avcj.com/avcj/news/3010878/gic-advises-caution-predicts-rising-global-volatility linkPattern = "www.pintu360.com/a\d+.html" Nocontents = [ ] columns = [ {"column": "innovation", "max": 1, 'sourceValue': 'channel_74'}, {"column": "entrepreneurs", "max": 1, 'sourceValue': 'channel_4'}, {"column": "small-business", "max": 1, 'sourceValue': 'channel_21'},
def power(config): """Reads 'time series' from netcdf time series file, and adds power as a variable. """ if __name__ == "__main__": logger = loghelper.create_logger(config) else: logger = loghelper.get_logger(config['log.name']) # listify ensures they are returned as a list, even if it is one file files = shared._listify(config['<files>']) # Number of samples to use should be in here # Whether to normalise power should be in here start = config.get('start') delay = config.get('delay') cycles = shared._listify(config.get('cycles')) pnorm = config.get('pnorm') pdist = config.get('pdist') sstd = config.get('sstd') dstd = config.get('dstd') pquants = config.get('pquants') quantiles = np.array(pquants) pcurve_dir = config.get('pcurve-dir') ts_dir = config.get('tseries-dir') out = config.get('out') metadata = config.get('metadata') basetime = start if start else datetime.datetime.today() prior = shared._prior_time(basetime, delay=delay, hours=cycles) logger.debug("using %s as a start time" % prior) if not files: logger.debug("no files specified, finding using options") file_pattern = config.get('file-pattern') if not file_pattern: raise ConfigError('either supply files or specify file-pattern') expanded = substitute.sub_date(file_pattern, init_time=prior) files = glob.glob(expanded) print("hello") logger.debug(files) # if we get to this point and there are still no files, then we have a problem if not files: raise IOError("no files found") logger.debug("input files: ") logger.debug(files) for f in files: logger.debug("\t%s" % f) # if pdist if pdist: n=pdist #grid_id = config['grid_id'] out_pattern = config.get('out') for tseries_file in files: dataset_in = Dataset(tseries_file, 'a') # Get dimensions dims = dataset_in.dimensions nreftime = len(dims['reftime']) ntime = len(dims['leadtime']) nloc = len(dims['location']) nheight = len(dims['height']) loc_str_len = len(dims['loc_str_length']) # Get coordinate variables reftime = dataset_in.variables['reftime'] leadtime = dataset_in.variables['leadtime'] validtime = nctools._valid_time(reftime, leadtime) refdt = num2date(reftime[:], reftime.units) power_file = substitute.sub_date(out, init_time=refdt[0]) logger.info('Estimating power from time series: %s ' % tseries_file) logger.info('Writing power time series to: %s ' % power_file) location = [''.join(l.filled(' ')).strip() for l in dataset_in.variables['location']] height = dataset_in.variables['height'] if power_file == tseries_file: dataset_out = dataset_in else: dataset_out = Dataset(power_file, 'w') # Get number of quantiles nq = len(quantiles) pdata = np.ma.zeros((ntime,nloc,nheight,nq+1), np.float) # mean will be 1st value use_locs = [] # loop through locations and look for power-curve file for l,loc in enumerate(location): pcurve_file = '%s/%s.csv' %(pcurve_dir, loc) # mask power data if no power curve found for this park if not os.path.exists(pcurve_file): #logger.debug("Power curve: %s not found, skipping" % pcurve_file) pdata[:,l,:,:] = np.ma.masked continue logger.info('Predicting power output for %s' % loc ) # # Open power curve # use_locs.append(l) pcurve = from_file(pcurve_file) for h in range(nheight): speed = dataset_in.variables['SPEED'][0,:,l,h] direction = dataset_in.variables['DIRECTION'][0,:,l,h] #pwr = pcurve.power(speed,direction) # pdist will create a distribution for each timetep based on sampling # n times from a normal distribution. pdist = pcurve.power_dist(speed, direction, sstd=sstd,dstd=dstd,n=n, normalise=pnorm) pmean = np.mean(pdist, axis=1) pquants = scipy.stats.mstats.mquantiles(pdist, prob=quantiles/100.0,axis=1, alphap=0.5, betap=0.5) pdata[:,l,h,0] = pmean pdata[:,l,h,1:] = pquants[:,:] #logger.info('finished %s' % loc) use_inds = np.array(use_locs) if dataset_out != dataset_in: dataset_out.createDimension('reftime', None) dataset_out.createVariable('reftime', 'float', ('reftime',)) dataset_out.variables['reftime'][:] = reftime[:] dataset_out.variables['reftime'].units = reftime.units dataset_out.variables['reftime'].calendar = reftime.calendar dataset_out.variables['reftime'].long_name = reftime.long_name dataset_out.variables['reftime'].standard_name = reftime.standard_name dataset_out.createDimension('leadtime', len(leadtime)) dataset_out.createVariable('leadtime', 'int', ('leadtime',)) dataset_out.variables['leadtime'][:] = leadtime[:] dataset_out.variables['leadtime'].units = leadtime.units dataset_out.variables['leadtime'].long_name = leadtime.long_name dataset_out.variables['leadtime'].standard_name = leadtime.standard_name dataset_out.createDimension('location', len(use_locs)) dataset_out.createDimension('loc_str_length', loc_str_len) loc_data =np.array([list(l.ljust(loc_str_len, ' ')) for l in location]) dataset_out.createVariable('location', 'c', ('location', 'loc_str_length')) dataset_out.variables['location'][:] = loc_data[use_inds,:] dataset_out.createDimension('height', nheight) dataset_out.createVariable('height', 'i', ('height',)) dataset_out.variables['height'][:] = height[:] dataset_out.GRID_ID = dataset_in.GRID_ID dataset_out.DX = dataset_in.DX dataset_out.DY = dataset_in.DY try: dataset_out.variables['height'].units = height.units except Exception: logger.warn("height units missing") pdata = pdata[:, use_inds, :, :] for key in metadata.keys(): key = key.upper() dataset_out.setncattr(key,dataset_in.getncattr(key)) pavg = dataset_out.createVariable('POWER','f',('reftime','leadtime','location','height')) pavg.units = 'kW' pavg.description = 'forecast power output' pavg[0,:,:,:] = pdata[:,:,:,0] for q, qval in enumerate(quantiles): varname = 'POWER.P%02d' % qval var = dataset_out.createVariable(varname,'f',('reftime','leadtime','location','height')) if pnorm: var.units = 'ratio' else: var.units = 'kW' var.description = 'forecast power output' var[0,:,:,:] = pdata[:,:,:,q+1] #logger.debug(dataset_out) dataset_in.close() if dataset_out!=dataset_in: dataset_out.close()
def write_json_files(frame, out_dir, out_name, variables, dimspec, drop, rename=None, float_format="%0.3f",log_name=LOGGER ): """ Writes each variable and init_time series into one json file. If vars is None, then all export all variables""" logger = loghelper.get_logger(log_name) logger.info("*** outputting data as json ***") # drop columns by subsetting to create a view if drop: _drop(frame, drop) # subset based on variable, location, height frame = _filter(frame, variables, dimspec, log_name) if rename: frame = _rename(frame, rename) # Bit of a hack to ease output formatting, convert init_time to string frame['init_time'] = frame['init_time'].apply(str) # we need to group by everything except valid time and value group_by = [c for c in frame.columns if c not in ["valid_time", "value"]] gb = frame.groupby(group_by) # Convert time to milliseconds since epoc convert = lambda t: time.mktime(t.timetuple())*1000 series = [] for name, group in gb: #logger.debug("processing %s" % str(name)) # create a dictionary from all the fields except valid time and value d = dict(zip(group_by,list(name))) timestamp = map(convert, group['valid_time']) values = group['value'] mvals = np.ma.masked_invalid(np.array(values)) data = [ (timestamp[n],mvals[n]) for n in range(len(timestamp))] ldata = map(list, data) d['data'] = ldata s = str(d) # this is an ugly hack which could potentially lead to errors if " u'" occurs at the end of a string s = s.replace(" u'", " '") # change single quotes to double s = s.replace("'", '"') # replace masked values. Again, ugly s = s.replace('masked', 'null') series.append(s) json_str = ','.join(series) if not os.path.exists(out_dir): os.makedirs(out_dir) fout = open('%s/%s' % (out_dir, out_name), 'w') fout.write('[') fout.write(json_str) fout.write(']') fout.close()
def write_aot_files(frame, out_dir, log_name=LOGGER): """Writes file format the same as AOTs existing supplier, which is: "Location","Date/time (utc)","Date/time (local)","Forecast (hours)","Windspeed 21m (m/sec)","Winddirection 21m (degrees)","Windspeed 70m (m/sec)","Winddirection 70m (degrees)","Windspeed 110m (m/sec)","Winddirection 110m (degrees)","Percentile 10 (m/sec) 70m","Percentile 20 (m/sec) 70m","Percentile 30 (m/sec) 70m","Percentile 40 (m/sec) 70m","Percentile 50 (m/sec) 70m","Percentile 60 (m/sec) 70m","Percentile 70 (m/sec) 70m","Percentile 80 (m/sec) 70m","Percentile 90 (m/sec) 70m "Thanet",2013-05-31 00:00,2013-05-31 02:00,0,7.80,351,8.70,352,8.93,352,7.27,7.83,8.20,8.52,8.70,8.97,9.27,9.66,10.16 Arguments: @ncfiles list of netcdf time-series files to process @out_dir directory to write output to @dimspec use this to restrict dimensions""" logger = loghelper.get_logger(log_name) # Format is too bespoke, just hard code it all here! # ensure sorted by init_time, valid_time, location frame.sort(['init_time', 'valid_time', 'location'], inplace=True) #init_time = frame.init_time[0] # # The AOT files require a local time, as well as UTC time. This requires a mapping between location # and timezone. The quickest way to do this is to hardcode this here. This is not very elegant or # extensible, but it works. # import pytz tz_map = { "FRE": "Europe/Amsterdam", "HAM": "Europe/Amsterdam", "DTK": "Europe/Amsterdam", "SNB": "Europe/Amsterdam", "AVS": "Europe/Amsterdam", "FN1": "Europe/Amsterdam", "FN3": "Europe/Amsterdam", "AMS": "Europe/Amsterdam", "NEZ": "Europe/Amsterdam", "ZDB": "Europe/Amsterdam", "RCN": "Europe/Amsterdam", "BEK": "Europe/Amsterdam", "DEB": "Europe/Amsterdam", "DKY": "Europe/Amsterdam", "DLN": "Europe/Amsterdam", "HGV": "Europe/Amsterdam", "LWN": "Europe/Amsterdam", "LYD": "Europe/Amsterdam", "SPL": "Europe/Amsterdam", "SVN": "Europe/Amsterdam", "VLK": "Europe/Amsterdam", "ZSN": "Europe/Amsterdam", "STO": "Europe/Amsterdam", "SLG": "Europe/Amsterdam", "YTS": "Europe/Amsterdam", "UTG": "Europe/Amsterdam", "EA1": "Europe/London", "EAZ": "Europe/London", "EDI": "Europe/London", "LON": "Europe/London", "UKF": "Europe/London", "UTH": "Europe/London", "UOR": "Europe/London", "UEB": "Europe/London", "WHM": "Europe/Amsterdam"} name_map = {"FRE" : "Fredericia", "EDI" : "Edinburgh", "LON" : "London", "STO" : "Stockholm", "HAM" : "Hamburg", "AMS" : "Amsterdam", "UKF" : "Kentish Flats", "UTH" : "Thanet", "UOR" : "Ormonde", "SLG" : "Lillgrund", "DTK" : "Dan Tysk", "SNB" : "Sand Bank", "UEB" : "Edinbane", "NEZ" : "Egmonde an Zee", "ZDB" : "Zuidlob", "AVS" : "Alpha Ventus", "RCN" : "RCN Mast", "FN1" : "Fino 1 Platform", "FN3" : "Fino 3 Platform", "BEK" : "Beek", "DEB" : "Debilt", "DKY" : "Dekooy", "DLN" : "Deelen", "HGV" : "Hoogeveen", "LWN" : "Leeuwarden", "LYD" : "Lelystad", "SPL" : "Schipol", "SVN" : "Stavoren", "VLK" : "Valkenburg", "ZSN" : "Zestienhoven", "EA1" : "East Anglia 1B", "EAZ" : "East Anglia ZE", "YTS" : "Yttre Stengrund", "UTG" : "Utgrunded", "WHM" : "Hagesholm"} inv_name_map = {v:k for k, v in name_map.items()} # # This renames columns in the input into columns in the output # Only columns named here will be exported # # col_map = OrderedDict([("long_name", "Location"), ("valid_time", "Date/time (utc)"), ("local_time", "Date/time (local)"), ("lead_time", "Forecast (hours)"), ("SPEED.20", "Windspeed 21m (m/sec)"), ("DIRECTION.20", "Winddirection 21m (degrees)"), ("SPEED.70", "Windspeed 70m (m/sec))"), ("DIRECTION.70", "Winddirection 70m (degrees)"), ("SPEED.110", "Windspeed 110m (m/sec)"), ("DIRECTION.110", "Winddirection 110m (degrees)"), ("SPEED.70.P10", "Percentile 10 (m/sec) 70m"), ("SPEED.70.P20", "Percentile 20 (m/sec) 70m"), ("SPEED.70.P30", "Percentile 30 (m/sec) 70m"), ("SPEED.70.P40", "Percentile 40 (m/sec) 70m"), ("SPEED.70.P50", "Percentile 50 (m/sec) 70m"), ("SPEED.70.P60", "Percentile 60 (m/sec) 70m"), ("SPEED.70.P70", "Percentile 70 (m/sec) 70m"), ("SPEED.70.P80", "Percentile 80 (m/sec) 70m"), ("SPEED.70.P90", "Percentile 90 (m/sec) 70m") ]) utc = pytz.UTC # weeeeee, what a lot of chained operators! # converts to local time convert = lambda row: utc.localize(row['valid_time']).astimezone(pytz.timezone(tz_map[row['location']])).strftime('%Y-%m-%d %H:%M') # Now we apply our uber-lambda function to insert local time frame['local_time'] = frame.apply(convert, axis=1) # Calculate lead time as integer number of hours deltas = frame['valid_time'] - frame['init_time'] hours = lambda x: x / np.timedelta64(1, 'h') lead_ints = deltas.apply(hours) frame['lead_time'] = lead_ints.astype(int) # Expand short names to long names rename = lambda x: name_map[x] long_names = frame['location'].apply(rename) frame['long_name'] = long_names # ******************************************* # WARING - this is a hack and does not belong # here long term. This is just a quick way of # adding statistical percentiles to the output # time series. We need to thinl carefully about # where these shoudl be calculated. #******************************************** # "SPEED_070.P10" : "Percentile 10 (m/sec) 70m", # "SPEED_070.P20" : "Percentile 20 (m/sec) 70m", # "SPEED_070.P30" : "Percentile 30 (m/sec) 70m", # "SPEED_070.P40" : "Percentile 40 (m/sec) 70m", # "SPEED_070.P50" : "Percentile 50 (m/sec) 70m", # "SPEED_070.P60" : "Percentile 60 (m/sec) 70m", # "SPEED_070.P70" : "Percentile 70 (m/sec) 70m", # "SPEED_070.P80" : "Percentile 80 (m/sec) 70m", # "SPEED_070.P90" : "Percentile 90 (m/sec) 70m" # ******************************************* # Unstacking is causing missing values to propagate # Unstacking is causing missing values. frame = pd.pivot_table(frame, values="value", rows=["init_time", "valid_time", "local_time", "lead_time", "location", "long_name", "GRID_ID"], cols=["variable","height"]) # The columns are now tuples # We want to collapse these to single strings tuples = frame.columns columns = map(collapse, tuples) # ensure string and not unicode columns = map(str,columns) # set frames columns frame.columns = columns # reset index to make column selection easier frame = frame.reset_index() logger.debug("adding percentiles") percentiles = [10, 20, 30, 40, 50, 60, 70, 80, 90] for p in percentiles: pname = 'SPEED.70.P%02d' % p pfunc = lambda x : stats.norm.ppf(p/100.0, x, x*0.10) frame[pname] = frame['SPEED.70'].apply(pfunc) gb = frame.groupby(by=['init_time','GRID_ID','location']) groups = dict(list(gb)) for key, group in gb: logger.debug("processing group %s" %str(key)) init_time = key[0] grid_id=key[1] location = key[2] # subset and rename subset = group[col_map.keys()] subset.columns = col_map.values() d = '%s/%s/d%02d' % (out_dir, location, grid_id) if not os.path.exists(d): os.makedirs(d) out_name = '%s/%s.txt' % (d, init_time.strftime('%y%m%d%H')) logger.debug("writing times series out to %s" % out_name) subset.to_csv(out_name, index=False, float_format='%0.2f')
def frame_from_nc_old(ncfiles, vars, dimspec, global_atts, var_atts, log_name): """ Build a Pandas DataFrame from a series of netcdf files This is horrendously inneficient! A better way would be to build up Index objects from the coordinate variables, then create DataFrames for each variable with the coordinate indexes, then concatenate together. Unstacking a coordinate, e.g. height would have to be implemented somehow.""" logger = loghelper.get_logger(log_name) logger.debug(vars) logger.debug(dimspec) logger.debug(global_atts) logger.debug(var_atts) rows = [] # subsetting defaults to full selection ts,te = 0,None ls,le = 0,None hs,he = 0,None if dimspec!=None: for dim,ind in dimspec.items(): if dim=='time': ts = ind[0] te = ind[1] if dim=='location': ls = ind[0] le = ind[1] if dim=='height': hs = ind[0] he = ind[1] for f in ncfiles: dataset = Dataset(f, 'r') variables = dataset.variables dataset_atts = dataset.__dict__ if vars==None: vars = list(variables.keys()) fulltime = variables['time'] fulldatetimes = num2date(fulltime,units=fulltime.units,calendar=fulltime.calendar) time = fulltime[ts:te] datetimes = fulldatetimes[ts:te] ntime = len(datetimes) init_time = fulldatetimes[0] # hack to catch thanet try: location = variables['location'][ls:le] except KeyError: location = variables['location_id'][ls:le] nloc = location.shape[0] loc_id_raw = [''.join(location[l,:].filled('')) for l in range(nloc)] loc_id = map(string.strip, loc_id_raw) height = variables['height'][hs:he] nheight = len(height) # this will force the reading all of the required variable data into memory varnames = [v for v in vars if v not in COORD_VARS] vardata = dict([(v, variables[v][:]) for v in varnames]) # Argh! Nested loop hell. for t in range(ntime): for l in range(nloc): rowdict = OrderedDict() for a in global_atts: logger.debug('adding value of attribute: %s' % a) rowdict[a] = dataset_atts[a] rowdict['valid_time'] = datetimes[t] rowdict['location'] = loc_id[l] rowdict['init_time'] = init_time for v in varnames: vatts = variables[v].__dict__ data = vardata[v] for att in var_atts: rowdict[att] = vatts[att] # 2D variable if len(data.shape)==2: rowdict[v] = data[t,l] # 3D variable, unstack height if len(data.shape)==3: for h in range(nheight): key = '%s_%03d' %(v, int(height[h])) rowdict[key] = data[t,l,h] rows.append(rowdict) dataset.close() df = pd.DataFrame(rows) #re-arrange columns cols = df.columns pre_cols = ['init_time','valid_time','location'] data_cols = [c for c in cols if c not in pre_cols] new_cols = pre_cols + data_cols df = df[new_cols] return df
from gevent.event import Event from gevent import monkey monkey.patch_all() reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import config import db import loghelper import url_helper #logger loghelper.init_logger("remove_bd_sshot", stream=True) logger = loghelper.get_logger("remove_bd_sshot") #mongo mongo = db.connect_mongo() collection_android = mongo.market.android collection_itunes = mongo.market.itunes collection_android_market = mongo.market.android_market cnt = 0 def copy_from_itunes(app, artifactId): conn = db.connect_torndb() if app.has_key("description"):
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import db sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import aggregator_db_util import helper sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../artifact')) import artifact_recommend import set_artifact_rank #logger loghelper.init_logger("company_aggregator_artifact", stream=True) logger = loghelper.get_logger("company_aggregator_artifact") def aggregate_artifact(company_id,source_company_id, test=False): table_names = helper.get_table_names(test) # artifact conn = db.connect_torndb() sas = list(conn.query("select * from source_artifact where sourceCompanyId=%s", source_company_id)) for sa in sas: if sa["active"] == "Y" or sa["active"] is None: if sa["domain"] is not None and sa["domain"].strip() != "": artifact = conn.get("select * from " + table_names["artifact"] + " where companyId=%s and type=%s and domain=%s limit 1", company_id, sa["type"], sa["domain"]) else: artifact = conn.get("select * from " + table_names["artifact"] + " where companyId=%s and type=%s and link=%s limit 1",
from pymongo import MongoClient import pymongo from bson.objectid import ObjectId reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper, config, util, url_helper import db #join amac for find_investor_alias_by_fund with active and verify #todo query for 12010 but not active #logger loghelper.init_logger("amac_util", stream=True) logger = loghelper.get_logger("amac_util") # investor_alias amacType amacId # investor_alias_candidate amacType amacId def get_websit_domains(managerIds): domains = [] mongo = db.connect_mongo() collection_manager = mongo.amac.manager for managerId in managerIds: manager = collection_manager.find_one({"_id": ObjectId(managerId)}) if manager is not None and manager.has_key("domain") is True \ and manager["domain"] is not None and manager["domain"].strip() != "" \ and manager["domain"].strip() not in ["www.com","baidu.com"]: if manager["domain"] not in domains: domains.append(manager["domain"])
import datetime from pymongo import MongoClient import pymongo from bson.objectid import ObjectId import amac_util reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper, config, util, url_helper import db #logger loghelper.init_logger("amac_findy", stream=True) logger = loghelper.get_logger("amac_findy") # investor_alias amacType amacId # investor_alias_candidate amacType amacId # # 拉萨合业投资管理有限公司 # 北京聚信远业投资咨询有限公司 # 北京云瀚锦科技中心(有限合伙) # 天津红杉资本投资管理中心 # 北京红杉嘉禾资产管理中心(有限合伙) # 北京红杉盛远管理咨询有限公司 # 红杉资本投资管理(天津)有限公司 # 北京锋业股权投资中心(有限合伙) # 上海桓远投资管理有限公司 # 北京创想天地投资管理有限公司
from pymongo import MongoClient import pymongo from kafka import (KafkaClient, SimpleProducer) reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import config import loghelper import my_request import util #logger loghelper.init_logger("gen_messages", stream=True) logger = loghelper.get_logger("gen_messages") #mongo (mongodb_host, mongodb_port) = config.get_mongodb_config() mongo = MongoClient(mongodb_host, mongodb_port) #kafka (kafka_url) = config.get_kafka_config() kafka = KafkaClient(kafka_url) # HashedPartitioner is default kafka_producer = SimpleProducer(kafka) # company_collection = mongo.crawler_v2.company if __name__ == "__main__":
# -*- coding: utf-8 -*- import os, sys reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper, config import db #logger loghelper.init_logger("score_2_list", stream=True) logger = loghelper.get_logger("score_2_list") def process(score, name): scores = conn.query( "select * from deal_user_score where userId=%s and score=%s", user_id, score) if len(scores) > 0: mylist = conn.get( "select * from mylist where createUser=%s and name=%s", user_id, name) if mylist is None: mylistId = conn.insert( "insert mylist(name,isPublic,active,createTime,createUser) values(%s,'N','Y',now(),%s)", name, user_id) else: mylistId = mylist["id"] umr = conn.get( "select * from user_mylist_rel where userId=%s and mylistId=%s",
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, db, util, url_helper #logger loghelper.init_logger("crawler_miit", stream=True) logger = loghelper.get_logger("crawler_miit") class miitCrawler(BaseCrawler.BaseCrawler): def __init__(self, max_crawl=1, timeout=30, use_proxy=False): BaseCrawler.BaseCrawler.__init__(self, max_crawl=max_crawl, timeout=timeout, use_proxy=use_proxy) self._post_url = 'http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_searchExecute.action' self.token = None self.jsessionid = None self._jsl_clearance = None self._jsluid = None
import config import loghelper import util sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import proxy_pool sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler #logger loghelper.init_logger("crawler_gonshang_qixinbao", stream=True) logger = loghelper.get_logger("crawler_gonshang_qixinbao") class SocksiPyConnection(httplib.HTTPConnection): def __init__(self, proxytype, proxyaddr, proxyport=None, rdns=True, username=None, password=None, *args, **kwargs): self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password) httplib.HTTPConnection.__init__(self, *args, **kwargs)
os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../support')) import loghelper import util, name_helper, url_helper, download import db sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import parser_db_util # logger loghelper.init_logger("xtecher_company_repair", stream=True) logger = loghelper.get_logger("xtecher_company_repair") def run(): conn = db.connect_torndb() sql = '''select name,fullname,sourceid,id from source_company where source=13821 ''' results = conn.query(sql) # TODO conn.close() for c in results: if c['fullname'] is not None and not name_helper.name_check( c['fullname'])[1] == True: logger.info('%s not company', c['fullname']) conn = db.connect_torndb()
# action: create, delete msg = {"source": action, "id": company_id, "detail": source} flag = False while flag is False: try: kafkaProducer.send_messages("task_company", json.dumps(msg)) flag = True except Exception, e: logger.exception(e) time.sleep(60) #logger loghelper.init_logger("sh_import", stream=True) logger = loghelper.get_logger("sh_import") def insert(shortname, name, brief, fullNames): name = name.replace("(开业)", "") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId, brief) # logger.info("sid:%s->sourceId:%s",sid, sourceId) parser_db_util.save_source_company_name(sid, shortname, 12020) for fullName in [name] + fullNames: parser_db_util.save_source_company_name(sid, fullName, 12010) return sid
sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import db, config, util import loghelper sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import proxy_pool sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler #logger loghelper.init_logger("crawler_lagou_job", stream=True) logger = loghelper.get_logger("crawler_lagou_job") class LagouJobCrawler(): def __init__(self, timeout=20): self.timeout=timeout self.opener = None self.socks_proxy = {"type": "socks4", "ip": "180.173.153.98", "port": 1080} def is_crawl_success(self, url, content): if content.find('操作成功') == -1: logger.info(content) return False r = "companyId=(.*?)&pageSize" result = util.re_get_result(r, url) (id,) = result
def power(config): """Reads 'time series' from netcdf time series file, and adds power as a variable. """ if __name__ == "__main__": logger = loghelper.create_logger(config) else: logger = loghelper.get_logger(config['log.name']) # Number of samples to use should be in here # Whether to normalise power should be in here pnorm = config['pnorm'] pdist = config['pdist'] sstd = config['sstd'] dstd = config['dstd'] pquants = config['pquants'] quantiles = np.array(pquants) logger.debug(pnorm) if pdist: n=pdist grid_id = config['grid_id'] init_time = config['init_time'] pcurve_dir = config['pcurve_dir'] ts_dir = config['tseries_dir'] tseries_file = expand(config['tseries_file'], config) power_file = expand(config['power_file'], config) logger.info('Estimating power from time series: %s ' % tseries_file) logger.info('Writing power time series to: %s ' % power_file) dataset_in = Dataset(tseries_file, 'a') # Get dimensions dims = dataset_in.dimensions ntime = len(dims['time']) nloc = len(dims['location']) nheight = len(dims['height']) loc_str_len = len(dims['loc_str_length']) # Get coordinate variables nctime = dataset_in.variables['time'] datetimes = netcdftime.num2date(nctime, nctime.units) location = [''.join(l.filled(' ')).strip() for l in dataset_in.variables['location']] height = dataset_in.variables['height'] # Get attributes metadata = config['metadata'] if power_file == tseries_file: dataset_out = dataset_in else: dataset_out = Dataset(power_file, 'w') # Get number of quantiles nq = len(quantiles) pdata = np.ma.zeros((ntime,nloc,nheight,nq+1), np.float) # mean will be 1st value use_locs = [] for l,loc in enumerate(location): pcurve_file = '%s/%s.csv' %(pcurve_dir, loc) # mask power data if no power curve found for this park if not os.path.exists(pcurve_file): #logger.debug("Power curve: %s not found, skipping" % pcurve_file) pdata[:,l,:,:] = np.ma.masked continue logger.info('Predicting power output for %s' % loc ) # # Open power curve # use_locs.append(l) pcurve = from_file(pcurve_file) for h in range(nheight): speed = dataset_in.variables['SPEED'][:,l,h] direction = dataset_in.variables['DIRECTION'][:,l,h] #pwr = pcurve.power(speed,direction) # pdist will create a distribution for each timetep based on sampling # n times from a normal distribution. pdist = pcurve.power_dist(speed, direction, sstd=sstd,dstd=dstd,n=n, normalise=pnorm) pmean = np.mean(pdist, axis=1) pquants = scipy.stats.mstats.mquantiles(pdist, prob=quantiles/100.0,axis=1, alphap=0.5, betap=0.5) pdata[:,l,h,0] = pmean pdata[:,l,h,1:] = pquants[:,:] logger.info('finished %s' % loc) use_inds = np.array(use_locs) logger.debug(use_inds) logger.debug(pdata.shape) logger.debug(pdata[:,use_inds,:,:].shape) if dataset_out != dataset_in: dataset_out.createDimension('time', None) dataset_out.createVariable('time', 'float', ('time',)) dataset_out.variables['time'][:] = nctime[:] dataset_out.variables['time'].units = nctime.units dataset_out.variables['time'].calendar = nctime.calendar dataset_out.createDimension('location', len(use_locs)) dataset_out.createDimension('loc_str_length', loc_str_len) loc_data =np.array([list(l.ljust(loc_str_len, ' ')) for l in location]) dataset_out.createVariable('location', 'c', ('location', 'loc_str_length')) dataset_out.variables['location'][:] = loc_data[use_inds,:] dataset_out.createDimension('height', nheight) dataset_out.createVariable('height', 'i', ('height',)) dataset_out.variables['height'][:] = height[:] dataset_out.GRID_ID = dataset_in.GRID_ID dataset_out.DX = dataset_in.DX dataset_out.DY = dataset_in.DY try: dataset_out.variables['height'].units = height.units except Exception: logger.warn("height units missing") pdata = pdata[:, use_inds, :, :] for key in metadata.keys(): key = key.upper() logger.debug(key) dataset_out.setncattr(key,dataset_in.getncattr(key)) pavg = dataset_out.createVariable('POWER','f',('time','location','height')) pavg.units = 'kW' pavg.description = 'forecast power output' pavg[:] = pdata[:,:,:,0] for q, qval in enumerate(quantiles): varname = 'POWER.P%02d' % qval logger.debug("creating variable %s" % varname) var = dataset_out.createVariable(varname,'f',('time','location','height')) if pnorm: var.units = 'ratio' else: var.units = 'kW' var.description = 'forecast power output' print pdata[:,:,:,q+1] var[:] = pdata[:,:,:,q+1] dataset_in.close() if dataset_out!=dataset_in: dataset_out.close()
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, db, extract #logger loghelper.init_logger("cninfo", stream=True) logger = loghelper.get_logger("cninfo") rmap = [ { "type": 1, "typeDesc": "资产负债表", "ue": "balancesheet", }, { "type": 2, "typeDesc": "利润表", "ue": "incomestatements", }, { "type": 3, "typeDesc": "现金流量表",
import os, sys import datetime, time import json import pymongo reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import util import config import db import loghelper #logger loghelper.init_logger("investor_ranking", stream=True) logger = loghelper.get_logger("investor_ranking") conn = None def get_today_date(): today = datetime.datetime.now() tomorrow = today + datetime.timedelta(days=1) start_date = "%s-%s-%s" % (today.year, today.month, today.day) end_date = "%s-%s-%s" % (tomorrow.year, tomorrow.month, tomorrow.day) return start_date, end_date def get_thisweek_date(): date1 = datetime.date.today() - datetime.timedelta( days=datetime.date.today().weekday())
import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_feixiaohao", stream=True) logger = loghelper.get_logger("crawler_feixiaohao") NEWSSOURCE = "feixiaohao" URLS = [] CURRENT_PAGE = 1 linkPattern = "feixiaohao.com/currencies" Nocontents = [] columns = [ # {"column": "jmd", "max": 2}, { "column": "None", "max": 30 }, ] SOURCE = 13511
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import util import proxy_pool import db #logger loghelper.init_logger("appstore_rank_2", stream=True) logger = loghelper.get_logger("appstore_rank_2") #mongo mongo = db.connect_mongo() appstore_rank_collection = mongo.trend.appstore_rank total = 0 types = { "free": 27, "charge": 30, "grossing": 38, } genres = [
# -*- coding: utf-8 -*- # tag 任务 import os, sys import time reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper import db, datetime # logger loghelper.init_logger("export_audit", stream=True) logger = loghelper.get_logger("export_audit") rmap = { 1000: '未融资', 1010: '天使轮', 1011: '天使轮', 1020: 'pre-A', 1030: 'A', 1031: 'A+', 1039: 'Pre-B', 1040: 'B', 1041: 'B+', 1050: 'C', 1060: 'D', 1070: 'E', 1080: 'F', 1090: '后期阶段',
sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../support')) import loghelper, config import db, name_helper, url_helper import json, config, traceback, time, util from bson.objectid import ObjectId import random # logger loghelper.init_logger("mt_cnt", stream=True) logger = loghelper.get_logger("mt_cnt") mongo = db.connect_mongo() collection = mongo['open-maintain'].task collectionUser = mongo['open-maintain'].user conn = db.connect_torndb() def start_run(): while True: taskCnt = list(mongo['open-maintain'].task.aggregate([{ '$match': { 'taskUser': -666, 'active': "Y" } }, {
def get_logger(): return loghelper.get_logger(LOGGER)
import loghelper, extract, db, util, url_helper, download, traceback_decorator sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import proxy_pool #logger loghelper.init_logger("crawler_scmp_news", stream=True) logger = loghelper.get_logger("crawler_scmp_news") NEWSSOURCE = "scmp" RETRY = 3 TYPE = 60004 SOURCE = 13887 URLS = [] CURRENT_PAGE = 1 linkPattern = ".*?/\d+/.*" Nocontents = [] columns = [ { "column": "news", "max": 1 }, ]
# -*- coding: utf-8 -*- import os, sys from BaseCrawler import BaseCrawler from pyquery import PyQuery as pq reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper #logger loghelper.init_logger("crawler_itjuzi_investfirm", stream=True) logger = loghelper.get_logger("crawler_itjuzi_investfirm") SOURCE = 13030 #ITJUZI TYPE = 36004 #投资个人 class ItjuziCrawler(BaseCrawler): def __init__(self, start): BaseCrawler.__init__(self, header=True) self.set_start(start) def set_start(self, start): self.current = start self.latest = start def get_url(self): key = str(self.current)
def melt(ncfiles, vars=None, global_atts=None,var_atts=None, coord_vars=None, missing=None): """ Build a (molten) Pandas DataFrame from a series of netcdf files. This is a flexible, but very memory-inneficient data structure, so be careful calling this with large netcdf files. Arguments: ncfiles -- the input filenames vars -- the variables to read, if None all variables in files read var_atts -- variable attributes to include in each line of output, default all global_atts -- global attributes to include in each row of output coord_vars -- variables to treat as coordinates, if None will use variables with the same name as dimensions""" logger = loghelper.get_logger(LOGGER) frames = [] if len(ncfiles)==1: dataset = Dataset(ncfiles[0]) else: dataset = MFDataset(ncfiles) coord_vars = get_coordinate_vars(dataset, coord_vars) variables = dataset.variables # get global attributes in dataset # shouldn't really use this, but it works dataset_atts = dataset.__dict__ use_global_atts = _lookup(global_atts, dataset_atts, missing) # if no vars specified, use all in ncfiles if (vars==None or vars==["all"]): vars = list(variables.keys()) # variables are a function of var(reftime,leadtime,height,location) # or var(reftime,leadtime,location) usevars = [v for v in vars if v not in coord_vars] logger.debug("usevars: %s" % usevars) # There must be a clean way of doing this in a general # way, but I don't have the time to code this properly, # so I'm looping over fixed and hard-coded dimension names location = coord_vars['location'] reftime = coord_vars['reftime'] leadtime = coord_vars['leadtime'] height = coord_vars['height'] #lat = coord_vars['lat'] #lon = coord_vars['lon'] nloc = len(location) nreftime = len(reftime) nleadtime = len(leadtime) # dimension order is reftime, leadtime, location, height # or reftime, leadtime, location vars2D = [v for v in usevars if len(variables[v].shape)==3] vars3D = [v for v in usevars if len(variables[v].shape)==4] series = [] for v in vars2D: vname = v variable = variables[v] use_var_atts = _lookup(var_atts, variable.__dict__, missing) factors = [reftime, leadtime, [HGT2DNUM], location, [vname]] + map(_listify, use_global_atts.values()) + map(_listify,use_var_atts.values()) names = ['reftime', 'leadtime', 'height', 'location','variable'] + use_global_atts.keys() + use_var_atts.keys() index = pd.MultiIndex.from_product(factors, names=names) #index = pd.MultiIndex.from_tuples([(ref,lead,loc,HGT2DNUM,vname) for ref in reftime for lead in leadtime for loc in location], names=['reftime', 'leadtime', 'location', 'height','variable']) if type(variable[:]) == np.ma.core.MaskedArray: data = variable[:].flatten().filled(np.nan).astype(np.float) else: data = variable[:].flatten().astype(np.float) series.append( pd.Series(data=data, index=index, name='value')) for v in vars3D: variable = variables[v] vname = v use_var_atts = _lookup(var_atts, variable.__dict__, missing) for h,hgt in enumerate(height): subvar = variable[:,:,:,h] vname = "%s.%03d" % (v,hgt) vname = v factors = [reftime, leadtime, [hgt], location, [vname]] + map(_listify, use_global_atts.values()) + map(_listify,use_var_atts.values()) names = ['reftime', 'leadtime', 'height', 'location','variable'] + use_global_atts.keys() + use_var_atts.keys() index = pd.MultiIndex.from_product(factors, names=names) #index = pd.MultiIndex.from_tuples([(ref,lead,loc,hgt,vname) for ref in reftime for lead in leadtime for loc in location], names=['reftime', 'leadtime', 'location','height', 'variable']) if type(subvar) == np.ma.core.MaskedArray: data = subvar[:].flatten().filled(np.nan).astype(np.float) else: data = subvar[:].flatten().astype(np.float) series.append(pd.Series(data=data, index=index, name='value')) # this is molten data, to use Haldey Wickham's terminology # or perhaps 5th normal form? result = pd.concat(series, axis=0).reset_index() return result