Beispiel #1
0
def test():
    loghelper.get_logger('wrf_forecast')

    #logger.debug('running test')
    

    template   = '/home/slha/code/wrftools/devel/queue/template.sge'
    job_script = '/home/slha/code/wrftools/devel/queue/job.sge'
    executable = '/home/slha/forecasting/development/run/wrf.exe'
    run_dir    = '/home/slha/forecasting/development/run'
    jobname    = 'WRF'
    qname      = 'all.q'
    nprocs     = 8
 
    replacements = {'<executable>': executable,
                    '<jobname>': jobname,
                    '<qname>'  : qname,
                    '<nprocs>' : nprocs}
    
    fill_template(template, job_script, replacements)

    os.chdir(run_dir)
    job_id = qsub(job_script)
    
    for i in range(3):
        status = qstat(job_id)
        print status
        if status==None:
            'print job not in queue, presume complete'
            break
        if 'E' in status:
            raise QueueError('job %s has queue status of %s' %(job_id, status))
        

        time.sleep(5)
Beispiel #2
0
def _filter(frame, variables=None, dimspec=None, log_name=LOGGER):
    
    logger = loghelper.get_logger(log_name)
    
    # filter by variables
    if variables:
        use_var = map(str,variables)
        logger.debug("filtering on variable: %s" % str(use_var))
        frame = frame[frame['variable'].isin(use_var)]
        logger.debug("%d rows" % len(frame))
    
    # filter by location
    if dimspec and 'location' in dimspec:
        use_loc = map(str,dimspec['location'])
        logger.debug("filtering on location: %s" % str(use_loc))
        frame = frame[frame['location'].isin(use_loc)]
        logger.debug("%d rows" % len(frame))
    
    # filter by height. How do we treat surface here?
    if dimspec and 'height' in dimspec:
        use_hgt = dimspec['height']
        use_hgt = [HGT2DNUM if h==HGT2DSTR else h for h in use_hgt]
        
        logger.debug("filtering on height: %s" % str(use_hgt))
        ind = frame['height'].isin(use_hgt)
        frame = frame[ind]
        logger.debug("%d rows" % len(frame))
    
    return frame
Beispiel #3
0
def ncdump(config):
    
    logger = loghelper.get_logger(config['log.name'])
    
    # subset of config to be used for expanding filenames
    scope = {'init_time' : config['init_time'],
             'grid_id'   : config['grid_id']}
             

    
    for name, entry in config['ncdump'].items():
        logger.debug("procesing entry %s " % name)
        
        if config.get('<files>'):
            files = config['<files>']
            if type(files)!=type([]):
                files = [files]
        else:
            tseries_files = expand(entry['tseries_file'], config)
            logger.debug("expanding file list from pattern and init time")
            #logger.debug(tseries_files)
            
            files = glob.glob(tseries_files)
            logger.debug("found %d files" % len(files))

        dump(files, entry, scope, log_name=config['log.name'])
Beispiel #4
0
def dispatch_entry(config, entry, dry_run=None, log_name=LOGGER):    
    """Dispacthes one entry of distribution list"""
    
    logger          = loghelper.get_logger(log_name)
    
    address   = expand(entry['mailto'], config)
    subject   = expand(entry['subject'], config)
    body      = expand(entry['body'], config)
    from_addr = expand(entry['from'], config)
    attachments = [expand(a, config) for a in entry['attach']]

    logger.debug('dispatch_entry() called')
    
    if type(attachments)==type([]):
        a_arg = ' '.join(['-a %s' % a for a in attachments])
    else:
        a_arg = '-a %s' % attachments 
    
    if 'cc' in  entry:
        cc_arg = '-c %s' % entry['cc']
    else:
        cc_arg = ''

    if 'content_type' in entry:
        ct_arg = '-e "my_hdr Content-Type: :%s"' % dist['content_type']
    else:
        ct_arg = ''

        
    cmd = """EMAIL="%s" mutt %s -s"%s" %s %s -- %s < %s """ %(from_addr, ct_arg, subject, a_arg, cc_arg, address, body)
    logger.debug(cmd)
    logger.debug(dry_run)
    if not dry_run:
        subprocess.call(cmd, shell=True)
Beispiel #5
0
def dispatch(config):

    if __name__ == "__main__":
        logger = loghelper.create_logger(config)
    else:
        logger = loghelper.get_logger(config['log.name'])

    dist = config['dispatch.list']
    logger.info("dispatch.py sending files via email")
    dry_run=config['dry_run']
    
    for name, entry in dist.items():
        logger.info("dispatching files for entry: %s" % name)
        dispatch_entry(config, entry, dry_run=dry_run, log_name=config['log.name'])
Beispiel #6
0
def write_csv_files(frame, out_dir, out_name, variables, dimspec, drop, values, rows, cols, sort_by=None, rename=None, float_format='%0.3f', na_rep="",log_name=LOGGER):
    """Writes each variable and height into a seperate column.  Columns will be labelled variable_height where height if formatted as %03d int(height)
    
    Takes as input a DataFrame in a record based format, e.g. init_time, valid_time, height, location, variable, units, value."""
    
    logger = loghelper.get_logger(log_name)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    
    # drop columns first, this will cause problems later if someone else wants to use it!
    #if drop:
    #    for col in drop:
    #        del(frame[col])
    #logger.debug(frame)
    
    # drop columns by subsetting to create a view
    if drop: _drop(frame, drop)
      
    # subset based on variable, location, height
    frame = _filter(frame, variables, dimspec, log_name)
    
    #frame = frame.set_index(['init_time','valid_time'])    
    logger.debug(frame)
    logger.debug("about to pivot")
    logger.debug("values: %s" % values)
    logger.debug("rows: %s" % rows)
    logger.debug("cols: %s" % cols)
    frame = pd.pivot_table(frame, values=values, rows=rows,cols=cols)

    frame = frame.reset_index()


    if sort_by:
        frame.sort(sort_by, inplace=True)

    if rename:
        frame = _rename(frame, rename)
    
    logger.debug("outputting csv file: %s/%s " % (out_dir, out_name))
    logger.debug(na_rep)
    frame.to_csv('%s/%s' % (out_dir, out_name), index=False, float_format=float_format, na_rep=na_rep)    
Beispiel #7
0
def dump(files,entry,scope,log_name=LOGGER):
    
    logger = loghelper.get_logger(log_name)
    vars        = entry['tseries_vars']
    global_atts = entry['global_atts']
    var_atts    = entry['var_atts']
    coord_vars  = entry['coord_vars']
    format      = entry['format'].strip()
   
    #logger.warn("subsetting at read time is not implemented")
    # Read all data into memory as pandas Series objects
    logger.debug("ncdump called with arguments")
    logger.debug("\t files: %s"       % str(files))
    logger.debug("\t vars: %s"        % str(vars))
    logger.debug("\t global_atts: %s" % str(global_atts))
    logger.debug("\t var_atts: %s"    % str(var_atts))
    logger.debug("\t coord_vars: %s"  % str(coord_vars))
    logger.debug("\t log_name: %s"    % str(log_name))
    
    
    for file in files:
        logger.debug(file)
        frame = frame_from_nc([file], vars, global_atts, var_atts, coord_vars,log_name)
            
        if format not in FORMATS:
            logger.error("format %s not understood" % format)
            raise UnknownFormat("format not understood")
        
        if format=='txt' :
            pass
            #write_txt_files(frame, entry['dir'], entry['dimspec'], log_name)
            
        elif format=='json':
            write_json_files(frame, entry['dir'], expand(entry['fname'], scope), entry['tseries_vars'], entry['dimspec'], entry['drop'], entry['rename'], entry['float_format'], log_name)

        elif format=='csv':
            write_csv_files(frame, entry['dir'], expand(entry['fname'], scope), entry['tseries_vars'],entry['dimspec'], entry['drop'], values='value', rows=entry['rows'],cols=entry['cols'],sort_by=entry['sort_by'],rename=entry['rename'],float_format=entry['float_format'], na_rep=entry['na_rep'], log_name=log_name)
                
        elif format=='aot':
            write_aot_files(frame, entry['dir'])
Beispiel #8
0
def qsub(job_script):
    """Submits a PBS job via qsub
    
    Arguments:
        @job_script -- full path to a pbs job script file
    Returns:
        @job_id -- the job id returned by the PBS system """
 
    logger = loghelper.get_logger('wrf_forecast')
    #logger.debug('submitting job %s' % job_script)
    
    cmd  = 'qsub %s ' % job_script
    
    #
    # The output from PBS is of the format
    # "Your job 3681 ("TEST") has been submitted"
    #
    proc = subprocess.Popen([cmd], stdout=subprocess.PIPE, shell=True)
    output = proc.stdout.read()
    job_id = output.split(' ')[2]
    logger.debug("%s ------> %s" % (cmd, job_id))
    return job_id
Beispiel #9
0
# -*- coding: utf-8 -*-
import os, sys
import datetime
import time

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, db

#logger
loghelper.init_logger("openapi_funding", stream=True)
logger = loghelper.get_logger("openapi_funding")


def add_new_created_fundings():
    today = datetime.datetime.now().date()
    date1 = today + datetime.timedelta(days=-30)
    date2 = today + datetime.timedelta(days=-365 * 2)
    logger.info("today: %s, date1: %s, date2: %s", today, date1, date2)

    conn = db.connect_torndb()
    fundings = conn.query(
        "select * "
        "from funding "
        "where companyId is not null and "
        "createTime>=%s and "
        "("
        "(publishDate is not null and publishDate>=%s) "
        "or "
        "(publishDate is null and fundingDate>=%s)"
        ")", today, date1, date2)
Beispiel #10
0
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util
import time

DATE = None

# logger
loghelper.init_logger("crawler_feixiaohao_marketdata", stream=True)
logger = loghelper.get_logger("crawler_feixiaohao_marketdata")


def save_marketdata(content):
    fileName = 'file/market_data_%s.xls' % datetime.datetime.now().strftime(
        "%Y-%m-%d%H:%M:%S")
    path = os.path.join(os.path.split(os.path.realpath(__file__))[0], fileName)

    logger.info('saving file:%s', path)
    with open(path, "wb") as file:
        file.write(content)

    return fileName


def run(crawler):
Beispiel #11
0
def frame_from_nc(ncfiles, vars, global_atts, var_atts, coord_vars,log_name):

    """ Build a Pandas DataFrame from a series of netcdf files"""

    logger = loghelper.get_logger(log_name)
    frames = []
    
    # Open files one-by-one
    for f in ncfiles:
        logger.debug("reading:  %s" % f)
        dataset = Dataset(f, 'r')
        #logger.debug(dataset)
        variables = dataset.variables
        # lookup global attributes in dataset
        # shouldn't really use this, but it works
        dataset_atts = dataset.__dict__
        
        
        # if no vars specified, use all in ncfiles
        if vars==None:
            vars = list(variables.keys())
            
        # get coorinate variables
        time      = variables['time']
        datetimes = num2date(time,units=time.units,calendar=time.calendar)
        ntime     = len(datetimes) 
        init_time = datetimes[0]
      
      
        # hack to catch thanet files which have location_id rather than location
        try:
            location    = variables['location']
        except KeyError:
            location    = variables['location_id']
            
        # Unmask string and strip, convert from unicode to string
        nloc        = location.shape[0]
        loc_masked  = np.ma.array(location)
        loc_id_raw  = [''.join(loc_masked[l,:].filled('')) for l in range(nloc)]
        location    = map(string.strip, loc_id_raw)
        location    = map(str,location)
        
        height      = variables['height']
        nheight     = len(height)

        varnames = [v for v in vars if v not in coord_vars]
        vars2D = [v for v in varnames if len(variables[v].shape)==2]
        vars3D = [v for v in varnames if len(variables[v].shape)==3]
       

        #can't really avoid nested loop here without making code unintelligble
        for v in vars2D:
            for l in range(nloc):
                # create dataframe then append columns avoids copying each series
                df = pd.DataFrame(datetimes, index=range(len(datetimes)), columns=['valid_time'])
                df['init_time']  = init_time
                df['location']   = location[l]  # this creates is an object type
                df['location']   = df['location'].astype(str)
                df['height']     = HGT2DNUM
                df['variable']   = v
                df['value']      = variables[v][:,l]
                for att in global_atts:
                    df[str(att)] = dataset_atts[att]
                for att in var_atts:
                    df[str(att)] = variables[v].getncattr(att)
                frames.append(df)
        
        for v in vars3D:
            for l in range(nloc):
                for h in range(nheight):
                    # create dataframe then append columns avoids copying each series
                    df = pd.DataFrame(datetimes, index=range(len(datetimes)),columns=['valid_time'])
                    df['init_time']  = init_time
                    df['location']   = location[l]
                    df['height']     = height[h]
                    df['variable']   = v
                    df['value']      = variables[v][:,l,h]
                    for att in global_atts:
                        df[str(att)] = dataset_atts[att]
                    for att in var_atts:
                        df[str(att)] = variables[v].getncattr(att)
                    frames.append(df)
        dataset.close()
    
    df = pd.concat(frames)
    
    cols = df.columns

    # re-order the columns for cleaner output
    pre_cols = ['init_time','valid_time','location']
    data_cols = [c for c in cols if c not in pre_cols]
    new_cols = pre_cols + data_cols
    df = df[new_cols]
    df.index = range(len(df))
    
    return df
Beispiel #12
0
# -*- coding: utf-8 -*-
import os, sys
import datetime

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, db

#logger
loghelper.init_logger("patch_ether_fa", stream=True)
logger = loghelper.get_logger("patch_ether_fa")


def main():
    utokens = {}

    mongo = db.connect_mongo()
    items = list(
        mongo.xiniudata.user_cookie.find({
            "type": "utoken",
            "active": 'Y'
        }))
    for item in items:
        userCookie = item["userCookie"]
        utokenUserId = item["utokenUserId"]
        if utokens.has_key(userCookie):
            logger.info(userCookie)
        else:
Beispiel #13
0
from email.mime.text import MIMEText
from email.header import Header
from email.utils import formataddr
import requests, json
from aliyun_monitor import AliyunMonitor

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, util, db, config

#logger
loghelper.init_logger("email_send_patch", stream=True)
logger = loghelper.get_logger("email_send_patch")


def merge_users(to_list, from_list):
    for user in from_list:
        exist = False
        for u in to_list:
            if user["id"] == u["id"]:
                exist = True
        if exist is False:
            to_list.append(user)


if __name__ == "__main__":
    cnt = 0
    conn = db.connect_torndb()
Beispiel #14
0
import random

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../support'))

import loghelper, db
import proxy_pool

#logger
loghelper.init_logger("BaseCrawler", stream=True)
logger = loghelper.get_logger("BaseCrawler")

#mongo
#mongo = db.connect_mongo()
#collection = mongo.raw.projectdata


class RedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        pass

    def http_error_302(self, req, fp, code, msg, headers):
        pass


class BaseCrawler:
Beispiel #15
0
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import GlobalValues

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, db

#logger
loghelper.init_logger("crawler_36kr_company2", stream=True)
logger = loghelper.get_logger("crawler_36kr_company2")

TYPE = 36001
SOURCE = 13022
URLS = []
CURRENT_PAGE = 1
linkPattern = "/article/\d+"
Nocontents = []

columns0 = [
    {
        "column": None,
        "max": 3
    },
    {
        "column": "FARMING",
Beispiel #16
0
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import config
import loghelper
import name_helper
import db

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import aggregator_db_util
import helper

#logger
loghelper.init_logger("company_aggregator_baseinfo", stream=True)
logger = loghelper.get_logger("company_aggregator_baseinfo")

#
mongo = db.connect_mongo()
collection = mongo.trend.android
collection_alexa = mongo.trend.alexa
gongshang = mongo.info.gongshang

def get_company_code(name, test=False):
    table_names = helper.get_table_names(test)
    conn = db.connect_torndb()
    if len(name) <8 :
        pinyin = lazy_pinyin(name.decode('utf-8'))
        company_code = ''.join(pinyin)
    else:
        pinyin = lazy_pinyin(name.decode('utf-8'), style=pypinyin.INITIALS)
Beispiel #17
0
reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, db, extract

# logger
loghelper.init_logger("crawler_szse_an", stream=True)
logger = loghelper.get_logger("crawler_szse_an")


class AnnounceCrawler(BaseCrawler.BaseCrawler):
    def __init__(self, timeout=20):
        BaseCrawler.BaseCrawler.__init__(self, timeout=timeout)

    # 实现
    def is_crawl_success(self, url, content):
        try:

            res = content.replace('var szzbAffiches=', '')[:-2]
            # logger.info(res)
            contentnew = eval(res.decode("gbk").strip())
            logger.info(contentnew)
            if len(contentnew) > 0:
Beispiel #18
0
import os, sys
import time, datetime
import traceback
import requests
import json

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper, db

#logger
loghelper.init_logger("patch_wechat_unionid", stream=True)
logger = loghelper.get_logger("patch_wechat_unionid")

appid = "wx766854150052d912"
appsecret = "d4fc5ea387e938c7641dd434a4d7a891"
ACCESS_TOKEN = None  #有效期为7200秒,开发者必须在自己的服务全局缓存access_token
ACCESS_TOKEN_TIME = 0


def refreshToken():
    global ACCESS_TOKEN, ACCESS_TOKEN_TIME

    if ACCESS_TOKEN is None or ACCESS_TOKEN_TIME + 7000 < time.time():
        print "get ACCESS_TOKEN and JSAPI_TICKET"
        ACCESS_TOKEN_TIME = time.time()
        url = "https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid=%s&secret=%s" % (
            appid, appsecret)
Beispiel #19
0
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import db
import name_helper
import config
import image_helper
import url_helper
#logger
loghelper.init_logger("card_v1", stream=True)
logger = loghelper.get_logger("card_v1")

#parse data from qimingpian directly, bamy called it step 1 to checkout company


def find_companies_by_full_name_corporate(full_names, idmax=0):
    companyIds = []
    for full_name in full_names:
        if full_name is None or full_name == "":
            continue

        # full_name = name_helper.company_name_normalize(full_name)

        conn = db.connect_torndb()
        corporate_aliases = conn.query(
            "select a.* from corporate_alias a join corporate c on c.id=a.corporateId where "
Beispiel #20
0
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download, traceback_decorator

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import proxy_pool

# logger
loghelper.init_logger("crawler_forbes_news", stream=True)
logger = loghelper.get_logger("crawler_forbes_news")

NEWSSOURCE = "forbes"
RETRY = 3
TYPE = 60004
SOURCE = 13883
URLS = []
CURRENT_PAGE = 1
# https://www.avcj.com/avcj/news/3010878/gic-advises-caution-predicts-rising-global-volatility
linkPattern = "www.pintu360.com/a\d+.html"
Nocontents = [
]
columns = [
    {"column": "innovation", "max": 1, 'sourceValue': 'channel_74'},
    {"column": "entrepreneurs", "max": 1, 'sourceValue': 'channel_4'},
    {"column": "small-business", "max": 1, 'sourceValue': 'channel_21'},
Beispiel #21
0
def power(config):
    """Reads 'time series' from netcdf time series file, and adds power as a variable. """
    
    if __name__ == "__main__":
        logger = loghelper.create_logger(config)
    else:
        logger = loghelper.get_logger(config['log.name'])
    
    # listify ensures they are returned as a list, even if it is one file
    files = shared._listify(config['<files>'])
    
    # Number of samples to use should be in here
    # Whether to normalise power should be in here    
    start     = config.get('start')
    delay     = config.get('delay')
    cycles    = shared._listify(config.get('cycles'))
    pnorm     = config.get('pnorm')
    pdist     = config.get('pdist')
    sstd      = config.get('sstd')
    dstd      = config.get('dstd')
    pquants   = config.get('pquants')
    quantiles = np.array(pquants)
    pcurve_dir = config.get('pcurve-dir')
    ts_dir     = config.get('tseries-dir')
    out        = config.get('out')
    metadata   = config.get('metadata')


    
    basetime = start if start else datetime.datetime.today()
    prior = shared._prior_time(basetime, delay=delay, hours=cycles)

    logger.debug("using %s as a start time" % prior)

    if not files:
        logger.debug("no files specified, finding using options")
        file_pattern = config.get('file-pattern')
        if not file_pattern: raise ConfigError('either supply files or specify file-pattern')
        
        expanded = substitute.sub_date(file_pattern, init_time=prior)
        files = glob.glob(expanded)
        print("hello")
        logger.debug(files)

    # if we get to this point and there are still no files, then we have a problem
    if not files: raise IOError("no files found")
    
    logger.debug("input files: ")
    logger.debug(files)
    for f in files:
        logger.debug("\t%s" % f)
    
    # if pdist 
    if pdist: n=pdist
    
    
    #grid_id         = config['grid_id']
    
    
    out_pattern     = config.get('out')
    
    
    for tseries_file in files:
        dataset_in = Dataset(tseries_file, 'a')
            
        # Get dimensions
        dims      = dataset_in.dimensions
        nreftime  = len(dims['reftime'])
        ntime     = len(dims['leadtime'])
        nloc      = len(dims['location'])
        nheight   = len(dims['height'])
        loc_str_len = len(dims['loc_str_length'])
        
        # Get coordinate variables
        reftime   = dataset_in.variables['reftime']
        leadtime  = dataset_in.variables['leadtime']
        validtime = nctools._valid_time(reftime, leadtime)
        
        refdt     = num2date(reftime[:], reftime.units)
        
        power_file = substitute.sub_date(out, init_time=refdt[0])
        

        logger.info('Estimating power from time series: %s ' % tseries_file)
        logger.info('Writing power time series to: %s ' % power_file)

        
        location = [''.join(l.filled(' ')).strip() for l in dataset_in.variables['location']]
        height   = dataset_in.variables['height']

        
        if power_file == tseries_file:
            dataset_out = dataset_in
        else:
            dataset_out = Dataset(power_file, 'w')

            
        # Get number of quantiles
        nq    = len(quantiles)
        pdata = np.ma.zeros((ntime,nloc,nheight,nq+1), np.float) # mean will be 1st value
        
        use_locs = []
        # loop through locations and look for power-curve file
        
        for l,loc in enumerate(location):
            pcurve_file = '%s/%s.csv' %(pcurve_dir, loc)
            
            # mask power data if no power curve found for this park
            if not os.path.exists(pcurve_file):
                #logger.debug("Power curve: %s not found, skipping" % pcurve_file)
                pdata[:,l,:,:] = np.ma.masked
                continue
            
            logger.info('Predicting power output for %s' % loc )
            #
            # Open power curve
            #
            use_locs.append(l)
            pcurve = from_file(pcurve_file)

        
            for h in range(nheight):
                speed     = dataset_in.variables['SPEED'][0,:,l,h]
                direction = dataset_in.variables['DIRECTION'][0,:,l,h]
                
                #pwr = pcurve.power(speed,direction)
        
                # pdist will create a distribution for each timetep based on sampling
                # n times from a normal distribution. 
                pdist   = pcurve.power_dist(speed, direction, sstd=sstd,dstd=dstd,n=n, normalise=pnorm)
                pmean   = np.mean(pdist, axis=1)
                pquants = scipy.stats.mstats.mquantiles(pdist, prob=quantiles/100.0,axis=1, alphap=0.5, betap=0.5)
                

                pdata[:,l,h,0]  = pmean
                pdata[:,l,h,1:] = pquants[:,:]

            #logger.info('finished %s' % loc)            



        use_inds = np.array(use_locs)

        
        if dataset_out != dataset_in:
            dataset_out.createDimension('reftime', None)
            dataset_out.createVariable('reftime', 'float', ('reftime',))
            dataset_out.variables['reftime'][:] = reftime[:]
            dataset_out.variables['reftime'].units = reftime.units
            dataset_out.variables['reftime'].calendar = reftime.calendar
            dataset_out.variables['reftime'].long_name = reftime.long_name
            dataset_out.variables['reftime'].standard_name = reftime.standard_name

            
            dataset_out.createDimension('leadtime', len(leadtime))
            dataset_out.createVariable('leadtime', 'int', ('leadtime',))
            dataset_out.variables['leadtime'][:] = leadtime[:]
            dataset_out.variables['leadtime'].units = leadtime.units
            dataset_out.variables['leadtime'].long_name = leadtime.long_name
            dataset_out.variables['leadtime'].standard_name = leadtime.standard_name
            
            dataset_out.createDimension('location', len(use_locs))
            dataset_out.createDimension('loc_str_length', loc_str_len)
            
            loc_data =np.array([list(l.ljust(loc_str_len, ' ')) for l in location])
            dataset_out.createVariable('location', 'c', ('location', 'loc_str_length'))
            dataset_out.variables['location'][:] = loc_data[use_inds,:]
            
            dataset_out.createDimension('height', nheight)        
            dataset_out.createVariable('height', 'i', ('height',))
            dataset_out.variables['height'][:] = height[:]
            dataset_out.GRID_ID = dataset_in.GRID_ID
            dataset_out.DX = dataset_in.DX
            dataset_out.DY = dataset_in.DY
            
            try:
                dataset_out.variables['height'].units = height.units
            except Exception:
                logger.warn("height units missing")
            
            
            pdata = pdata[:, use_inds, :, :]
            for key in metadata.keys():
                key = key.upper()
                dataset_out.setncattr(key,dataset_in.getncattr(key))
                
            
        
        pavg    = dataset_out.createVariable('POWER','f',('reftime','leadtime','location','height'))
        pavg.units = 'kW'
        pavg.description = 'forecast power output'
        pavg[0,:,:,:] = pdata[:,:,:,0]

        
        for q, qval in enumerate(quantiles):

            varname = 'POWER.P%02d' % qval

            var  = dataset_out.createVariable(varname,'f',('reftime','leadtime','location','height'))
            if pnorm:
                var.units = 'ratio'
            else:
                var.units = 'kW'
            var.description = 'forecast power output'

            var[0,:,:,:] = pdata[:,:,:,q+1]
        
                
        #logger.debug(dataset_out)
        
        dataset_in.close()
        if dataset_out!=dataset_in:
            dataset_out.close()
Beispiel #22
0
def write_json_files(frame, out_dir, out_name, variables, dimspec, drop, rename=None, float_format="%0.3f",log_name=LOGGER ):
    """ Writes each variable and init_time series into one json file. If vars is None, then all export all variables"""

    logger = loghelper.get_logger(log_name)

    logger.info("*** outputting data as json ***")
    # drop columns by subsetting to create a view
    if drop: _drop(frame, drop)
      
      
    # subset based on variable, location, height
    frame = _filter(frame, variables, dimspec, log_name)


    if rename:
        frame = _rename(frame, rename)

        
    # Bit of a hack to ease output formatting, convert init_time to string
    frame['init_time'] = frame['init_time'].apply(str)
    
    
    
    # we need to group by everything except valid time and value
    group_by = [c for c in frame.columns if c not in ["valid_time", "value"]]
    gb = frame.groupby(group_by)

        
    # Convert time to milliseconds since epoc
    convert = lambda t: time.mktime(t.timetuple())*1000        
    
    series = []
    for name, group in gb:
        #logger.debug("processing %s" % str(name))
        # create a dictionary from all the fields except valid time and value
        d = dict(zip(group_by,list(name)))
        
        timestamp = map(convert, group['valid_time'])
        values  = group['value']
        mvals = np.ma.masked_invalid(np.array(values))
        data    = [ (timestamp[n],mvals[n]) for n in range(len(timestamp))]
        ldata   = map(list, data)
        d['data'] = ldata
        s = str(d)
    
        # this is an ugly hack which could potentially lead to errors if " u'" occurs at the end of a string
        s =  s.replace(" u'", " '")
                
        # change single quotes to double
        s = s.replace("'", '"')
        
        # replace masked values. Again, ugly
        s = s.replace('masked', 'null')
        
        series.append(s)

    json_str = ','.join(series)
    
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    
    fout = open('%s/%s' % (out_dir, out_name), 'w')
    fout.write('[')
    fout.write(json_str)
    fout.write(']')
    fout.close()
Beispiel #23
0
def write_aot_files(frame, out_dir, log_name=LOGGER):        
    """Writes file format the same as AOTs existing supplier, which is:
  
    "Location","Date/time (utc)","Date/time (local)","Forecast (hours)","Windspeed 21m (m/sec)","Winddirection 21m (degrees)","Windspeed 70m (m/sec)","Winddirection 70m (degrees)","Windspeed 110m (m/sec)","Winddirection 110m (degrees)","Percentile  10 (m/sec) 70m","Percentile  20 (m/sec) 70m","Percentile  30 (m/sec) 70m","Percentile  40 (m/sec) 70m","Percentile  50 (m/sec) 70m","Percentile  60 (m/sec) 70m","Percentile  70 (m/sec) 70m","Percentile  80 (m/sec) 70m","Percentile  90 (m/sec) 70m
    "Thanet",2013-05-31 00:00,2013-05-31 02:00,0,7.80,351,8.70,352,8.93,352,7.27,7.83,8.20,8.52,8.70,8.97,9.27,9.66,10.16
    
    Arguments:
        @ncfiles list of netcdf time-series files to process
        @out_dir directory to write output to
        @dimspec use this to restrict dimensions"""
    
    logger = loghelper.get_logger(log_name)
    
    # Format is too bespoke, just hard code it all here!
    
    
    # ensure sorted by init_time, valid_time, location
    frame.sort(['init_time', 'valid_time', 'location'], inplace=True)
    #init_time = frame.init_time[0]
    
    
    #
    # The AOT files require a local time, as well as UTC time. This requires a mapping between location 
    # and timezone. The quickest way to do this is to hardcode this here. This is not very elegant or
    # extensible, but it works.
    #
    import pytz
    tz_map = { "FRE": "Europe/Amsterdam",
                    "HAM":	"Europe/Amsterdam",
                    "DTK":	"Europe/Amsterdam",
                    "SNB":	"Europe/Amsterdam",
                    "AVS":	"Europe/Amsterdam",
                    "FN1":	"Europe/Amsterdam",
                    "FN3":	"Europe/Amsterdam",
                    "AMS":	"Europe/Amsterdam",
                    "NEZ":	"Europe/Amsterdam",
                    "ZDB":	"Europe/Amsterdam",
                    "RCN":	"Europe/Amsterdam",
                    "BEK":	"Europe/Amsterdam",
                    "DEB":	"Europe/Amsterdam",
                    "DKY":	"Europe/Amsterdam",
                    "DLN":	"Europe/Amsterdam",
                    "HGV":	"Europe/Amsterdam",
                    "LWN":	"Europe/Amsterdam",
                    "LYD":	"Europe/Amsterdam",
                    "SPL":	"Europe/Amsterdam",
                    "SVN":	"Europe/Amsterdam",
                    "VLK":	"Europe/Amsterdam",
                    "ZSN":	"Europe/Amsterdam",
                    "STO":	"Europe/Amsterdam",
                    "SLG":	"Europe/Amsterdam",
                    "YTS":	"Europe/Amsterdam",
                    "UTG":	"Europe/Amsterdam",
                    "EA1":	"Europe/London",
                    "EAZ":	"Europe/London",
                    "EDI":	"Europe/London",
                    "LON":	"Europe/London",
                    "UKF":	"Europe/London",
                    "UTH":	"Europe/London",
                    "UOR":	"Europe/London",
                    "UEB":	"Europe/London",
                    "WHM":  "Europe/Amsterdam"}

    name_map = {"FRE" : "Fredericia",
                "EDI" : "Edinburgh",
                "LON" : "London",
                "STO" : "Stockholm",
                "HAM" : "Hamburg",
                "AMS" : "Amsterdam",
                "UKF" : "Kentish Flats",
                "UTH" : "Thanet",
                "UOR" : "Ormonde",
                "SLG" : "Lillgrund",
                "DTK" : "Dan Tysk",
                "SNB" : "Sand Bank",
                "UEB" : "Edinbane",
                "NEZ" : "Egmonde an Zee",
                "ZDB" : "Zuidlob",
                "AVS" : "Alpha Ventus",
                "RCN" : "RCN Mast",
                "FN1" : "Fino 1 Platform",
                "FN3" : "Fino 3 Platform",
                "BEK" : "Beek",
                "DEB" : "Debilt",
                "DKY" : "Dekooy",
                "DLN" : "Deelen",
                "HGV" : "Hoogeveen",
                "LWN" : "Leeuwarden",
                "LYD" : "Lelystad",
                "SPL" : "Schipol",
                "SVN" : "Stavoren",
                "VLK" : "Valkenburg",
                "ZSN" : "Zestienhoven",
                "EA1" : "East Anglia 1B",
                "EAZ" : "East Anglia ZE",
                "YTS" : "Yttre Stengrund",
                "UTG" : "Utgrunded",
                "WHM" : "Hagesholm"}

    inv_name_map = {v:k for k, v in name_map.items()}
    
    
    #
    # This renames columns in the input into columns in the output
    # Only columns named here will be exported
    #
    #
    col_map = OrderedDict([("long_name",   "Location"),
                               ("valid_time",    "Date/time (utc)"),
                               ("local_time",    "Date/time (local)"),
                               ("lead_time",     "Forecast (hours)"),
                               ("SPEED.20",     "Windspeed 21m (m/sec)"),
                               ("DIRECTION.20", "Winddirection 21m (degrees)"),
                               ("SPEED.70",     "Windspeed 70m (m/sec))"),
                               ("DIRECTION.70", "Winddirection 70m (degrees)"),
                               ("SPEED.110",     "Windspeed 110m (m/sec)"),
                               ("DIRECTION.110", "Winddirection 110m (degrees)"),
                               ("SPEED.70.P10", "Percentile  10 (m/sec) 70m"),
                               ("SPEED.70.P20", "Percentile  20 (m/sec) 70m"),
                               ("SPEED.70.P30", "Percentile  30 (m/sec) 70m"),
                               ("SPEED.70.P40", "Percentile  40 (m/sec) 70m"),
                               ("SPEED.70.P50", "Percentile  50 (m/sec) 70m"),
                               ("SPEED.70.P60", "Percentile  60 (m/sec) 70m"),
                               ("SPEED.70.P70", "Percentile  70 (m/sec) 70m"),
                               ("SPEED.70.P80", "Percentile  80 (m/sec) 70m"),
                               ("SPEED.70.P90", "Percentile  90 (m/sec) 70m") ])


    utc = pytz.UTC
    # weeeeee, what a lot of chained operators!
    # converts to local time
    convert = lambda row: utc.localize(row['valid_time']).astimezone(pytz.timezone(tz_map[row['location']])).strftime('%Y-%m-%d %H:%M')
    
    
    # Now we apply our uber-lambda function to insert local time
    frame['local_time'] = frame.apply(convert, axis=1)

    # Calculate lead time as integer number of hours
    deltas = frame['valid_time'] - frame['init_time']
    hours = lambda x: x / np.timedelta64(1, 'h')
    lead_ints = deltas.apply(hours)
    frame['lead_time'] = lead_ints.astype(int)

        
    # Expand short names to long names
    rename = lambda x: name_map[x]
    long_names = frame['location'].apply(rename)
    frame['long_name'] = long_names
    
    
    # *******************************************
    # WARING - this is a hack and does not belong
    # here long term. This is just a quick way of
    # adding statistical percentiles to the output 
    # time series. We need to thinl carefully about
    # where these shoudl be calculated.
    #********************************************
    #     "SPEED_070.P10" : "Percentile  10 (m/sec) 70m",
    #     "SPEED_070.P20" : "Percentile  20 (m/sec) 70m",
    #     "SPEED_070.P30" : "Percentile  30 (m/sec) 70m",
    #     "SPEED_070.P40" : "Percentile  40 (m/sec) 70m",
    #     "SPEED_070.P50" : "Percentile  50 (m/sec) 70m",
    #     "SPEED_070.P60" : "Percentile  60 (m/sec) 70m",
    #     "SPEED_070.P70" : "Percentile  70 (m/sec) 70m",
    #     "SPEED_070.P80" : "Percentile  80 (m/sec) 70m",
    #     "SPEED_070.P90" : "Percentile  90 (m/sec) 70m" 
    # *******************************************


    # Unstacking is causing missing values to propagate
    # Unstacking is causing missing values.
    
    frame = pd.pivot_table(frame, values="value", rows=["init_time", "valid_time", "local_time", "lead_time", "location", "long_name", "GRID_ID"], cols=["variable","height"])
   
    
    # The columns are now tuples
    # We want to collapse these to single strings
    tuples  = frame.columns
    columns = map(collapse, tuples)
    # ensure string and not unicode
    columns = map(str,columns)
    # set frames columns
    frame.columns = columns
    # reset index to make column selection easier
    frame = frame.reset_index()
    

        
    logger.debug("adding percentiles")
    percentiles = [10, 20, 30, 40, 50, 60, 70, 80, 90]
    for p in percentiles:
        pname = 'SPEED.70.P%02d' % p
        pfunc = lambda x : stats.norm.ppf(p/100.0, x, x*0.10)
        frame[pname] = frame['SPEED.70'].apply(pfunc)

    
    gb = frame.groupby(by=['init_time','GRID_ID','location'])
    groups = dict(list(gb))
    
    for key, group in gb:
        
        
        logger.debug("processing group %s" %str(key))
        init_time = key[0]
        grid_id=key[1]
        location = key[2]
        
        # subset and rename
        subset = group[col_map.keys()]
        subset.columns = col_map.values()
    
        d = '%s/%s/d%02d' % (out_dir, location, grid_id)
        if not os.path.exists(d):
            os.makedirs(d)
        
        out_name = '%s/%s.txt' % (d, init_time.strftime('%y%m%d%H'))
        logger.debug("writing times series out to %s" % out_name)
        subset.to_csv(out_name, index=False, float_format='%0.2f')
Beispiel #24
0
def frame_from_nc_old(ncfiles, vars, dimspec, global_atts, var_atts, log_name):

    """ Build a Pandas DataFrame from a series of netcdf files
    
    This is horrendously inneficient! A better way would be to build up 
    Index objects from the coordinate variables, then create DataFrames for 
    each variable with the coordinate indexes, then concatenate together. 
    
    Unstacking a coordinate, e.g. height would have to be implemented somehow."""

    logger = loghelper.get_logger(log_name)
    logger.debug(vars)
    logger.debug(dimspec)
    logger.debug(global_atts)
    logger.debug(var_atts)
    
    rows = []
    
    # subsetting defaults to full selection
    ts,te = 0,None
    ls,le = 0,None
    hs,he = 0,None
   
    if dimspec!=None:
        for dim,ind in dimspec.items():
            if dim=='time':
                ts = ind[0]
                te = ind[1]
            if dim=='location':
                ls = ind[0]
                le = ind[1]
            if dim=='height':
                hs = ind[0]
                he = ind[1]
    
    
    for f in ncfiles:
        dataset = Dataset(f, 'r')
                
        variables   = dataset.variables
        dataset_atts = dataset.__dict__
        
        if vars==None:
            vars = list(variables.keys())
            
        fulltime      = variables['time']
        fulldatetimes = num2date(fulltime,units=fulltime.units,calendar=fulltime.calendar)
        
        time      = fulltime[ts:te]
        datetimes = fulldatetimes[ts:te]
        ntime     = len(datetimes) 
        init_time = fulldatetimes[0]
      
        # hack to catch thanet
        try:
            location    = variables['location'][ls:le]
        except KeyError:
            location    = variables['location_id'][ls:le] 
            
        
        nloc        = location.shape[0]
        loc_id_raw  = [''.join(location[l,:].filled('')) for l in range(nloc)]
        loc_id      = map(string.strip, loc_id_raw)
        height      = variables['height'][hs:he]
        nheight     = len(height)

        # this will force the reading all of the required variable data into memory
        varnames = [v for v in vars if v not in COORD_VARS]
        vardata  = dict([(v, variables[v][:]) for v in varnames])

        # Argh! Nested loop hell.
        for t in range(ntime):
            for l in range(nloc):
                rowdict = OrderedDict()
                
                for a in global_atts:
                    logger.debug('adding value of attribute: %s' % a)
                    rowdict[a] = dataset_atts[a]
        
                rowdict['valid_time']  = datetimes[t]
                rowdict['location']    = loc_id[l]
                rowdict['init_time']   = init_time
                
                for v in varnames:
                    vatts = variables[v].__dict__
                    data = vardata[v]
                    
                    for att in var_atts:
                        rowdict[att] = vatts[att]
                        
                    # 2D variable
                    if len(data.shape)==2:
                        rowdict[v] = data[t,l]
                    
                    # 3D variable, unstack height
                    if len(data.shape)==3:
                        for h in range(nheight):
                            key = '%s_%03d' %(v, int(height[h]))
                            rowdict[key] = data[t,l,h]

                rows.append(rowdict)
        dataset.close()
    
    df = pd.DataFrame(rows)
    
    #re-arrange columns
    cols = df.columns
    pre_cols = ['init_time','valid_time','location']
    data_cols = [c for c in cols if c not in pre_cols]
    new_cols = pre_cols + data_cols
    df = df[new_cols]
    return df
Beispiel #25
0
from gevent.event import Event
from gevent import monkey
monkey.patch_all()
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import config
import db
import loghelper
import url_helper

#logger
loghelper.init_logger("remove_bd_sshot", stream=True)
logger = loghelper.get_logger("remove_bd_sshot")

#mongo
mongo = db.connect_mongo()

collection_android = mongo.market.android
collection_itunes = mongo.market.itunes

collection_android_market = mongo.market.android_market

cnt = 0


def copy_from_itunes(app, artifactId):
    conn = db.connect_torndb()
    if app.has_key("description"):
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import db
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import aggregator_db_util
import helper

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../artifact'))
import artifact_recommend
import set_artifact_rank

#logger
loghelper.init_logger("company_aggregator_artifact", stream=True)
logger = loghelper.get_logger("company_aggregator_artifact")


def aggregate_artifact(company_id,source_company_id, test=False):
    table_names = helper.get_table_names(test)

    # artifact
    conn = db.connect_torndb()
    sas = list(conn.query("select * from source_artifact where sourceCompanyId=%s", source_company_id))
    for sa in sas:
        if sa["active"] == "Y" or sa["active"] is None:
            if sa["domain"] is not None and sa["domain"].strip() != "":
                artifact = conn.get("select * from " + table_names["artifact"] + " where companyId=%s and type=%s and domain=%s limit 1",
                                    company_id, sa["type"], sa["domain"])
            else:
                artifact = conn.get("select * from " + table_names["artifact"] + " where companyId=%s and type=%s and link=%s limit 1",
Beispiel #27
0
from pymongo import MongoClient
import pymongo
from bson.objectid import ObjectId

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper, config, util, url_helper
import db

#join amac for find_investor_alias_by_fund with active and verify
#todo query for 12010 but not active
#logger
loghelper.init_logger("amac_util", stream=True)
logger = loghelper.get_logger("amac_util")

# investor_alias amacType amacId
# investor_alias_candidate amacType amacId

def get_websit_domains(managerIds):
    domains = []
    mongo = db.connect_mongo()
    collection_manager = mongo.amac.manager
    for managerId in managerIds:
        manager = collection_manager.find_one({"_id": ObjectId(managerId)})
        if manager is not None and manager.has_key("domain") is True \
            and manager["domain"] is not None and manager["domain"].strip() != "" \
            and manager["domain"].strip() not in ["www.com","baidu.com"]:

            if manager["domain"] not in domains: domains.append(manager["domain"])
Beispiel #28
0
import datetime
from pymongo import MongoClient
import pymongo
from bson.objectid import ObjectId
import amac_util
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper, config, util, url_helper
import db


#logger
loghelper.init_logger("amac_findy", stream=True)
logger = loghelper.get_logger("amac_findy")

# investor_alias amacType amacId
# investor_alias_candidate amacType amacId

#
# 拉萨合业投资管理有限公司
# 北京聚信远业投资咨询有限公司
# 北京云瀚锦科技中心(有限合伙)
# 天津红杉资本投资管理中心
# 北京红杉嘉禾资产管理中心(有限合伙)
# 北京红杉盛远管理咨询有限公司
# 红杉资本投资管理(天津)有限公司
# 北京锋业股权投资中心(有限合伙)
# 上海桓远投资管理有限公司
# 北京创想天地投资管理有限公司
Beispiel #29
0
from pymongo import MongoClient
import pymongo
from kafka import (KafkaClient, SimpleProducer)

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import config
import loghelper
import my_request
import util

#logger
loghelper.init_logger("gen_messages", stream=True)
logger = loghelper.get_logger("gen_messages")

#mongo
(mongodb_host, mongodb_port) = config.get_mongodb_config()
mongo = MongoClient(mongodb_host, mongodb_port)

#kafka
(kafka_url) = config.get_kafka_config()
kafka = KafkaClient(kafka_url)
# HashedPartitioner is default
kafka_producer = SimpleProducer(kafka)

#
company_collection = mongo.crawler_v2.company

if __name__ == "__main__":
Beispiel #30
0
# -*- coding: utf-8 -*-
import os, sys

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper, config
import db

#logger
loghelper.init_logger("score_2_list", stream=True)
logger = loghelper.get_logger("score_2_list")


def process(score, name):
    scores = conn.query(
        "select * from deal_user_score where userId=%s and score=%s", user_id,
        score)
    if len(scores) > 0:
        mylist = conn.get(
            "select * from mylist  where createUser=%s and name=%s", user_id,
            name)
        if mylist is None:
            mylistId = conn.insert(
                "insert mylist(name,isPublic,active,createTime,createUser) values(%s,'N','Y',now(),%s)",
                name, user_id)
        else:
            mylistId = mylist["id"]
        umr = conn.get(
            "select * from user_mylist_rel where userId=%s and mylistId=%s",
Beispiel #31
0
reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, db, util, url_helper

#logger
loghelper.init_logger("crawler_miit", stream=True)
logger = loghelper.get_logger("crawler_miit")


class miitCrawler(BaseCrawler.BaseCrawler):
    def __init__(self, max_crawl=1, timeout=30, use_proxy=False):
        BaseCrawler.BaseCrawler.__init__(self,
                                         max_crawl=max_crawl,
                                         timeout=timeout,
                                         use_proxy=use_proxy)

        self._post_url = 'http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_searchExecute.action'

        self.token = None
        self.jsessionid = None
        self._jsl_clearance = None
        self._jsluid = None
Beispiel #32
0
import config
import loghelper
import util

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import proxy_pool

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

#logger
loghelper.init_logger("crawler_gonshang_qixinbao", stream=True)
logger = loghelper.get_logger("crawler_gonshang_qixinbao")


class SocksiPyConnection(httplib.HTTPConnection):
    def __init__(self,
                 proxytype,
                 proxyaddr,
                 proxyport=None,
                 rdns=True,
                 username=None,
                 password=None,
                 *args,
                 **kwargs):
        self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username,
                          password)
        httplib.HTTPConnection.__init__(self, *args, **kwargs)
Beispiel #33
0
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../support'))
import loghelper
import util, name_helper, url_helper, download
import db

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import parser_db_util

# logger
loghelper.init_logger("xtecher_company_repair", stream=True)
logger = loghelper.get_logger("xtecher_company_repair")


def run():
    conn = db.connect_torndb()
    sql = '''select name,fullname,sourceid,id  from source_company where source=13821
    '''
    results = conn.query(sql)  # TODO
    conn.close()

    for c in results:
        if c['fullname'] is not None and not name_helper.name_check(
                c['fullname'])[1] == True:
            logger.info('%s not company', c['fullname'])

            conn = db.connect_torndb()
Beispiel #34
0
    # action: create, delete
    msg = {"source": action, "id": company_id, "detail": source}
    flag = False
    while flag is False:
        try:
            kafkaProducer.send_messages("task_company", json.dumps(msg))
            flag = True
        except Exception, e:
            logger.exception(e)
            time.sleep(60)


#logger
loghelper.init_logger("sh_import", stream=True)
logger = loghelper.get_logger("sh_import")


def insert(shortname, name, brief, fullNames):
    name = name.replace("(开业)", "")
    sourceId = util.md5str(name)
    sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId,
                                            brief)
    # logger.info("sid:%s->sourceId:%s",sid, sourceId)
    parser_db_util.save_source_company_name(sid, shortname, 12020)
    for fullName in [name] + fullNames:
        parser_db_util.save_source_company_name(sid, fullName, 12010)

    return sid

Beispiel #35
0
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import db, config, util
import loghelper

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import proxy_pool


sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

#logger
loghelper.init_logger("crawler_lagou_job", stream=True)
logger = loghelper.get_logger("crawler_lagou_job")


class LagouJobCrawler():
    def __init__(self, timeout=20):
        self.timeout=timeout
        self.opener = None
        self.socks_proxy = {"type": "socks4", "ip": "180.173.153.98", "port": 1080}

    def is_crawl_success(self, url, content):
        if content.find('操作成功') == -1:
            logger.info(content)
            return False
        r = "companyId=(.*?)&pageSize"
        result = util.re_get_result(r, url)
        (id,) = result
Beispiel #36
0
def power(config):
    """Reads 'time series' from netcdf time series file, and adds power as a variable. """
    
    if __name__ == "__main__":
        logger = loghelper.create_logger(config)
    else:
        logger = loghelper.get_logger(config['log.name'])
    
    # Number of samples to use should be in here
    # Whether to normalise power should be in here    
    pnorm           = config['pnorm']
    pdist           = config['pdist']
    sstd            = config['sstd']
    dstd            = config['dstd']
    pquants         = config['pquants']
    quantiles       = np.array(pquants)
    
    
    logger.debug(pnorm)
    
    if pdist:
        n=pdist
            
    grid_id         = config['grid_id']
    init_time       = config['init_time']
    pcurve_dir      = config['pcurve_dir']
    ts_dir          = config['tseries_dir']
    
    tseries_file    = expand(config['tseries_file'], config)
    power_file      = expand(config['power_file'], config)

    logger.info('Estimating power from time series: %s ' % tseries_file)
    logger.info('Writing power time series to: %s ' % power_file)
    
    dataset_in = Dataset(tseries_file, 'a')

            
        
    # Get dimensions
    dims    = dataset_in.dimensions
    ntime   = len(dims['time'])
    nloc    = len(dims['location'])
    nheight = len(dims['height'])
    loc_str_len = len(dims['loc_str_length'])
    
    # Get coordinate variables
    nctime    = dataset_in.variables['time']
    datetimes = netcdftime.num2date(nctime, nctime.units)
    location = [''.join(l.filled(' ')).strip() for l in dataset_in.variables['location']]
    height   = dataset_in.variables['height']

    # Get attributes
    metadata = config['metadata']

    
    if power_file == tseries_file:
        dataset_out = dataset_in
    else:
        dataset_out = Dataset(power_file, 'w')

        
    # Get number of quantiles
    nq    = len(quantiles)
    pdata = np.ma.zeros((ntime,nloc,nheight,nq+1), np.float) # mean will be 1st value
    
    use_locs = []
    for l,loc in enumerate(location):
    
        pcurve_file = '%s/%s.csv' %(pcurve_dir, loc)
        
        # mask power data if no power curve found for this park
        if not os.path.exists(pcurve_file):
            #logger.debug("Power curve: %s not found, skipping" % pcurve_file)
            pdata[:,l,:,:] = np.ma.masked
            continue
        
        logger.info('Predicting power output for %s' % loc )
        #
        # Open power curve
        #
        use_locs.append(l)
        pcurve = from_file(pcurve_file)

    
        for h in range(nheight):
            speed     = dataset_in.variables['SPEED'][:,l,h]
            direction = dataset_in.variables['DIRECTION'][:,l,h]
            
            #pwr = pcurve.power(speed,direction)
    
            # pdist will create a distribution for each timetep based on sampling
            # n times from a normal distribution. 
            pdist   = pcurve.power_dist(speed, direction, sstd=sstd,dstd=dstd,n=n, normalise=pnorm)
            pmean   = np.mean(pdist, axis=1)
            pquants = scipy.stats.mstats.mquantiles(pdist, prob=quantiles/100.0,axis=1, alphap=0.5, betap=0.5)
            

            pdata[:,l,h,0]  = pmean
            pdata[:,l,h,1:] = pquants[:,:]

        logger.info('finished %s' % loc)            



    use_inds = np.array(use_locs)
    logger.debug(use_inds)
    logger.debug(pdata.shape)
    logger.debug(pdata[:,use_inds,:,:].shape)

    if dataset_out != dataset_in:

        dataset_out.createDimension('time', None)
        dataset_out.createVariable('time', 'float', ('time',))
        dataset_out.variables['time'][:] = nctime[:]
        dataset_out.variables['time'].units = nctime.units
        dataset_out.variables['time'].calendar = nctime.calendar
        
        
        dataset_out.createDimension('location', len(use_locs))
        dataset_out.createDimension('loc_str_length', loc_str_len)
        
        loc_data =np.array([list(l.ljust(loc_str_len, ' ')) for l in location])
        dataset_out.createVariable('location', 'c', ('location', 'loc_str_length'))
        dataset_out.variables['location'][:] = loc_data[use_inds,:]
        
        dataset_out.createDimension('height', nheight)        
        dataset_out.createVariable('height', 'i', ('height',))
        dataset_out.variables['height'][:] = height[:]
        dataset_out.GRID_ID = dataset_in.GRID_ID
        dataset_out.DX = dataset_in.DX
        dataset_out.DY = dataset_in.DY
        
        try:
            dataset_out.variables['height'].units = height.units
        except Exception:
            logger.warn("height units missing")
        
        
        pdata = pdata[:, use_inds, :, :]
        for key in metadata.keys():
            key = key.upper()
            logger.debug(key)
            dataset_out.setncattr(key,dataset_in.getncattr(key))
            
        
    
    pavg    = dataset_out.createVariable('POWER','f',('time','location','height'))
    pavg.units = 'kW'
    pavg.description = 'forecast power output'
    pavg[:] = pdata[:,:,:,0]

    
    for q, qval in enumerate(quantiles):

        varname = 'POWER.P%02d' % qval
        logger.debug("creating variable %s" % varname)
        var  = dataset_out.createVariable(varname,'f',('time','location','height'))
        if pnorm:
            var.units = 'ratio'
        else:
            var.units = 'kW'
        var.description = 'forecast power output'
        print pdata[:,:,:,q+1]
        var[:] = pdata[:,:,:,q+1]
    
            

    
    dataset_in.close()
    if dataset_out!=dataset_in:
        dataset_out.close()
Beispiel #37
0
reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, db, extract

#logger
loghelper.init_logger("cninfo", stream=True)
logger = loghelper.get_logger("cninfo")

rmap = [
    {
        "type": 1,
        "typeDesc": "资产负债表",
        "ue": "balancesheet",
    },
    {
        "type": 2,
        "typeDesc": "利润表",
        "ue": "incomestatements",
    },
    {
        "type": 3,
        "typeDesc": "现金流量表",
Beispiel #38
0
import os, sys
import datetime, time
import json
import pymongo
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import util
import config
import db
import loghelper

#logger
loghelper.init_logger("investor_ranking", stream=True)
logger = loghelper.get_logger("investor_ranking")

conn = None


def get_today_date():
    today = datetime.datetime.now()
    tomorrow = today + datetime.timedelta(days=1)
    start_date = "%s-%s-%s" % (today.year, today.month, today.day)
    end_date = "%s-%s-%s" % (tomorrow.year, tomorrow.month, tomorrow.day)
    return start_date, end_date


def get_thisweek_date():
    date1 = datetime.date.today() - datetime.timedelta(
        days=datetime.date.today().weekday())
Beispiel #39
0
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_feixiaohao", stream=True)
logger = loghelper.get_logger("crawler_feixiaohao")

NEWSSOURCE = "feixiaohao"

URLS = []
CURRENT_PAGE = 1
linkPattern = "feixiaohao.com/currencies"
Nocontents = []
columns = [
    # {"column": "jmd", "max": 2},
    {
        "column": "None",
        "max": 30
    },
]
SOURCE = 13511
Beispiel #40
0
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import util
import proxy_pool
import db

#logger
loghelper.init_logger("appstore_rank_2", stream=True)
logger = loghelper.get_logger("appstore_rank_2")

#mongo

mongo = db.connect_mongo()
appstore_rank_collection = mongo.trend.appstore_rank

total = 0

types = {
    "free": 27,
    "charge": 30,
    "grossing": 38,
}

genres = [
Beispiel #41
0
# -*- coding: utf-8 -*-
# tag 任务
import os, sys
import time

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper
import db, datetime

# logger
loghelper.init_logger("export_audit", stream=True)
logger = loghelper.get_logger("export_audit")

rmap = {
    1000: '未融资',
    1010: '天使轮',
    1011: '天使轮',
    1020: 'pre-A',
    1030: 'A',
    1031: 'A+',
    1039: 'Pre-B',
    1040: 'B',
    1041: 'B+',
    1050: 'C',
    1060: 'D',
    1070: 'E',
    1080: 'F',
    1090: '后期阶段',
Beispiel #42
0
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../support'))
import loghelper, config
import db, name_helper, url_helper
import json, config, traceback, time, util
from bson.objectid import ObjectId
import random

# logger
loghelper.init_logger("mt_cnt", stream=True)
logger = loghelper.get_logger("mt_cnt")

mongo = db.connect_mongo()
collection = mongo['open-maintain'].task
collectionUser = mongo['open-maintain'].user
conn = db.connect_torndb()


def start_run():
    while True:
        taskCnt = list(mongo['open-maintain'].task.aggregate([{
            '$match': {
                'taskUser': -666,
                'active': "Y"
            }
        }, {
Beispiel #43
0
def get_logger():
    return loghelper.get_logger(LOGGER)
Beispiel #44
0
import loghelper, extract, db, util, url_helper, download, traceback_decorator

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import proxy_pool

#logger
loghelper.init_logger("crawler_scmp_news", stream=True)
logger = loghelper.get_logger("crawler_scmp_news")

NEWSSOURCE = "scmp"
RETRY = 3
TYPE = 60004
SOURCE = 13887
URLS = []
CURRENT_PAGE = 1
linkPattern = ".*?/\d+/.*"
Nocontents = []
columns = [
    {
        "column": "news",
        "max": 1
    },
]
Beispiel #45
0
# -*- coding: utf-8 -*-
import os, sys
from BaseCrawler import BaseCrawler
from pyquery import PyQuery as pq

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper

#logger
loghelper.init_logger("crawler_itjuzi_investfirm", stream=True)
logger = loghelper.get_logger("crawler_itjuzi_investfirm")

SOURCE = 13030  #ITJUZI
TYPE = 36004  #投资个人


class ItjuziCrawler(BaseCrawler):
    def __init__(self, start):
        BaseCrawler.__init__(self, header=True)
        self.set_start(start)

    def set_start(self, start):
        self.current = start
        self.latest = start

    def get_url(self):
        key = str(self.current)
Beispiel #46
0
def melt(ncfiles, vars=None, global_atts=None,var_atts=None, coord_vars=None, missing=None):

    """ Build a (molten) Pandas DataFrame from a series of netcdf files. This is a flexible, but very 
    memory-inneficient data structure, so be careful calling this with large netcdf files.
    
    Arguments:
      ncfiles     -- the input filenames
      vars        -- the variables to read, if None all variables in files read
      var_atts    -- variable attributes to include in each line of output, default all
      global_atts -- global attributes to include in each row of output
      coord_vars  -- variables to treat as coordinates, if None will use variables with 
                     the same name as dimensions"""

    logger = loghelper.get_logger(LOGGER)
    frames = []

        
    if len(ncfiles)==1:
        dataset = Dataset(ncfiles[0])
    else:
        dataset = MFDataset(ncfiles)
    
    
    coord_vars = get_coordinate_vars(dataset, coord_vars)
    variables = dataset.variables
    
    # get global attributes in dataset
    # shouldn't really use this, but it works
    dataset_atts = dataset.__dict__
        

    use_global_atts = _lookup(global_atts, dataset_atts, missing)

    
    
    # if no vars specified, use all in ncfiles
    if (vars==None or vars==["all"]):  vars = list(variables.keys())
    
    
    
    # variables are a function of var(reftime,leadtime,height,location)
    # or var(reftime,leadtime,location)
    usevars = [v for v in vars if v not in coord_vars]
    
   
    logger.debug("usevars: %s" % usevars)

    # There must be a clean way of doing this in a general 
    # way, but I don't have the time to code this properly,
    # so I'm looping over fixed and hard-coded dimension names
    
    location = coord_vars['location']
    reftime  = coord_vars['reftime']
    leadtime = coord_vars['leadtime']
    height   = coord_vars['height']
    #lat      = coord_vars['lat']
    #lon      = coord_vars['lon']
    
    nloc = len(location)
    nreftime = len(reftime)
    nleadtime = len(leadtime)
    
    # dimension order is reftime, leadtime, location, height
    # or reftime, leadtime, location
    vars2D = [v for v in usevars if len(variables[v].shape)==3]
    vars3D = [v for v in usevars if len(variables[v].shape)==4]
    
    series = []
    
    for v in vars2D:
        vname = v
        variable = variables[v]

        use_var_atts = _lookup(var_atts, variable.__dict__, missing)
        
        factors = [reftime, leadtime, [HGT2DNUM], location, [vname]] + map(_listify, use_global_atts.values()) + map(_listify,use_var_atts.values())
        names = ['reftime', 'leadtime', 'height', 'location','variable'] + use_global_atts.keys() + use_var_atts.keys()
        
        index = pd.MultiIndex.from_product(factors, names=names)
        #index = pd.MultiIndex.from_tuples([(ref,lead,loc,HGT2DNUM,vname) for ref in reftime for lead in leadtime for loc in location], names=['reftime', 'leadtime', 'location', 'height','variable'])
        
        if type(variable[:]) == np.ma.core.MaskedArray:
            data = variable[:].flatten().filled(np.nan).astype(np.float)
        else:
            data = variable[:].flatten().astype(np.float)

        series.append( pd.Series(data=data, index=index, name='value'))

    for v in vars3D:
        variable = variables[v]
        vname = v
        use_var_atts = _lookup(var_atts, variable.__dict__, missing)
        for h,hgt in enumerate(height):
            subvar = variable[:,:,:,h]
            vname = "%s.%03d" % (v,hgt)
            vname = v
            factors = [reftime, leadtime, [hgt], location, [vname]] + map(_listify, use_global_atts.values()) + map(_listify,use_var_atts.values())
            names = ['reftime', 'leadtime', 'height', 'location','variable'] + use_global_atts.keys() + use_var_atts.keys()
            index = pd.MultiIndex.from_product(factors, names=names)
            #index = pd.MultiIndex.from_tuples([(ref,lead,loc,hgt,vname) for ref in reftime for lead in leadtime for loc in location], names=['reftime', 'leadtime', 'location','height', 'variable'])
            if type(subvar) == np.ma.core.MaskedArray:
                data = subvar[:].flatten().filled(np.nan).astype(np.float)
            else:
                data = subvar[:].flatten().astype(np.float)
            
            series.append(pd.Series(data=data, index=index, name='value'))
    
    
    # this is molten data, to use Haldey Wickham's terminology
    # or perhaps 5th normal form?
    result = pd.concat(series, axis=0).reset_index()
    return result