Example #1
0
def savedata():
    basesql = 'insert into spider_data.v_list (title, url, v_url,v_type, v_source) values (%s, %s, %s, %s, %s)'
    lpath = './leaveinfo'
    lists = os.listdir(path = lpath)
    for i in lists:
        basename = os.path.splitext(i)[0]
        path = os.path.join(lpath, i)
        if path.find('.json') != -1:
            data = utils.getjsondata(path)
            sqldata = [tuple(i.values()) + (basename, 'ppx') for i in data]
            # print(sqldata)
            db.executemany(basesql, sqldata, basename)
Example #2
0
def parse(fname):

    name = 'alexandria'
    create_table(name)

    lines = (line for line in open(fname, 'r'))
    lines.next()

    sql = '''
    WITH upsert AS (
        UPDATE ''' + name + ''' 
            SET day = DATE(%s)
            , val = %s
            , storyid = %s
            , country = %s
            , sentiment = %s
            , confidence = %s
            , novelty = %s
            , subjects = %s
            , relevance = %s
        WHERE 
            sid = (SELECT sid FROM security_master WHERE symbol = %s LIMIT 1)
            AND ts = %s
        RETURNING *
    ) 
    INSERT INTO ''' + name + ''' (
        sid, day, val, storyid, ts, country, sentiment, confidence, novelty,
        subjects, relevance )
    SELECT 
        (SELECT sid FROM security_master WHERE symbol = %s LIMIT 1)
        , DATE(%s), %s, %s, %s, %s, %s, %s, %s, %s, %s
    WHERE NOT EXISTS (SELECT 1 FROM upsert);
    '''

    batch_size = 10000
    args = []
    for line in lines:
        arg = parse_line(line)
        args.append(arg)
        if len(args) % batch_size == 0:
            start = time.time()
            executemany(sql, args, info=cs)
            print 'Submitted %d rows in %f seconds' % (batch_size,
                                                       time.time() - start)
            args = []
def parse(fname):

    name = 'alexandria'
    create_table(name)

    lines = (line for line in open(fname, 'r'))
    lines.next()

    sql = '''
    WITH upsert AS (
        UPDATE ''' + name + ''' 
            SET day = DATE(%s)
            , val = %s
            , storyid = %s
            , country = %s
            , sentiment = %s
            , confidence = %s
            , novelty = %s
            , subjects = %s
            , relevance = %s
        WHERE 
            sid = (SELECT sid FROM security_master WHERE symbol = %s LIMIT 1)
            AND ts = %s
        RETURNING *
    ) 
    INSERT INTO ''' + name + ''' (
        sid, day, val, storyid, ts, country, sentiment, confidence, novelty,
        subjects, relevance )
    SELECT 
        (SELECT sid FROM security_master WHERE symbol = %s LIMIT 1)
        , DATE(%s), %s, %s, %s, %s, %s, %s, %s, %s, %s
    WHERE NOT EXISTS (SELECT 1 FROM upsert);
    '''

    batch_size = 10000
    args = []
    for line in lines:
        arg = parse_line(line)
        args.append(arg)
        if len(args) % batch_size == 0:
            start = time.time()
            executemany(sql, args, info=cs)
            print 'Submitted %d rows in %f seconds' % (batch_size, time.time() - start)
            args = []
def parse(fname):

    print fname

    # remove any previously appended extensions
    originalName = fname
    while fname.count('.') > 1:
        fname,extension = os.path.splitext(fname)
    os.rename(originalName, fname)

    tree = ET.parse(fname)
    root = tree.getroot()

    # for reference

    repno = root.find('RepNo').text
    description = root.find('ReferenceInformation/CompanyInformation/Company').get('Name')
    exchange = root.find('ReferenceInformation/Issues/Issue/Exchange').get('Code')

    sedol = None
    symbol = None
    ric = None
    for issue in root.findall('ReferenceInformation/Issues/Issue/IssueXref'):
        if issue.get('Type') == 'SEDOL':
            sedol = issue.text
        elif issue.get('Type') == 'Ticker':
            symbol = issue.text
        elif issue.get('Type') == 'DisplayRIC':
            ric = issue.text

    for i in [ repno, description, exchange, sedol, symbol, ric ]:
        if not i:
            print "Missing field.. skipping %s" % fname
            os.rename(fname, fname + FIELD)
            return

    name = None
    if exchange and symbol:
        exch = lookup.get(exchange)
        if not exch:
            print "Unsupported exchange '%s'.. skipping %s" % (exchange, fname)
            os.rename(fname, fname + EXCHANGE)
            return
        else:
            exchange = exch
        name = '%s:%s' % (exchange, symbol)

    currency = root.find('ReferenceInformation/CompanyInformation/ReportingCurrency').get('Code')

    # only updates for now to ensure companies that don't have eoddata aren't inserted
    # sids,exchange,symbol get inserted by ingest_eoddata.py
    buf = '''
	UPDATE security_master 
	SET repno = %s
	, ric = %s
	, sedol = %s
	, display_name = %s
	WHERE exchange = %s AND symbol = %s
    '''
    args = (repno,ric,sedol,description,exchange,symbol)
    db.execute(buf, args)
    
    # only want companies reporting in USD 
    if currency != 'USD':
        print 'Non USD currency field in', fname,'.. skipping'
        os.rename(fname, fname + CURRENCY)
        return

    # delimiter for the statement type + coa combo
    delim = '_'

    # upsert filter_descriptions on filter name 
    buf = '''
    WITH upsert AS (
        UPDATE filter_descriptions
        SET description = %s, source = %s
        WHERE filter = %s
        RETURNING *
    )   INSERT INTO filter_descriptions (filter,description,source)
        SELECT %s,%s,%s
        WHERE NOT EXISTS (SELECT 1 FROM upsert);
    '''
    args = []
    for layout in root.iter('Layout'):
        layoutType = layout.get('Type')
        for mapitem in layout.iter('MapItem'):
            coa = layoutType + delim + mapitem.get('COA')
            coa = coa.upper()
            description = mapitem.text
            args.append((description, source, coa, coa, description, source))

            # create the filter table if doesn't exist
            db.createFilterTable(coa)

    db.executemany(buf, sorted(args)) 

    # periodEnd is most recent date the current statement applies to. 
    # Because we iterate from newest to oldest, we can save the periodStart 
    # from the previous statement and use it as the stop for the current statement.
    # We use today's date as the stop for the most recent statement.
    periodEnd = datetime.date.today().strftime('%Y-%m-%d')

    for i,period in enumerate(root.iter('Period')):

        # only process the most recent period
        if onlyMostRecent and i > 0:
            break

        periodStart = period.get('PeriodEndDate')

        # write one day,val of filter data for each day we have intraday data for
        buf = "SELECT DISTINCT day FROM days WHERE day BETWEEN %s AND %s"
        args = (periodStart, periodEnd)
        rows = db.execute(buf, args, returndata=True)

        # skip periods that don't have corresponding intraday data 
        if not rows:
            continue

        print fname,periodStart,periodEnd
        days = [ x[0] for x in sorted(rows) ]

        for statement in period.iter('Statement'):
            statementType = statement.get('Type')
            for fv in statement.iter('FV'):
                coa = statementType + '_' + fv.get('COA')
                val = fv.text

                # upsert on name,day
                # update the val for this table's name,day or insert if no val for this name,day
                buf = '''
                WITH upsert AS (
                    UPDATE ''' + coa + ''' 
                    SET val = %s 
                    WHERE name = %s AND day = %s
                    RETURNING *
                )
                    INSERT INTO ''' + coa + ''' (name,day,val) 
                    SELECT %s,%s,%s 
                    WHERE NOT EXISTS (SELECT 1 FROM upsert);
                '''
                    
                args = []
                for day in days:
                    args.append((val,name,day,name,day,val))
                # bc processed chronologically, 'Reclassified Normal' entries will be submitted
                # first and should NOT be replaced by 'Update Normal' entries
                db.executemany(buf, args)

        periodEnd = periodStart

    # rename file to help us track what files still need to be processed
    os.rename(fname, fname + EXT)
    print 'Finished',fname
Example #5
0
def savetypeinfo(ctt):
    basesql = 'insert into articles (row_key, date, hot_news, lb_img, thumbnail_pics, source, topic, url, url_from, category, url_pv) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
    db.executemany(basesql, ctt)
    print('savetypeinfo')
Example #6
0
def savecontentdata(data):
    # save data
    basesql = 'insert into article_contents (row_key, date, topic, content, source, url) values (%s, %s, %s, %s, %s, %s)'
    db.executemany(basesql, data)
    print('savecontentdata')