def savedata(): basesql = 'insert into spider_data.v_list (title, url, v_url,v_type, v_source) values (%s, %s, %s, %s, %s)' lpath = './leaveinfo' lists = os.listdir(path = lpath) for i in lists: basename = os.path.splitext(i)[0] path = os.path.join(lpath, i) if path.find('.json') != -1: data = utils.getjsondata(path) sqldata = [tuple(i.values()) + (basename, 'ppx') for i in data] # print(sqldata) db.executemany(basesql, sqldata, basename)
def parse(fname): name = 'alexandria' create_table(name) lines = (line for line in open(fname, 'r')) lines.next() sql = ''' WITH upsert AS ( UPDATE ''' + name + ''' SET day = DATE(%s) , val = %s , storyid = %s , country = %s , sentiment = %s , confidence = %s , novelty = %s , subjects = %s , relevance = %s WHERE sid = (SELECT sid FROM security_master WHERE symbol = %s LIMIT 1) AND ts = %s RETURNING * ) INSERT INTO ''' + name + ''' ( sid, day, val, storyid, ts, country, sentiment, confidence, novelty, subjects, relevance ) SELECT (SELECT sid FROM security_master WHERE symbol = %s LIMIT 1) , DATE(%s), %s, %s, %s, %s, %s, %s, %s, %s, %s WHERE NOT EXISTS (SELECT 1 FROM upsert); ''' batch_size = 10000 args = [] for line in lines: arg = parse_line(line) args.append(arg) if len(args) % batch_size == 0: start = time.time() executemany(sql, args, info=cs) print 'Submitted %d rows in %f seconds' % (batch_size, time.time() - start) args = []
def parse(fname): print fname # remove any previously appended extensions originalName = fname while fname.count('.') > 1: fname,extension = os.path.splitext(fname) os.rename(originalName, fname) tree = ET.parse(fname) root = tree.getroot() # for reference repno = root.find('RepNo').text description = root.find('ReferenceInformation/CompanyInformation/Company').get('Name') exchange = root.find('ReferenceInformation/Issues/Issue/Exchange').get('Code') sedol = None symbol = None ric = None for issue in root.findall('ReferenceInformation/Issues/Issue/IssueXref'): if issue.get('Type') == 'SEDOL': sedol = issue.text elif issue.get('Type') == 'Ticker': symbol = issue.text elif issue.get('Type') == 'DisplayRIC': ric = issue.text for i in [ repno, description, exchange, sedol, symbol, ric ]: if not i: print "Missing field.. skipping %s" % fname os.rename(fname, fname + FIELD) return name = None if exchange and symbol: exch = lookup.get(exchange) if not exch: print "Unsupported exchange '%s'.. skipping %s" % (exchange, fname) os.rename(fname, fname + EXCHANGE) return else: exchange = exch name = '%s:%s' % (exchange, symbol) currency = root.find('ReferenceInformation/CompanyInformation/ReportingCurrency').get('Code') # only updates for now to ensure companies that don't have eoddata aren't inserted # sids,exchange,symbol get inserted by ingest_eoddata.py buf = ''' UPDATE security_master SET repno = %s , ric = %s , sedol = %s , display_name = %s WHERE exchange = %s AND symbol = %s ''' args = (repno,ric,sedol,description,exchange,symbol) db.execute(buf, args) # only want companies reporting in USD if currency != 'USD': print 'Non USD currency field in', fname,'.. skipping' os.rename(fname, fname + CURRENCY) return # delimiter for the statement type + coa combo delim = '_' # upsert filter_descriptions on filter name buf = ''' WITH upsert AS ( UPDATE filter_descriptions SET description = %s, source = %s WHERE filter = %s RETURNING * ) INSERT INTO filter_descriptions (filter,description,source) SELECT %s,%s,%s WHERE NOT EXISTS (SELECT 1 FROM upsert); ''' args = [] for layout in root.iter('Layout'): layoutType = layout.get('Type') for mapitem in layout.iter('MapItem'): coa = layoutType + delim + mapitem.get('COA') coa = coa.upper() description = mapitem.text args.append((description, source, coa, coa, description, source)) # create the filter table if doesn't exist db.createFilterTable(coa) db.executemany(buf, sorted(args)) # periodEnd is most recent date the current statement applies to. # Because we iterate from newest to oldest, we can save the periodStart # from the previous statement and use it as the stop for the current statement. # We use today's date as the stop for the most recent statement. periodEnd = datetime.date.today().strftime('%Y-%m-%d') for i,period in enumerate(root.iter('Period')): # only process the most recent period if onlyMostRecent and i > 0: break periodStart = period.get('PeriodEndDate') # write one day,val of filter data for each day we have intraday data for buf = "SELECT DISTINCT day FROM days WHERE day BETWEEN %s AND %s" args = (periodStart, periodEnd) rows = db.execute(buf, args, returndata=True) # skip periods that don't have corresponding intraday data if not rows: continue print fname,periodStart,periodEnd days = [ x[0] for x in sorted(rows) ] for statement in period.iter('Statement'): statementType = statement.get('Type') for fv in statement.iter('FV'): coa = statementType + '_' + fv.get('COA') val = fv.text # upsert on name,day # update the val for this table's name,day or insert if no val for this name,day buf = ''' WITH upsert AS ( UPDATE ''' + coa + ''' SET val = %s WHERE name = %s AND day = %s RETURNING * ) INSERT INTO ''' + coa + ''' (name,day,val) SELECT %s,%s,%s WHERE NOT EXISTS (SELECT 1 FROM upsert); ''' args = [] for day in days: args.append((val,name,day,name,day,val)) # bc processed chronologically, 'Reclassified Normal' entries will be submitted # first and should NOT be replaced by 'Update Normal' entries db.executemany(buf, args) periodEnd = periodStart # rename file to help us track what files still need to be processed os.rename(fname, fname + EXT) print 'Finished',fname
def savetypeinfo(ctt): basesql = 'insert into articles (row_key, date, hot_news, lb_img, thumbnail_pics, source, topic, url, url_from, category, url_pv) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' db.executemany(basesql, ctt) print('savetypeinfo')
def savecontentdata(data): # save data basesql = 'insert into article_contents (row_key, date, topic, content, source, url) values (%s, %s, %s, %s, %s, %s)' db.executemany(basesql, data) print('savecontentdata')