def load_time_series(symbol, start_date=None, end_date=None, downsample_days=1): log.info("loading %s for %s to %s" % (symbol, start_date, end_date)) filename = "%s.csv" % symbol if not os.path.exists(filename): fetch.fetch_data(symbol) data = pandas.read_csv(filename, parse_dates=True, index_col=0) data = data.drop(["Open", "High", "Low", "Close", "Volume"], axis=1) data = data.rename(columns={"Adj Close" : symbol}) data = data.sort() if data.index[0] > start_date: log.warning("no data for %s before %s" % (symbol, data.index[0])) return None data = data.truncate(before=start_date, after=end_date) log.info("%d rows after truncating" % len(data)) # downsample if necessary if downsample_days > 1: drange = pandas.DateRange(start_date, end_date, offset = downsample_days * datetools.day) grouped = data.groupby(drange.asof) means = grouped.mean() log.info("%d rows after downsampling" % len(means)) return means else: return data
def load_time_series(symbol, start_date=None, end_date=None, downsample_days=1): log.info("loading %s for %s to %s" % (symbol, start_date, end_date)) filename = "%s.csv" % symbol if not os.path.exists(filename): fetch.fetch_data(symbol) data = pandas.read_csv(filename, parse_dates=True, index_col=0) data = data.drop(["Open", "High", "Low", "Close", "Volume"], axis=1) data = data.rename(columns={"Adj Close": symbol}) data = data.sort() if data.index[0] > start_date: log.warning("no data for %s before %s" % (symbol, data.index[0])) return None data = data.truncate(before=start_date, after=end_date) log.info("%d rows after truncating" % len(data)) # downsample if necessary if downsample_days > 1: drange = pandas.DateRange(start_date, end_date, offset=downsample_days * datetools.day) grouped = data.groupby(drange.asof) means = grouped.mean() log.info("%d rows after downsampling" % len(means)) return means else: return data
def get_new_data(): print("++++++++++\nIn get_new_data module ...") for api in cfg.config['apis']: data = fetch.fetch_data(s_url=api['url'], l_filter=api['filter']) raw_posts = parse_feed(data) posts = munge_feed(raw_posts) db_insert(posts) time.sleep(1)
def get_weather(root): create_table() city = textField.get() api_data = fetch_data(city) if api_data['cod'] == '404': print('\n```````````````````````````````````````') print(city + ' ' + api_data['message']) print('```````````````````````````````````````') final_info = 'null' + '\n' + 'null' final_data = '\nCity name: ' + 'null' + '\nCity id: ' + 'null' + '\n' + '\nHumidity: ' + 'null' + '\nWind speed: ' + 'null' + '\nVisibility: ' + 'null' label1.config(text=final_info) label2.config(text=final_data) else: weather_desc = str(api_data['weather'][0]['description']) temp_city = str(float(api_data['main']['temp']) - 273.15)[:4] + '°C' humid = str(api_data['main']['humidity']) + '%' wind_spd = str(api_data['wind']['speed']) + ' kmph' city_name = str(api_data['name']) city_id = str(api_data['id']) date_time = '[' + str( datetime.now().strftime("%d-%b-%Y | %I:%M %p")) + ']' visibility = str(float(api_data['visibility']) / 1000)[:4] + ' km' insert_into_table(city_id, city_name, date_time, temp_city, weather_desc, wind_spd, humid, visibility) print( '\n\n------------------------------------------------------------------------------------' ) print('Weather stats for -> {} | City-id : {} | {}'.format( city_name, city_id, date_time)) print( '------------------------------------------------------------------------------------\n' ) print('Current Temperature : {}'.format(temp_city)) print('Weather Discription : {}'.format(weather_desc)) print('Wind Speed : {}'.format(wind_spd)) print('Humidity : {}'.format(humid)) print('Visibility : {}\n'.format(visibility)) final_info = weather_desc + '\n' + temp_city final_data = '\nCity name: ' + city_name + '\nCity id: ' + city_id + '\n' + date_time + '\nHumidity: ' + humid + '\nWind speed: ' + wind_spd + '\nVisibility: ' + visibility label1.config(text=final_info) label2.config(text=final_data)
def get_new_data(page="default"): print("++++++++++\nIn get_new_data module ...") print("+++\nPage is: ", page) for api in cfg.config['apis'][page]: data = fetch.fetch_data(s_url=api['url'], l_filter=api['filter']) raw_posts = parse_feed(data) posts = munge_feed(raw_posts) # posts = filter_feed(raw_posts) db_insert(posts) time.sleep(1) expire()
def get_new_data(s_file_name): print("++++++++++\nIn get_new_data module ...") # need to select which apis(s) to check for api in cfg.config['apis'][s_file_name]: data = fetch.fetch_data(s_url=api, l_filter=cfg.config['apis']['filter']) raw_posts = parse_feed(data) # posts = munge_feed(raw_posts) posts = filter_feed(raw_posts) db_insert(posts) time.sleep(2)
def get_api(): db = {} data = fetch.fetch_data(cfg.config['api']) # extract parties info parties_cp = data['Election']['Leading']['Party'] parties = [] party_list = ["LIB", "PC", "NDP", "GRN"] clr_list = {"LIB": "red", "PC": "blue", "NDP": "orange", "GRN": "green"} for x in parties_cp: if x['Name'] in party_list: obj = {} obj['Name'] = x['Name'] obj['seats'] = x['Leading'] + x['Elected'] obj['clr'] = clr_list[obj['Name']] parties.append(obj) db['parties'] = parties # print(data['Election']['Riding']) # extract ridings of interest info ridings_list = [ "Hamilton Centre", "Hamilton Eastâ\x80\x94Stoney Creek", "Hamilton Mountain", "Hamilton Westâ\x80\x94Ancasterâ\x80\x94Dundas", "Flamboroughâ\x80\x94Glanbrook", "Burlington", "Niagara West", "Haldimandâ\x80\x94Norfolk", "Oakville Northâ\x80\x94Burlington", "Brantfordâ\x80\x94Brant" ] ridings_cp = [ x for x in data['Election']['Riding'] if x['RNE'] in ridings_list ] ridings = [] for x in ridings_cp: obj = {} obj['name'] = x['RNE'].replace("â\x80\x94", "–") obj['candidates'] = [] for y in x['Candidate']: obj2 = {} obj2['name'] = y['FN'] + ' ' + y['LN'] obj2['party'] = y['PE'] obj2['votes'] = y['V'] obj['candidates'].append(obj2) ridings.append(obj) db['ridings'] = ridings db['leaders'] = { "ford": "Won riding", "horwath": "Won riding", "schreiner": "Won riding", "wynne": "Won riding" } with io.open("cp.db", "w+", encoding='utf8') as file: file.write(json.dumps(db, ensure_ascii=False)) return db
def test_fetch(self): """ Tests TFL API for fetching data Args: Self Returns: None Raises: None """ res = fetch_data() self.assertIsInstance(res, list)
def main(): data = fetch_data() for d in data: print "Getting image for", d["EntryId"] url_complex = d[config.IMAGE_FIELD] try: url = url_complex.split("(")[1].split(")")[0] ext = url_complex.split("(")[0].split(".")[1] name = d["EntryId"] path = name + "." + ext r = urllib2.urlopen(url) with open(config.PROFILE_PICTURE_FOLDER+"/"+path, 'wb') as f: f.write(r.read()) except Exception: print "***** No Image Found *****", d["EntryId"]
def test_dict(self): """ Tests formatting of response data from TFL API Args: Self Returns: None Raises: None """ res = fetch_data() data = data_for_display(res) self.assertIsNotNone(data)
def main(): data = fetch_data() counter = 1 dumper = [] page = 1 for d in data: profile_photo_file = get_file_name_for_entry(d) #extension dumper.append([d[config.NAME_FIELD],d[config.EMAIL_FIELD],d[config.TWITTER_FIELD],d[config.DESCRIPTION_FIELD],profile_photo_file]) if counter % 5 == 0: p = ProfilePage() p.run(dumper) dumper = [] p.save(page) page += 1 counter +=1 p = ProfilePage() p.run(dumper) dumper = [] p.save(page)
def get_api(): db = {} data = fetch.fetch_data(cfg.config['api']) # extract parties info parties_cp = data['Election']['Leading']['Party'] parties = [] party_list = ["LIB", "PC", "NDP", "GRN"] clr_list = {"LIB": "red", "PC": "blue", "NDP": "orange", "GRN": "green"} for x in parties_cp: if x['Name'] in party_list: print(x['Name']) obj = {} obj['Name'] = x['Name'] obj['seats'] = x['Leading'] + x['Elected'] obj['clr'] = clr_list[obj['Name']] parties.append(obj) db['parties'] = parties # print(data['Election']['Riding']) # extract ridings of interest info ridings_list = [ "Hamilton Centre", "Hamilton Eastâ\x80\x94Stoney Creek", "Hamilton Mountain", "Hamilton Westâ\x80\x94Ancasterâ\x80\x94Dundas", "Flamboroughâ\x80\x94Glanbrook" ] ridings_cp = [ x for x in data['Election']['Riding'] if x['RNE'] in ridings_list ] ridings = [] for x in ridings_cp: obj = {} obj['name'] = x['RNE'].replace("â\x80\x94", "–") obj['candidates'] = [] for y in x['Candidate']: obj2 = {} obj2['name'] = y['FN'] + ' ' + y['LN'] obj2['party'] = y['PE'] obj2['votes'] = y['V'] obj['candidates'].append(obj2) ridings.append(obj) db['ridings'] = ridings return db
def index(): data = fetch_data() display_data = data_for_display(data) write_data(display_data) return render_template("index.html", data=display_data)
def raw_data(): data = fetch_data() return jsonify(data)
def fetch_hypos(synset): data = fetch.fetch_data(HYPO_URL % synset) return data.replace("-", "").split("\r\n")
def fetch_image_urls(synset): data = fetch.fetch_data(MAPPING_URL % synset) image_mappings = [y.split() for y in data.split("\r\n") if y] return image_mappings
#!/usr/bin/env python3 -W ignore::DeprecationWarning import sys import warnings from fetch import fetch_data from train import train_data # suppress all warnings warnings.filterwarnings("ignore") if __name__ == "__main__": fetch_obj = fetch_data(data_type=0) test_obj = fetch_data(data_type=1) train_obj = train_data(fetch_obj.label_df, fetch_obj.pseudo_df, test_obj.unlabel_df) # apply a tf-idf model with SVD combine_data = [*train_obj.labeled_data, *train_obj.pseudo_data] train_obj.fit_vectorizer(combine_data, min_df=0.010, max_df=0.8, ngram_range=(1, 2), svd=True) # using lda train_obj.train_model() # test data train_obj.fit_vectorizer(train_obj.test_data, min_df=0.010, max_df=0.8, ngram_range=(1, 2),
def update(): """ 更新数据库,异步函数 :return: 无返回 """ global update_process global update_process_percent update_process = "INITIATING" update_process_percent = 0.0 con = sqlite3.connect('essay.db') database.init(con) # 初始化数据库 fetch_status, total_essay = fetch.total_essay_number() # 得到当前cs.AI分类下的所有论文数 if not fetch_status: # 如果拉取失败,返回服务器错误 raise Exception("Cannot get the count of total essay number") start_offset = (total_essay - 1) // request_max_results * request_max_results # 由于是从后往前翻页,故计算开始的offset值 last_updated = database.latest_update_time(con) # 得到数据库中最晚更新的论文的时间戳,晚于其更新的论文都是未插入数据库的 update_process = "GETTING ESSAYS INFO" essay_to_insert = [] pdf_to_fetch = [] break_flag = False for i in range(start_offset, -1, -request_max_results): update_process_percent = 1 - (i / total_essay) essays = list() # 论文集 trail_counter = 0 # 失败计数器,由于此处是频繁拉取所以需要多次尝试机会 while essays is None or len(essays) == 0: if trail_counter >= 5: # 超出尝试次数,服务器错误 return status, essays = fetch.fetch_data(i, request_max_results) # 尝试去拉取 trail_counter = trail_counter + 1 for essay in essays: # 要插入的论文,更新必须晚于数据库中更新最晚的论文,且不位于数据库中 if essay["updated"] > last_updated or len(database.query(con, "id", essay["id"])) == 0: essay_to_insert.append(essay) if pdf_end_time > essay["updated"] >= pdf_start_time: # 在2020年10月1日后发表,2021年1月1日前停止记录,先记录要下载的pdf pdf_to_fetch.append((essay["pdf"], essay["id"])) else: break_flag = True # 由于返回值论文是从晚到早的,若出现了相同的论文,必定是之前已经插入到数据库的论文 break if break_flag: break update_process = "INSERT INTO DATABASE" database.insert(con, essay_to_insert) # 向数据库里push数据 if os.path.exists("pdf_list.tmp"): # 获取之前缓存的要拉取的pdf的文件 temp_file = open("pdf_list.tmp") pdf_to_fetch.extend(json.loads(temp_file.read())) temp_file.close() temp_file = open("pdf_list.tmp", "w") # 往pdf_list.tmp文件中放置当前要拉取的pdf,作为缓存 temp_file.write(json.dumps(pdf_to_fetch)) temp_file.close() update_process = "DOWNLOADING PDF" count = 1 for essay in pdf_to_fetch: # 此处开始下载pdf update_process_percent = count / len(pdf_to_fetch) fetch.download_pdf(essay[0], essay[1]) count = count + 1 if os.path.exists("pdf_list.tmp"): # 下载完毕,删除pdf_list.tmp os.remove("pdf_list.tmp") con.close()