def test_read_cache(self): with patch('cache.read_from_file') as mocked_read: # [0] one elem in cache, [1] few elements with different urls test_cases = [ self.cache[self.date], { self.url: {self.news_article_1, self.news_article_2}, self.another_url: {self.news_article_2} } ] for cache in test_cases: mocked_read.return_value = cache read_cache(self.date, self.url, limit=100) read_cache(self.date, 'ALL', limit=100) # if provided wrong source self.assertRaises(CacheNotFoundError, read_cache, self.date, 'wrong source', limit=100) # if no elem in cache mocked_read.return_value = None self.assertRaises(CacheNotFoundError, read_cache, self.date, 'ALL', limit=100)
def main(): """Entry point for RSS reader""" try: args = get_args() if args.verbose: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') if not args.date: response = check_response(go_for_rss(args.source)) news_articles = xml_parser(response, args.limit) save_cache(news_articles, args.source) else: news_articles = read_cache(args.date, args.source, args.limit) if args.to_html or args.to_pdf: converter(news_articles, args.to_html, args.to_pdf) else: result = output_format(news_articles, args.json) print_result(result, args.limit) except CacheNotFoundError as ex: print(ex.__doc__) except GoForRssError as ex: print(ex.__doc__) except WrongResponseTypeError as ex: print(ex.__doc__) except NoDataToConvertError as ex: print(ex.__doc__)
def post_httplib(self, entry_path): url, payload = cache.read_cache(entry_path) params = urllib.urlencode(payload) headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"} _url = urlparse(SerialGrabber_Paths.urls[url]) if _url.scheme == "https": conn = httplib.HTTPSConnection(_url.hostname) else: conn = httplib.HTTPConnection(_url.hostname) conn.request("POST", _url.path, body=params, headers=headers) response = conn.getresponse() self.logger.info("HTTP Response: %s %s"%(response.status, response.reason)) data = response.read() self.logger.log(5,data) conn.close() if response.status == 200: cache.decache(entry_path)
def post_requests(self, entry_path): url, payload = cache.read_cache(entry_path) s = requests.session() s.config['keep_alive'] = False s.config['danger_mode'] = True s.config['max_retries'] = 0 s.config['pool_connections'] = 1 s.config['pool_maxsize'] = 1 r = s.post(SerialGrabber_Paths.urls[url], data=payload, verify=False) self.logger.info("Response Code: %s" % r.status_code) self.logger.debug(r.text.encode('utf8')) if r.status_code == requests.codes.ok: print "POSTED" cache.decache(entry_path) toRet = True r.raw.release_conn() del r del s
def get_growth_data(year, quarter): """ 获取成长能力数据 Parameters -------- year:int 年度 e.g:2014 quarter:int 季度 :1、2、3、4,只能输入这4个季度 Return -------- DataFrame mbrg,主营业务收入增长率(%) nprg,净利润增长率(%) nav,净资产增长率(%) targ,总资产增长率(%) code,代码 name,股票名称 EXCHANGE,交易所 eps,每股收益 holderInterests,股东权益 epsLastYear,去年每股收益 holderInterestsLastYear,去年股东权益 epsg,每股收益增长率(%) seg,股东权益增长率(%) """ if ct._check_input(year, quarter) is True: filename = "growth_data_%d_%d.csv"%(year, quarter) data = cache.read_cache(filename) if data is not None: data = data.drop_duplicates('code') data['code'] = data['code'].map(lambda x:str(x).zfill(6)) return data #nocache ct._write_head() data = _get_growth_data(year, quarter,1,pd.DataFrame()) cache.write_cache(data,filename) if data is not None: data = data.drop_duplicates('code') data['code'] = data['code'].map(lambda x:str(x).zfill(6)) return data
def post_httplib(self, entry_path): url, payload = cache.read_cache(entry_path) params = urllib.urlencode(payload) headers = { "Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain" } _url = urlparse(SerialGrabber_Paths.urls[url]) if _url.scheme == "https": conn = httplib.HTTPSConnection(_url.hostname) else: conn = httplib.HTTPConnection(_url.hostname) conn.request("POST", _url.path, body=params, headers=headers) response = conn.getresponse() self.logger.info("HTTP Response: %s %s" % (response.status, response.reason)) data = response.read() self.logger.log(5, data) conn.close() if response.status == 200: cache.decache(entry_path)
def get_debtpaying_data(year, quarter): """ 获取偿债能力数据 Parameters -------- year:int 年度 e.g:2014 quarter:int 季度 :1、2、3、4,只能输入这4个季度 Return -------- DataFrame FinancialRatios1,流动比率(%) FinancialRatios2,速动比率(%) FinancialRatios5,现金比率(%) FinancialRatios6,利息支付倍数 FinancialRatios8,股东权益比率(%) FinancialRatios56,资产负债率(%) Symbol,代码 SName,股票名称 """ #nocache if ct._check_input(year, quarter) is True: filename = "debtpaying_data_%d_%d.csv"%(year, quarter) data = cache.read_cache(filename) if data is not None: return data ct._write_head() data = _get_debtpaying_data(year, quarter,1,pd.DataFrame()) if data is not None: data = data.drop('FinancialRatios9',axis=1) data = data.drop('FinancialRatios18',axis=1) data = data.drop_duplicates('Symbol') data['Symbol'] = data['Symbol'].map(lambda x:str(x).zfill(6)) cache.write_cache(data,filename) return data
def __init__(self, cursor, token_idx_lookup, full_token_idx_lookup, lookups_path, idf_path, train_size, txt_dataset_path, pkl_dataset_prefix=None): self.txt_dataset_path = txt_dataset_path self.pkl_dataset_prefix = pkl_dataset_prefix if self.pkl_dataset_prefix is not None: self.current_part = None return if self.txt_dataset_path is not None: if '.pkl' in self.txt_dataset_path: with open(self.txt_dataset_path, 'rb') as fh: self.dataset_cache = pickle.load(fh) return with open(self.txt_dataset_path) as fh: self.dataset_cache = [ ast.literal_eval(line) for line in fh.readlines() ] return with open(idf_path) as fh: self.idf = json.load(fh) self.cursor = cursor with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) self.desc_fs = DocLookup('./desc_fs.npz', entity_id_to_row, token_idx_mapping=_.invert(token_idx_lookup), default_value={}, use_default=True) self.desc_fs_unstemmed = DocLookup( './desc_unstemmed_fs.npz', entity_id_to_row, token_idx_mapping=_.invert(full_token_idx_lookup), default_value={'<PAD>': 1}, use_default=True) self.embedding_dict = get_embedding_dict('./glove.6B.300d.txt', embedding_dim=300) self.stemmer = SnowballStemmer('english') lookups = load_entity_candidate_ids_and_label_lookup( lookups_path, train_size) label_to_entity_id = _.invert(lookups['entity_labels']) self.entity_candidates_prior = { entity_text: { label_to_entity_id[label]: candidates for label, candidates in prior.items() } for entity_text, prior in lookups['entity_candidates_prior'].items() } self.prior_approx_mapping = u.get_prior_approx_mapping( self.entity_candidates_prior) self.mentions = None self.labels = None self.mention_doc_id = None self.mention_sentences = None self.mention_fs = None self.mention_fs_unstemmed = None self.page_f_lookup = None self.with_labels = None self._candidate_strs_lookup = read_cache( './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor)) self.stopwords = set(nltk_stopwords.words('english'))
def __init__(self, cursor, page_id_order, entity_candidates_prior, entity_label_lookup, embedding, token_idx_lookup, batch_size, num_entities, num_candidates, entity_embeds, cheat=False, buffer_scale=1, min_mentions=1, use_fast_sampler=False, use_wiki2vec=False, use_sum_encoder=False, start_from_page_num=0, ablation=['local_context', 'document_context', 'prior']): self._candidate_strs_lookup = read_cache( './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor)) self.page_id_order = page_id_order self.entity_candidates_prior = entity_candidates_prior self.entity_label_lookup = _.map_values(entity_label_lookup, torch.tensor) self.entity_id_lookup = { int(label): entity_id for entity_id, label in self.entity_label_lookup.items() } self.embedding = embedding self.token_idx_lookup = token_idx_lookup self.cursor = cursor self.batch_size = batch_size self.num_entities = num_entities self.num_candidates = num_candidates self._sentence_spans_lookup = {} self._page_content_lookup = {} self._embedded_page_content_lookup = {} self._page_token_cnts_lookup = {} self._entity_page_mentions_lookup = {} self._mentions_per_page_ctr = defaultdict(int) self._mention_infos = {} self._bag_of_nouns_lookup = {} self.page_ctr = start_from_page_num self.cheat = cheat self.buffer_scale = buffer_scale self.min_mentions = min_mentions self.use_fast_sampler = use_fast_sampler self.use_wiki2vec = use_wiki2vec self.use_sum_encoder = use_sum_encoder # if self.use_fast_sampler: assert not self.use_wiki2vec, 'train wiki2vec locally' self.prior_approx_mapping = u.get_prior_approx_mapping( self.entity_candidates_prior) self.page_content_lim = 5000 if self.min_mentions > 1: query = 'select id from entities where num_mentions >= ' + str( self.min_mentions) cursor.execute(query) self.valid_entity_ids = set(row['id'] for row in cursor.fetchall()) self.ablation = ablation self.entity_embeds = entity_embeds self._offset = 0 with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) self.token_ctr_by_entity_id = DocLookup('./desc_unstemmed_fs.npz', entity_id_to_row, default_value={1: 1}, use_default=True) self.to_entity_id = read_cache( './page_to_entity_id.pkl', lambda: get_page_id_to_entity_id_lookup(cursor))
def tk_interface(title="UML_downloader", pkg_path="packages/other_packages.txt", credit_path="packages/credits.json", outstream=sys.stdout): # create an installation interface to install mod. window = tix.Tk() window.title(title) # create the correct pkg_path depending on pyinstaller mode if getattr(sys, 'frozen', False): application_path = sys._MEIPASS else: application_path = os.path.dirname(__file__) local_pkg_path = os.path.join(application_path, pkg_path) local_credit_path = os.path.join( application_path, credit_path) if credit_path else credit_path # try to find cached infomation cache_obj_path = cache.DEFAULT_CACHE cache_loc = cache.DEFAULT_CACHE_LOC cache_obj = cache.read_cache(location=cache_obj_path) cached_pkg_path = os.path.join(cache_obj.get("cache_dir", cache_loc), pkg_path) # the set used to update data additional_set = set() keeper = {"updated": False} # update section function def update_sections(): if ( not keeper["updated"] ): # this to prevent redundant multiple download. TODO disable the button instead link = GITHUB_PATTERN_DEFAULT.format(DEFAULT_REPO, pkg_path) filehandler.download(cached_pkg_path, link, stream=False, outstream=outstream) keeper["updated"] = True else: return sections = read_sections_from_pkg(cached_pkg_path) keeper["adtframe"].destroy() keeper["adtframe"] = adtframe = treeview_frame(window, sections, additional_set, cache_obj=cache_obj, outstream=outstream) adtframe.grid(column=0, row=2, columnspan=2) # Config frame, handle all the settings (original location, etc.) frame, location = control_frame(cache_obj, additional_set, update_sections_fn=update_sections, cache_obj_path=cache_obj_path, cache_loc=cache_loc, credit_path=local_credit_path, master=window, padx=5, pady=2) frame.grid(column=0, row=0, columnspan=2, sticky="w") # Additional mods from external source sections = read_sections_from_pkg( cached_pkg_path if os.path.isfile(cached_pkg_path) else local_pkg_path) keeper["adtframe"] = adtframe = treeview_frame(window, sections, additional_set, cache_obj=cache_obj, outstream=outstream) adtframe.grid(column=0, row=2, columnspan=2) return window