예제 #1
0
 def test_read_cache(self):
     with patch('cache.read_from_file') as mocked_read:
         # [0] one elem in cache, [1] few elements with different urls
         test_cases = [
             self.cache[self.date], {
                 self.url: {self.news_article_1, self.news_article_2},
                 self.another_url: {self.news_article_2}
             }
         ]
         for cache in test_cases:
             mocked_read.return_value = cache
             read_cache(self.date, self.url, limit=100)
             read_cache(self.date, 'ALL', limit=100)
         # if provided wrong source
         self.assertRaises(CacheNotFoundError,
                           read_cache,
                           self.date,
                           'wrong source',
                           limit=100)
         # if no elem in cache
         mocked_read.return_value = None
         self.assertRaises(CacheNotFoundError,
                           read_cache,
                           self.date,
                           'ALL',
                           limit=100)
예제 #2
0
def main():
    """Entry point for RSS reader"""
    try:
        args = get_args()
        if args.verbose:
            logging.basicConfig(level=logging.INFO,
                                format='%(asctime)s %(message)s')

        if not args.date:
            response = check_response(go_for_rss(args.source))
            news_articles = xml_parser(response, args.limit)
            save_cache(news_articles, args.source)
        else:
            news_articles = read_cache(args.date, args.source, args.limit)

        if args.to_html or args.to_pdf:
            converter(news_articles, args.to_html, args.to_pdf)
        else:
            result = output_format(news_articles, args.json)
            print_result(result, args.limit)
    except CacheNotFoundError as ex:
        print(ex.__doc__)
    except GoForRssError as ex:
        print(ex.__doc__)
    except WrongResponseTypeError as ex:
        print(ex.__doc__)
    except NoDataToConvertError as ex:
        print(ex.__doc__)
예제 #3
0
    def post_httplib(self, entry_path):
        url, payload = cache.read_cache(entry_path)
        params = urllib.urlencode(payload)
        headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}

        _url = urlparse(SerialGrabber_Paths.urls[url])
        if _url.scheme == "https":
            conn = httplib.HTTPSConnection(_url.hostname)
        else:
            conn = httplib.HTTPConnection(_url.hostname)
        conn.request("POST", _url.path, body=params, headers=headers)
        response = conn.getresponse()
        self.logger.info("HTTP Response: %s %s"%(response.status, response.reason))
        data = response.read()
        self.logger.log(5,data)
        conn.close()
        if response.status == 200:
            cache.decache(entry_path)
예제 #4
0
    def post_requests(self, entry_path):
        url, payload = cache.read_cache(entry_path)
        s = requests.session()
        s.config['keep_alive'] = False
        s.config['danger_mode'] = True
        s.config['max_retries'] = 0
        s.config['pool_connections'] = 1
        s.config['pool_maxsize'] = 1

        r = s.post(SerialGrabber_Paths.urls[url], data=payload, verify=False)
        self.logger.info("Response Code: %s" % r.status_code)
        self.logger.debug(r.text.encode('utf8'))
        if r.status_code == requests.codes.ok:
            print "POSTED"
            cache.decache(entry_path)
            toRet = True
        r.raw.release_conn()
        del r
        del s
예제 #5
0
    def post_requests(self, entry_path):
        url, payload = cache.read_cache(entry_path)
        s = requests.session()
        s.config['keep_alive'] = False
        s.config['danger_mode'] = True
        s.config['max_retries'] = 0
        s.config['pool_connections'] = 1
        s.config['pool_maxsize'] = 1

        r = s.post(SerialGrabber_Paths.urls[url], data=payload, verify=False)
        self.logger.info("Response Code: %s" % r.status_code)
        self.logger.debug(r.text.encode('utf8'))
        if r.status_code == requests.codes.ok:
            print "POSTED"
            cache.decache(entry_path)
            toRet = True
        r.raw.release_conn()
        del r
        del s
예제 #6
0
def get_growth_data(year, quarter):
    """
        获取成长能力数据
    Parameters
    --------
    year:int 年度 e.g:2014
    quarter:int 季度 :1、2、3、4,只能输入这4个季度
       
    Return
    --------
    DataFrame
        mbrg,主营业务收入增长率(%)
        nprg,净利润增长率(%)
        nav,净资产增长率(%)
        targ,总资产增长率(%)
        code,代码
        name,股票名称
        EXCHANGE,交易所
        eps,每股收益
        holderInterests,股东权益
        epsLastYear,去年每股收益
        holderInterestsLastYear,去年股东权益
        epsg,每股收益增长率(%)
        seg,股东权益增长率(%)
    """
    if ct._check_input(year, quarter) is True:
        filename = "growth_data_%d_%d.csv"%(year, quarter)
        data = cache.read_cache(filename)
        if  data is not None:
            data = data.drop_duplicates('code')
            data['code'] = data['code'].map(lambda x:str(x).zfill(6))
            return data
        #nocache
        ct._write_head()
        data = _get_growth_data(year, quarter,1,pd.DataFrame())
        cache.write_cache(data,filename)
        if data is not None:
            data = data.drop_duplicates('code')
            data['code'] = data['code'].map(lambda x:str(x).zfill(6))
        return data
예제 #7
0
    def post_httplib(self, entry_path):
        url, payload = cache.read_cache(entry_path)
        params = urllib.urlencode(payload)
        headers = {
            "Content-type": "application/x-www-form-urlencoded",
            "Accept": "text/plain"
        }

        _url = urlparse(SerialGrabber_Paths.urls[url])
        if _url.scheme == "https":
            conn = httplib.HTTPSConnection(_url.hostname)
        else:
            conn = httplib.HTTPConnection(_url.hostname)
        conn.request("POST", _url.path, body=params, headers=headers)
        response = conn.getresponse()
        self.logger.info("HTTP Response: %s %s" %
                         (response.status, response.reason))
        data = response.read()
        self.logger.log(5, data)
        conn.close()
        if response.status == 200:
            cache.decache(entry_path)
예제 #8
0
def get_debtpaying_data(year, quarter):
    """
        获取偿债能力数据
    Parameters
    --------
    year:int 年度 e.g:2014
    quarter:int 季度 :1、2、3、4,只能输入这4个季度
       
    Return
    --------
    DataFrame
        FinancialRatios1,流动比率(%)
        FinancialRatios2,速动比率(%)
        FinancialRatios5,现金比率(%)
        FinancialRatios6,利息支付倍数
        FinancialRatios8,股东权益比率(%)
        FinancialRatios56,资产负债率(%)
        Symbol,代码
        SName,股票名称
    """
    
    #nocache
    if ct._check_input(year, quarter) is True:
        filename = "debtpaying_data_%d_%d.csv"%(year, quarter)
        data = cache.read_cache(filename)
        if  data is not None:
            return data
        ct._write_head()
        data = _get_debtpaying_data(year, quarter,1,pd.DataFrame())
        if data is not None:
            data = data.drop('FinancialRatios9',axis=1)
            data = data.drop('FinancialRatios18',axis=1)
            data = data.drop_duplicates('Symbol')
            data['Symbol'] = data['Symbol'].map(lambda x:str(x).zfill(6))
        cache.write_cache(data,filename)
        return data
예제 #9
0
 def __init__(self,
              cursor,
              token_idx_lookup,
              full_token_idx_lookup,
              lookups_path,
              idf_path,
              train_size,
              txt_dataset_path,
              pkl_dataset_prefix=None):
     self.txt_dataset_path = txt_dataset_path
     self.pkl_dataset_prefix = pkl_dataset_prefix
     if self.pkl_dataset_prefix is not None:
         self.current_part = None
         return
     if self.txt_dataset_path is not None:
         if '.pkl' in self.txt_dataset_path:
             with open(self.txt_dataset_path, 'rb') as fh:
                 self.dataset_cache = pickle.load(fh)
                 return
         with open(self.txt_dataset_path) as fh:
             self.dataset_cache = [
                 ast.literal_eval(line) for line in fh.readlines()
             ]
             return
     with open(idf_path) as fh:
         self.idf = json.load(fh)
     self.cursor = cursor
     with open('./entity_to_row_id.pkl', 'rb') as fh:
         entity_id_to_row = pickle.load(fh)
     self.desc_fs = DocLookup('./desc_fs.npz',
                              entity_id_to_row,
                              token_idx_mapping=_.invert(token_idx_lookup),
                              default_value={},
                              use_default=True)
     self.desc_fs_unstemmed = DocLookup(
         './desc_unstemmed_fs.npz',
         entity_id_to_row,
         token_idx_mapping=_.invert(full_token_idx_lookup),
         default_value={'<PAD>': 1},
         use_default=True)
     self.embedding_dict = get_embedding_dict('./glove.6B.300d.txt',
                                              embedding_dim=300)
     self.stemmer = SnowballStemmer('english')
     lookups = load_entity_candidate_ids_and_label_lookup(
         lookups_path, train_size)
     label_to_entity_id = _.invert(lookups['entity_labels'])
     self.entity_candidates_prior = {
         entity_text: {
             label_to_entity_id[label]: candidates
             for label, candidates in prior.items()
         }
         for entity_text, prior in
         lookups['entity_candidates_prior'].items()
     }
     self.prior_approx_mapping = u.get_prior_approx_mapping(
         self.entity_candidates_prior)
     self.mentions = None
     self.labels = None
     self.mention_doc_id = None
     self.mention_sentences = None
     self.mention_fs = None
     self.mention_fs_unstemmed = None
     self.page_f_lookup = None
     self.with_labels = None
     self._candidate_strs_lookup = read_cache(
         './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor))
     self.stopwords = set(nltk_stopwords.words('english'))
 def __init__(self,
              cursor,
              page_id_order,
              entity_candidates_prior,
              entity_label_lookup,
              embedding,
              token_idx_lookup,
              batch_size,
              num_entities,
              num_candidates,
              entity_embeds,
              cheat=False,
              buffer_scale=1,
              min_mentions=1,
              use_fast_sampler=False,
              use_wiki2vec=False,
              use_sum_encoder=False,
              start_from_page_num=0,
              ablation=['local_context', 'document_context', 'prior']):
     self._candidate_strs_lookup = read_cache(
         './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor))
     self.page_id_order = page_id_order
     self.entity_candidates_prior = entity_candidates_prior
     self.entity_label_lookup = _.map_values(entity_label_lookup,
                                             torch.tensor)
     self.entity_id_lookup = {
         int(label): entity_id
         for entity_id, label in self.entity_label_lookup.items()
     }
     self.embedding = embedding
     self.token_idx_lookup = token_idx_lookup
     self.cursor = cursor
     self.batch_size = batch_size
     self.num_entities = num_entities
     self.num_candidates = num_candidates
     self._sentence_spans_lookup = {}
     self._page_content_lookup = {}
     self._embedded_page_content_lookup = {}
     self._page_token_cnts_lookup = {}
     self._entity_page_mentions_lookup = {}
     self._mentions_per_page_ctr = defaultdict(int)
     self._mention_infos = {}
     self._bag_of_nouns_lookup = {}
     self.page_ctr = start_from_page_num
     self.cheat = cheat
     self.buffer_scale = buffer_scale
     self.min_mentions = min_mentions
     self.use_fast_sampler = use_fast_sampler
     self.use_wiki2vec = use_wiki2vec
     self.use_sum_encoder = use_sum_encoder
     # if self.use_fast_sampler: assert not self.use_wiki2vec, 'train wiki2vec locally'
     self.prior_approx_mapping = u.get_prior_approx_mapping(
         self.entity_candidates_prior)
     self.page_content_lim = 5000
     if self.min_mentions > 1:
         query = 'select id from entities where num_mentions >= ' + str(
             self.min_mentions)
         cursor.execute(query)
         self.valid_entity_ids = set(row['id'] for row in cursor.fetchall())
     self.ablation = ablation
     self.entity_embeds = entity_embeds
     self._offset = 0
     with open('./entity_to_row_id.pkl', 'rb') as fh:
         entity_id_to_row = pickle.load(fh)
     self.token_ctr_by_entity_id = DocLookup('./desc_unstemmed_fs.npz',
                                             entity_id_to_row,
                                             default_value={1: 1},
                                             use_default=True)
     self.to_entity_id = read_cache(
         './page_to_entity_id.pkl',
         lambda: get_page_id_to_entity_id_lookup(cursor))
예제 #11
0
def tk_interface(title="UML_downloader",
                 pkg_path="packages/other_packages.txt",
                 credit_path="packages/credits.json",
                 outstream=sys.stdout):
    # create an installation interface to install mod.
    window = tix.Tk()
    window.title(title)
    # create the correct pkg_path depending on pyinstaller mode
    if getattr(sys, 'frozen', False):
        application_path = sys._MEIPASS
    else:
        application_path = os.path.dirname(__file__)
    local_pkg_path = os.path.join(application_path, pkg_path)
    local_credit_path = os.path.join(
        application_path, credit_path) if credit_path else credit_path
    # try to find cached infomation
    cache_obj_path = cache.DEFAULT_CACHE
    cache_loc = cache.DEFAULT_CACHE_LOC
    cache_obj = cache.read_cache(location=cache_obj_path)
    cached_pkg_path = os.path.join(cache_obj.get("cache_dir", cache_loc),
                                   pkg_path)
    # the set used to update data
    additional_set = set()
    keeper = {"updated": False}

    # update section function
    def update_sections():
        if (
                not keeper["updated"]
        ):  # this to prevent redundant multiple download. TODO disable the button instead
            link = GITHUB_PATTERN_DEFAULT.format(DEFAULT_REPO, pkg_path)
            filehandler.download(cached_pkg_path,
                                 link,
                                 stream=False,
                                 outstream=outstream)
            keeper["updated"] = True
        else:
            return
        sections = read_sections_from_pkg(cached_pkg_path)
        keeper["adtframe"].destroy()
        keeper["adtframe"] = adtframe = treeview_frame(window,
                                                       sections,
                                                       additional_set,
                                                       cache_obj=cache_obj,
                                                       outstream=outstream)
        adtframe.grid(column=0, row=2, columnspan=2)

    # Config frame, handle all the settings (original location, etc.)
    frame, location = control_frame(cache_obj,
                                    additional_set,
                                    update_sections_fn=update_sections,
                                    cache_obj_path=cache_obj_path,
                                    cache_loc=cache_loc,
                                    credit_path=local_credit_path,
                                    master=window,
                                    padx=5,
                                    pady=2)
    frame.grid(column=0, row=0, columnspan=2, sticky="w")
    # Additional mods from external source
    sections = read_sections_from_pkg(
        cached_pkg_path if os.path.isfile(cached_pkg_path) else local_pkg_path)
    keeper["adtframe"] = adtframe = treeview_frame(window,
                                                   sections,
                                                   additional_set,
                                                   cache_obj=cache_obj,
                                                   outstream=outstream)
    adtframe.grid(column=0, row=2, columnspan=2)
    return window