def Cold_boot(self, url, pause=3): """ retry if errors are met """ headers = {'user-agent': self.get_random_user_agent()} try: requests.packages.urllib3.disable_warnings( requests.packages.urllib3.exceptions.InsecureRequestWarning) r = requests.get(url=url, proxies=self.proxies, headers=headers, allow_redirects=False, verify=False, timeout=30) time.sleep(pause) ylog.info(url) content = r.content charset = cchardet.detect(content) bsObj = content.decode(charset['encoding']) return bsObj except (ValueError, Exception) as e: # print('something') print(e.message) print("Sleeping for %i" % self.error_delay) time.sleep(self.error_delay) return self.Cold_boot(url)
def extract(self, text): soup = BeautifulSoup(text, "lxml") papers = soup.find_all('div', class_='g') result = [] for item in papers: information = { 'Title': None, 'PageURL': None, 'text': None, 'source': None, 'time': None } try: pub, created_datetime = item.find( 'div', class_='slp').find('span').get_text().split('-') created_datetime = self.clear_time(created_datetime.strip()) except ValueError as e: ylog.info(item.find('div', class_='slp').find_all('span')) pub = item.find('div', class_='slp').find('span').get_text() created_datetime = None except: continue Title = BeautifulSoup(str(item.find('h3')), "lxml").get_text() PageURL = item.find('a')['href'] MatchedAbstract = item.find('div', class_='st').get_text() information = { 'Title': Title, 'PageURL': PageURL, 'Publication': pub.replace('\u200e ', ''), 'MatchedAbstract': MatchedAbstract, 'CreatedTime': created_datetime } result.append(information) # ylog.debug(Title) return result
def find_meta(title, doi): """ find metadata with title or doi Keyword Arguments: title -- doi -- """ ylog.info(title) works = Works() w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: t = item.get('title')[0] sub_title = item.get('subtitle')[0] except: continue if SequenceMatcher(a=title, b=t).ratio() > 0.9 or SequenceMatcher( a=title, b=sub_title).ratio > 0.9: return item if i > 18: ylog.debug('[x]%s' % title) # ylog.debug(item['title']) return None
def find_meta(self, identifier): """ find metadata with title or DOI Keyword Arguments: identifier -- """ try: # verify=False is dangerous but sci-hub.io # requires intermediate certificates to verify # and requests doesn't know how to download them. # as a hacky fix, you can add them to your store # and verifying would work. will fix this later. url = self.base_url + identifier['article_link'] self.sess.headers = {'user-agent': self.get_random_user_agent()} res = self.sess.get(url, verify=False, allow_redirects=False) re_bracket = re.compile("\[(.*?)\]\s") title = re.sub(re_bracket, "", identifier['name']) ylog.debug('*' * 80) ylog.debug("title: %s" % title) ylog.debug(res.status_code) # self.out.ix[title]['status_code'] = res.status_code ylog.debug("headers: %s" % res.headers['Content-Type']) ylog.debug('location: %s' % res.headers.get("Location")) # self.out.ix[title]['location'] = res.headers.get("Location") search_title = True if not res.headers.get("Location"): content = res.content if len(content) > 2: import cchardet charset = cchardet.detect(content) text = content.decode(charset['encoding']) soup = BeautifulSoup(text, "lxml") script = soup.script.get_text() doi_regexp = '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+' try: doi_match = re.compile(doi_regexp).findall(script)[0] ylog.info("DOI: %s" % doi_match) search_title = False # use crossref API to get metadata works = Works() w1 = works.query(doi_match).sort('relevance').order( 'desc') i = 0 for item in w1: # TODO: verify title # self.out.ix[title]['DOI'] = item['DOI'] return {'meta': item['DOI'], 'url': url} except IndexError: ylog.debug('failed to find regexp') elif search_title: works = Works() w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: # ylog.debug('crossref item title ') t = item.get('title')[0] # ylog.debug(t) sub_title = item.get('subtitle')[0] # ylog.debug(sub_title) # ylog.debug("ratio: %s" % # (SequenceMatcher(a=title, b=t).ratio())) except TypeError: sub_title = '' if SequenceMatcher( a=title, b=t).ratio() > 0.9 or SequenceMatcher( a=title, b=sub_title).ratio( ) > 0.9 or t.startswith(title): ylog.debug("DOI %s" % item['DOI']) # self.out.ix[title]['DOI'] = item['DOI'] return {'meta': item['DOI'], 'url': url} if i > 18: # ylog.debug('[x]%s' % title) # ylog.debug(item['title']) return None except requests.exceptions.ConnectionError: logger.info('{} cannot acess,changing'.format( self.available_base_url_list[0])) self._change_base_url() except requests.exceptions.RequestException as e: return { 'err': 'Failed to fetch pdf with identifier %s (resolved url %s) due to request exception.' % (identifier, url) }
def upload_node(dict_re_match_object): """ upload regular expression object in the dictionary in a batch. 1. get each value from the input dictionary. 2. create a graph upload request. 3. fill node properties. use encoded original Chinese title plus url as url property. 4. if there's any error upload response, retry. 5. print upload statistics. Keyword Arguments: re_match_object -- re object """ res = None error = None re_upload_error = None retry = 0 nodes_fail_retry = 0 uploaded_number = 0 while res is None: try: graph_upload_request = graphUpload_pb2.GraphUploadRequest() # iterate nodes batch for index, value in dict_re_match_object.items(): if value is not None: item = dict_re_match_object.get(index) # print(item) title = item.group()[1:-1] zh_title = HanziConv.toSimplified(title) # if zh_title in IGNORE_CATEGORIES: # break node = graph_upload_request.graph.nodes.add() node.props.type = "readonlyDoc" # p1 = node.props.props.entries.add() # p1.key = "url" # p1.value = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus( # title) p2 = node.props.props.entries.add() p2.key = "_s_import_source" p2.value = "word2vec model" node.businessID.url = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus( title) node.names.chinese = zh_title # other information of the upload request graph_upload_request.uploadTag = "UploadWord2VecVocabNodes" graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') res = gs_call.upload_graph(graph_upload_request) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % (e.code, e.reason) else: raise except RETRIABLE_EXCEPTIONS as e: error = 'A retriable error occurred: %s' % e try: if res.failedNodes: re_upload_error = "some nodes failed to upload %s" % res.failedNodeds except: pass if re_upload_error is not None: print(re_upload_error) nodes_fail_retry += 1 res = None if nodes_fail_retry > NODES_FAIL_MAX_RETRIES: ylog.debug(res) res = "continue" if error is not None: print(error) retry += 1 res = None if retry > MAX_RETRIES: ylog.debug(res) # break # exit("no loger attempting to retry.") ylog.debug(res) max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) # ylog.debug(res) # jump out while response is None: try: if res.nodeUpdateResultStatistics: ylog.debug(res.nodeUpdateResultStatistics) uploaded_number = res.nodeUpdateResultStatistics.numOfCreations + \ res.nodeUpdateResultStatistics.numOfUpdates + \ res.nodeUpdateResultStatistics.numOfSkips if res.uploadedNodes: for updated in res.uploadedNodes: ylog.debug("uploaded node GID: %s" % updated.gid) if res.failedNodes: for err in res.failedNodes: if err.error.errorCode != 202001: ylog.info(err.error) ylog.debug(err.error) except: pass return uploaded_number
#!/usr/bin/env python # -*- coding: utf-8 -*- """ """ import logging from ylib import ylog from lib.gftTools import gftIO from lib.gftTools.proto import graphUpload_pb2 from lib.gftTools.gftIO import GSError ylog.set_level(logging.DEBUG) ylog.console_on() # ylog.filelog_on("wiki_upload") ylog.info("start") def skill_result_2_graph(resp_run_node_action): """ convert skill_pb2.RespRunNodeAction to graphUpload.proto.Graph so that in python we have only one graph format. Keyword Arguments: resp_run_node_action -- skill_pb2.RespRunNodeAction, result from get_graph_from_neo4j Return: graphUpload_pb2.graph """ graph_upload_request = graphUpload_pb2.GraphUploadRequest() for n in resp_run_node_action.graphs[0].graph.nodes: node = graph_upload_request.graph.nodes.add()
def create_continuous_contract(start_date, end_date, contract_data, target): ''' parse contract data to get continuous price for each group. Parameters ---------- start_date: datetime end_date: datetime contract_data: OOTTV contract name, contract code, date, settlement date, close price target: list or NULL targets to parse, NULL will parse all contracts. Returns ------- continuous_price: DataFrame ''' if isinstance(contract_data, gftIO.GftTable): data = contract_data.asColumnTab().copy() if isinstance(target, list): target = gftIO.strSet2Np(np.array(target)) name = { 'INNERCODE': 'contract_code', 'OPTIONCODE': 'contract_name', 'SETTLEMENTDATE': 'settlement_date', 'ENDDATE': 'date', 'CLOSEPRICE': 'close_price' } data.rename(columns=lambda x: name[x], inplace=True) data.dropna(subset=['settlement_date'], inplace=True) continuous_price = pd.DataFrame() if target is None: target = data['contract_name'].unique() for num_contract, contract in enumerate(target): ylog.info(num_contract) ylog.info(contract) target_data = data[data['contract_name'] == contract] target_expiry_dates = target_data[['contract_code', 'settlement_date']].\ drop_duplicates().sort_values('settlement_date') target_expiry_dates.set_index('contract_code', inplace=True) target_expiry_dates = target_expiry_dates[target_expiry_dates.columns[ 0]] target_data = target_data.loc[:, ['date', 'contract_code', 'close_price']] contract_data = target_data.pivot( index='date', columns='contract_code', values='close_price') contract_dates = contract_data.index continuous_contract_price = pd.Series( np.ones(len(contract_dates)), index=contract_dates, name=contract) # ylog.info(contract_dates) prev_date = contract_dates[0] # Loop through each contract and create the specific weightings for # each contract depending upon the rollover date and price adjusted method. # Here for backtesting, we use last trading day rollover and backward ratio price adjustment. target_data_with_datetimeindex = target_data.set_index('date') price_adjust_ratio = pd.Series( np.ones(len(target_expiry_dates)), index=target_expiry_dates.values, name='ratio') adjusted_price = pd.Series(index=contract_dates, name=contract) target_data_with_datetimeindex['close_price'].replace( to_replace=0, method='bfill', inplace=True) target_data_with_datetimeindex['close_price'].replace( to_replace=0, method='pad', inplace=True) target_data_with_datetimeindex = target_data_with_datetimeindex[ ~target_data_with_datetimeindex.index.duplicated()] for i, (item, ex_date) in enumerate(target_expiry_dates.iteritems()): # ylog.info(i) # ylog.info(item) # ylog.info(ex_date) if i < len(target_expiry_dates) - 1 \ and ex_date < target_data_with_datetimeindex.index[-1]: idx_ex_date = target_data_with_datetimeindex.index.searchsorted( ex_date) pre_ex_date = contract_dates[idx_ex_date - 1] # ex_date has no price data, move ex_date to next trading date. if ex_date not in target_data_with_datetimeindex.index and \ idx_ex_date + 1 < len(target_data_with_datetimeindex.index): ex_date = contract_dates[idx_ex_date + 1] else: continue price_adjust_ratio.loc[ex_date] = target_data_with_datetimeindex['close_price'].loc[ex_date] / \ target_data_with_datetimeindex['close_price'].loc[pre_ex_date] # to create adjusted_pricested price by the product of target price date and # adjustment ratio. for i, (item, ex_date) in enumerate(target_expiry_dates.iteritems()): #print(i, item, ex_date) idx_ex_date = contract_data.index.searchsorted(ex_date) pre_ex_date = contract_dates[idx_ex_date - 1] adjusted_price.ix[prev_date:pre_ex_date] = target_data_with_datetimeindex['close_price'].ix[prev_date:pre_ex_date] * \ price_adjust_ratio.ix[ex_date:].cumprod().iloc[-1] prev_date = ex_date continuous_price = pd.concat([continuous_price, adjusted_price], axis=1) return continuous_price
ylog.debug('[x]%s' % title) # ylog.debug(item['title']) break dl.download_from_doi('10.1145/2449396.2449413') with open('/home/weiwu/share/deep_learning/data/My Collection.bib' ) as bibtex_file: bib_database = bibtexparser.load(bibtex_file) items = [] for article in bib_database.entries: if article['ENTRYTYPE'] == 'article': if article.get('doi') is not None: title = article['title'][1:-1] # title = ' '.join(['+' + x for x in title.split()]) ylog.info(title) result = {'target': title} w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: t = item.get('title')[0] except: continue if SequenceMatcher(a=title, b=t).ratio() > 0.9: result['result'] = item['title'] target_doi = article.get('doi').lower() found_doi = item['DOI'].lower() ylog.debug("target doi: %s" % target_doi) ylog.debug("found doi: %s" % found_doi)